<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<meta http-equiv=Content-Type content="text/html; charset=utf8">
<title>/usr/web/sources/contrib/geoff/sdnvme.c - Plan 9 from Bell Labs</title>
<!-- THIS FILE IS AUTOMATICALLY GENERATED. -->
<!-- EDIT sources.tr INSTEAD. -->
</meta>
</head>
<body>
<p style="margin-top: 0; margin-bottom: 0.17in"></p>
<p style="line-height: 1.2em; margin-left: 1.00in; text-indent: 0.00in; margin-right: 1.00in; margin-top: 0; margin-bottom: 0; text-align: center;">
<span style="font-size: 10pt"><a href="/plan9/">Plan 9 from Bell Labs</a>&rsquo;s /usr/web/sources/contrib/geoff/sdnvme.c</span></p>
<p style="margin-top: 0; margin-bottom: 0.17in"></p>
<p style="margin-top: 0; margin-bottom: 0.17in"></p>
<center><font size=-1>
Copyright © 2009 Alcatel-Lucent.<br />
Distributed under the
<a href="/plan9/license.html">Lucent Public License version 1.02</a>.
<br />
<a href="/plan9/download.html">Download the Plan 9 distribution.</a>
</font>
</center>
<p style="margin-top: 0; margin-bottom: 0.17in"></p>
<table width="100%" cellspacing=0 border=0><tr><td align="center">
<table cellspacing=0 cellpadding=5 bgcolor="#eeeeff"><tr><td align="left">
<pre>
<!-- END HEADER -->
/*
 * driver for NVM Express 1.1 interface to PCI-Express solid state disk
 * (i.e., flash memory).
 *
 * currently the controller is in the drive, so there's no multiplexing
 * of drives through the controller.  multiple namespaces (actually number
 * spaces) are assumed to refer to different views of the same disk
 * (different block sizes).
 *
 * many features of NVME are ignored in the interest of simplicity and speed.
 * many of them are intended to jump on a bandwagon (e.g., VMs) or check a box.
 * using interrupts rather than polling costs us about 4% in large-block
 * sequential read performance.
 */
#include "u.h"
#include "../port/lib.h"
#include "mem.h"
#include "dat.h"
#include "fns.h"
#include "io.h"
#include "../port/error.h"
#include "../port/sd.h"

#define PAGEOF(ctlr, p) ((uintptr)(p) &amp; ~((ctlr)-&gt;pgsz-1))

#define QFULL(qp)	((qp)-&gt;qidx.hd == qidxplus1((qp), (qp)-&gt;qidx.tl))
#define QEMPTY(qp)	((qp)-&gt;qidx.hd == (qp)-&gt;qidx.tl)

#define nvmeadmissue(ctlr, op, nsid, buf) \
	nvmeissue(ctlr, &amp;ctlr-&gt;qpair[Qadmin], nil, op, nsid, buf, 0)

enum {
	/* fundamental constants */
	Qadmin,			/* queue-pair ordinals; Qadmin fixed at 0 */
	Qio,
	Nqueues,

	Vall = 1&lt;&lt;Qadmin | 1&lt;&lt;Qio,	/* all interesting vector */

	Subq = 0,
	Complq,
	Qtypes,

	Nsunused = 0,
	Nsall	= ~0ul,

	Idns	= 0,
	Idctlr,
	Idnsids,

	Minsect	= 512,

	/* tunable parameters */
	Debugintr = 0,
	Debugns = 0,

	Timeout = 20*1000,	/* adjust to taste. started at 2000 ms. */

	/*
	 * NVME page size must be &gt;= sector size.  anything over 8K only
	 * benefits bulk copies and benchmarks.
	 */
	Startpgsz = Sdalign,	/* on samsung sm951, 4k ≤ page_size ≤ 128MB */

	Qlen	= 32,	/* defaults; queue lengths must be powers of 2 &lt; 4K */
	Cqlen	= 16,

	NCtlr	= 8,	/* each takes a pci-e or m.2 slot */
	NCtlrdrv= 1,
	NDrive	= NCtlr * NCtlrdrv,

	Reserved = (ushort)~0,		/* placeholder cmdid */
};

/* admin commands */
enum Adminops {
	Admmkiosq	= 1,	/* create i/o submission q */
	Admmkiocq	= 5,	/* create i/o completion q */
	Admid		= 6,	/* identify */
};

/* I/O commands */
enum Opcode {
	Cmdflush	= 0,
	Cmdwrite	= 1,
	Cmdread		= 2,
	Cmdwriteuncorr	= 4,
	Cmdcompare	= 5,
	Cmddsm		= 9,
};

typedef struct Cmd Cmd;
typedef struct Completion Completion;
typedef struct Ctlr Ctlr;
typedef struct Ctlrid Ctlrid;
typedef struct Doorbell Doorbell;
typedef struct Lbafmt Lbafmt;
typedef struct Nsid Nsid;
typedef struct Nvindx Nvindx;
typedef struct Qpair Qpair;
typedef struct Regs Regs;
typedef struct Transfer Transfer;

extern SDifc sdnvmeifc;

struct Nvindx {
	unsigned hd;			/* remove at this index */
	unsigned tl;			/* add at this index */
};

struct Qpair {
	Cmd	*q;			/* base of Cmd array */
	Nvindx	qidx;
	int	sqlen;
	int	writelast;		/* flag: read or write in last cmd? */

	Completion *cmpl;		/* base of Completion array */
	Nvindx	cidx;
	int	cqlen;
	int	phase;			/* initial phase bit setting in cmpl */
};

/* these are reused and never freed */
struct Transfer {
	Transfer *next;
	Rendez;
	int	done;			/* flag for rendezvous */
	int	status;			/* from completion */
	ulong	qtm;			/* time at enqueue */
	uvlong	stcyc;			/* cycles at enqueue */
	ushort	cmdid;			/* 0 means available */
	int	rdwr;
};

struct Ctlr {
	Regs*	regs;			/* memory-mapped I/O registers */
	SDev*	sdev;
	Intrcommon;
	uintptr	port;			/* physical addr of I/O registers */

	int	pgsz;			/* size of an `nvme page' */
	int	minpgsz;
	int	mdts;			/* actual value, not log2; unit minpgsz */
	int	sqlen;			/* sub q len */
	int	cqlen;			/* compl q len */
	int	stride;	/* bytes from base of one doorbell reg. to the next */

	/* per-drive scalars, since there is only one drive */
	vlong	sectors;		/* total, copy to SDunits */
	int	secsize;		/* sector size, copy to SDunits */
	int	ns;			/* namespace of the single drive */
	/* stats */
	int	maxqlen[2];	/* high water marks of read, write queues */
	/* example results: rd 89 µs, wr 325 µs */
	uvlong	maxcyc[3];  /* high water marks of read, write, admin cycles */

	/* per controller */
	QLock;				/* serialise q notifications */
	Rendez;				/* q empty/full notifications */
	Lock;				/* intr svc */
	Lock	issuelock;		/* inflight &amp; q heads &amp; tail mostly */
	Lock	xfrlock;
	Lock	shutlock;
	int	inflight;		/* count of xfrs in progress */
	int	intrsena;		/* interrupts we care about */
	Transfer *xfrs;			/* transfers in flight or done */
	Qpair	qpair[Nqueues];		/* use a single admin queue pair */

	/* per-drive arrays */
	char	serial[20+1];
	char	model[40+1];
	char	fw[8+1];
};

struct Regs {
	uvlong	cap;		/* controller capabilities */
	ulong	vs;		/* version */
	/* intm* bits are actually vector number offsets */
	ulong	intmset;	/* intr mask set: bit # is i/o completion q # */
	ulong	intmclr;	/* intr mask clear: " */
	ulong	cc;		/* controller configuration */
	ulong	nssrc;		/* reset, iff cap.nssrs set */
	ulong	csts;		/* controller status */
	ulong	_rsvd2;		/* reserved */
	ulong	aqa;		/* admin queue attributes */
	uvlong	asq;		/* admin submit queue base address */
	uvlong	acq;		/* admin completion queue base address */
	uchar	_pad0[0x1000 - 0x38];
	/* this is the nominal doorbell layout, with stride of 4 */
	struct Doorbell {
		ulong	sqtl;	/* submission queue tail */
		ulong	cqhd;	/* completion queue head */
	} doorbell[Nqueues];
};

/*
 * making the doorbell stride variable at run time requires changing the
 * declaration and addressing of the Regs-&gt;doorbell array, making it clunkier.
 * supposedly non-zero strides are only desirable in VMs, for efficiency.
 */
/* clunky doorbell register addressing for any stride */
/* instead of &amp;ctlr-&gt;regs-&gt;doorbell[qid].sqtl */
#define doorbellsqtl(ctlr, qp) (ulong *)\
	((char *)(ctlr)-&gt;regs-&gt;doorbell + (ctlr)-&gt;stride*(Qtypes*(qp) + Subq))
/* instead of &amp;ctlr-&gt;regs-&gt;doorbell[qid].cqhd */
#define doorbellcqhd(ctlr, qp) (ulong *)\
	((char *)(ctlr)-&gt;regs-&gt;doorbell + (ctlr)-&gt;stride*(Qtypes*(qp) + Complq))

enum {
	/* cap */
	Nssrs		= 1ull &lt;&lt; 36,

	/* cc */
	Enable		= 1 &lt;&lt; 0,
	Cssnvm		= 0 &lt;&lt; 4,		/* nvm command set */
	Cssmask		= 7 &lt;&lt; 4,
	Shnnone		= 0 &lt;&lt; 14,		/* shutdown style */
	Shnnormal	= 1 &lt;&lt; 14,
	Shnabrupt	= 2 &lt;&lt; 14,
	Shnmask		= 3 &lt;&lt; 14,

	/* csts */
	Rdy		= 1 &lt;&lt; 0,		/* okay to add to sub. q */
	Cfs		= 1 &lt;&lt; 1,		/* controller fatal status */
	Shstnormal	= 0 &lt;&lt; 2,		/* shutdown status */
	Shstoccur	= 1 &lt;&lt; 2,
	Shstcmplt	= 2 &lt;&lt; 2,
	Shstmask	= 3 &lt;&lt; 2,
	Nssro		= 1 &lt;&lt; 4,
};

struct Cmd {
	/* common 40-byte header */
	uchar	opcode;		/* command dword 0 */
	uchar	flags;
	ushort	cmdid;
	ulong	nsid;
	ulong	cdw2[2];	/* not used */
	uvlong	metadata;
	uvlong	prp1;		/* buffer memory address */
	uvlong	prp2;		/* zero, buffer addr, or prp list addr */
	union {
		ulong	cdw10[6]; /* admin: command dwords 10-15 */
		struct {	/* nvm i/o */
			uvlong	slba;
			ushort	length;
			ushort	control;
			ulong	dsmgmt;
			/* rest are for end-to-end protection only */
			ulong	reftag;
			ushort	apptag;
			ushort	appmask;
		};
	};
};

enum {
	/* cdw10[1] for Admmkiocq */
	Ien	= 1&lt;&lt;1,		/* intr enabled */
	Pc	= 1&lt;&lt;0,		/* physically contiguous */
};

struct Completion {
	ulong	specific;
	ulong	_pad;
	ushort	sqhd;
	ushort	sqid;
	ushort	cmdid;
	ushort	stsphs;		/* status + 1 phase bit */
};

enum {
	Phase	= 1,		/* phase bit in stsphs */
};

struct Ctlrid {
	ushort	pcivid;
	ushort	pcissvid;
	char	serial[20];	/* space-padded, unterminated strings */
	char	model[40];
	char	fw[8];
	char	_72_[77-72];
	uchar	mdts;		/* log2(max data xfr size), unit: min pg sz */
				/* 0 is unlimited */
	char	_516_[516-78];	/* ... lots of uninteresting stuff ... */
	ulong	nns;		/* number of namespaces present */
	/* ... lots of uninteresting stuff ... */
};

struct Nsid {
	uvlong	size;
	uvlong	cap;
	uvlong	used;
	uchar	feat;
	uchar	lnbafmts;
	uchar	fmtlbasz;
	uchar	mdcap;
	uchar	dpc;
	uchar	dps;
	uchar	optnmic;
	uchar	optrescap;
	uchar	_pad0[128-32];
	struct Lbafmt {
		ushort	mdsize;
		uchar	lglbasize;	/* log2(lba size) */
		uchar	relperf;
	} lbafmt[16];
	/* ... uninteresting stuff ... */
};

CTASSERT(sizeof(Cmd) == 64, cmd_wrong_size);
CTASSERT(sizeof(Completion) == 16, compl_wrong_size);

static Lock clocklck;
static int clockrunning;
static ulong iosttck;		/* tick of most recently-started i/o */
static int nctlrs;
static Ctlr *ctlrs[NCtlr];

static void
cidxincr(Ctlr *ctlr, Qpair *qp)
{
	if (++qp-&gt;cidx.hd &gt;=  ctlr-&gt;cqlen) {
		qp-&gt;cidx.hd = 0;
		qp-&gt;phase ^= Phase;
	}
}

#ifdef unused
static void
isfatal(Regs *regs, char *where)
{
	if (regs-&gt;csts &amp; Cfs)
		panic("nvme: fatal controller error %s", where);
}
#endif

static Transfer *
findxfr(Ctlr *ctlr, int cmdid)
{
	Transfer *xfr;

	for (xfr = ctlr-&gt;xfrs; xfr; xfr = xfr-&gt;next)
		if (xfr-&gt;cmdid == cmdid)
			return xfr;
	return nil;
}

/*
 * cqhd is head of the completion queue.
 * mark its transfer done, notify anybody waiting for it.
 */
static void
completexfr(Ctlr *ctlr, Completion *cqhd, int qid)
{
	uvlong cycs;
	Transfer *xfr;

	if (Debugintr)
		iprint("intr q %d cmdid %d...", qid, cqhd-&gt;cmdid);
	xfr = findxfr(ctlr, cqhd-&gt;cmdid);
	if (xfr == nil)
		panic("sd%C0: nvmeinterrupt: unexpected completion cmd id %d",
			ctlr-&gt;sdev-&gt;idno, cqhd-&gt;cmdid);
	if (xfr-&gt;qtm &amp;&amp; TK2MS(sys-&gt;ticks) - xfr-&gt;qtm &gt;= Timeout)
		iprint("sd%C0: nvmeinterrupt: completed cmd id %d but "
			"took more than %d s.\n",
			ctlr-&gt;sdev-&gt;idno, cqhd-&gt;cmdid, Timeout/1000);

	/* cycle-based measurements */
	cycles(&amp;cycs);
	cycs -= xfr-&gt;stcyc;
	if (cycs &gt; ctlr-&gt;maxcyc[xfr-&gt;rdwr])
		ctlr-&gt;maxcyc[xfr-&gt;rdwr] = cycs;

	xfr-&gt;status = cqhd-&gt;stsphs &amp; ~Phase;
	xfr-&gt;done = 1;
	xfr-&gt;qtm = 0;
	wakeup(xfr);		/* notify of completion */
}

/* advance sub. q head to completion's, notify waiters */
static void
advancesqhd(Ctlr *ctlr, Qpair *qp, Completion *cqhd, int qid)
{
	if (Debugintr)
		iprint("sw q %d sqhd set to %d...", qid, cqhd-&gt;sqhd);
	qp-&gt;qidx.hd = cqhd-&gt;sqhd;
	wakeup(ctlr);		/* notify of sqhd advance */
}

/*
 * advance compl. q head, notify ctlr., which will extinguish intr source
 * (by acknowledging this completion) and remove cqhd from the compl. q.
 */
static void
advancecqhd(Ctlr *ctlr, Qpair *qp, int qid)
{
	cidxincr(ctlr, qp);
	if (Debugintr)
		iprint("doorbell q %d cqhd set to %d\n", qid, qp-&gt;cidx.hd);
	*doorbellcqhd(ctlr, qid) = qp-&gt;cidx.hd;
	coherence();
}

/*
 * Act on and clear the interrupt(s).
 * In order to share PCI IRQs, just ignore spurious interrupts.
 * Advances queue head indices past completed operations.
 */
static Intrsvcret
nvmeinterrupt(Ureg *, void* arg)
{
	int qid, ndone, donepass; /* qid's not a great name (see path.qid) */
	ulong causes;
	Completion *cqhd;
	Ctlr *ctlr;
	Qpair *qp;
	Regs *regs;

	ctlr = arg;
	regs = ctlr-&gt;regs;
	causes = regs-&gt;intmset;
	USED(causes);
	ilock(&amp;ctlr-&gt;issuelock); /* keep other cpus out of intr svc, indices */
	if (ctlr-&gt;inflight == 0) {	/* not expecting an interrupt? */
		/* probably lost a race with polling: nothing to do */
		iunlock(&amp;ctlr-&gt;issuelock);
		return Intrnotforme;
	}

	ndone = 0;
	do {
		donepass = 0;
		for (qid = Nqueues - 1; qid &gt;= 0; qid--) /* scan i/o q 1st */
			for (qp = &amp;ctlr-&gt;qpair[qid]; ; ) {
				cqhd = &amp;qp-&gt;cmpl[qp-&gt;cidx.hd];
				if ((cqhd-&gt;stsphs &amp; Phase) == qp-&gt;phase)
					break;
				completexfr(ctlr, cqhd, qid);
				advancesqhd(ctlr, qp, cqhd, qid);
				/*
				 * toggles qp-&gt;phase if qp-&gt;cidx.hd wraps when
				 * incr'd.
				 */
				advancecqhd(ctlr, qp, qid);
				if (--ctlr-&gt;inflight &lt; 0)
					iprint("nvmeinterrupt: inflight botch\n");
				ndone++, donepass++;
			}
	} while (donepass &gt; 0);
	/* unmask intr. sources of interest iff transfers are in flight */
	if (ctlr-&gt;inflight == 0) {
		iosttck = 0;
		ctlr-&gt;intrsena = 0;
	} else
		regs-&gt;intmclr = Vall;
	iunlock(&amp;ctlr-&gt;issuelock);
	if (ndone &gt; 0)
		return Intrforme;
	else
		return Intrnotforme;
}

/* return cmd id other than zero and Reserved */
static int
cidalloc(void)
{
	int thisid;
	static int cid;
	static Lock cidlck;

	ilock(&amp;cidlck);
	++cid;
	if ((ushort)cid == 0 || (ushort)cid == Reserved)
		cid = 1;
	thisid = cid;
	iunlock(&amp;cidlck);
	return thisid;
}

/* fill in submission queue entry *cmd */
static void
mkcmd(Ctlr *ctlr, Cmd *cmd, SDreq *r, int op, ulong nsid, void *buf, int qid,
	vlong lba)
{
	long count;
	uintptr addr;

	memset(cmd, 0, sizeof *cmd);
	cmd-&gt;opcode = op;
	cmd-&gt;cmdid = cidalloc();
	cmd-&gt;nsid = nsid;
	addr = (uintptr)buf;
	if (addr != 0) {
		if (addr &lt; KZERO)
			print("nvme mkcmd: %#p not kernel virtual address\n",		
				addr);
		/* each prp entry points to at most a page */
		cmd-&gt;prp1 = PCIWADDR((void *)addr);
		if (r &amp;&amp; r-&gt;dlen &gt; ctlr-&gt;pgsz &amp;&amp; r-&gt;dlen &lt;= 2*ctlr-&gt;pgsz)
			cmd-&gt;prp2 = PAGEOF(ctlr, cmd-&gt;prp1) + ctlr-&gt;pgsz;
		else
			cmd-&gt;prp2 = 0;
	}
	switch (qid) {
	case Qadmin:
		/* we are using single-message msi */
		switch (op) {
		case Admmkiocq:
			cmd-&gt;cdw10[0] = (ctlr-&gt;cqlen - 1)&lt;&lt;16 | Qio;
			cmd-&gt;cdw10[1] = Ien | Pc;  /* vector 0 since no msi-x */
			break;
		case Admmkiosq:
			cmd-&gt;cdw10[0] = (ctlr-&gt;sqlen - 1)&lt;&lt;16 | Qio;
			cmd-&gt;cdw10[1] = Qio&lt;&lt;16 | Pc;	/* completion q id */
			break;
		case Admid:
			if (nsid == Nsall) {
				cmd-&gt;cdw10[0] = Idctlr;
				cmd-&gt;nsid = 0;
			} else
				cmd-&gt;cdw10[0] = Idns;
			break;
		}
		break;
	default:
		switch (op) {
		case Cmdread:
		case Cmdwrite:
			count = r-&gt;dlen / r-&gt;unit-&gt;secsize;
			if (count == 0) {
				print("nvmeissue: zero sector count for i/o "
					"of length %d\n", r-&gt;dlen);
				break;
			}
			cmd-&gt;slba = lba;
			cmd-&gt;length = (ushort)(count - 1);	/* sectors */
			assert(r-&gt;data == buf);
			assert(r-&gt;unit-&gt;secsize * count &lt;= r-&gt;dlen);
			assert(nsid);
			break;
		}
		break;
	}
}

static void
updmaxqlen(Ctlr *ctlr, Qpair *qp)
{
	int qlen;
	int *qlenp;

	qlen = (qp-&gt;qidx.tl + qp-&gt;sqlen - qp-&gt;qidx.hd) % qp-&gt;sqlen;
	qlenp = &amp;ctlr-&gt;maxqlen[qp-&gt;writelast];
	if (qlen &gt; *qlenp)
		*qlenp = qlen;
}

/*
 * send a command via the submission queue.
 * call with ctlr-&gt;issuelock held.
 * advances submission queue's tail index.
 */
static void
sendcmd(Ctlr *ctlr, Qpair *qp, Cmd *qtl, Transfer *xfr)
{
	int qid;

	xfr-&gt;done = 0;
	xfr-&gt;cmdid = qtl-&gt;cmdid;
	xfr-&gt;qtm = TK2MS(sys-&gt;ticks);
	qid = qp - ctlr-&gt;qpair;
	if (Debugintr)
		iprint("issue q %d cmdid %d...", qid, xfr-&gt;cmdid);

	/*
	 * Notify controller of new submission queue entry,
	 * which triggers execution of it.
	 */
	updmaxqlen(ctlr, qp);
	cycles(&amp;xfr-&gt;stcyc);

	ctlr-&gt;inflight++;
	iosttck = sys-&gt;ticks;
	*doorbellsqtl(ctlr, qid) = qp-&gt;qidx.tl;		/* start i/o */
	coherence();
	ctlr-&gt;regs-&gt;intmclr = ctlr-&gt;intrsena = Vall;	/* unmask intrs */
}

static int
doneio(void* arg)
{
	return ((Transfer *)arg)-&gt;done;
}

static uint
qidxplus1(Qpair *qp, uint idx)
{
	if (++idx &gt;= qp-&gt;sqlen)
		idx = 0;
	return idx;
}

static int
qnotfull(void *arg)
{
	return !QFULL((Qpair *)arg);
}

static int
qempty(void *arg)
{
	return QEMPTY((Qpair *)arg);
}

static Transfer *
getfreexfr(Ctlr *ctlr)
{
	Transfer *xfr;

	ilock(&amp;ctlr-&gt;xfrlock);			/* allocate xfr */
	xfr = findxfr(ctlr, 0);
	if (xfr == nil) {
		xfr = malloc(sizeof *xfr);
		if (xfr == nil)
			panic("nvmeissue: out of memory");
		xfr-&gt;next = ctlr-&gt;xfrs;
		ctlr-&gt;xfrs = xfr;	/* add new xfr to chain */
	}
	xfr-&gt;cmdid = Reserved;
	xfr-&gt;qtm = 0;
	iunlock(&amp;ctlr-&gt;xfrlock);
	return xfr;
}

/*
 * if needed, wait for the sub q to drain a lot or a little.
 * not infallible, so test afterward under lock.
 */
static void
qdrain(Ctlr *ctlr, Qpair *qp, SDreq *r)
{
	if (QFULL(qp)) {
		qlock(ctlr);			/* wait for q space */
		while (QFULL(qp))
			sleep(ctlr, qnotfull, qp);
		qunlock(ctlr);
	}
	/*
	 * don't mix reads and writes in the queue, to avoid read-before-write
	 * problems.
	 */
	if (r &amp;&amp; qp-&gt;writelast != r-&gt;write) {
		qlock(ctlr);
		if (qp-&gt;writelast != r-&gt;write)
			sleep(ctlr, qempty, qp);  /* changing, so drain */
		qp-&gt;writelast = r-&gt;write;
		qunlock(ctlr);
	}
}

/* drain and return with ctlr-&gt;issuelock held */
static void
qdrainilock(Ctlr *ctlr, Qpair *qp, SDreq *r)
{
	int again;
	
	/* serialise composition of cmd in place at sq tail */
	do {
		qdrain(ctlr, qp, r);

		again = 0;
		ilock(&amp;ctlr-&gt;issuelock);
		/* test again under lock */
		if (QFULL(qp) || r &amp;&amp; qp-&gt;writelast != r-&gt;write) {
			/* lost a race; uncommon case */
			iunlock(&amp;ctlr-&gt;issuelock);
			again = 1;
		}
	} while (again);
	/* issuelock still held */
}

static void
prerr(int sts)
{
	if (sts)
		iprint("nvmeissue: cmd error status %#ux: "
			"code %#ux type %d more %d do-not-retry %d\n", sts,
			(sts &gt;&gt;  1) &amp; MASK(8), (sts &gt;&gt;  9) &amp; MASK(3),
			(sts &gt;&gt; 14) &amp; MASK(1), (sts &gt;&gt; 15) &amp; MASK(1));
}

/*
 * add new nvme command to tail of submission queue of Qpair,
 * and wait for it to complete.  return status with phase bit zeroed.
 */
static int
nvmeissue(Ctlr *ctlr, Qpair *qp, SDreq *r, int op, ulong nsid, void *buf,
	vlong lba)
{
	ushort sts;
	Cmd *qtl;
	Transfer *xfr;

	xfr = getfreexfr(ctlr);
	if (op == Cmdwrite)
		xfr-&gt;rdwr = Write;
	else if (op == Cmdread)
		xfr-&gt;rdwr = Read;
	else
		xfr-&gt;rdwr = 2;

	/* serialise composition of cmd in place at sq tail */
	qdrainilock(ctlr, qp, r);
	/* ctlr-&gt;issuelock is now held */

	/* Reserve a space and update sub. q tail index past it. */
	qtl = &amp;qp-&gt;q[qp-&gt;qidx.tl];
	qp-&gt;qidx.tl = qidxplus1(qp, qp-&gt;qidx.tl);

	/*
	 * Compose the command struct at the tail of the submission queue.
	 * mkcmd converts buf to physical address space.
	 */
	mkcmd(ctlr, qtl, r, op, nsid, buf, qp - ctlr-&gt;qpair, lba);
	sendcmd(ctlr, qp, qtl, xfr);			/* start cmd */
	iunlock(&amp;ctlr-&gt;issuelock);

	/* this is the only process waiting for this xfr. */
	while(waserror())
		;
	tsleep(xfr, doneio, xfr, Timeout);
	poperror();
	if (!xfr-&gt;done) {
		/* we see this with the Samsung 983 DCT. */
		nvmeinterrupt(nil, ctlr);
		if (!xfr-&gt;done)
			panic("sd%C0: nvmeissue: cmd id %d didn't complete "
				"in %d s.", ctlr-&gt;sdev-&gt;idno, xfr-&gt;cmdid,
				Timeout/1000);
	}

	sts = xfr-&gt;status;
	xfr-&gt;cmdid = 0;				/* xfr available for re-use */
	if (sts)
		prerr(sts);
	return sts;
}

/* map scsi to nvm opcodes */
static int
scsiop2nvme(uchar* cmd)
{
	if (isscsiread(*cmd))
	 	return Cmdread;
	else if (isscsiwrite(*cmd))
	 	return Cmdwrite;
	else {
		iprint("scsiop2nvme: scsi cmd %#ux unexpected\n", *cmd);
		return -1;
	}
}

static int
issueios(SDreq *r)
{
	int n, max, iostat, nvmcmd;
	ulong count;			/* sectors */
	uvlong lba;
	Ctlr *ctlr;
	SDunit *unit;

	unit = r-&gt;unit;
	ctlr = unit-&gt;dev-&gt;ctlr;
	nvmcmd = scsiop2nvme(r-&gt;cmd);
	if (nvmcmd == -1)
		error("nvme: scsi cmd unexpected");
	scsilbacount(r-&gt;cmd, r-&gt;clen, &amp;lba, &amp;count);
	if(count * unit-&gt;secsize &gt; r-&gt;dlen)
		count = r-&gt;dlen / unit-&gt;secsize;
	max = 2*ctlr-&gt;pgsz / unit-&gt;secsize;	/* needs 1 or 2 prp addrs */
	/* to do this in generality, need to allocate a prp list page */
	if (0)
		max = (ctlr-&gt;mdts? ctlr-&gt;mdts * ctlr-&gt;minpgsz: 128*KB) /
			unit-&gt;secsize;
	iostat = 0;

	for (; count &gt; 0; count -= n){
		n = MIN(count, max);
		r-&gt;dlen = n * unit-&gt;secsize;
		iostat = nvmeissue(ctlr, &amp;ctlr-&gt;qpair[Qio], r, nvmcmd,
			ctlr-&gt;ns, r-&gt;data, lba);
		if (iostat)
			break;
		lba += n;
		r-&gt;data = (uchar *)r-&gt;data + r-&gt;dlen;
	}
	return iostat;
}

/*
 * Issue an I/O (SCSI) command to a controller and wait for it to complete.
 * The command and its length is contained in r-&gt;cmd and r-&gt;cmdlen.
 * If any data is to be returned, r-&gt;dlen should be non-zero, and
 * the returned data will be placed in r-&gt;data.
 */
static int
nvmerio(SDreq* r)
{
	int i, iostat;
	ulong origdlen;
	uchar *origdata;
	static char info[256];

	if(*r-&gt;cmd == ScmdSynccache || *r-&gt;cmd == ScmdSynccache16)
		return sdsetsense(r, SDok, 0, 0, 0);

	/* scsi command to get information about the drive or disk? */
	if((i = sdfakescsi(r, info, sizeof info)) != SDnostatus){
		r-&gt;status = i;
		return i;
	}

	if(r-&gt;data == nil)
		return SDok;

	/*
	 * Cap the size of individual transfers and repeat if needed.
	 * Save r-&gt;data and r-&gt;dlen, and restore them after the loop.
	 * could call scsibio, which allocates an SDreq.
	 */
	origdata = r-&gt;data;
	origdlen = r-&gt;dlen;

	assert(r-&gt;unit-&gt;secsize &gt;= Minsect &amp;&amp;
		r-&gt;unit-&gt;secsize &lt;= ((Ctlr *)r-&gt;unit-&gt;dev-&gt;ctlr)-&gt;pgsz);
	iostat = issueios(r);

	r-&gt;rlen = (uchar *)r-&gt;data - origdata;
	r-&gt;data = origdata;
	r-&gt;dlen = origdlen;
	r-&gt;status = SDok;
	if (iostat != 0) {
		r-&gt;status = SDeio;
		/* 3, 0xc, 2: write error, reallocation failed */
		sdsetsense(r, SDcheck, 3, 0xc, 2);
	}
	return r-&gt;status;
}

static int
nvmerctl(SDunit* unit, char* p, int l)
{
	int n;
	Ctlr *ctlr;
	Regs *regs;

	if((ctlr = unit-&gt;dev-&gt;ctlr) == nil)
		return 0;
	regs = ctlr-&gt;regs;
	n = snprint(p, l, "config %#lux capabilities %#llux status %#lux\n",
		regs-&gt;cc, regs-&gt;cap, regs-&gt;csts);
	/*
	 * devsd has already generated "inquiry" line using the model,
	 * so printing ctlr-&gt;model here would be redundant.
	 */
	n += snprint(p+n, l-n, "serial %s\n", ctlr-&gt;serial);
	if(unit-&gt;sectors)
		n += snprint(p+n, l-n, "geometry %lld %lud\n",
			unit-&gt;sectors, unit-&gt;secsize);
	return n;
}

/* must emit exactly one line per controller (sd(3)) */
static char*
nvmertopctl(SDev *sdev, char *p, char *e)
{
	Ctlr *ctlr;

	ctlr = sdev-&gt;ctlr;
	return seprint(p, e, "sd%c nvme regs %#p irq %d: max q lens, rd %d "
		"wr %d; max cycs, rd %lld wr %lld\n", sdev-&gt;idno, ctlr-&gt;port,
		ctlr-&gt;irq, ctlr-&gt;maxqlen[Read], ctlr-&gt;maxqlen[Write],
		ctlr-&gt;maxcyc[Read], ctlr-&gt;maxcyc[Write]);
}

static void
reset(Regs *regs)
{
	if (regs-&gt;cc &amp; Enable) {
		if (awaitbitpat(&amp;regs-&gt;csts, Rdy, Rdy) &lt; 0)
			print("nvme reset timed out awaiting ready\n");
		regs-&gt;cc &amp;= ~Enable;
		coherence();
	}
	/* else may have previously cleared Enable &amp; be waiting for not ready */
	if (awaitbitpat(&amp;regs-&gt;csts, Rdy, 0) &lt; 0)
		print("nvme reset timed out awaiting not ready\n");
}

static void
nvmedrive(SDunit *unit)
{
	uchar *p;
	Ctlr *ctlr;

	unit-&gt;sense[0] = 0x70;
	unit-&gt;sense[7] = sizeof(unit-&gt;sense)-7;

	memset(unit-&gt;inquiry, 0, sizeof unit-&gt;inquiry);
	unit-&gt;inquiry[0] = SDperdisk;
	unit-&gt;inquiry[2] = 2;
	unit-&gt;inquiry[3] = 2;
	unit-&gt;inquiry[4] = sizeof unit-&gt;inquiry - 4;
	p = &amp;unit-&gt;inquiry[8];
	ctlr = unit-&gt;dev-&gt;ctlr;
	/* model is smaller than unit-&gt;inquiry-8 */
	strncpy((char *)p, ctlr-&gt;model, sizeof ctlr-&gt;model);

	unit-&gt;secsize = ctlr-&gt;secsize;
	unit-&gt;sectors = ctlr-&gt;sectors;
	print("sd%C%d: nvme %,lld sectors: %s fw %s serial %s\n",
		unit-&gt;dev-&gt;idno, unit-&gt;subno, unit-&gt;sectors,
		ctlr-&gt;model, ctlr-&gt;fw, ctlr-&gt;serial);
}

static void
pickpgsz(Ctlr *ctlr)
{
	ulong minpgsz, maxpgsz;

	minpgsz = 1 &lt;&lt; (12 + ((ctlr-&gt;regs-&gt;cap &gt;&gt; 48) &amp; MASK(4)));
	maxpgsz = 1 &lt;&lt; (12 + ((ctlr-&gt;regs-&gt;cap &gt;&gt; 52) &amp; MASK(4)));
	ctlr-&gt;minpgsz = minpgsz;		/* for Ctlrid-&gt;mdts */
	ctlr-&gt;pgsz = MIN(Startpgsz, maxpgsz);
	if (ctlr-&gt;pgsz &lt; minpgsz)
		ctlr-&gt;pgsz = minpgsz;
	if (Sdalign &gt;= 4*KB &amp;&amp; ctlr-&gt;pgsz &gt; Sdalign)
		ctlr-&gt;pgsz = Sdalign;
	if (ctlr-&gt;pgsz &lt; 4*KB)			/* sanity */
		ctlr-&gt;pgsz = 4*KB;
}

static void
pickqlens(Ctlr *ctlr)
{
	ulong mqes;

	mqes = (ctlr-&gt;regs-&gt;cap &amp; MASK(16)) + 1;  /* max i/o [sc] q len */
	ctlr-&gt;sqlen = MIN(mqes, Qlen);
	ctlr-&gt;cqlen = MIN(mqes, Cqlen);
}

static SDev*
nvmeprobe(Pcidev *p)
{
	int logstride;
	uintptr port;
	Ctlr *ctlr;
	Regs *regs;
	SDev *sdev;
	static int count;

	assert(p-&gt;mem[1].bar == 0);	/* upper 32 bits of 64-bit addr */
	port = p-&gt;mem[0].bar &amp; ~0x0f;
	regs = vmap(port, p-&gt;mem[0].size);
	if(regs == nil){
		print("nvmeprobe: phys address %#p in use did=%#ux\n",
			port, p-&gt;did);
		return nil;
	}

	if ((ctlr = malloc(sizeof(Ctlr))) == nil ||
	    (sdev = malloc(sizeof(SDev))) == nil) {
		free(ctlr);
		vunmap(regs, p-&gt;mem[0].size);
		return nil;
	}
	ctlr-&gt;regs = regs;
	ctlr-&gt;port = port;
	ctlr-&gt;irq = p-&gt;intl;
	/*
	 * Attempt to hard-reset the board.
	 */
	reset(regs);
	logstride = ((regs-&gt;cap &gt;&gt; 32) &amp; MASK(4));	/* doorbell stride */
	if (logstride != 0)
		panic("nvmeprobe: doorbell stride must be 0 (for now), not %d",
			logstride);
	ctlr-&gt;stride = 1 &lt;&lt; (2 + logstride);	/* 2^(2+logstride) */
	if (0 &amp;&amp; regs-&gt;cap &amp; Nssrs) {		/* nvm subsys reset avail.? */
		regs-&gt;cc |= Nssro;		/* clear Nssro by setting it */
		regs-&gt;nssrc = 'N'&lt;&lt;24 | 'V'&lt;&lt;16 | 'M'&lt;&lt;8 | 'e';
		if (awaitbitpat(&amp;regs-&gt;csts, Nssro, Nssro) &lt; 0)
			print("nvme subsys reset timed out awaiting Nssro\n");
	}

	pickpgsz(ctlr);
	pickqlens(ctlr);

	sdev-&gt;ifc = &amp;sdnvmeifc;
	sdev-&gt;ctlr = ctlr;
	sdev-&gt;idno = 'n';	/* actually assigned in sdadddevs() */
	sdev-&gt;nunit = NCtlrdrv;	/* max. drives (can be number found) */
	ctlr-&gt;sdev = sdev;

	/*
	 * we (pnp) don't have a `spec' argument, so
	 * we'll assume that sdn0 goes to the first nvme host
	 * adapter found, sdo0 to the next, etc.
	 */
	print("#S/sd%c: nvme: irq %d regs %#p page size %d\n",
		sdev-&gt;idno + count++, ctlr-&gt;irq, ctlr-&gt;port, ctlr-&gt;pgsz);

	/* would probe for drives here if there could be more than one. */
	/* upon return, this many sdev-&gt;units will be allocated. */
	sdev-&gt;nunit = 1;
	return sdev;
}

static void
sdevadd(SDev *sdev, SDev **head, SDev **tail)
{
	if(*head != nil)
		(*tail)-&gt;next = sdev;
	else
		*head = sdev;
	*tail = sdev;
}

/*
 * find all nvme controllers
 */
static SDev*
nvmepnp(void)
{
	Ctlr *ctlr;
	Pcidev *p;
	SDev *sdev, *head, *tail;

	p = nil;
	head = tail = nil;
	while(p = pcimatch(p, 0, 0)){
		/* ccrp 2 is NVME */
		if(p-&gt;ccrb != Pcibcstore || p-&gt;ccru != Pciscnvm || p-&gt;ccrp != 2)
			continue;
		if((sdev = nvmeprobe(p)) == nil)
			continue;
		ctlr = sdev-&gt;ctlr;
		ctlr-&gt;pcidev = p;
		sdevadd(sdev, &amp;head, &amp;tail);
		if (nctlrs &gt;= NCtlr)
			print("too many nvme controllers\n");
		else
			ctlrs[nctlrs++] = ctlr;
	}
	return head;
}

static void
allocqpair(Ctlr *ctlr, Qpair *qp)
{
	assert(ctlr-&gt;pgsz);
	qp-&gt;sqlen = ctlr-&gt;sqlen;
	qp-&gt;cqlen = ctlr-&gt;cqlen;
	qp-&gt;q    = mallocalign(qp-&gt;sqlen * sizeof *qp-&gt;q,    ctlr-&gt;pgsz, 0, 0);
	qp-&gt;cmpl = mallocalign(qp-&gt;cqlen * sizeof *qp-&gt;cmpl, ctlr-&gt;pgsz, 0, 0);
	if (qp-&gt;q == nil || qp-&gt;cmpl == nil)
		panic("nvmectlrenable: out of memory for queues");
}

static void
configure(Ctlr *ctlr, Qpair *qpadm)
{
	Regs *regs = ctlr-&gt;regs;

	regs-&gt;aqa = (ctlr-&gt;cqlen - 1)&lt;&lt;16 | (ctlr-&gt;sqlen - 1);
	regs-&gt;asq = PCIWADDR((void *)qpadm-&gt;q);
	regs-&gt;acq = PCIWADDR((void *)qpadm-&gt;cmpl);
	regs-&gt;cc = log2(sizeof(Completion))&lt;&lt;20 | log2(sizeof(Cmd))&lt;&lt;16 |
		(log2(ctlr-&gt;pgsz)-12) &lt;&lt; 7 | Cssnvm;
	coherence();
}

static void
enable(Regs *regs)
{
	if (!(regs-&gt;cc &amp; Enable)) {
		if (awaitbitpat(&amp;regs-&gt;csts, Rdy, 0) &lt; 0)
			print("nvme enable timed out awaiting not ready\n");
		regs-&gt;cc |= Enable;
		coherence();
	}
	/* else may have previously set Enable &amp; be waiting for ready */
	if (awaitbitpat(&amp;regs-&gt;csts, Rdy, Rdy) &lt; 0)
		print("nvme enable timed out awaiting ready\n");
}

/*
 * ns numbers start at 1 and are densely-packed.
 * pick one with 512-byte blocks, return preferred lbafmt via *lbafmtp.
 */
static int
bestns(Ctlr *ctlr, int nns, Nsid *nsid, int *lbafmtp)
{
	int i, ns, second, nssecond, lbasize;
	Lbafmt *lbafmt;

	second = 0;
	nssecond = 0;
	*lbafmtp = 0;
	for (ns = 1; ns &lt;= nns; ns++) {
		if (nvmeadmissue(ctlr, Admid, ns, nsid) != 0)
			panic("nvmectlrenable: Admid(%d) failed", ns);
		for (i = 0; i &lt; nelem(nsid-&gt;lbafmt); i++) {
			lbafmt = &amp;nsid-&gt;lbafmt[i];
			if (lbafmt-&gt;lglbasize == 0)	/* end lbafmt list? */
				break;
			lbasize = 1 &lt;&lt; lbafmt-&gt;lglbasize;
			if (Debugns)
				print("nvme ns %d: lba %d mdsize %d perf %d\n",
					ns, lbasize, lbafmt-&gt;mdsize,
					lbafmt-&gt;relperf &amp; 3);
			if (lbafmt-&gt;mdsize == 0 &amp;&amp; lbasize == Minsect) {
				*lbafmtp = i;
				return ns;
			}
			/* settle for 4k if that's all there is */
			if (lbafmt-&gt;mdsize == 0 &amp;&amp; lbasize == 4096) {
				second = i;
				nssecond = ns;
			}
		}
	}
	if (nssecond)
		*lbafmtp = second;
	return second;
}

/*
 * copy id string from controller, trim trailing blanks, downcase.
 * assumes src is unterminated and dest is at least one byte larger.
 */
static void
idcopy(char *dest, char *src, int size)
{
	char *p, *pend;

	memmove(dest, src, size);
	pend = &amp;dest[size];
	*pend-- = '\0';
	for (p = pend; p &gt; dest &amp;&amp; *p == ' '; p--)
		*p = '\0';
	for (p = dest; p &lt;= pend &amp;&amp; *p != '\0'; p++)
		*p = tolower(*p);
}

static void
nvmeintron(SDev *sdev)
{
	char name[32];
	Ctlr *ctlr;

	ctlr = sdev-&gt;ctlr;
	snprint(name, sizeof(name), "sd%c (%s)", sdev-&gt;idno, sdev-&gt;ifc-&gt;name);
	enableintr(ctlr, nvmeinterrupt, ctlr, name);
	ctlr-&gt;regs-&gt;intmset = ~0;	/* mask all interrupt sources */
}

static void
zeroqhdtls(Qpair *qp)
{
	qp-&gt;cidx.hd = qp-&gt;qidx.tl = 0;
	qp-&gt;cidx.tl = qp-&gt;qidx.hd = 0;	/* paranoia */
	coherence();
}

static int
nvmectlrenable(Ctlr* ctlr)
{
	int i, nns, gotns;
	char *idpage;
	Ctlrid *ctlrid;
	Lbafmt *lbafmt;
	Nsid *nsid;
	Qpair *qpadm, *qpio;
	Regs *regs = ctlr-&gt;regs;
	SDev *sdev = ctlr-&gt;sdev;

	/* we need at least one admin queue and one i/o queue */
	qpadm = &amp;ctlr-&gt;qpair[Qadmin];
	allocqpair(ctlr, qpadm);
	qpio = &amp;ctlr-&gt;qpair[Qio];
	allocqpair(ctlr, qpio);

	assert(!(regs-&gt;cc &amp; Enable));
	configure(ctlr, qpadm);	/* must do this while ctlr is disabled */
	enable(regs);
	zeroqhdtls(qpadm);		/* paranoia */

	regs-&gt;intmset = ~0;		/* mask all interrupt sources */
	nvmeintron(sdev);

	idpage = mallocalign(BY2PG, ctlr-&gt;pgsz, 0, 0);
	if (idpage == nil)
		panic("nvmectlrenable: out of memory");
	if (nvmeadmissue(ctlr, Admid, Nsall, idpage) != 0)
		panic("nvmectlrenable: Admid(Nsall) failed");
	ctlrid = (Ctlrid *)idpage;
	nns = ctlrid-&gt;nns;

	/* smuggle hw id strings into ctlr for later printing */
	idcopy(ctlr-&gt;serial, ctlrid-&gt;serial, sizeof ctlrid-&gt;serial);
	idcopy(ctlr-&gt;model, ctlrid-&gt;model, sizeof ctlrid-&gt;model);
	idcopy(ctlr-&gt;fw, ctlrid-&gt;fw, sizeof ctlrid-&gt;fw);
	if (ctlrid-&gt;mdts)
		ctlr-&gt;mdts = 1 &lt;&lt; ctlrid-&gt;mdts;
//	iprint("nvme: max xfr size %d\n", ctlr-&gt;mdts * ctlr-&gt;minpgsz);

	/*
	 * create first i/o queue with admin queue cmds.
	 * completion queue must be created first.
	 */
	if (nvmeadmissue(ctlr, Admmkiocq, Nsunused, qpio-&gt;cmpl) != 0)
		panic("nvmectlrenable: Admmkiocq failed");
	if (nvmeadmissue(ctlr, Admmkiosq, Nsunused, qpio-&gt;q) != 0)
		panic("nvmectlrenable: Admmkiosq failed");
	zeroqhdtls(qpio);		/* paranoia */

	/* find a suitable namespace */
	nsid = (Nsid *)idpage;
	gotns = bestns(ctlr, nns, nsid, &amp;i);	/* fills in nsid page */
	if (gotns == 0)
		panic("nvmectlrenable: no suitable namespace found");
	lbafmt = &amp;nsid-&gt;lbafmt[i];
	ctlr-&gt;secsize = 1 &lt;&lt; lbafmt-&gt;lglbasize;	/* remember for SDunit */
	ctlr-&gt;sectors = nsid-&gt;cap;		/* remember for SDunit */
	ctlr-&gt;ns = gotns;
	free(idpage);
	if (Debugns)
		print("nvme best ns: %d: sectors %,lld of %d bytes\n",
			ctlr-&gt;ns, ctlr-&gt;sectors, ctlr-&gt;secsize);
	return 1;
}

static void
freeqpair(Qpair *qp)
{
	free(qp-&gt;q);
	free(qp-&gt;cmpl);
	qp-&gt;q = nil;
	qp-&gt;cmpl = nil;
}

static void
ckstuck(void)
{
	int i;
	static int whined;

	for (i = 0; i &lt; nctlrs; i++)
		nvmeinterrupt(nil, ctlrs[i]);
	if (iosttck &amp;&amp; sys-&gt;ticks - iosttck &gt; 5*HZ &amp;&amp; ++whined &lt; 5)
		iprint("nvme: stuck for 5 s.\n");
}

/*
 * activate a single nvme controller, sdev.
 * upon return, sdev-&gt;nunit SDunits will be allocated.
 */
static int
nvmeenable(SDev* sdev)
{
	Ctlr *ctlr;

	ctlr = sdev-&gt;ctlr;
	if(ctlr-&gt;qpair[Qadmin].q)
		return 0;

	pcisetbme(ctlr-&gt;pcidev);
	if(!nvmectlrenable(ctlr)) {
		freeqpair(&amp;ctlr-&gt;qpair[Qadmin]);
		freeqpair(&amp;ctlr-&gt;qpair[Qio]);
		return 0;
	}

	/* watch for hardware bugs */
	lock(&amp;clocklck);
	if (!clockrunning) {
		addclock0link(ckstuck, 1000);
		clockrunning = 1;
	}
	unlock(&amp;clocklck);
	return 1;
}

static void
nvmeintroff(SDev *sdev)
{
	char name[32];
	Ctlr *ctlr;

	ctlr = sdev-&gt;ctlr;
	ctlr-&gt;regs-&gt;intmset = ~0;		/* mask all interrupt sources */

	snprint(name, sizeof(name), "sd%c (%s)", sdev-&gt;idno, sdev-&gt;ifc-&gt;name);
	disableintr(ctlr, nvmeinterrupt, ctlr, name);
}

/*
 * returns when all in-flight transfers are done.
 * call with shutlock &amp; issuelock held.
 */
static void
waitnoxfrs(Ctlr *ctlr)
{
	int i;

	for (i = 1000; i-- &gt; 0 &amp;&amp; ctlr-&gt;inflight &gt; 0; ) {
		iunlock(&amp;ctlr-&gt;shutlock);
		iunlock(&amp;ctlr-&gt;issuelock);
		delay(1);
		ilock(&amp;ctlr-&gt;issuelock);
		ilock(&amp;ctlr-&gt;shutlock);
	}
	if (i &lt;= 0)
		iprint("sdnvme: %d transfers still in flight after 1 s.\n",
			ctlr-&gt;inflight);
}

static int
nvmedisable(SDev* sdev)			/* disable interrupts for this sdev */
{
	Ctlr *ctlr;

	ctlr = sdev-&gt;ctlr;
	if (ctlr == nil)
		return 1;
	nvmeissue(ctlr, &amp;ctlr-&gt;qpair[Qio], nil, Cmdflush, Nsall, nil, 0);

	ilock(&amp;ctlr-&gt;issuelock);
	ilock(&amp;ctlr-&gt;shutlock);
	waitnoxfrs(ctlr);
	nvmeintroff(sdev);
	pciclrbme(ctlr-&gt;pcidev);
	iunlock(&amp;ctlr-&gt;shutlock);
	iunlock(&amp;ctlr-&gt;issuelock);
	return 1;
}

static void
nvmeclear(SDev* sdev)			/* clear the interface for this sdev */
{
	Ctlr *ctlr;

	ctlr = sdev-&gt;ctlr;
	if (ctlr == nil)
		return;
	ilock(&amp;ctlr-&gt;issuelock);
	ilock(&amp;ctlr-&gt;shutlock);
	if (ctlr-&gt;regs) {
		waitnoxfrs(ctlr);
		reset(ctlr-&gt;regs);	/* ctlrs and drives are one-to-one */
	}
	iunlock(&amp;ctlr-&gt;shutlock);
	iunlock(&amp;ctlr-&gt;issuelock);
}

/*
 * see if a particular drive exists.
 * must not set unit-&gt;sectors here, but rather in nvmeonline.
 */
static int
nvmeverify(SDunit *unit)
{
	if (unit-&gt;subno != 0)
		return 0;
	return 1;
}

/*
 * initialise a drive known to exist.
 * returns boolean for success.
 */
static int
nvmeonline(SDunit *unit)
{
	int r;

	if (unit-&gt;subno != 0)		/* not me? */
		return 0;
	if (unit-&gt;sectors)		/* already inited? */
		return 1;
	r = scsionline(unit);
	if(r == 0)
		return r;
	nvmedrive(unit);
	/*
	 * could hang around until disks are spun up and thus available as
	 * nvram, dos file systems, etc.  you wouldn't expect it, but
	 * the intel 330 sata ssd takes a while to `spin up'.
	 */
	return 1;			/* drive ready */
}

SDifc sdnvmeifc = {
	"nvme",				/* name */

	nvmepnp,			/* pnp */
	nil,				/* legacy */
	nvmeenable,			/* enable */
	nvmedisable,			/* disable */

	nvmeverify,			/* verify */
	nvmeonline,			/* online */
	nvmerio,			/* rio */
	nvmerctl,			/* rctl */
	nil,				/* wctl */

	scsibio,			/* bio */
	nil,				/* probe */
	nvmeclear,			/* clear */
	nvmertopctl,			/* rtopctl */
	nil,				/* wtopctl */
};
<!-- BEGIN TAIL -->
</pre>
</td></tr></table>
</td></tr></table>
<p style="margin-top: 0; margin-bottom: 0.17in"></p>
<p style="line-height: 1.2em; margin-left: 1.00in; text-indent: 0.00in; margin-right: 1.00in; margin-top: 0; margin-bottom: 0; text-align: center;">
<span style="font-size: 10pt"></span></p>
<p style="margin-top: 0; margin-bottom: 0.50in"></p>
<p style="margin-top: 0; margin-bottom: 0.33in"></p>
<center><table border="0"><tr>
<td valign="middle"><a href="http://www.alcatel-lucent.com/"><img border="0" src="/plan9/img/logo_ft.gif" alt="Bell Labs" />
</a></td>
<td valign="middle"><a href="http://www.opensource.org"><img border="0" alt="OSI certified" src="/plan9/img/osi-certified-60x50.gif" />
</a></td>
<td><img style="padding-right: 45px;" alt="Powered by Plan 9" src="/plan9/img/power36.gif" />
</td>
</tr></table></center>
<p style="margin-top: 0; margin-bottom: 0.17in"></p>
<center>
<span style="font-size: 10pt">(<a href="/plan9/">Return to Plan 9 Home Page</a>)</span>
</center>
<p style="margin-top: 0; margin-bottom: 0.17in"></p>
<center><font size=-1>
<span style="font-size: 10pt"><a href="http://www.lucent.com/copyright.html">Copyright</a></span>
<span style="font-size: 10pt">© 2009 Alcatel-Lucent.</span>
<span style="font-size: 10pt">All Rights Reserved.</span>
<br />
<span style="font-size: 10pt">Comments to</span>
<span style="font-size: 10pt"><a href="mailto:webmaster@plan9.bell-labs.com">webmaster@plan9.bell-labs.com</a>.</span>
</font></center>
</body>
</html>