Re: [PATCH] SCSI driver for VMware's virtual HBA - V4.

From: Anthony Liguori
Date: Wed Sep 09 2009 - 17:08:29 EST


Hi Alok,

Joining this a bit late as this was just brought to my attention. It would have been nice to CC the virtualization mailing list. Please do in future submissions.

Alok Kataria wrote:
VMware PVSCSI driver - v4.

/
diff --git a/drivers/scsi/pvscsi.h b/drivers/scsi/pvscsi.h
new file mode 100644
index 0000000..96bb655
--- /dev/null
+++ b/drivers/scsi/pvscsi.h
@@ -0,0 +1,395 @@
+/*
+ * VMware PVSCSI header file
+ *
+ * Copyright (C) 2008-2009, VMware, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; version 2 of the License and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Maintained by: Alok N Kataria <akataria@xxxxxxxxxx>
+ *
+ */
+
+#ifndef _PVSCSI_H_
+#define _PVSCSI_H_
+
+#include <linux/types.h>
+
+#define PVSCSI_DRIVER_VERSION_STRING "1.0.0.0"
+
+#define PVSCSI_MAX_NUM_SG_ENTRIES_PER_SEGMENT 128
+
+#define MASK(n) ((1 << (n)) - 1) /* make an n-bit mask */
+
+#define PCI_VENDOR_ID_VMWARE 0x15AD
+#define PCI_DEVICE_ID_VMWARE_PVSCSI 0x07C0
+
+/*
+ * host adapter status/error codes
+ */
+typedef enum {
+ BTSTAT_SUCCESS = 0x00, /* CCB complete normally with no errors */
+ BTSTAT_LINKED_COMMAND_COMPLETED = 0x0a,
+ BTSTAT_LINKED_COMMAND_COMPLETED_WITH_FLAG = 0x0b,
+ BTSTAT_DATA_UNDERRUN = 0x0c,
+ BTSTAT_SELTIMEO = 0x11, /* SCSI selection timeout */
+ BTSTAT_DATARUN = 0x12, /* data overrun/underrun */
+ BTSTAT_BUSFREE = 0x13, /* unexpected bus free */
+ BTSTAT_INVPHASE = 0x14, /* invalid bus phase or sequence requested by target */
+ BTSTAT_LUNMISMATCH = 0x17, /* linked CCB has different LUN from first CCB */
+ BTSTAT_SENSFAILED = 0x1b, /* auto request sense failed */
+ BTSTAT_TAGREJECT = 0x1c, /* SCSI II tagged queueing message rejected by target */
+ BTSTAT_BADMSG = 0x1d, /* unsupported message received by the host adapter */
+ BTSTAT_HAHARDWARE = 0x20, /* host adapter hardware failed */
+ BTSTAT_NORESPONSE = 0x21, /* target did not respond to SCSI ATN, sent a SCSI RST */
+ BTSTAT_SENTRST = 0x22, /* host adapter asserted a SCSI RST */
+ BTSTAT_RECVRST = 0x23, /* other SCSI devices asserted a SCSI RST */
+ BTSTAT_DISCONNECT = 0x24, /* target device reconnected improperly (w/o tag) */
+ BTSTAT_BUSRESET = 0x25, /* host adapter issued BUS device reset */
+ BTSTAT_ABORTQUEUE = 0x26, /* abort queue generated */
+ BTSTAT_HASOFTWARE = 0x27, /* host adapter software error */
+ BTSTAT_HATIMEOUT = 0x30, /* host adapter hardware timeout error */
+ BTSTAT_SCSIPARITY = 0x34, /* SCSI parity error detected */
+} HostBusAdapterStatus;
+
+/*
+ * Register offsets.
+ *
+ * These registers are accessible both via i/o space and mm i/o.
+ */
+
+enum PVSCSIRegOffset {
+ PVSCSI_REG_OFFSET_COMMAND = 0x0,
+ PVSCSI_REG_OFFSET_COMMAND_DATA = 0x4,
+ PVSCSI_REG_OFFSET_COMMAND_STATUS = 0x8,
+ PVSCSI_REG_OFFSET_LAST_STS_0 = 0x100,
+ PVSCSI_REG_OFFSET_LAST_STS_1 = 0x104,
+ PVSCSI_REG_OFFSET_LAST_STS_2 = 0x108,
+ PVSCSI_REG_OFFSET_LAST_STS_3 = 0x10c,
+ PVSCSI_REG_OFFSET_INTR_STATUS = 0x100c,
+ PVSCSI_REG_OFFSET_INTR_MASK = 0x2010,
+ PVSCSI_REG_OFFSET_KICK_NON_RW_IO = 0x3014,
+ PVSCSI_REG_OFFSET_DEBUG = 0x3018,
+ PVSCSI_REG_OFFSET_KICK_RW_IO = 0x4018,
+};
+
+/*
+ * Virtual h/w commands.
+ */
+
+enum PVSCSICommands {
+ PVSCSI_CMD_FIRST = 0, /* has to be first */
+
+ PVSCSI_CMD_ADAPTER_RESET = 1,
+ PVSCSI_CMD_ISSUE_SCSI = 2,
+ PVSCSI_CMD_SETUP_RINGS = 3,
+ PVSCSI_CMD_RESET_BUS = 4,
+ PVSCSI_CMD_RESET_DEVICE = 5,
+ PVSCSI_CMD_ABORT_CMD = 6,
+ PVSCSI_CMD_CONFIG = 7,
+ PVSCSI_CMD_SETUP_MSG_RING = 8,
+ PVSCSI_CMD_DEVICE_UNPLUG = 9,
+
+ PVSCSI_CMD_LAST = 10 /* has to be last */
+};
+
+/*
+ * Command descriptor for PVSCSI_CMD_RESET_DEVICE --
+ */
+
+typedef struct PVSCSICmdDescResetDevice {
+ u32 target;
+ u8 lun[8];
+} __packed PVSCSICmdDescResetDevice;
+
+/*
+ * Command descriptor for PVSCSI_CMD_ABORT_CMD --
+ *
+ * - currently does not support specifying the LUN.
+ * - _pad should be 0.
+ */
+
+typedef struct PVSCSICmdDescAbortCmd {
+ u64 context;
+ u32 target;
+ u32 _pad;
+} __packed PVSCSICmdDescAbortCmd;
+
+/*
+ * Command descriptor for PVSCSI_CMD_SETUP_RINGS --
+ *
+ * Notes:
+ * - reqRingNumPages and cmpRingNumPages need to be power of two.
+ * - reqRingNumPages and cmpRingNumPages need to be different from 0,
+ * - reqRingNumPages and cmpRingNumPages need to be inferior to
+ * PVSCSI_SETUP_RINGS_MAX_NUM_PAGES.
+ */
+
+#define PVSCSI_SETUP_RINGS_MAX_NUM_PAGES 32
+typedef struct PVSCSICmdDescSetupRings {
+ u32 reqRingNumPages;
+ u32 cmpRingNumPages;
+ u64 ringsStatePPN;
+ u64 reqRingPPNs[PVSCSI_SETUP_RINGS_MAX_NUM_PAGES];
+ u64 cmpRingPPNs[PVSCSI_SETUP_RINGS_MAX_NUM_PAGES];
+} __packed PVSCSICmdDescSetupRings;
+
+/*
+ * Command descriptor for PVSCSI_CMD_SETUP_MSG_RING --
+ *
+ * Notes:
+ * - this command was not supported in the initial revision of the h/w
+ * interface. Before using it, you need to check that it is supported by
+ * writing PVSCSI_CMD_SETUP_MSG_RING to the 'command' register, then
+ * immediately after read the 'command status' register:
+ * * a value of -1 means that the cmd is NOT supported,
+ * * a value != -1 means that the cmd IS supported.
+ * If it's supported the 'command status' register should return:
+ * sizeof(PVSCSICmdDescSetupMsgRing) / sizeof(u32).
+ * - this command should be issued _after_ the usual SETUP_RINGS so that the
+ * RingsState page is already setup. If not, the command is a nop.
+ * - numPages needs to be a power of two,
+ * - numPages needs to be different from 0,
+ * - _pad should be zero.
+ */
+
+#define PVSCSI_SETUP_MSG_RING_MAX_NUM_PAGES 16
+
+typedef struct PVSCSICmdDescSetupMsgRing {
+ u32 numPages;
+ u32 _pad;
+ u64 ringPPNs[PVSCSI_SETUP_MSG_RING_MAX_NUM_PAGES];
+} __packed PVSCSICmdDescSetupMsgRing;
+
+enum PVSCSIMsgType {
+ PVSCSI_MSG_DEV_ADDED = 0,
+ PVSCSI_MSG_DEV_REMOVED = 1,
+ PVSCSI_MSG_LAST = 2,
+};
+
+/*
+ * Msg descriptor.
+ *
+ * sizeof(struct PVSCSIRingMsgDesc) == 128.
+ *
+ * - type is of type enum PVSCSIMsgType.
+ * - the content of args depend on the type of event being delivered.
+ */
+
+typedef struct PVSCSIRingMsgDesc {
+ u32 type;
+ u32 args[31];
+} __packed PVSCSIRingMsgDesc;
+
+typedef struct PVSCSIMsgDescDevStatusChanged {
+ u32 type; /* PVSCSI_MSG_DEV _ADDED / _REMOVED */
+ u32 bus;
+ u32 target;
+ u8 lun[8];
+ u32 pad[27];
+} __packed PVSCSIMsgDescDevStatusChanged;
+
+/*
+ * Rings state.
+ *
+ * - the fields:
+ * . msgProdIdx,
+ * . msgConsIdx,
+ * . msgNumEntriesLog2,
+ * .. are only used once the SETUP_MSG_RING cmd has been issued.
+ * - '_pad' helps to ensure that the msg related fields are on their own
+ * cache-line.
+ */
+
+typedef struct PVSCSIRingsState {
+ u32 reqProdIdx;
+ u32 reqConsIdx;
+ u32 reqNumEntriesLog2;
+
+ u32 cmpProdIdx;
+ u32 cmpConsIdx;
+ u32 cmpNumEntriesLog2;
+
+ u8 _pad[104];
+
+ u32 msgProdIdx;
+ u32 msgConsIdx;
+ u32 msgNumEntriesLog2;
+} __packed PVSCSIRingsState;

All of this can be hidden behind a struct virtqueue. You could then introduce a virtio-vmwring that implemented this ABI.

You could then separate out the actual scsi logic into a separate virtio-scsi.c driver.

There are numerous advantages to this layering. You get to remain ABI compatible with yourself. You can potentially reuse the ring logic in other drivers. Other VMMs can use different ring transports without introducing new drivers. For instance, in KVM, we would use virtio-pci + virtio-scsi instead of using virtio-vmwring + virtio-scsi.

The virtio infrastructure has been backported to various old kernels too so there really is no reason not to use it.

We have the interfaces in virtio to do notification suppression and MSI so I don't think there's any real limitation that it would impose on you.

Regards,

Anthony Liguori
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/