[PATCH v5 3/4] uio_pci_generic: add MSI/MSI-X support

From: Vlad Zolotarov
Date: Tue Oct 06 2015 - 13:18:07 EST


Add support for MSI and MSI-X interrupt modes:
- Interrupt mode selection order is:
INT#X (for backward compatibility) -> MSI-X -> MSI.
- Add ioctl() commands:
- UIO_PCI_GENERIC_INT_MODE_GET: query the current interrupt mode.
- UIO_PCI_GENERIC_IRQ_NUM_GET: query the maximum number of IRQs.
- UIO_PCI_GENERIC_IRQ_SET: bind the IRQ to eventfd (similar to vfio).
- See an example in Documentation/DocBook/uio-howto.

Currently uio_pci_generic demands INT#x interrupts source be available.
However there are devices that simply don't have INT#x capability, for
instance SR-IOV VF devices that simply don't have INT#x capability. For
such devices uio_pci_generic will simply fail (more specifically its
probe() will fail).

When IOMMU is either not available (e.g. Amazon EC2) or not acceptable
due to performance overhead and thus VFIO is not an option users that
develop user-space drivers are left without any option but to develop
some proprietary UIO drivers (e.g. igb_uio driver in Intel's DPDK) just
to be able to use UIO infrastructure.

This series provides a generic solution for this problem while
preserving the original behaviour for devices for which the original
uio_pci_generic had worked before (i.e. INT#x will be used by default).

Signed-off-by: Vlad Zolotarov <vladz@xxxxxxxxxxxxxxxxxxxx>
---
New in v4:
- Use portable __u32 and __s32 types from asm/types.h for
defining uio_pci_generic_irq_set fields.
- Use proper _IO macros for defining read and write ioctl()
commands.
- Moved bars mapping setting into a separate patch.

New in v3:
- Add __iomem qualifier to temp buffer receiving ioremap value.

New in v2:
- Added #include <linux/uaccess.h> to uio_pci_generic.c
---
drivers/uio/uio_pci_generic.c | 320 ++++++++++++++++++++++++++++++++---
include/uapi/linux/uio_pci_generic.h | 51 ++++++
2 files changed, 348 insertions(+), 23 deletions(-)
create mode 100644 include/uapi/linux/uio_pci_generic.h

diff --git a/drivers/uio/uio_pci_generic.c b/drivers/uio/uio_pci_generic.c
index 2c6e2b1..be92918 100644
--- a/drivers/uio/uio_pci_generic.c
+++ b/drivers/uio/uio_pci_generic.c
@@ -22,16 +22,32 @@
#include <linux/device.h>
#include <linux/module.h>
#include <linux/pci.h>
+#include <linux/msi.h>
#include <linux/slab.h>
#include <linux/uio_driver.h>
+#include <linux/eventfd.h>
+#include <linux/uaccess.h>
+#include <uapi/linux/uio_pci_generic.h>

#define DRIVER_VERSION "0.01.0"
#define DRIVER_AUTHOR "Michael S. Tsirkin <mst@xxxxxxxxxx>"
#define DRIVER_DESC "Generic UIO driver for PCI 2.3 devices"

+struct msix_info {
+ u32 num_irqs;
+ struct msix_entry *table;
+ struct uio_msix_irq_ctx {
+ struct eventfd_ctx *trigger; /* MSI-x vector to eventfd */
+ char *name; /* name in /proc/interrupts */
+ } *ctx;
+};
+
struct uio_pci_generic_dev {
struct uio_info info;
struct pci_dev *pdev;
+ struct mutex msix_state_lock; /* ioctl mutex */
+ enum uio_int_mode int_mode;
+ struct msix_info msix;
};

static inline struct uio_pci_generic_dev *
@@ -109,9 +125,107 @@ fail:
return err;
}

-/* Interrupt handler. Read/modify/write the command register to disable
- * the interrupt. */
-static irqreturn_t irqhandler(int irq, struct uio_info *info)
+static irqreturn_t msix_irqhandler(int irq, void *arg);
+
+/* set the mapping between vector # and existing eventfd. */
+static int set_irq_eventfd(struct uio_pci_generic_dev *gdev, u32 vec, int fd)
+{
+ struct uio_msix_irq_ctx *ctx;
+ struct eventfd_ctx *trigger;
+ struct pci_dev *pdev = gdev->pdev;
+ int irq, err;
+
+ if (vec >= gdev->msix.num_irqs) {
+ dev_notice(&gdev->pdev->dev, "vec %u >= num_vec %u\n",
+ vec, gdev->msix.num_irqs);
+ return -ERANGE;
+ }
+
+ irq = gdev->msix.table[vec].vector;
+
+ /* Cleanup existing irq mapping */
+ ctx = &gdev->msix.ctx[vec];
+ if (ctx->trigger) {
+ free_irq(irq, ctx->trigger);
+ eventfd_ctx_put(ctx->trigger);
+ ctx->trigger = NULL;
+ }
+
+ /* Passing a negative value is used to unbind from the interrupt */
+ if (fd < 0)
+ return 0;
+
+
+ trigger = eventfd_ctx_fdget(fd);
+ if (IS_ERR(trigger)) {
+ err = PTR_ERR(trigger);
+ dev_notice(&gdev->pdev->dev,
+ "eventfd ctx get failed: %d\n", err);
+ return err;
+ }
+
+ err = request_irq(irq, msix_irqhandler, 0, ctx->name, trigger);
+ if (err) {
+ dev_notice(&pdev->dev, "request irq failed: %d\n", err);
+ eventfd_ctx_put(trigger);
+ return err;
+ }
+
+ dev_dbg(&pdev->dev, "map vector %u to fd %d trigger %p\n",
+ vec, fd, trigger);
+ ctx->trigger = trigger;
+
+ return 0;
+}
+
+static int uio_pci_generic_ioctl(struct uio_info *info, unsigned int cmd,
+ unsigned long arg)
+{
+ struct uio_pci_generic_dev *gdev = to_uio_pci_generic_dev(info);
+ struct uio_pci_generic_irq_set hdr;
+ int err;
+
+ switch (cmd) {
+ case UIO_PCI_GENERIC_IRQ_SET:
+ if (copy_from_user(&hdr, (void __user *)arg, sizeof(hdr)))
+ return -EFAULT;
+
+ /* Locking is needed to ensure two IRQ_SET ioctl()'s are not
+ * running in parallel.
+ */
+ mutex_lock(&gdev->msix_state_lock);
+ if (gdev->int_mode != UIO_INT_MODE_MSIX) {
+ mutex_unlock(&gdev->msix_state_lock);
+ return -EOPNOTSUPP;
+ }
+
+ err = set_irq_eventfd(gdev, hdr.vec, (int)hdr.fd);
+ mutex_unlock(&gdev->msix_state_lock);
+
+ break;
+ case UIO_PCI_GENERIC_IRQ_NUM_GET:
+ if (gdev->int_mode == UIO_INT_MODE_NONE)
+ err = put_user(0, (u32 __user *)arg);
+ else if (gdev->int_mode != UIO_INT_MODE_MSIX)
+ err = put_user(1, (u32 __user *)arg);
+ else
+ err = put_user(gdev->msix.num_irqs,
+ (u32 __user *)arg);
+
+ break;
+ case UIO_PCI_GENERIC_INT_MODE_GET:
+ err = put_user(gdev->int_mode, (u32 __user *)arg);
+
+ break;
+ default:
+ err = -EOPNOTSUPP;
+ }
+
+ return err;
+}
+
+/* INT#X interrupt handler. */
+static irqreturn_t intx_irqhandler(int irq, struct uio_info *info)
{
struct uio_pci_generic_dev *gdev = to_uio_pci_generic_dev(info);

@@ -122,8 +236,162 @@ static irqreturn_t irqhandler(int irq, struct uio_info *info)
return IRQ_HANDLED;
}

-static int probe(struct pci_dev *pdev,
- const struct pci_device_id *id)
+/* MSI interrupt handler. */
+static irqreturn_t msi_irqhandler(int irq, struct uio_info *info)
+{
+ /* UIO core will signal the user process. */
+ return IRQ_HANDLED;
+}
+
+/* MSI-X interrupt handler. */
+static irqreturn_t msix_irqhandler(int irq, void *arg)
+{
+ struct eventfd_ctx *trigger = arg;
+
+ pr_devel("irq %u trigger %p\n", irq, trigger);
+
+ eventfd_signal(trigger, 1);
+ return IRQ_HANDLED;
+}
+
+static bool enable_intx(struct uio_pci_generic_dev *gdev)
+{
+ struct pci_dev *pdev = gdev->pdev;
+
+ if (!pdev->irq || !pci_intx_mask_supported(pdev))
+ return false;
+
+ gdev->int_mode = UIO_INT_MODE_INTX;
+ gdev->info.irq = pdev->irq;
+ gdev->info.irq_flags = IRQF_SHARED;
+ gdev->info.handler = intx_irqhandler;
+
+ return true;
+}
+
+static void set_pci_master(struct pci_dev *pdev)
+{
+ pci_set_master(pdev);
+ dev_warn(&pdev->dev, "Enabling PCI bus mastering. Bogus userspace application is able to trash kernel memory using DMA");
+ add_taint(TAINT_USER, LOCKDEP_STILL_OK);
+}
+
+static bool enable_msi(struct uio_pci_generic_dev *gdev)
+{
+ struct pci_dev *pdev = gdev->pdev;
+
+ set_pci_master(pdev);
+
+ if (pci_enable_msi(pdev))
+ return false;
+
+ gdev->int_mode = UIO_INT_MODE_MSI;
+ gdev->info.irq = pdev->irq;
+ gdev->info.irq_flags = 0;
+ gdev->info.handler = msi_irqhandler;
+
+ return true;
+}
+
+static bool enable_msix(struct uio_pci_generic_dev *gdev)
+{
+ struct pci_dev *pdev = gdev->pdev;
+ int i, vectors = pci_msix_vec_count(pdev);
+
+ if (vectors <= 0)
+ return false;
+
+ gdev->msix.table = kcalloc(vectors, sizeof(struct msix_entry),
+ GFP_KERNEL);
+ if (!gdev->msix.table) {
+ dev_err(&pdev->dev, "Failed to allocate memory for MSI-X table");
+ return false;
+ }
+
+ gdev->msix.ctx = kcalloc(vectors, sizeof(struct uio_msix_irq_ctx),
+ GFP_KERNEL);
+ if (!gdev->msix.ctx) {
+ dev_err(&pdev->dev, "Failed to allocate memory for MSI-X contexts");
+ goto err_ctx_alloc;
+ }
+
+ for (i = 0; i < vectors; i++) {
+ gdev->msix.table[i].entry = i;
+ gdev->msix.ctx[i].name = kasprintf(GFP_KERNEL,
+ KBUILD_MODNAME "[%d](%s)",
+ i, pci_name(pdev));
+ if (!gdev->msix.ctx[i].name)
+ goto err_name_alloc;
+ }
+
+ set_pci_master(pdev);
+
+ if (pci_enable_msix(pdev, gdev->msix.table, vectors))
+ goto err_msix_enable;
+
+ gdev->int_mode = UIO_INT_MODE_MSIX;
+ gdev->info.irq = UIO_IRQ_CUSTOM;
+ gdev->msix.num_irqs = (u32)vectors;
+
+ return true;
+
+err_msix_enable:
+ pci_clear_master(pdev);
+err_name_alloc:
+ for (i = 0; i < vectors; i++)
+ kfree(gdev->msix.ctx[i].name);
+
+ kfree(gdev->msix.ctx);
+err_ctx_alloc:
+ kfree(gdev->msix.table);
+
+ return false;
+}
+
+/**
+ * Disable interrupts and free related resources.
+ *
+ * @gdev device handle
+ *
+ * This function should be called after the corresponding UIO device has been
+ * unregistered. This will ensure that there are no currently running ioctl()s
+ * and there won't be any new ones until next probe() call.
+ */
+static void disable_intr(struct uio_pci_generic_dev *gdev)
+{
+ struct pci_dev *pdev = gdev->pdev;
+ int i;
+
+ switch (gdev->int_mode) {
+ case UIO_INT_MODE_MSI:
+ pci_disable_msi(pdev);
+ pci_clear_master(pdev);
+
+ break;
+ case UIO_INT_MODE_MSIX:
+ /* No need for locking here since there shouldn't be any
+ * ioctl()s running by now.
+ */
+ for (i = 0; i < gdev->msix.num_irqs; i++) {
+ if (gdev->msix.ctx[i].trigger)
+ set_irq_eventfd(gdev, i, -1);
+
+ kfree(gdev->msix.ctx[i].name);
+ }
+
+ pci_disable_msix(pdev);
+ pci_clear_master(pdev);
+ kfree(gdev->msix.ctx);
+ kfree(gdev->msix.table);
+
+ break;
+ default:
+ break;
+ }
+}
+
+
+static int probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
struct uio_pci_generic_dev *gdev;
int err;
@@ -135,26 +403,17 @@ static int probe(struct pci_dev *pdev,
return err;
}

- if (!pdev->irq) {
- dev_warn(&pdev->dev, "No IRQ assigned to device: "
- "no support for interrupts?\n");
- pci_disable_device(pdev);
- return -ENODEV;
- }
-
- if (!pci_intx_mask_supported(pdev)) {
- err = -ENODEV;
- goto err_verify;
- }
-
gdev = kzalloc(sizeof(struct uio_pci_generic_dev), GFP_KERNEL);
if (!gdev) {
err = -ENOMEM;
goto err_alloc;
}

+ gdev->pdev = pdev;
gdev->info.name = "uio_pci_generic";
gdev->info.version = DRIVER_VERSION;
+ gdev->info.ioctl = uio_pci_generic_ioctl;
+ mutex_init(&gdev->msix_state_lock);

err = pci_request_regions(pdev, "uio_pci_generic");
if (err != 0) {
@@ -162,10 +421,21 @@ static int probe(struct pci_dev *pdev,
goto err_request_regions;
}

- gdev->info.irq = pdev->irq;
- gdev->info.irq_flags = IRQF_SHARED;
- gdev->info.handler = irqhandler;
- gdev->pdev = pdev;
+ /* Enable the corresponding interrupt mode. Try to enable INT#X first
+ * for backward compatibility.
+ */
+ if (enable_intx(gdev))
+ dev_info(&pdev->dev, "Using INT#x mode: IRQ %ld",
+ gdev->info.irq);
+ else if (enable_msix(gdev))
+ dev_info(&pdev->dev, "Using MSI-X mode: number of IRQs %d",
+ gdev->msix.num_irqs);
+ else if (enable_msi(gdev))
+ dev_info(&pdev->dev, "Using MSI mode: IRQ %ld", gdev->info.irq);
+ else {
+ err = -ENODEV;
+ goto err_verify;
+ }

/* remap resources */
err = setup_maps(pdev, &gdev->info);
@@ -175,6 +445,7 @@ static int probe(struct pci_dev *pdev,
err = uio_register_device(&pdev->dev, &gdev->info);
if (err)
goto err_register;
+
pci_set_drvdata(pdev, gdev);

return 0;
@@ -182,12 +453,14 @@ static int probe(struct pci_dev *pdev,
err_register:
release_iomaps(gdev);
err_maps:
+ disable_intr(gdev);
+err_verify:
pci_release_regions(pdev);
err_request_regions:
kfree(gdev);
err_alloc:
-err_verify:
pci_disable_device(pdev);
+
return err;
}

@@ -196,10 +469,11 @@ static void remove(struct pci_dev *pdev)
struct uio_pci_generic_dev *gdev = pci_get_drvdata(pdev);

uio_unregister_device(&gdev->info);
+ disable_intr(gdev);
release_iomaps(gdev);
pci_release_regions(pdev);
- pci_disable_device(pdev);
kfree(gdev);
+ pci_disable_device(pdev);
pci_set_drvdata(pdev, NULL);
}

diff --git a/include/uapi/linux/uio_pci_generic.h b/include/uapi/linux/uio_pci_generic.h
new file mode 100644
index 0000000..7c13f5c
--- /dev/null
+++ b/include/uapi/linux/uio_pci_generic.h
@@ -0,0 +1,51 @@
+/*
+ * include/uapi/linux/uio_pci_generic.h
+ *
+ * Header file for userspace generic PCI IO driver and applications with public
+ * API.
+ *
+ * Author: Vlad Zolotarov <vladz@xxxxxxxxxxxxxxxxxxxx>
+ *
+ * Licensed under the GPLv2 only.
+ */
+
+#ifndef _UIO_PCI_GENERIC_H_
+#define _UIO_PCI_GENERIC_H_
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+enum uio_int_mode {
+ UIO_INT_MODE_NONE = 0,
+ UIO_INT_MODE_INTX = 1,
+ UIO_INT_MODE_MSI = 2,
+ UIO_INT_MODE_MSIX = 3
+};
+
+/* bind/unbind the requested IRQ to the given eventfd */
+struct uio_pci_generic_irq_set {
+ /* index of the IRQ to connect to starting from 0 */
+ __u32 vec;
+ /* eventfd file descriptor for binding or a negative value for
+ * unbinding.
+ */
+ __s32 fd;
+};
+
+#define UIO_PCI_GENERIC_BASE 0x86
+
+/* Bind/unbind the eventfd file descriptor to/from the specific IRQ vector.
+ * Vector is defined by its index starting from 0.
+ */
+#define UIO_PCI_GENERIC_IRQ_SET _IOW('I', UIO_PCI_GENERIC_BASE + 1, \
+ struct uio_pci_generic_irq_set)
+
+/* Return the number of available IRQs */
+#define UIO_PCI_GENERIC_IRQ_NUM_GET _IOR('I', UIO_PCI_GENERIC_BASE + 2, \
+ uint32_t)
+
+/* Return the device interrupt mode (uio_int_mode values) */
+#define UIO_PCI_GENERIC_INT_MODE_GET _IOR('I', UIO_PCI_GENERIC_BASE + 3, \
+ uint32_t)
+
+#endif /* _UIO_PCI_GENERIC_H_ */
--
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/