[PATCH] vfio/pci: Parallelize device open and release

From: Alex Williamson
Date: Fri Nov 09 2018 - 17:09:50 EST


In commit 61d792562b53 ("vfio-pci: Use mutex around open, release, and
remove") a mutex was added to freeze the refcnt for a device so that
we can handle errors and perform bus resets on final close. However,
bus resets can be rather slow and a global mutex here is undesirable.
A per-device mutex provides the best granularity, but then our chances
of triggering a bus/slot reset with multiple affected devices is slim
when devices are released in parallel. Instead create a reflck object
shared among all devices under the same bus or slot, allowing devices
on independent buses to be released in parallel while serializing per
bus/slot.

Reported-by: Christian Ehrhardt <christian.ehrhardt@xxxxxxxxxxxxx>
Tested-by: Christian Ehrhardt <christian.ehrhardt@xxxxxxxxxxxxx>
Signed-off-by: Alex Williamson <alex.williamson@xxxxxxxxxx>
---
drivers/vfio/pci/vfio_pci.c | 157 ++++++++++++++++++++++++++++++-----
drivers/vfio/pci/vfio_pci_private.h | 6 +
2 files changed, 139 insertions(+), 24 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 50cdedfca9fe..d443fb7a4e75 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -56,8 +56,6 @@ module_param(disable_idle_d3, bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(disable_idle_d3,
"Disable using the PCI D3 low power state for idle, unused devices");

-static DEFINE_MUTEX(driver_lock);
-
static inline bool vfio_vga_disabled(void)
{
#ifdef CONFIG_VFIO_PCI_VGA
@@ -393,14 +391,14 @@ static void vfio_pci_release(void *device_data)
{
struct vfio_pci_device *vdev = device_data;

- mutex_lock(&driver_lock);
+ mutex_lock(&vdev->reflck->lock);

if (!(--vdev->refcnt)) {
vfio_spapr_pci_eeh_release(vdev->pdev);
vfio_pci_disable(vdev);
}

- mutex_unlock(&driver_lock);
+ mutex_unlock(&vdev->reflck->lock);

module_put(THIS_MODULE);
}
@@ -413,7 +411,7 @@ static int vfio_pci_open(void *device_data)
if (!try_module_get(THIS_MODULE))
return -ENODEV;

- mutex_lock(&driver_lock);
+ mutex_lock(&vdev->reflck->lock);

if (!vdev->refcnt) {
ret = vfio_pci_enable(vdev);
@@ -424,7 +422,7 @@ static int vfio_pci_open(void *device_data)
}
vdev->refcnt++;
error:
- mutex_unlock(&driver_lock);
+ mutex_unlock(&vdev->reflck->lock);
if (ret)
module_put(THIS_MODULE);
return ret;
@@ -1187,6 +1185,9 @@ static const struct vfio_device_ops vfio_pci_ops = {
.request = vfio_pci_request,
};

+static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev);
+static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck);
+
static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
struct vfio_pci_device *vdev;
@@ -1233,6 +1234,14 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
return ret;
}

+ ret = vfio_pci_reflck_attach(vdev);
+ if (ret) {
+ vfio_del_group_dev(&pdev->dev);
+ vfio_iommu_group_put(group, &pdev->dev);
+ kfree(vdev);
+ return ret;
+ }
+
if (vfio_pci_is_vga(pdev)) {
vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode);
vga_set_legacy_decoding(pdev,
@@ -1264,6 +1273,8 @@ static void vfio_pci_remove(struct pci_dev *pdev)
if (!vdev)
return;

+ vfio_pci_reflck_put(vdev->reflck);
+
vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev);
kfree(vdev->region);
mutex_destroy(&vdev->ioeventfds_lock);
@@ -1320,16 +1331,97 @@ static struct pci_driver vfio_pci_driver = {
.err_handler = &vfio_err_handlers,
};

+static DEFINE_MUTEX(reflck_lock);
+
+static struct vfio_pci_reflck *vfio_pci_reflck_alloc(void)
+{
+ struct vfio_pci_reflck *reflck;
+
+ reflck = kzalloc(sizeof(*reflck), GFP_KERNEL);
+ if (!reflck)
+ return ERR_PTR(-ENOMEM);
+
+ kref_init(&reflck->kref);
+ mutex_init(&reflck->lock);
+
+ return reflck;
+}
+
+static void vfio_pci_reflck_get(struct vfio_pci_reflck *reflck)
+{
+ kref_get(&reflck->kref);
+}
+
+static int vfio_pci_reflck_find(struct pci_dev *pdev, void *data)
+{
+ struct vfio_pci_reflck **preflck = data;
+ struct vfio_device *device;
+ struct vfio_pci_device *vdev;
+
+ device = vfio_device_get_from_dev(&pdev->dev);
+ if (!device)
+ return 0;
+
+ if (pci_dev_driver(pdev) != &vfio_pci_driver) {
+ vfio_device_put(device);
+ return 0;
+ }
+
+ vdev = vfio_device_data(device);
+
+ if (vdev->reflck) {
+ vfio_pci_reflck_get(vdev->reflck);
+ *preflck = vdev->reflck;
+ vfio_device_put(device);
+ return 1;
+ }
+
+ vfio_device_put(device);
+ return 0;
+}
+
+static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev)
+{
+ bool slot = !pci_probe_reset_slot(vdev->pdev->slot);
+
+ mutex_lock(&reflck_lock);
+
+ if (pci_is_root_bus(vdev->pdev->bus) ||
+ vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_reflck_find,
+ &vdev->reflck, slot) <= 0)
+ vdev->reflck = vfio_pci_reflck_alloc();
+
+ mutex_unlock(&reflck_lock);
+
+ return IS_ERR(vdev->reflck) ? PTR_ERR(vdev->reflck) : 0;
+}
+
+static void vfio_pci_reflck_release(struct kref *kref)
+{
+ struct vfio_pci_reflck *reflck = container_of(kref,
+ struct vfio_pci_reflck,
+ kref);
+
+ kfree(reflck);
+ mutex_unlock(&reflck_lock);
+}
+
+static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck)
+{
+ kref_put_mutex(&reflck->kref, vfio_pci_reflck_release, &reflck_lock);
+}
+
struct vfio_devices {
struct vfio_device **devices;
int cur_index;
int max_index;
};

-static int vfio_pci_get_devs(struct pci_dev *pdev, void *data)
+static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data)
{
struct vfio_devices *devs = data;
struct vfio_device *device;
+ struct vfio_pci_device *vdev;

if (devs->cur_index == devs->max_index)
return -ENOSPC;
@@ -1343,16 +1435,25 @@ static int vfio_pci_get_devs(struct pci_dev *pdev, void *data)
return -EBUSY;
}

+ vdev = vfio_device_data(device);
+
+ /* Only collect unused devices */
+ if (vdev->refcnt) {
+ vfio_device_put(device);
+ return -EBUSY;
+ }
+
devs->devices[devs->cur_index++] = device;
return 0;
}

/*
- * Attempt to do a bus/slot reset if there are devices affected by a reset for
- * this device that are needs_reset and all of the affected devices are unused
- * (!refcnt). Callers are required to hold driver_lock when calling this to
- * prevent device opens and concurrent bus reset attempts. We prevent device
- * unbinds by acquiring and holding a reference to the vfio_device.
+ * Attempt to do a bus/slot reset if there are devices affected by a reset
+ * for this device that are "needs_reset" and all of the affected devices
+ * are unused (!refcnt). Callers are required to hold vdev->reflck->lock
+ * to prevent concurrent device opens, which is expected to protect all
+ * affected devices. A vfio_device reference is also acquired for each
+ * affected device to prevent unbinds.
*
* NB: vfio-core considers a group to be viable even if some devices are
* bound to drivers like pci-stub or pcieport. Here we require all devices
@@ -1363,7 +1464,7 @@ static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev)
{
struct vfio_devices devs = { .cur_index = 0 };
int i = 0, ret = -EINVAL;
- bool needs_reset = false, slot = false;
+ bool slot = false;
struct vfio_pci_device *tmp;

if (!pci_probe_reset_slot(vdev->pdev->slot))
@@ -1381,28 +1482,36 @@ static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev)
return;

if (vfio_pci_for_each_slot_or_bus(vdev->pdev,
- vfio_pci_get_devs, &devs, slot))
+ vfio_pci_get_unused_devs,
+ &devs, slot))
goto put_devs;

+ /* Does at least one need a reset? */
for (i = 0; i < devs.cur_index; i++) {
tmp = vfio_device_data(devs.devices[i]);
- if (tmp->needs_reset)
- needs_reset = true;
- if (tmp->refcnt)
- goto put_devs;
+ if (tmp->needs_reset) {
+ ret = pci_reset_bus(vdev->pdev);
+ break;
+ }
}

- if (needs_reset)
- ret = pci_reset_bus(vdev->pdev);
-
put_devs:
for (i = 0; i < devs.cur_index; i++) {
tmp = vfio_device_data(devs.devices[i]);
- if (!ret)
+
+ /*
+ * If reset was successful, affected devices no longer need
+ * a reset and we should return all the collateral devices
+ * to low power. If not successful, we either didn't reset
+ * the bus or timed out waiting for it, so let's not touch
+ * the power state.
+ */
+ if (!ret) {
tmp->needs_reset = false;

- if (!tmp->refcnt && !disable_idle_d3)
- pci_set_power_state(tmp->pdev, PCI_D3hot);
+ if (tmp != vdev && !disable_idle_d3)
+ pci_set_power_state(tmp->pdev, PCI_D3hot);
+ }

vfio_device_put(devs.devices[i]);
}
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index cde3b5d3441a..aa2355e67340 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -76,6 +76,11 @@ struct vfio_pci_dummy_resource {
struct list_head res_next;
};

+struct vfio_pci_reflck {
+ struct kref kref;
+ struct mutex lock;
+};
+
struct vfio_pci_device {
struct pci_dev *pdev;
void __iomem *barmap[PCI_STD_RESOURCE_END + 1];
@@ -104,6 +109,7 @@ struct vfio_pci_device {
bool needs_reset;
bool nointx;
struct pci_saved_state *pci_saved_state;
+ struct vfio_pci_reflck *reflck;
int refcnt;
int ioeventfds_nr;
struct eventfd_ctx *err_trigger;