[PATCH 1/5] vfio: Add device tracking during unbind

From: Alex Williamson
Date: Wed Feb 04 2015 - 11:13:01 EST


There's a small window between the vfio bus driver calling
vfio_del_group_dev() and the device being completely unbound where
the vfio group appears to be non-viable. This creates a race for
users like QEMU/KVM where the kvm-vfio module tries to get an
external reference to the group in order to match and release an
existing reference, while the device is potentially being removed
from the vfio bus driver. If the group is momentarily non-viable,
kvm-vfio may not be able to release the group reference until VM
shutdown, making the group unusable until that point.

Bridge the gap between device removal from the group and completion
of the driver unbind by tracking it in a list. The device is added
to the list before the bus driver reference is released and removed
using the existing unbind notifier.

Signed-off-by: Alex Williamson <alex.williamson@xxxxxxxxxx>
---

drivers/vfio/vfio.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 71 insertions(+), 5 deletions(-)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index f018d8d..43d5622 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -63,6 +63,11 @@ struct vfio_container {
void *iommu_data;
};

+struct vfio_unbound_dev {
+ struct device *dev;
+ struct list_head unbound_next;
+};
+
struct vfio_group {
struct kref kref;
int minor;
@@ -75,6 +80,8 @@ struct vfio_group {
struct notifier_block nb;
struct list_head vfio_next;
struct list_head container_next;
+ struct list_head unbound_list;
+ struct mutex unbound_lock;
atomic_t opened;
};

@@ -204,6 +211,8 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
kref_init(&group->kref);
INIT_LIST_HEAD(&group->device_list);
mutex_init(&group->device_lock);
+ INIT_LIST_HEAD(&group->unbound_list);
+ mutex_init(&group->unbound_lock);
atomic_set(&group->container_users, 0);
atomic_set(&group->opened, 0);
group->iommu_group = iommu_group;
@@ -264,9 +273,16 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
static void vfio_group_release(struct kref *kref)
{
struct vfio_group *group = container_of(kref, struct vfio_group, kref);
+ struct vfio_unbound_dev *unbound, *tmp;

WARN_ON(!list_empty(&group->device_list));

+ list_for_each_entry_safe(unbound, tmp,
+ &group->unbound_list, unbound_next) {
+ list_del(&unbound->unbound_next);
+ kfree(unbound);
+ }
+
device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
list_del(&group->vfio_next);
vfio_free_group_minor(group->minor);
@@ -440,17 +456,36 @@ static bool vfio_whitelisted_driver(struct device_driver *drv)
}

/*
- * A vfio group is viable for use by userspace if all devices are either
- * driver-less or bound to a vfio or whitelisted driver. We test the
- * latter by the existence of a struct vfio_device matching the dev.
+ * A vfio group is viable for use by userspace if all devices are in
+ * one of the following states:
+ * - driver-less
+ * - bound to a vfio driver
+ * - bound to a whitelisted driver
+ *
+ * We use two methods to determine whether a device is bound to a vfio
+ * driver. The first is to test whether the device exists in the vfio
+ * group. The second is to test if the device exists on the group
+ * unbound_list, indicating it's in the middle of transitioning from
+ * a vfio driver to driver-less.
*/
static int vfio_dev_viable(struct device *dev, void *data)
{
struct vfio_group *group = data;
struct vfio_device *device;
struct device_driver *drv = ACCESS_ONCE(dev->driver);
+ struct vfio_unbound_dev *unbound;
+ int ret = -EINVAL;

- if (!drv || vfio_whitelisted_driver(drv))
+ mutex_lock(&group->unbound_lock);
+ list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
+ if (dev == unbound->dev) {
+ ret = 0;
+ break;
+ }
+ }
+ mutex_unlock(&group->unbound_lock);
+
+ if (!ret || !drv || vfio_whitelisted_driver(drv))
return 0;

device = vfio_group_get_device(group, dev);
@@ -459,7 +494,7 @@ static int vfio_dev_viable(struct device *dev, void *data)
return 0;
}

- return -EINVAL;
+ return ret;
}

/**
@@ -501,6 +536,7 @@ static int vfio_iommu_group_notifier(struct notifier_block *nb,
{
struct vfio_group *group = container_of(nb, struct vfio_group, nb);
struct device *dev = data;
+ struct vfio_unbound_dev *unbound;

/*
* Need to go through a group_lock lookup to get a reference or we
@@ -550,6 +586,17 @@ static int vfio_iommu_group_notifier(struct notifier_block *nb,
* stop the system to maintain isolation. At a minimum, we'd
* want a toggle to disable driver auto probe for this device.
*/
+
+ mutex_lock(&group->unbound_lock);
+ list_for_each_entry(unbound,
+ &group->unbound_list, unbound_next) {
+ if (dev == unbound->dev) {
+ list_del(&unbound->unbound_next);
+ kfree(unbound);
+ break;
+ }
+ }
+ mutex_unlock(&group->unbound_lock);
break;
}

@@ -657,6 +704,7 @@ void *vfio_del_group_dev(struct device *dev)
struct vfio_group *group = device->group;
struct iommu_group *iommu_group = group->iommu_group;
void *device_data = device->device_data;
+ struct vfio_unbound_dev *unbound;

/*
* The group exists so long as we have a device reference. Get
@@ -664,6 +712,24 @@ void *vfio_del_group_dev(struct device *dev)
*/
vfio_group_get(group);

+ /*
+ * When the device is removed from the group, the group suddenly
+ * becomes non-viable; the device has a driver (until the unbind
+ * completes), but it's not present in the group. This is bad news
+ * for any external users that need to re-acquire a group reference
+ * in order to match and release their existing reference. To
+ * solve this, we track such devices on the unbound_list to bridge
+ * the gap until they're fully unbound.
+ */
+ unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
+ if (unbound) {
+ unbound->dev = dev;
+ mutex_lock(&group->unbound_lock);
+ list_add(&unbound->unbound_next, &group->unbound_list);
+ mutex_unlock(&group->unbound_lock);
+ }
+ WARN_ON(!unbound);
+
vfio_device_put(device);

/* TODO send a signal to encourage this to be released */

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/