Re: [PATCH 1/4] iommu: Add dev_iommu->ops_rwsem

From: Jason Gunthorpe
Date: Mon Feb 13 2023 - 09:16:57 EST


On Mon, Feb 13, 2023 at 03:49:38PM +0800, Lu Baolu wrote:

> +static int iommu_group_freeze_dev_ops(struct iommu_group *group)
> +{
> + struct group_device *device;
> + struct device *dev;
> +
> + mutex_lock(&group->mutex);
> + list_for_each_entry(device, &group->devices, list) {
> + dev = device->dev;
> + down_read(&dev->iommu->ops_rwsem);

This isn't allowed, you can't obtain locks in a loop like this, it
will deadlock.

You don't need more locks, we already have the group mutex, the
release path should be fixed to use it properly as I was trying to do here:

https://lore.kernel.org/kvm/4-v1-ef00ffecea52+2cb-iommu_group_lifetime_jgg@xxxxxxxxxx/
https://lore.kernel.org/kvm/YyyTxx0HnA3maxEk@xxxxxxxxxx/

Then what you'd do in a path like this is:

mutex_lock(&group->mutex);
dev = iommu_group_first_device(group)
if (!dev)
/* Racing with group cleanup */
return -EINVAL;

/* Now dev->ops is valid and must remain valid so long as
group->mutex is held */


The only reason this doesn't work already is because of the above
stuff about release not holding the group mutex when manipulating the
devices in the group. This is simply mis-locked.

Robin explained it was done like this because
arm_iommu_detach_device() re-enters the iommu core during release_dev,
so I suggest fixing that by simply not doing that (see above)

Below is the lastest attempt I had tried, I didn't have time to get back
to it and we fixed the bug another way. It needs a bit of adjusting to
also remove the device from the group after release is called within
the same mutex critical region.

Jason

diff --git a/arch/arm/include/asm/dma-iommu.h b/arch/arm/include/asm/dma-iommu.h
index fe9ef6f79e9cfe..ea7198a1786186 100644
--- a/arch/arm/include/asm/dma-iommu.h
+++ b/arch/arm/include/asm/dma-iommu.h
@@ -31,6 +31,7 @@ void arm_iommu_release_mapping(struct dma_iommu_mapping *mapping);
int arm_iommu_attach_device(struct device *dev,
struct dma_iommu_mapping *mapping);
void arm_iommu_detach_device(struct device *dev);
+void arm_iommu_release_device(struct device *dev);

#endif /* __KERNEL__ */
#endif
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index c135f6e37a00ca..3d7b385e3981ef 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -1680,13 +1680,15 @@ int arm_iommu_attach_device(struct device *dev,
EXPORT_SYMBOL_GPL(arm_iommu_attach_device);

/**
- * arm_iommu_detach_device
+ * arm_iommu_release_device
* @dev: valid struct device pointer
*
- * Detaches the provided device from a previously attached map.
- * This overwrites the dma_ops pointer with appropriate non-IOMMU ops.
+ * This is like arm_iommu_detach_device() except it handles the special
+ * case of being called under an iommu driver's release operation. In this
+ * case the driver must have already detached the device from any attached
+ * domain before calling this function.
*/
-void arm_iommu_detach_device(struct device *dev)
+void arm_iommu_release_device(struct device *dev)
{
struct dma_iommu_mapping *mapping;

@@ -1696,13 +1698,34 @@ void arm_iommu_detach_device(struct device *dev)
return;
}

- iommu_detach_device(mapping->domain, dev);
kref_put(&mapping->kref, release_iommu_mapping);
to_dma_iommu_mapping(dev) = NULL;
set_dma_ops(dev, NULL);

pr_debug("Detached IOMMU controller from %s device.\n", dev_name(dev));
}
+EXPORT_SYMBOL_GPL(arm_iommu_release_device);
+
+/**
+ * arm_iommu_detach_device
+ * @dev: valid struct device pointer
+ *
+ * Detaches the provided device from a previously attached map.
+ * This overwrites the dma_ops pointer with appropriate non-IOMMU ops.
+ */
+void arm_iommu_detach_device(struct device *dev)
+{
+ struct dma_iommu_mapping *mapping;
+
+ mapping = to_dma_iommu_mapping(dev);
+ if (!mapping) {
+ dev_warn(dev, "Not attached\n");
+ return;
+ }
+
+ iommu_detach_device(mapping->domain, dev);
+ arm_iommu_release_device(dev);
+}
EXPORT_SYMBOL_GPL(arm_iommu_detach_device);

static void arm_setup_iommu_dma_ops(struct device *dev, u64 dma_base, u64 size,
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index de91dd88705bd3..f3dbd980133567 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -101,6 +101,10 @@ static int iommu_create_device_direct_mappings(struct iommu_group *group,
static struct iommu_group *iommu_group_get_for_dev(struct device *dev);
static ssize_t iommu_group_store_type(struct iommu_group *group,
const char *buf, size_t count);
+static struct group_device *
+__iommu_group_remove_device(struct iommu_group *group, struct device *dev);
+static void __iommu_group_remove_device_sysfs(struct iommu_group *group,
+ struct group_device *device);

#define IOMMU_GROUP_ATTR(_name, _mode, _show, _store) \
struct iommu_group_attribute iommu_group_attr_##_name = \
@@ -430,18 +434,50 @@ int iommu_probe_device(struct device *dev)

void iommu_release_device(struct device *dev)
{
+ struct iommu_group *group = dev->iommu_group;
const struct iommu_ops *ops;
+ struct group_device *device;

if (!dev->iommu)
return;

iommu_device_unlink(dev->iommu->iommu_dev, dev);

+ mutex_lock(&group->mutex);
+ device = __iommu_group_remove_device(group, dev);
ops = dev_iommu_ops(dev);
+
+ /*
+ * If the group has become empty then ownership must have been released,
+ * and the current domain must be set back to NULL or the default
+ * domain.
+ */
+ if (list_empty(&group->devices))
+ WARN_ON(group->owner_cnt ||
+ group->domain != group->default_domain);
+
+ /*
+ * release_device() must stop using any attached domain on the device.
+ * If there are still other devices in the group they are not effected
+ * by this callback.
+ *
+ * The IOMMU driver must set the device to either an identity or
+ * blocking translation and stop using any domain pointer, as it is
+ * going to be freed.
+ */
if (ops->release_device)
ops->release_device(dev);
+ mutex_unlock(&group->mutex);
+
+ __iommu_group_remove_device_sysfs(group, device);
+
+ /*
+ * This will eventually call iommu_group_release() which will free the
+ * iommu_domains.
+ */
+ dev->iommu_group = NULL;
+ kobject_put(group->devices_kobj);

- iommu_group_remove_device(dev);
module_put(ops->owner);
dev_iommu_free(dev);
}
@@ -1039,44 +1075,71 @@ int iommu_group_add_device(struct iommu_group *group, struct device *dev)
}
EXPORT_SYMBOL_GPL(iommu_group_add_device);

-/**
- * iommu_group_remove_device - remove a device from it's current group
- * @dev: device to be removed
- *
- * This function is called by an iommu driver to remove the device from
- * it's current group. This decrements the iommu group reference count.
- */
-void iommu_group_remove_device(struct device *dev)
+static struct group_device *
+__iommu_group_remove_device(struct iommu_group *group, struct device *dev)
{
- struct iommu_group *group = dev->iommu_group;
- struct group_device *tmp_device, *device = NULL;
+ struct group_device *device;
+
+ lockdep_assert_held(&group->mutex);

if (!group)
- return;
+ return NULL;

dev_info(dev, "Removing from iommu group %d\n", group->id);

- mutex_lock(&group->mutex);
- list_for_each_entry(tmp_device, &group->devices, list) {
- if (tmp_device->dev == dev) {
- device = tmp_device;
+ list_for_each_entry(device, &group->devices, list) {
+ if (device->dev == dev) {
list_del(&device->list);
- break;
+ return device;
}
}
- mutex_unlock(&group->mutex);
+ return NULL;
+}

+static void __iommu_group_remove_device_sysfs(struct iommu_group *group,
+ struct group_device *device)
+{
if (!device)
return;

sysfs_remove_link(group->devices_kobj, device->name);
- sysfs_remove_link(&dev->kobj, "iommu_group");
+ sysfs_remove_link(&device->dev->kobj, "iommu_group");

- trace_remove_device_from_group(group->id, dev);
+ trace_remove_device_from_group(group->id, device->dev);

kfree(device->name);
kfree(device);
- dev->iommu_group = NULL;
+}
+
+/**
+ * iommu_group_remove_device - remove a device from it's current group
+ * @dev: device to be removed
+ *
+ * This function is used by non-iommu drivers to create non-iommu subystem
+ * groups (eg VFIO mdev, SPAPR) Using this from inside an iommu driver is an
+ * abuse. Instead the driver should return the correct group from
+ * ops->device_group()
+ */
+void iommu_group_remove_device(struct device *dev)
+{
+ struct iommu_group *group = dev->iommu_group;
+ struct group_device *device;
+
+ /*
+ * Since we don't do ops->release_device() there is no way for the
+ * driver to stop using any attached domain before we free it. If a
+ * domain is attached while this function is called it will eventually
+ * UAF.
+ *
+ * Thus it is only useful for cases like VFIO/SPAPR that don't use an
+ * iommu driver, or for cases like FSL that don't use default domains.
+ */
+ WARN_ON(group->domain);
+
+ mutex_lock(&group->mutex);
+ device = __iommu_group_remove_device(group, dev);
+ mutex_unlock(&group->mutex);
+ __iommu_group_remove_device_sysfs(group, device);
kobject_put(group->devices_kobj);
}
EXPORT_SYMBOL_GPL(iommu_group_remove_device);
diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c
index a003bd5fc65c13..703a6083172de1 100644
--- a/drivers/iommu/ipmmu-vmsa.c
+++ b/drivers/iommu/ipmmu-vmsa.c
@@ -302,11 +302,8 @@ static void ipmmu_utlb_enable(struct ipmmu_vmsa_domain *domain,
/*
* Disable MMU translation for the microTLB.
*/
-static void ipmmu_utlb_disable(struct ipmmu_vmsa_domain *domain,
- unsigned int utlb)
+static void ipmmu_utlb_disable(struct ipmmu_vmsa_device *mmu, unsigned int utlb)
{
- struct ipmmu_vmsa_device *mmu = domain->mmu;
-
ipmmu_imuctr_write(mmu, utlb, 0);
mmu->utlb_ctx[utlb] = IPMMU_CTX_INVALID;
}
@@ -647,11 +644,11 @@ static void ipmmu_detach_device(struct iommu_domain *io_domain,
struct device *dev)
{
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
- struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain);
+ struct ipmmu_vmsa_device *mmu = to_ipmmu(dev);
unsigned int i;

for (i = 0; i < fwspec->num_ids; ++i)
- ipmmu_utlb_disable(domain, fwspec->ids[i]);
+ ipmmu_utlb_disable(mmu, fwspec->ids[i]);

/*
* TODO: Optimize by disabling the context when no device is attached.
@@ -847,7 +844,8 @@ static void ipmmu_probe_finalize(struct device *dev)

static void ipmmu_release_device(struct device *dev)
{
- arm_iommu_detach_device(dev);
+ ipmmu_detach_device(NULL, dev);
+ arm_iommu_release_device(dev);
}

static struct iommu_group *ipmmu_find_group(struct device *dev)