Re: [PATCH v3] iommu/s390: Fix race with release_device ops

From: Matthew Rosato
Date: Wed Aug 31 2022 - 10:13:53 EST


On 8/31/22 5:05 AM, Pierre Morel wrote:
>
>
> On 8/30/22 22:15, Matthew Rosato wrote:
>> With commit fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev
>> calls") s390-iommu is supposed to handle dynamic switching between IOMMU
>> domains and the DMA API handling.  However, this commit does not
>> sufficiently handle the case where the device is released via a call
>> to the release_device op as it may occur at the same time as an opposing
>> attach_dev or detach_dev since the group mutex is not held over
>> release_device.  This was observed if the device is deconfigured during a
>> small window during vfio-pci initialization and can result in WARNs and
>> potential kernel panics.
>>
>> Handle this by tracking when the device is probed/released via
>> dev_iommu_priv_set/get().  Ensure that once the device is released only
>> release_device handles the re-init of the device DMA.
>>
>> Fixes: fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev calls")
>> Signed-off-by: Matthew Rosato <mjrosato@xxxxxxxxxxxxx>
>> ---
>> Changes since v2:
>> - Relocate the list_empty and aperture chekcs in attach_dev to the
>>    their original locations so they are all done under a single
>>    acquisition of the spinlock (Heiko)
>> ---
>>   arch/s390/include/asm/pci.h |  1 +
>>   arch/s390/pci/pci.c         |  1 +
>>   drivers/iommu/s390-iommu.c  | 39 ++++++++++++++++++++++++++++++++++---
>>   3 files changed, 38 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
>> index 7b4cdadbc023..080251e7b275 100644
>> --- a/arch/s390/include/asm/pci.h
>> +++ b/arch/s390/include/asm/pci.h
>> @@ -157,6 +157,7 @@ struct zpci_dev {
>>       /* DMA stuff */
>>       unsigned long    *dma_table;
>>       spinlock_t    dma_table_lock;
>> +    struct mutex    dma_domain_lock; /* protects s390_domain value */
>>       int        tlb_refresh;
>>         spinlock_t    iommu_bitmap_lock;
>> diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
>> index 73cdc5539384..973edd32ecc9 100644
>> --- a/arch/s390/pci/pci.c
>> +++ b/arch/s390/pci/pci.c
>> @@ -832,6 +832,7 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state)
>>       kref_init(&zdev->kref);
>>       mutex_init(&zdev->lock);
>>       mutex_init(&zdev->kzdev_lock);
>> +    mutex_init(&zdev->dma_domain_lock);
>>         rc = zpci_init_iommu(zdev);
>>       if (rc)
>> diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c
>> index c898bcbbce11..1137d669e849 100644
>> --- a/drivers/iommu/s390-iommu.c
>> +++ b/drivers/iommu/s390-iommu.c
>> @@ -99,6 +99,14 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
>>       if (!domain_device)
>>           return -ENOMEM;
>>   +    /* Leave now if the device has already been released */
>> +    mutex_lock(&zdev->dma_domain_lock);
>> +    if (!dev_iommu_priv_get(dev)) {
>> +        mutex_unlock(&zdev->dma_domain_lock);
>> +        kfree(domain_device);
>> +        return 0;
>> +    }
>> +
>>       if (zdev->dma_table && !zdev->s390_domain) {
>>           cc = zpci_dma_exit_device(zdev);
>>           if (cc) {
>> @@ -132,9 +140,10 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
>>           goto out_restore;
>>       }
>>       domain_device->zdev = zdev;
>> -    zdev->s390_domain = s390_domain;
>>       list_add(&domain_device->list, &s390_domain->devices);
>>       spin_unlock_irqrestore(&s390_domain->list_lock, flags);
>> +    zdev->s390_domain = s390_domain;
>> +    mutex_unlock(&zdev->dma_domain_lock);
>>         return 0;
>>   @@ -147,6 +156,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
>>                      virt_to_phys(zdev->dma_table));
>>       }
>>   out_free:
>> +    mutex_unlock(&zdev->dma_domain_lock);
>>       kfree(domain_device);
>>         return rc;
>> @@ -176,17 +186,22 @@ static void s390_iommu_detach_device(struct iommu_domain *domain,
>>       }
>>       spin_unlock_irqrestore(&s390_domain->list_lock, flags);
>>   -    if (found && (zdev->s390_domain == s390_domain)) {
>> +    mutex_lock(&zdev->dma_domain_lock);
>> +    if (found && (zdev->s390_domain == s390_domain) &&
>> +        dev_iommu_priv_get(dev)) {
>>           zdev->s390_domain = NULL;
>>           zpci_unregister_ioat(zdev, 0);
>>           zpci_dma_init_device(zdev);
>>       }
>> +    mutex_unlock(&zdev->dma_domain_lock);
>>   }
>>     static struct iommu_device *s390_iommu_probe_device(struct device *dev)
>>   {
>>       struct zpci_dev *zdev = to_zpci_dev(dev);
>>   +    dev_iommu_priv_set(dev, zdev);
>> +
>>       return &zdev->iommu_dev;
>>   }
>>   @@ -206,10 +221,28 @@ static void s390_iommu_release_device(struct device *dev)
>>        *
>>        * So let's call detach_dev from here if it hasn't been called before.
>>        */
>> -    if (zdev && zdev->s390_domain) {
>> +    if (zdev) {
>> +        /*
>> +         * Clear priv to block further attaches for this device,
>> +         * ensure detaches don't init DMA.  Hold the domain lock
>> +         * to ensure that attach/detach get a consistent view of
>> +         * whether or not the device is released.
>> +         */
>> +        mutex_lock(&zdev->dma_domain_lock);
>> +        dev_iommu_priv_set(dev, NULL);
>> +        mutex_unlock(&zdev->dma_domain_lock);
>> +        /* Make sure this device is removed from the domain list */
>>           domain = iommu_get_domain_for_dev(dev);
>
>
> Shouldn't you take the group_mutex before calling directly s390_iommu_detach_device from here?
>

Well, we can't do that directly as the group mutex is only accessible to the iommu core. But perhaps the right answer is to call back into the core here by just replacing the existing call to s390_iommu_detach_device below with a call to iommu_detach_device (1 line change). Then we still ensure DMA is initialized next after this point. I've been running tests with such a change for the last few hours and this works well.

But this also made me notice another subtle issue re: a mismatch between the number of kallocs and kfrees for s390_domain_device. Need to investigate that further before a next version.

>
>>           if (domain)
>>               s390_iommu_detach_device(domain, dev);
>
>
>
>
>> +        /* Now ensure DMA is initialized from here */
>> +        mutex_lock(&zdev->dma_domain_lock);
>> +        if (zdev->s390_domain) {
>> +            zdev->s390_domain = NULL;
>> +            zpci_unregister_ioat(zdev, 0);
>> +            zpci_dma_init_device(zdev);
>> +        }
>> +        mutex_unlock(&zdev->dma_domain_lock);
>>       }
>>   }
>>  
>