Re: [PATCH] vfio/pci: make the vfio_pci_mmap_fault reentrant

From: Jason Gunthorpe
Date: Tue Mar 09 2021 - 18:46:18 EST


On Tue, Mar 09, 2021 at 12:56:39PM -0700, Alex Williamson wrote:

> And I think this is what we end up with for the current code base:

Yeah, that looks Ok

> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index 65e7e6b44578..2f247ab18c66 100644
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -1568,19 +1568,24 @@ void vfio_pci_memory_unlock_and_restore(struct vfio_pci_device *vdev, u16 cmd)
> }
>
> /* Caller holds vma_lock */
> -static int __vfio_pci_add_vma(struct vfio_pci_device *vdev,
> - struct vm_area_struct *vma)
> +struct vfio_pci_mmap_vma *__vfio_pci_add_vma(struct vfio_pci_device *vdev,
> + struct vm_area_struct *vma)
> {
> struct vfio_pci_mmap_vma *mmap_vma;
>
> + list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) {
> + if (mmap_vma->vma == vma)
> + return ERR_PTR(-EEXIST);
> + }
> +
> mmap_vma = kmalloc(sizeof(*mmap_vma), GFP_KERNEL);
> if (!mmap_vma)
> - return -ENOMEM;
> + return ERR_PTR(-ENOMEM);
>
> mmap_vma->vma = vma;
> list_add(&mmap_vma->vma_next, &vdev->vma_list);
>
> - return 0;
> + return mmap_vma;
> }
>
> /*
> @@ -1612,30 +1617,39 @@ static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf)
> {
> struct vm_area_struct *vma = vmf->vma;
> struct vfio_pci_device *vdev = vma->vm_private_data;
> - vm_fault_t ret = VM_FAULT_NOPAGE;
> + struct vfio_pci_mmap_vma *mmap_vma;
> + unsigned long vaddr, pfn;
> + vm_fault_t ret;
>
> mutex_lock(&vdev->vma_lock);
> down_read(&vdev->memory_lock);
>
> if (!__vfio_pci_memory_enabled(vdev)) {
> ret = VM_FAULT_SIGBUS;
> - mutex_unlock(&vdev->vma_lock);
> goto up_out;
> }
>
> - if (__vfio_pci_add_vma(vdev, vma)) {
> - ret = VM_FAULT_OOM;
> - mutex_unlock(&vdev->vma_lock);
> + mmap_vma = __vfio_pci_add_vma(vdev, vma);
> + if (IS_ERR(mmap_vma)) {
> + /* A concurrent fault might have already inserted the page */
> + ret = (PTR_ERR(mmap_vma) == -EEXIST) ? VM_FAULT_NOPAGE :
> + VM_FAULT_OOM;

I think -EEIXST should not be an error, lets just go down to the
vmf_insert_pfn() and let the MM resolve the race naturally.

I suspect returning VM_FAULT_NOPAGE will be averse to the userspace if
it hits this race??

Also the _prot does look needed at least due to the SME, but possibly
also to ensure NC gets set..

Jason