Re: [PATCH v3 19/20] PCI/P2PDMA: introduce pci_mmap_p2pmem()

From: Bjorn Helgaas
Date: Mon Sep 27 2021 - 14:49:37 EST


On Thu, Sep 16, 2021 at 05:40:59PM -0600, Logan Gunthorpe wrote:
> Introduce pci_mmap_p2pmem() which is a helper to allocate and mmap
> a hunk of p2pmem into userspace.
>
> Pages are allocated from the genalloc in bulk and their reference count
> incremented. They are returned to the genalloc when the page is put.
>
> The VMA does not take a reference to the pages when they are inserted
> with vmf_insert_mixed() (which is necessary for zone device pages) so
> the backing P2P memory is stored in a structures in vm_private_data.
>
> A pseudo mount is used to allocate an inode for each PCI device. The
> inode's address_space is used in the file doing the mmap so that all
> VMAs are collected and can be unmapped if the PCI device is unbound.
> After unmapping, the VMAs are iterated through and their pages are
> put so the device can continue to be unbound. An active flag is used
> to signal to VMAs not to allocate any further P2P memory once the
> removal process starts. The flag is synchronized with concurrent
> access with an RCU lock.
>
> The VMAs and inode will survive after the unbind of the device, but no
> pages will be present in the VMA and a subsequent access will result
> in a SIGBUS error.
>
> Signed-off-by: Logan Gunthorpe <logang@xxxxxxxxxxxx>

Acked-by: Bjorn Helgaas <bhelgaas@xxxxxxxxxx>

I would capitalize "Introduce" in the subject line.

> ---
> drivers/pci/p2pdma.c | 263 ++++++++++++++++++++++++++++++++++++-
> include/linux/pci-p2pdma.h | 11 ++
> include/uapi/linux/magic.h | 1 +
> 3 files changed, 273 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
> index 2422af5a529c..a5adf57af53a 100644
> --- a/drivers/pci/p2pdma.c
> +++ b/drivers/pci/p2pdma.c
> @@ -16,14 +16,19 @@
> #include <linux/genalloc.h>
> #include <linux/memremap.h>
> #include <linux/percpu-refcount.h>
> +#include <linux/pfn_t.h>
> +#include <linux/pseudo_fs.h>
> #include <linux/random.h>
> #include <linux/seq_buf.h>
> #include <linux/xarray.h>
> +#include <uapi/linux/magic.h>
>
> struct pci_p2pdma {
> struct gen_pool *pool;
> bool p2pmem_published;
> struct xarray map_types;
> + struct inode *inode;
> + bool active;
> };
>
> struct pci_p2pdma_pagemap {
> @@ -32,6 +37,14 @@ struct pci_p2pdma_pagemap {
> u64 bus_offset;
> };
>
> +struct pci_p2pdma_map {
> + struct kref ref;
> + struct pci_dev *pdev;
> + struct inode *inode;
> + void *kaddr;
> + size_t len;
> +};
> +
> static struct pci_p2pdma_pagemap *to_p2p_pgmap(struct dev_pagemap *pgmap)
> {
> return container_of(pgmap, struct pci_p2pdma_pagemap, pgmap);
> @@ -100,6 +113,26 @@ static const struct attribute_group p2pmem_group = {
> .name = "p2pmem",
> };
>
> +/*
> + * P2PDMA internal mount
> + * Fake an internal VFS mount-point in order to allocate struct address_space
> + * mappings to remove VMAs on unbind events.
> + */
> +static int pci_p2pdma_fs_cnt;
> +static struct vfsmount *pci_p2pdma_fs_mnt;
> +
> +static int pci_p2pdma_fs_init_fs_context(struct fs_context *fc)
> +{
> + return init_pseudo(fc, P2PDMA_MAGIC) ? 0 : -ENOMEM;
> +}
> +
> +static struct file_system_type pci_p2pdma_fs_type = {
> + .name = "p2dma",
> + .owner = THIS_MODULE,
> + .init_fs_context = pci_p2pdma_fs_init_fs_context,
> + .kill_sb = kill_anon_super,
> +};
> +
> static void p2pdma_page_free(struct page *page)
> {
> struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(page->pgmap);
> @@ -128,6 +161,9 @@ static void pci_p2pdma_release(void *data)
> gen_pool_destroy(p2pdma->pool);
> sysfs_remove_group(&pdev->dev.kobj, &p2pmem_group);
> xa_destroy(&p2pdma->map_types);
> +
> + iput(p2pdma->inode);
> + simple_release_fs(&pci_p2pdma_fs_mnt, &pci_p2pdma_fs_cnt);
> }
>
> static int pci_p2pdma_setup(struct pci_dev *pdev)
> @@ -145,17 +181,32 @@ static int pci_p2pdma_setup(struct pci_dev *pdev)
> if (!p2p->pool)
> goto out;
>
> - error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_release, pdev);
> + error = simple_pin_fs(&pci_p2pdma_fs_type, &pci_p2pdma_fs_mnt,
> + &pci_p2pdma_fs_cnt);
> if (error)
> goto out_pool_destroy;
>
> + p2p->inode = alloc_anon_inode(pci_p2pdma_fs_mnt->mnt_sb);
> + if (IS_ERR(p2p->inode)) {
> + error = -ENOMEM;
> + goto out_unpin_fs;
> + }
> +
> + error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_release, pdev);
> + if (error)
> + goto out_put_inode;
> +
> error = sysfs_create_group(&pdev->dev.kobj, &p2pmem_group);
> if (error)
> - goto out_pool_destroy;
> + goto out_put_inode;
>
> rcu_assign_pointer(pdev->p2pdma, p2p);
> return 0;
>
> +out_put_inode:
> + iput(p2p->inode);
> +out_unpin_fs:
> + simple_release_fs(&pci_p2pdma_fs_mnt, &pci_p2pdma_fs_cnt);
> out_pool_destroy:
> gen_pool_destroy(p2p->pool);
> out:
> @@ -163,6 +214,45 @@ static int pci_p2pdma_setup(struct pci_dev *pdev)
> return error;
> }
>
> +static void pci_p2pdma_map_free_pages(struct pci_p2pdma_map *pmap)
> +{
> + int i;
> +
> + if (!pmap->kaddr)
> + return;
> +
> + for (i = 0; i < pmap->len; i += PAGE_SIZE)
> + put_page(virt_to_page(pmap->kaddr + i));
> +
> + pmap->kaddr = NULL;
> +}
> +
> +static void pci_p2pdma_free_mappings(struct address_space *mapping)
> +{
> + struct vm_area_struct *vma;
> +
> + i_mmap_lock_write(mapping);
> + if (RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
> + goto out;
> +
> + vma_interval_tree_foreach(vma, &mapping->i_mmap, 0, -1)
> + pci_p2pdma_map_free_pages(vma->vm_private_data);
> +
> +out:
> + i_mmap_unlock_write(mapping);
> +}
> +
> +static void pci_p2pdma_unmap_mappings(void *data)
> +{
> + struct pci_dev *pdev = data;
> + struct pci_p2pdma *p2pdma = rcu_dereference_protected(pdev->p2pdma, 1);
> +
> + p2pdma->active = false;
> + synchronize_rcu();
> + unmap_mapping_range(p2pdma->inode->i_mapping, 0, 0, 1);
> + pci_p2pdma_free_mappings(p2pdma->inode->i_mapping);
> +}
> +
> /**
> * pci_p2pdma_add_resource - add memory for use as p2p memory
> * @pdev: the device to add the memory to
> @@ -221,6 +311,11 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
> goto pgmap_free;
> }
>
> + error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_unmap_mappings,
> + pdev);
> + if (error)
> + goto pages_free;
> +
> p2pdma = rcu_dereference_protected(pdev->p2pdma, 1);
> error = gen_pool_add_owner(p2pdma->pool, (unsigned long)addr,
> pci_bus_address(pdev, bar) + offset,
> @@ -229,6 +324,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
> if (error)
> goto pages_free;
>
> + p2pdma->active = true;
> pci_info(pdev, "added peer-to-peer DMA memory %#llx-%#llx\n",
> pgmap->range.start, pgmap->range.end);
>
> @@ -1029,3 +1125,166 @@ ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev,
> return sprintf(page, "%s\n", pci_name(p2p_dev));
> }
> EXPORT_SYMBOL_GPL(pci_p2pdma_enable_show);
> +
> +static struct pci_p2pdma_map *pci_p2pdma_map_alloc(struct pci_dev *pdev,
> + size_t len)
> +{
> + struct pci_p2pdma_map *pmap;
> +
> + pmap = kzalloc(sizeof(*pmap), GFP_KERNEL);
> + if (!pmap)
> + return NULL;
> +
> + kref_init(&pmap->ref);
> + pmap->pdev = pci_dev_get(pdev);
> + pmap->len = len;
> +
> + return pmap;
> +}
> +
> +static void pci_p2pdma_map_free(struct kref *ref)
> +{
> + struct pci_p2pdma_map *pmap =
> + container_of(ref, struct pci_p2pdma_map, ref);
> +
> + pci_p2pdma_map_free_pages(pmap);
> + pci_dev_put(pmap->pdev);
> + iput(pmap->inode);
> + simple_release_fs(&pci_p2pdma_fs_mnt, &pci_p2pdma_fs_cnt);
> + kfree(pmap);
> +}
> +
> +static void pci_p2pdma_vma_open(struct vm_area_struct *vma)
> +{
> + struct pci_p2pdma_map *pmap = vma->vm_private_data;
> +
> + kref_get(&pmap->ref);
> +}
> +
> +static void pci_p2pdma_vma_close(struct vm_area_struct *vma)
> +{
> + struct pci_p2pdma_map *pmap = vma->vm_private_data;
> +
> + kref_put(&pmap->ref, pci_p2pdma_map_free);
> +}
> +
> +static vm_fault_t pci_p2pdma_vma_fault(struct vm_fault *vmf)
> +{
> + struct pci_p2pdma_map *pmap = vmf->vma->vm_private_data;
> + struct pci_p2pdma *p2pdma;
> + void *vaddr;
> + pfn_t pfn;
> + int i;
> +
> + if (!pmap->kaddr) {
> + rcu_read_lock();
> + p2pdma = rcu_dereference(pmap->pdev->p2pdma);
> + if (!p2pdma)
> + goto err_out;
> +
> + if (!p2pdma->active)
> + goto err_out;
> +
> + pmap->kaddr = (void *)gen_pool_alloc(p2pdma->pool, pmap->len);
> + if (!pmap->kaddr)
> + goto err_out;
> +
> + for (i = 0; i < pmap->len; i += PAGE_SIZE)
> + get_page(virt_to_page(pmap->kaddr + i));
> +
> + rcu_read_unlock();
> + }
> +
> + vaddr = pmap->kaddr + (vmf->pgoff << PAGE_SHIFT);
> + pfn = phys_to_pfn_t(virt_to_phys(vaddr), PFN_DEV | PFN_MAP);
> +
> + return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
> +
> +err_out:
> + rcu_read_unlock();
> + return VM_FAULT_SIGBUS;
> +}
> +static const struct vm_operations_struct pci_p2pdma_vmops = {
> + .open = pci_p2pdma_vma_open,
> + .close = pci_p2pdma_vma_close,
> + .fault = pci_p2pdma_vma_fault,
> +};
> +
> +/**
> + * pci_p2pdma_mmap_file_open - setup file mapping to store P2PMEM VMAs
> + * @pdev: the device to allocate memory from
> + * @vma: the userspace vma to map the memory to
> + *
> + * Set f_mapping of the file to the p2pdma inode so that mappings
> + * are can be torn down on device unbind.
> + *
> + * Returns 0 on success, or a negative error code on failure
> + */
> +void pci_p2pdma_mmap_file_open(struct pci_dev *pdev, struct file *file)
> +{
> + struct pci_p2pdma *p2pdma;
> +
> + rcu_read_lock();
> + p2pdma = rcu_dereference(pdev->p2pdma);
> + if (p2pdma)
> + file->f_mapping = p2pdma->inode->i_mapping;
> + rcu_read_unlock();
> +}
> +EXPORT_SYMBOL_GPL(pci_p2pdma_mmap_file_open);
> +
> +/**
> + * pci_mmap_p2pmem - setup an mmap region to be backed with P2PDMA memory
> + * that was registered with pci_p2pdma_add_resource()
> + * @pdev: the device to allocate memory from
> + * @vma: the userspace vma to map the memory to
> + *
> + * The file must call pci_p2pdma_mmap_file_open() in its open() operation.
> + *
> + * Returns 0 on success, or a negative error code on failure
> + */
> +int pci_mmap_p2pmem(struct pci_dev *pdev, struct vm_area_struct *vma)
> +{
> + struct pci_p2pdma_map *pmap;
> + struct pci_p2pdma *p2pdma;
> + int ret;
> +
> + /* prevent private mappings from being established */
> + if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) {
> + pci_info_ratelimited(pdev,
> + "%s: fail, attempted private mapping\n",
> + current->comm);
> + return -EINVAL;
> + }
> +
> + pmap = pci_p2pdma_map_alloc(pdev, vma->vm_end - vma->vm_start);
> + if (!pmap)
> + return -ENOMEM;
> +
> + rcu_read_lock();
> + p2pdma = rcu_dereference(pdev->p2pdma);
> + if (!p2pdma) {
> + ret = -ENODEV;
> + goto out;
> + }
> +
> + ret = simple_pin_fs(&pci_p2pdma_fs_type, &pci_p2pdma_fs_mnt,
> + &pci_p2pdma_fs_cnt);
> + if (ret)
> + goto out;
> +
> + ihold(p2pdma->inode);
> + pmap->inode = p2pdma->inode;
> + rcu_read_unlock();
> +
> + vma->vm_flags |= VM_MIXEDMAP;
> + vma->vm_private_data = pmap;
> + vma->vm_ops = &pci_p2pdma_vmops;
> +
> + return 0;
> +
> +out:
> + rcu_read_unlock();
> + kfree(pmap);
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(pci_mmap_p2pmem);
> diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h
> index 0c33a40a86e7..f9f19f3db676 100644
> --- a/include/linux/pci-p2pdma.h
> +++ b/include/linux/pci-p2pdma.h
> @@ -81,6 +81,8 @@ int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev,
> bool *use_p2pdma);
> ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev,
> bool use_p2pdma);
> +void pci_p2pdma_mmap_file_open(struct pci_dev *pdev, struct file *file);
> +int pci_mmap_p2pmem(struct pci_dev *pdev, struct vm_area_struct *vma);
> #else /* CONFIG_PCI_P2PDMA */
> static inline int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar,
> size_t size, u64 offset)
> @@ -152,6 +154,15 @@ static inline ssize_t pci_p2pdma_enable_show(char *page,
> {
> return sprintf(page, "none\n");
> }
> +static inline void pci_p2pdma_mmap_file_open(struct pci_dev *pdev,
> + struct file *file)
> +{
> +}
> +static inline int pci_mmap_p2pmem(struct pci_dev *pdev,
> + struct vm_area_struct *vma)
> +{
> + return -EOPNOTSUPP;
> +}
> #endif /* CONFIG_PCI_P2PDMA */
>
>
> diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
> index 35687dcb1a42..af737842c56f 100644
> --- a/include/uapi/linux/magic.h
> +++ b/include/uapi/linux/magic.h
> @@ -88,6 +88,7 @@
> #define BPF_FS_MAGIC 0xcafe4a11
> #define AAFS_MAGIC 0x5a3c69f0
> #define ZONEFS_MAGIC 0x5a4f4653
> +#define P2PDMA_MAGIC 0x70327064
>
> /* Since UDF 2.01 is ISO 13346 based... */
> #define UDF_SUPER_MAGIC 0x15013346
> --
> 2.30.2
>