Re: [PATCH v3 14/24] vfio: powerpc/spapr: Register memory

From: Alexey Kardashevskiy
Date: Tue Feb 03 2015 - 00:52:18 EST


On 02/03/2015 11:11 AM, Alex Williamson wrote:
> On Thu, 2015-01-29 at 20:21 +1100, Alexey Kardashevskiy wrote:
>> The existing implementation accounts the whole DMA window in
>> the locked_vm counter which is going to be even worse with multiple
>> containers and huge DMA windows.
>>
>> This introduces 2 ioctls to register/unregister DMA memory which
>> receive user space address and size of the memory region which
>> needs to be pinned/unpinned and counted in locked_vm.
>>
>> If any memory region was registered, all subsequent DMA map requests
>> should address already pinned memory. If no memory was registered,
>> then the amount of memory required for a single default memory will be
>> accounted when the container is enabled and every map/unmap will pin/unpin
>> a page.
>>
>> Dynamic DMA window and in-kernel acceleration will require memory to
>> be registered in order to work.
>>
>> The accounting is done per VFIO container. When the support of
>> multiple groups per container is added, we will have accurate locked_vm
>> accounting.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@xxxxxxxxx>
>> ---
>> drivers/vfio/vfio_iommu_spapr_tce.c | 333 ++++++++++++++++++++++++++++++++----
>> include/uapi/linux/vfio.h | 29 ++++
>> 2 files changed, 331 insertions(+), 31 deletions(-)
>>
>> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
>> index 8256275..d0987ae 100644
>> --- a/drivers/vfio/vfio_iommu_spapr_tce.c
>> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
>> @@ -86,8 +86,169 @@ struct tce_container {
>> struct mutex lock;
>> struct iommu_group *grp;
>> bool enabled;
>> + struct list_head mem_list;
>> };
>>
>> +struct tce_memory {
>> + struct list_head next;
>> + struct rcu_head rcu;
>> + __u64 vaddr;
>> + __u64 size;
>> + __u64 pfns[];
>> +};
>
> So we're using 2MB of kernel memory per 1G of user mapped memory, right?
> Or are we using bigger pages here? I'm not sure the kmalloc below is
> the appropriate allocator for something that can be so large.

ok, vmalloc it is then.


>> +
>> +static void tce_unpin_pages(struct tce_container *container,
>> + struct tce_memory *mem, __u64 vaddr, __u64 size)
>> +{
>> + __u64 off;
>> + struct page *page = NULL;
>> +
>> +
>> + for (off = 0; off < size; off += PAGE_SIZE) {
>> + if (!mem->pfns[off >> PAGE_SHIFT])
>> + continue;
>> +
>> + page = pfn_to_page(mem->pfns[off >> PAGE_SHIFT]);
>> + if (!page)
>> + continue;
>> +
>> + put_page(page);
>> + mem->pfns[off >> PAGE_SHIFT] = 0;
>> + }
>
> Seems cleaner to count by 1 rather than PAGE_SIZE (ie. shift size once
> instead of off 3 times).
>
>> +}
>> +
>> +static void release_tce_memory(struct rcu_head *head)
>> +{
>> + struct tce_memory *mem = container_of(head, struct tce_memory, rcu);
>> +
>> + kfree(mem);
>> +}
>> +
>> +static void tce_do_unregister_pages(struct tce_container *container,
>> + struct tce_memory *mem)
>> +{
>> + tce_unpin_pages(container, mem, mem->vaddr, mem->size);
>> + decrement_locked_vm(mem->size);
>> + list_del_rcu(&mem->next);
>> + call_rcu_sched(&mem->rcu, release_tce_memory);
>> +}
>> +
>> +static long tce_unregister_pages(struct tce_container *container,
>> + __u64 vaddr, __u64 size)
>> +{
>> + struct tce_memory *mem, *memtmp;
>> +
>> + if (container->enabled)
>> + return -EBUSY;
>> +
>> + if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
>> + return -EINVAL;
>> +
>> + list_for_each_entry_safe(mem, memtmp, &container->mem_list, next) {
>> + if ((mem->vaddr == vaddr) && (mem->size == size)) {
>> + tce_do_unregister_pages(container, mem);
>> + return 0;
>> + }
>> + }
>> +
>> + return -ENOENT;
>> +}
>> +
>> +static long tce_pin_pages(struct tce_container *container,
>> + struct tce_memory *mem, __u64 vaddr, __u64 size)
>> +{
>> + __u64 off;
>> + struct page *page = NULL;
>> +
>> + for (off = 0; off < size; off += PAGE_SIZE) {
>> + if (1 != get_user_pages_fast(vaddr + off,
>> + 1/* pages */, 1/* iswrite */, &page)) {
>> + tce_unpin_pages(container, mem, vaddr, off);
>> + return -EFAULT;
>> + }
>> +
>> + mem->pfns[off >> PAGE_SHIFT] = page_to_pfn(page);
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +static long tce_register_pages(struct tce_container *container,
>> + __u64 vaddr, __u64 size)
>> +{
>> + long ret;
>> + struct tce_memory *mem;
>> +
>> + if (container->enabled)
>> + return -EBUSY;
>> +
>> + if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) ||
>> + ((vaddr + size) < vaddr))
>> + return -EINVAL;
>> +
>> + /* Any overlap with registered chunks? */
>> + rcu_read_lock();
>> + list_for_each_entry_rcu(mem, &container->mem_list, next) {
>> + if ((mem->vaddr < (vaddr + size)) &&
>> + (vaddr < (mem->vaddr + mem->size))) {
>> + ret = -EBUSY;
>> + goto unlock_exit;
>> + }
>> + }
>> +
>> + ret = try_increment_locked_vm(size >> PAGE_SHIFT);
>> + if (ret)
>> + goto unlock_exit;
>> +
>> + mem = kzalloc(sizeof(*mem) + (size >> (PAGE_SHIFT - 3)), GFP_KERNEL);
>
>
> I suspect that userspace can break kmalloc with the potential size of
> this structure. You might need a vmalloc. I also wonder if there isn't
> a more efficient tree structure to use.


Right, I'll use vmalloc. All this time I was thinking kmalloc() allocates
non-contiguous memory :) How would the tree be more efficient here? I store
pfns once and unpin them once as well.



>> + if (!mem)
>> + goto unlock_exit;
>> +
>> + if (tce_pin_pages(container, mem, vaddr, size))
>> + goto free_exit;
>> +
>> + mem->vaddr = vaddr;
>> + mem->size = size;
>> +
>> + list_add_rcu(&mem->next, &container->mem_list);
>> + rcu_read_unlock();
>> +
>> + return 0;
>> +
>> +free_exit:
>> + kfree(mem);
>> +
>> +unlock_exit:
>> + decrement_locked_vm(size >> PAGE_SHIFT);
>> + rcu_read_unlock();
>> +
>> + return ret;
>> +}
>> +
>> +static inline bool tce_preregistered(struct tce_container *container)
>> +{
>> + return !list_empty(&container->mem_list);
>> +}
>> +
>> +static bool tce_pinned(struct tce_container *container,
>> + __u64 vaddr, __u64 size)
>> +{
>> + struct tce_memory *mem;
>> + bool ret = false;
>> +
>> + rcu_read_lock();
>> + list_for_each_entry_rcu(mem, &container->mem_list, next) {
>> + if ((mem->vaddr <= vaddr) &&
>> + (vaddr + size <= mem->vaddr + mem->size)) {
>> + ret = true;
>> + break;
>> + }
>> + }
>> + rcu_read_unlock();
>> +
>> + return ret;
>> +}
>> +
>> static bool tce_check_page_size(struct page *page, unsigned page_shift)
>> {
>> unsigned shift;
>> @@ -166,14 +327,16 @@ static int tce_iommu_enable(struct tce_container *container)
>> * as this information is only available from KVM and VFIO is
>> * KVM agnostic.
>> */
>> - iommu = iommu_group_get_iommudata(container->grp);
>> - if (!iommu)
>> - return -EFAULT;
>> + if (!tce_preregistered(container)) {
>> + iommu = iommu_group_get_iommudata(container->grp);
>> + if (!iommu)
>> + return -EFAULT;
>>
>> - tbl = &iommu->tables[0];
>> - ret = try_increment_locked_vm(IOMMU_TABLE_PAGES(tbl));
>> - if (ret)
>> - return ret;
>> + tbl = &iommu->tables[0];
>> + ret = try_increment_locked_vm(IOMMU_TABLE_PAGES(tbl));
>> + if (ret)
>> + return ret;
>> + }
>>
>> container->enabled = true;
>>
>> @@ -193,12 +356,14 @@ static void tce_iommu_disable(struct tce_container *container)
>> if (!container->grp || !current->mm)
>> return;
>>
>> - iommu = iommu_group_get_iommudata(container->grp);
>> - if (!iommu)
>> - return;
>> + if (!tce_preregistered(container)) {
>> + iommu = iommu_group_get_iommudata(container->grp);
>> + if (!iommu)
>> + return;
>>
>> - tbl = &iommu->tables[0];
>> - decrement_locked_vm(IOMMU_TABLE_PAGES(tbl));
>> + tbl = &iommu->tables[0];
>> + decrement_locked_vm(IOMMU_TABLE_PAGES(tbl));
>> + }
>> }
>>
>> static void *tce_iommu_open(unsigned long arg)
>> @@ -215,6 +380,7 @@ static void *tce_iommu_open(unsigned long arg)
>> return ERR_PTR(-ENOMEM);
>>
>> mutex_init(&container->lock);
>> + INIT_LIST_HEAD_RCU(&container->mem_list);
>>
>> return container;
>> }
>> @@ -222,6 +388,7 @@ static void *tce_iommu_open(unsigned long arg)
>> static void tce_iommu_release(void *iommu_data)
>> {
>> struct tce_container *container = iommu_data;
>> + struct tce_memory *mem, *memtmp;
>>
>> WARN_ON(container->grp);
>> tce_iommu_disable(container);
>> @@ -229,14 +396,19 @@ static void tce_iommu_release(void *iommu_data)
>> if (container->grp)
>> tce_iommu_detach_group(iommu_data, container->grp);
>>
>> + list_for_each_entry_safe(mem, memtmp, &container->mem_list, next)
>> + tce_do_unregister_pages(container, mem);
>> +
>> mutex_destroy(&container->lock);
>>
>> kfree(container);
>> }
>>
>> -static void tce_iommu_unuse_page(unsigned long oldtce)
>> +static void tce_iommu_unuse_page(struct tce_container *container,
>> + unsigned long oldtce)
>> {
>> struct page *page;
>> + bool do_put = !tce_preregistered(container);
>>
>> if (!(oldtce & (TCE_PCI_READ | TCE_PCI_WRITE)))
>> return;
>> @@ -245,7 +417,8 @@ static void tce_iommu_unuse_page(unsigned long oldtce)
>> if (oldtce & TCE_PCI_WRITE)
>> SetPageDirty(page);
>>
>> - put_page(page);
>> + if (do_put)
>> + put_page(page);
>> }
>>
>> static int tce_iommu_clear(struct tce_container *container,
>> @@ -261,7 +434,7 @@ static int tce_iommu_clear(struct tce_container *container,
>> if (ret)
>> continue;
>>
>> - tce_iommu_unuse_page(oldtce);
>> + tce_iommu_unuse_page(container, oldtce);
>> }
>>
>> return 0;
>> @@ -279,42 +452,91 @@ static enum dma_data_direction tce_iommu_direction(unsigned long tce)
>> return DMA_NONE;
>> }
>>
>> +static unsigned long tce_get_hva_cached(struct tce_container *container,
>> + unsigned page_shift, unsigned long tce)
>> +{
>> + struct tce_memory *mem;
>> + struct page *page = NULL;
>> + unsigned long hva = -1;
>> +
>> + tce &= ~(TCE_PCI_READ | TCE_PCI_WRITE);
>> + rcu_read_lock();
>> + list_for_each_entry_rcu(mem, &container->mem_list, next) {
>> + if ((mem->vaddr <= tce) && (tce < (mem->vaddr + mem->size))) {
>> + unsigned long gfn = (tce - mem->vaddr) >> PAGE_SHIFT;
>> + unsigned long hpa = mem->pfns[gfn] << PAGE_SHIFT;
>> +
>> + page = pfn_to_page(mem->pfns[gfn]);
>> +
>> + if (!tce_check_page_size(page, page_shift))
>> + break;
>> +
>> + hva = (unsigned long) __va(hpa);
>> + break;
>> + }
>> + }
>> + rcu_read_unlock();
>> +
>> + return hva;
>> +}
>> +
>> +static unsigned long tce_get_hva(struct tce_container *container,
>> + unsigned page_shift, unsigned long tce)
>> +{
>> + long ret = 0;
>> + struct page *page = NULL;
>> + unsigned long hva = -1;
>> + enum dma_data_direction direction = tce_iommu_direction(tce);
>> +
>> + ret = get_user_pages_fast(tce & PAGE_MASK, 1,
>> + direction != DMA_TO_DEVICE, &page);
>> + if (unlikely(ret != 1))
>> + return -1;
>> +
>> + if (!tce_check_page_size(page, page_shift)) {
>> + put_page(page);
>> + return -1;
>> + }
>> +
>> + hva = (unsigned long) page_address(page) +
>> + (tce & ~((1ULL << page_shift) - 1) & ~PAGE_MASK);
>> +
>> + return hva;
>> +}
>> +
>> static long tce_iommu_build(struct tce_container *container,
>> struct iommu_table *tbl,
>> unsigned long entry, unsigned long tce, unsigned long pages)
>> {
>> long i, ret = 0;
>> - struct page *page = NULL;
>> unsigned long hva, oldtce;
>> enum dma_data_direction direction = tce_iommu_direction(tce);
>> + bool do_put = false;
>>
>> for (i = 0; i < pages; ++i) {
>> - ret = get_user_pages_fast(tce & PAGE_MASK, 1,
>> - direction != DMA_TO_DEVICE, &page);
>> - if (unlikely(ret != 1)) {
>> - ret = -EFAULT;
>> - break;
>> + hva = tce_get_hva_cached(container, tbl->it_page_shift, tce);
>> + if (hva == -1) {
>> + do_put = true;
>> + WARN_ON_ONCE(1);
>> + hva = tce_get_hva(container, tbl->it_page_shift, tce);
>> }
>>
>> - if (!tce_check_page_size(page, tbl->it_page_shift)) {
>> - ret = -EFAULT;
>> - break;
>> - }
>> -
>> - hva = (unsigned long) page_address(page) +
>> - (tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK);
>> oldtce = 0;
>> -
>> ret = iommu_tce_xchg(tbl, entry + i, hva, &oldtce, direction);
>> if (ret) {
>> - put_page(page);
>> + if (do_put)
>> + put_page(pfn_to_page(__pa(hva) >> PAGE_SHIFT));
>> pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
>> __func__, entry << tbl->it_page_shift,
>> tce, ret);
>> break;
>> }
>>
>> - tce_iommu_unuse_page(oldtce);
>> + if (do_put)
>> + put_page(pfn_to_page(__pa(hva) >> PAGE_SHIFT));
>> +
>> + tce_iommu_unuse_page(container, oldtce);
>> +
>> tce += IOMMU_PAGE_SIZE(tbl);
>> }
>>
>> @@ -416,6 +638,11 @@ static long tce_iommu_ioctl(void *iommu_data,
>> if (ret)
>> return ret;
>>
>> + /* If any memory is pinned, only allow pages from that region */
>> + if (tce_preregistered(container) &&
>> + !tce_pinned(container, param.vaddr, param.size))
>> + return -EPERM;
>> +
>> ret = tce_iommu_build(container, tbl,
>> param.iova >> tbl->it_page_shift,
>> tce, param.size >> tbl->it_page_shift);
>> @@ -464,6 +691,50 @@ static long tce_iommu_ioctl(void *iommu_data,
>>
>> return ret;
>> }
>> + case VFIO_IOMMU_REGISTER_MEMORY: {
>> + struct vfio_iommu_type1_register_memory param;
>> +
>> + minsz = offsetofend(struct vfio_iommu_type1_register_memory,
>> + size);
>> +
>> + if (copy_from_user(&param, (void __user *)arg, minsz))
>> + return -EFAULT;
>> +
>> + if (param.argsz < minsz)
>> + return -EINVAL;
>> +
>> + /* No flag is supported now */
>> + if (param.flags)
>> + return -EINVAL;
>> +
>> + mutex_lock(&container->lock);
>> + ret = tce_register_pages(container, param.vaddr, param.size);
>> + mutex_unlock(&container->lock);
>> +
>> + return ret;
>> + }
>> + case VFIO_IOMMU_UNREGISTER_MEMORY: {
>> + struct vfio_iommu_type1_unregister_memory param;
>> +
>> + minsz = offsetofend(struct vfio_iommu_type1_unregister_memory,
>> + size);
>> +
>> + if (copy_from_user(&param, (void __user *)arg, minsz))
>> + return -EFAULT;
>> +
>> + if (param.argsz < minsz)
>> + return -EINVAL;
>> +
>> + /* No flag is supported now */
>> + if (param.flags)
>> + return -EINVAL;
>> +
>> + mutex_lock(&container->lock);
>> + tce_unregister_pages(container, param.vaddr, param.size);
>> + mutex_unlock(&container->lock);
>> +
>> + return 0;
>> + }
>> case VFIO_IOMMU_ENABLE:
>> mutex_lock(&container->lock);
>> ret = tce_iommu_enable(container);
>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
>> index 29715d2..2bb0c9b 100644
>> --- a/include/uapi/linux/vfio.h
>> +++ b/include/uapi/linux/vfio.h
>> @@ -437,6 +437,35 @@ struct vfio_iommu_type1_dma_unmap {
>> #define VFIO_IOMMU_ENABLE _IO(VFIO_TYPE, VFIO_BASE + 15)
>> #define VFIO_IOMMU_DISABLE _IO(VFIO_TYPE, VFIO_BASE + 16)
>>
>> +/**
>> + * VFIO_IOMMU_REGISTER_MEMORY - _IOW(VFIO_TYPE, VFIO_BASE + 17, struct vfio_iommu_type1_register_memory)
>> + *
>> + * Registers user space memory where DMA is allowed. It pins
>> + * user pages and does the locked memory accounting so
>> + * subsequent VFIO_IOMMU_MAP_DMA/VFIO_IOMMU_UNMAP_DMA calls
>> + * get simpler.
>> + */
>> +struct vfio_iommu_type1_register_memory {
>> + __u32 argsz;
>> + __u32 flags;
>> + __u64 vaddr; /* Process virtual address */
>> + __u64 size; /* Size of mapping (bytes) */
>> +};
>> +#define VFIO_IOMMU_REGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 17)
>> +
>> +/**
>> + * VFIO_IOMMU_UNREGISTER_MEMORY - _IOW(VFIO_TYPE, VFIO_BASE + 18, struct vfio_iommu_type1_unregister_memory)
>> + *
>> + * Unregisters user space memory registered with VFIO_IOMMU_REGISTER_MEMORY.
>> + */
>> +struct vfio_iommu_type1_unregister_memory {
>> + __u32 argsz;
>> + __u32 flags;
>> + __u64 vaddr; /* Process virtual address */
>> + __u64 size; /* Size of mapping (bytes) */
>> +};
>> +#define VFIO_IOMMU_UNREGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 18)
>> +
>
> Is the user allowed to unregister arbitrary sub-regions of previously
> registered memory? (I think I know the answer, but it should be
> documented)

The answer is "no" :) I'll update Documentation/vfio.txt.


> Why are these "type1" structures, shouldn't they be down below?


Pretty much because these do not look like they do anything
powerpc-specific from the userspace prospective. Like DMA map/unmap.


> Do we need an extension or flag bit to describe these as present or is
> it sufficient to call and fail?

Sorry, I do not follow you here. Flag to describe what as present? As it is
now, in QEMU I setup a memory listener which walks through all RAM regions
and calls VFIO_IOMMU_REGISTER_MEMORY for every slot, once when the
container is started being used and I expect that this can fail (because of
RLIMIT, etc).


> Do we need two ioctls or one?

There are map/unmap, enable/disable, set/unset container couples so I
thought it would look natural if it was pin/unpin couple, no?


> What about Documentation/vfio.txt?

Yep. Thanks for the review!


>
>> /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
>>
>> /*
>
>
>


--
Alexey
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/