Re: [PATCH v4 21/66] mm/mmap: Change do_brk_flags() to expand existing VMA and add do_brk_munmap()

From: Vlastimil Babka
Date: Thu Jan 13 2022 - 07:59:24 EST


On 12/1/21 15:29, Liam Howlett wrote:
> From: "Liam R. Howlett" <Liam.Howlett@xxxxxxxxxx>
>
> Avoid allocating a new VMA when it a vma modification can occur. When a
> brk() can expand or contract a VMA, then the single store operation will
> only modify one index of the maple tree instead of causing a node to
> split or coalesce. This avoids unnecessary allocations/frees of maple
> tree nodes and VMAs.
>
> Use the advanced API for the maple tree to avoid unnecessary walks of
> the tree.
>
> Signed-off-by: Liam R. Howlett <Liam.Howlett@xxxxxxxxxx>
> ---
> mm/mmap.c | 258 +++++++++++++++++++++++++++++++++++++++++++-----------
> 1 file changed, 207 insertions(+), 51 deletions(-)
>
> +static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma,
> + unsigned long addr, unsigned long len,
> + unsigned long flags)
> {
> struct mm_struct *mm = current->mm;
> - struct vm_area_struct *vma, *prev;
> - pgoff_t pgoff = addr >> PAGE_SHIFT;
> + struct vm_area_struct *prev = NULL;
> int error;
> unsigned long mapped_addr;
> validate_mm_mt(mm);
> @@ -2740,11 +2854,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len,
> if (error)
> return error;
>
> - /* Clear old maps, set up prev and uf */
> - if (munmap_vma_range(mm, addr, len, &prev, uf))
> - return -ENOMEM;
> -
> - /* Check against address space limits *after* clearing old maps... */
> + /* Check against address space limits by the changed size */

Can that cause spurious ENOMEM because now the check assumes 'len' worth of
purely new pages and no reuse?

> if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
> return -ENOMEM;
>
> @@ -2754,28 +2864,57 @@ static int do_brk_flags(unsigned long addr, unsigned long len,
> if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
> return -ENOMEM;
>
> - /* Can we just expand an old private anonymous mapping? */
> - vma = vma_merge(mm, prev, addr, addr + len, flags,
> - NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX);
> - if (vma)
> - goto out;
> + mas->last = addr + len - 1;
> + if (vma) {
> + /* Expand the existing vma if possible; almost never a singular
> + * list, so this will almost always fail. */
>
> - /*
> - * create a vma struct for an anonymous mapping
> - */
> - vma = vm_area_alloc(mm);
> - if (!vma) {
> - vm_unacct_memory(len >> PAGE_SHIFT);
> - return -ENOMEM;
> + if ((!vma->anon_vma ||
> + list_is_singular(&vma->anon_vma_chain)) &&

Hmm I feel uneasy about this part that mimics what vma_merge() does. Looks
like something e.g. we can easily forget to adjust when changing vma_merge()
itself.
Is this optimization worth the trouble given the comment above "so this will
almost always fail"?

> + ((vma->vm_flags & ~VM_SOFTDIRTY) == flags)){
> + mas->index = vma->vm_start;
> +
> + vma_adjust_trans_huge(vma, addr, addr + len, 0);
> + if (vma->anon_vma) {
> + anon_vma_lock_write(vma->anon_vma);
> + anon_vma_interval_tree_pre_update_vma(vma);
> + }
> + vma->vm_end = addr + len;
> + vma->vm_flags |= VM_SOFTDIRTY;
> + if (mas_store_gfp(mas, vma, GFP_KERNEL))
> + goto mas_mod_fail;
> +
> + if (vma->anon_vma) {
> + anon_vma_interval_tree_post_update_vma(vma);
> + anon_vma_unlock_write(vma->anon_vma);
> + }
> + khugepaged_enter_vma_merge(vma, flags);
> + goto out;
> + }
> + prev = vma;
> }
> + mas->index = addr;
> + mas_walk(mas);
> +
> + /* create a vma struct for an anonymous mapping */
> + vma = vm_area_alloc(mm);
> + if (!vma)
> + goto vma_alloc_fail;
>
> vma_set_anonymous(vma);
> vma->vm_start = addr;
> vma->vm_end = addr + len;
> - vma->vm_pgoff = pgoff;
> + vma->vm_pgoff = addr >> PAGE_SHIFT;
> vma->vm_flags = flags;
> vma->vm_page_prot = vm_get_page_prot(flags);
> - vma_link(mm, vma, prev);
> + if (vma_mas_store(vma, mas))
> + goto mas_store_fail;
> +
> + if (!prev)
> + prev = mas_prev(mas, 0);
> +
> + __vma_link_list(mm, vma, prev);
> + mm->map_count++;
> out:
> perf_event_mmap(vma);
> mm->total_vm += len >> PAGE_SHIFT;
> @@ -2785,15 +2924,31 @@ static int do_brk_flags(unsigned long addr, unsigned long len,
> vma->vm_flags |= VM_SOFTDIRTY;
> validate_mm_mt(mm);
> return 0;
> +
> +mas_store_fail:
> + vm_area_free(vma);
> +vma_alloc_fail:
> + vm_unacct_memory(len >> PAGE_SHIFT);
> + return -ENOMEM;
> +
> +mas_mod_fail:
> + vma->vm_end = addr;
> + if (vma->anon_vma) {
> + anon_vma_interval_tree_post_update_vma(vma);
> + anon_vma_unlock_write(vma->anon_vma);
> + }
> + return -ENOMEM;
> +
> }
>
> int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
> {
> struct mm_struct *mm = current->mm;
> + struct vm_area_struct *vma = NULL;
> unsigned long len;
> int ret;
> bool populate;
> - LIST_HEAD(uf);
> + MA_STATE(mas, &mm->mm_mt, addr, addr);
>
> len = PAGE_ALIGN(request);
> if (len < request)
> @@ -2804,10 +2959,11 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
> if (mmap_write_lock_killable(mm))
> return -EINTR;
>
> - ret = do_brk_flags(addr, len, flags, &uf);
> + // This vma left intentionally blank.

This comment using unintentionally bad syntax (// vs /* */)

Also if we leave it blank it means this path won't ever expand an existing
vma, while previously it could succeed the vma_merge, no? Or all callers of
vm_brk_flags() in a scenario where there's no expand anyway? Maybe just have
a more verbose comment...

> + mas_walk(&mas);
> + ret = do_brk_flags(&mas, vma, addr, len, flags);
> populate = ((mm->def_flags & VM_LOCKED) != 0);
> mmap_write_unlock(mm);
> - userfaultfd_unmap_complete(mm, &uf);

Looks like this part is removed completely from vm_brk_flags() paths?

OK it seems the whole patch makes some asumption that vm_brk_flags() never
has to unmap a pre-existing area, and in the brk() syscall this is now
delegated to do_brk_munmap(), and do_brk_flags() loses the support. While it
might be safe, it should be discussed in the patch that vm_brk_flags()
didn't actually need to support the unmap part, because x y z. And best if
there are some DEBUG_VM based assertions supporting that.

But then again, is the optimized scenario happening often enough to warrant it?

> if (populate && !ret)
> mm_populate(addr, len);
> return ret;