From ff86c690f9236d8ba74cb2eb6f529b6354a96223 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Tue, 11 May 2021 17:01:50 -0700 Subject: [PATCH] mm, hugetlb: fix resv_huge_pages underflow on UFFDIO_COPY test passes. With force copy fail, test fails resv underflows permenantly. Seems pages are added, removed, then not found in cache double consuming the reservation. Signed-off-by: Mina Almasry Cc: Axel Rasmussen Cc: Peter Xu Cc: linux-mm@kvack.org Cc: Mike Kravetz Cc: Andrew Morton Cc: linux-mm@kvack.org Cc: linux-kernel@vger.kernel.org Change-Id: Ida20f35fdfa7fce598582dcfa199845115eaac18 --- fs/hugetlbfs/inode.c | 6 +++ mm/hugetlb.c | 124 ++++++++++++++++++++++++++++++++++++++----- 2 files changed, 116 insertions(+), 14 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a2a42335e8fd..ba0b3fe88f18 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -525,6 +525,9 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, * to be adjusted. */ VM_BUG_ON(PagePrivate(page)); + printk("%s:%d:%s removing huge page, idx=%d h->resv_huge_pages=%d\n", + __FILE__, __LINE__, __func__, index, + h->resv_huge_pages); remove_huge_page(page); freed++; if (!truncate_op) { @@ -532,6 +535,9 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, index, index + 1, 1))) hugetlb_fix_reserve_counts(inode); } + printk("%s:%d:%s removed huge page, idx=%d h->resv_huge_pages=%d\n", + __FILE__, __LINE__, __func__, index, + h->resv_huge_pages); unlock_page(page); if (!truncate_op) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 629aa4c2259c..2d9b557cbafa 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -93,6 +93,28 @@ static inline bool subpool_is_free(struct hugepage_subpool *spool) return true; } +/* + * Gigantic pages are so large that we do not guarantee that page++ pointer + * arithmetic will work across the entire page. We need something more + * specialized. + */ +static void __copy_gigantic_page(struct page *dst, struct page *src, + int nr_pages) +{ + int i; + struct page *dst_base = dst; + struct page *src_base = src; + + for (i = 0; i < nr_pages;) { + cond_resched(); + copy_highpage(dst, src); + + i++; + dst = mem_map_next(dst, dst_base, i); + src = mem_map_next(src, src_base, i); + } +} + static inline void unlock_or_release_subpool(struct hugepage_subpool *spool, unsigned long irq_flags) { @@ -1165,6 +1187,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); if (page && !avoid_reserve && vma_has_reserves(vma, chg)) { SetHPageRestoreReserve(page); + WARN_ON_ONCE(!h->resv_huge_pages); h->resv_huge_pages--; } @@ -2405,6 +2428,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, int ret, idx; struct hugetlb_cgroup *h_cg; bool deferred_reserve; + pgoff_t pageidx = vma_hugecache_offset(h, vma, addr); idx = hstate_index(h); /* @@ -2508,6 +2532,9 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h), pages_per_huge_page(h), page); } + printk("%s:%d:%s allocated page idx=%d, deferred_reserve=%d, avoid_reserve=%d\n", + __FILE__, __LINE__, __func__, pageidx, deferred_reserve, + avoid_reserve); return page; out_uncharge_cgroup: @@ -4571,6 +4598,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, if (huge_pte_none(huge_ptep_get(ptep))) ret = vmf_error(PTR_ERR(page)); spin_unlock(ptl); + printk("%s:%d:%s failed \n", __FILE__, __LINE__, + __func__); goto out; } clear_huge_page(page, address, pages_per_huge_page(h)); @@ -4578,8 +4607,12 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, new_page = true; if (vma->vm_flags & VM_MAYSHARE) { + printk("%s:%d:%s adding page to cache idx=%d mapping=%px\n", + __FILE__, __LINE__, __func__, idx, mapping); int err = huge_add_to_page_cache(page, mapping, idx); if (err) { + printk("%s:%d:%s failed adding page to cache idx=%d\n", + __FILE__, __LINE__, __func__, idx); put_page(page); if (err == -EEXIST) goto retry; @@ -4868,44 +4901,102 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, struct page **pagep) { bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE); - struct address_space *mapping; - pgoff_t idx; + struct hstate *h = hstate_vma(dst_vma); + struct address_space *mapping = dst_vma->vm_file->f_mapping; + pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr); unsigned long size; int vm_shared = dst_vma->vm_flags & VM_SHARED; - struct hstate *h = hstate_vma(dst_vma); pte_t _dst_pte; spinlock_t *ptl; - int ret; + int ret = -ENOMEM; struct page *page; int writable; - - mapping = dst_vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, dst_vma, dst_addr); + unsigned long resv_temp = 0; if (is_continue) { ret = -EFAULT; page = find_lock_page(mapping, idx); - if (!page) + if (!page) { + ret = -ENOMEM; goto out; + } } else if (!*pagep) { - ret = -ENOMEM; + /* If a page already exists, then it's UFFDIO_COPY for + * a non-missing case. Return -EEXIST. + */ + if (hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) { + ret = -EEXIST; + printk("%s:%d:%s found in cache. idx=%d mapping=%px\n", + __FILE__, __LINE__, __func__, idx, + dst_vma->vm_file->f_mapping); + goto out; + } + + printk("%s:%d:%s not found in cache. Allocating consuming reservation idx=%d mapping=%px\n", + __FILE__, __LINE__, __func__, idx, + dst_vma->vm_file->f_mapping); page = alloc_huge_page(dst_vma, dst_addr, 0); - if (IS_ERR(page)) + BUG_ON(IS_ERR(page)); + if (IS_ERR(page)) { + printk("%s:%d:%s page allocation failed\n", __FILE__, + __LINE__, __func__); goto out; + } +#if 0 ret = copy_huge_page_from_user(page, - (const void __user *) src_addr, - pages_per_huge_page(h), false); + (const void __user *)src_addr, + pages_per_huge_page(h), false); +#else + ret = -ENOENT; +#endif /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { ret = -ENOENT; + printk("%s:%d:%s copy failed, freeing page idx=%d\n", + __FILE__, __LINE__, __func__, idx); + resv_temp = h->resv_huge_pages; + put_page(page); + + /* Reallocate the page outside the reserves. */ + struct mempolicy *mpol; + nodemask_t *nodemask; + gfp_t gfp_mask = htlb_alloc_mask(h); + int node = huge_node(dst_vma, dst_addr, gfp_mask, &mpol, + &nodemask); + resv_temp = h->resv_huge_pages; + page = alloc_migrate_huge_page(h, gfp_mask, node, + nodemask); + VM_BUG_ON(h->resv_huge_pages != resv_temp); + if (IS_ERR(page)) { + VM_BUG_ON(true); + ret = -ENOMEM; + printk("%s:%d:%s failed allocating migrate_huge_page\n", + __FILE__, __LINE__, __func__); + goto out; + } *pagep = page; /* don't free the page */ goto out; } } else { - page = *pagep; + if (hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) { + printk("%s:%d:%s found huge_page in cache idx=%d mapping=%px\n", + __FILE__, __LINE__, __func__, idx, + dst_vma->vm_file->f_mapping); + put_page(*pagep); + ret = -EEXIST; + goto out; + } + + printk("%s:%d:%s not found in cache, allocating consuming reservation idx=%d mapping=%px\n", + __FILE__, __LINE__, __func__, idx, + dst_vma->vm_file->f_mapping); + page = alloc_huge_page(dst_vma, dst_addr, 0); + VM_BUG_ON(IS_ERR(page)); + __copy_gigantic_page(page, *pagep, pages_per_huge_page(h)); + put_page(*pagep); *pagep = NULL; } @@ -4929,9 +5020,14 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, * hugetlb_fault_mutex_table that here must be hold by * the caller. */ + printk("%s:%d:%s adding page to cache idx=%d mapping=%px\n", + __FILE__, __LINE__, __func__, idx, mapping); ret = huge_add_to_page_cache(page, mapping, idx); - if (ret) + if (ret) { + printk("%s:%d:%s failed adding to cache\n", __FILE__, + __LINE__, __func__); goto out_release_nounlock; + } } ptl = huge_pte_lockptr(h, dst_mm, dst_pte); -- 2.31.1.818.g46aad6cb9e-goog