[PATCH] mm, hugetlb: fix resv_huge_pages underflow on UFFDIO_COPY

From: Mina Almasry
Date: Thu May 13 2021 - 19:43:17 EST


When hugetlb_mcopy_atomic_pte() is called with:
- mode==MCOPY_ATOMIC_NORMAL and,
- we already have a page in the page cache corresponding to the
associated address,

We will allocate a huge page from the reserves, and then fail to insert it
into the cache and return -EEXIST. In this case, we need to return -EEXIST
without allocating a new page as the page already exists in the cache.
Allocating the extra page causes the resv_huge_pages to underflow temporarily
until the extra page is freed.

To fix this we check if a page exists in the cache, and allocate it and
insert it in the cache immediately while holding the lock. After that we
copy the contents into the page.

As a side effect of this, pages may exist in the cache for which the
copy failed and for these pages PageUptodate(page) == false. Modify code
that query the cache to handle this correctly.

Tested using:
./tools/testing/selftests/vm/userfaultfd hugetlb_shared 1024 200 \
/mnt/huge

Test passes, and dmesg shows no underflow warnings.

Signed-off-by: Mina Almasry <almasrymina@xxxxxxxxxx>
Cc: Axel Rasmussen <axelrasmussen@xxxxxxxxxx>
Cc: Peter Xu <peterx@xxxxxxxxxx>
Cc: linux-mm@xxxxxxxxx
Cc: Mike Kravetz <mike.kravetz@xxxxxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: linux-mm@xxxxxxxxx
Cc: linux-kernel@xxxxxxxxxxxxxxx

---
fs/hugetlbfs/inode.c | 2 +-
mm/hugetlb.c | 109 +++++++++++++++++++++++--------------------
2 files changed, 60 insertions(+), 51 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a2a42335e8fd..cc027c335242 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -346,7 +346,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)

/* Find the page */
page = find_lock_page(mapping, index);
- if (unlikely(page == NULL)) {
+ if (unlikely(page == NULL || !PageUptodate(page))) {
/*
* We have a HOLE, zero out the user-buffer for the
* length of the hole or request.
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 629aa4c2259c..a5a5fbf7ac25 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4543,7 +4543,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,

retry:
page = find_lock_page(mapping, idx);
- if (!page) {
+ if (!page || !PageUptodate(page)) {
/* Check for page in userfault range */
if (userfaultfd_missing(vma)) {
ret = hugetlb_handle_userfault(vma, mapping, idx,
@@ -4552,26 +4552,30 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
goto out;
}

- page = alloc_huge_page(vma, haddr, 0);
- if (IS_ERR(page)) {
- /*
- * Returning error will result in faulting task being
- * sent SIGBUS. The hugetlb fault mutex prevents two
- * tasks from racing to fault in the same page which
- * could result in false unable to allocate errors.
- * Page migration does not take the fault mutex, but
- * does a clear then write of pte's under page table
- * lock. Page fault code could race with migration,
- * notice the clear pte and try to allocate a page
- * here. Before returning error, get ptl and make
- * sure there really is no pte entry.
- */
- ptl = huge_pte_lock(h, mm, ptep);
- ret = 0;
- if (huge_pte_none(huge_ptep_get(ptep)))
- ret = vmf_error(PTR_ERR(page));
- spin_unlock(ptl);
- goto out;
+ if (!page) {
+ page = alloc_huge_page(vma, haddr, 0);
+ if (IS_ERR(page)) {
+ /*
+ * Returning error will result in faulting task
+ * being sent SIGBUS. The hugetlb fault mutex
+ * prevents two tasks from racing to fault in
+ * the same page which could result in false
+ * unable to allocate errors. Page migration
+ * does not take the fault mutex, but does a
+ * clear then write of pte's under page table
+ * lock. Page fault code could race with
+ * migration, notice the clear pte and try to
+ * allocate a page here. Before returning
+ * error, get ptl and make sure there really is
+ * no pte entry.
+ */
+ ptl = huge_pte_lock(h, mm, ptep);
+ ret = 0;
+ if (huge_pte_none(huge_ptep_get(ptep)))
+ ret = vmf_error(PTR_ERR(page));
+ spin_unlock(ptl);
+ goto out;
+ }
}
clear_huge_page(page, address, pages_per_huge_page(h));
__SetPageUptodate(page);
@@ -4868,31 +4872,55 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
struct page **pagep)
{
bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
- struct address_space *mapping;
- pgoff_t idx;
+ struct hstate *h = hstate_vma(dst_vma);
+ struct address_space *mapping = dst_vma->vm_file->f_mapping;
+ pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
unsigned long size;
int vm_shared = dst_vma->vm_flags & VM_SHARED;
- struct hstate *h = hstate_vma(dst_vma);
pte_t _dst_pte;
spinlock_t *ptl;
- int ret;
+ int ret = -ENOMEM;
struct page *page;
int writable;

- mapping = dst_vma->vm_file->f_mapping;
- idx = vma_hugecache_offset(h, dst_vma, dst_addr);
-
if (is_continue) {
ret = -EFAULT;
- page = find_lock_page(mapping, idx);
- if (!page)
+ page = hugetlbfs_pagecache_page(h, dst_vma, dst_addr);
+ if (!page) {
+ ret = -ENOMEM;
goto out;
+ }
} else if (!*pagep) {
- ret = -ENOMEM;
+ /* If a page already exists, then it's UFFDIO_COPY for
+ * a non-missing case. Return -EEXIST.
+ */
+ if (hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
+ ret = -EEXIST;
+ goto out;
+ }
+
page = alloc_huge_page(dst_vma, dst_addr, 0);
if (IS_ERR(page))
goto out;

+ /* Add shared, newly allocated pages to the page cache. */
+ if (vm_shared) {
+ size = i_size_read(mapping->host) >> huge_page_shift(h);
+ ret = -EFAULT;
+ if (idx >= size)
+ goto out;
+
+ /*
+ * Serialization between remove_inode_hugepages() and
+ * huge_add_to_page_cache() below happens through the
+ * hugetlb_fault_mutex_table that here must be hold by
+ * the caller.
+ */
+ ret = huge_add_to_page_cache(page, mapping, idx);
+ if (ret)
+ goto out;
+ }
+
ret = copy_huge_page_from_user(page,
(const void __user *) src_addr,
pages_per_huge_page(h), false);
@@ -4916,24 +4944,6 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
*/
__SetPageUptodate(page);

- /* Add shared, newly allocated pages to the page cache. */
- if (vm_shared && !is_continue) {
- size = i_size_read(mapping->host) >> huge_page_shift(h);
- ret = -EFAULT;
- if (idx >= size)
- goto out_release_nounlock;
-
- /*
- * Serialization between remove_inode_hugepages() and
- * huge_add_to_page_cache() below happens through the
- * hugetlb_fault_mutex_table that here must be hold by
- * the caller.
- */
- ret = huge_add_to_page_cache(page, mapping, idx);
- if (ret)
- goto out_release_nounlock;
- }
-
ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
spin_lock(ptl);

@@ -4994,7 +5004,6 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
spin_unlock(ptl);
if (vm_shared || is_continue)
unlock_page(page);
-out_release_nounlock:
put_page(page);
goto out;
}
--
2.31.1.751.gd2f1c929bd-goog