[RFC PATCH 4/5] hugetlbfs: catch and handle truncate racing with page faults

From: Mike Kravetz
Date: Wed Apr 06 2022 - 17:34:05 EST


Most hugetlb fault handling code checks for faults beyond i_size.
While there are early checks in the code paths, the most difficult
to handle are those discovered after taking the page table lock.
At this point, we have possibly allocated a page and consumed
associated reservations and possibly added the page to the page cache.

When discovering a fault beyond i_size, be sure to:
- Remove the page from page cache, else it will sit there until the
file is removed.
- Do not restore any reservation for the page consumed. Otherwise
there will be an outstanding reservation for an offset beyond the
end of file.

The 'truncation' code in remove_inode_hugepages must deal with fault
code potentially removing a page from the cache after the page was
returned by pagevec_lookup and before locking the page. This can be
discovered by a change in page_mapping().

Signed-off-by: Mike Kravetz <mike.kravetz@xxxxxxxxxx>
---
fs/hugetlbfs/inode.c | 40 ++++++++++++++++++++++------------------
mm/hugetlb.c | 28 ++++++++++++++++++++--------
2 files changed, 42 insertions(+), 26 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 0cf352555354..341156c2a7d0 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -490,13 +490,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
* unmapped in caller. Unmap (again) now after taking
* the fault mutex. The mutex will prevent faults
* until we finish removing the page.
- *
- * This race can only happen in the hole punch case.
- * Getting here in a truncate operation is a bug.
*/
if (unlikely(page_mapped(page))) {
- BUG_ON(truncate_op);
-
i_mmap_lock_write(mapping);
hugetlb_vmdelete_list(&mapping->i_mmap,
index * pages_per_huge_page(h),
@@ -506,22 +501,31 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,

lock_page(page);
/*
- * We must free the huge page and remove from page
- * cache BEFORE removing the region/reserve map
- * (hugetlb_unreserve_pages). In rare out of memory
- * conditions, removal of the region/reserve map could
- * fail. Correspondingly, the subpool and global
- * reserve usage count can need to be adjusted.
+ * After locking page, make sure mapping is the same.
+ * We could have raced with page fault populate and
+ * backout code.
*/
- VM_BUG_ON(HPageRestoreReserve(page));
- hugetlb_delete_from_page_cache(page);
- freed++;
- if (!truncate_op) {
- if (unlikely(hugetlb_unreserve_pages(inode,
+ if (page_mapping(page) == mapping) {
+ /*
+ * We must free the huge page and remove from
+ * page cache BEFORE removing the region/
+ * reserve map (hugetlb_unreserve_pages). In
+ * rare out of memory conditions, removal of
+ * the region/reserve map could fail.
+ * Correspondingly, the subpool and global
+ * reserve usage count can need to be adjusted.
+ */
+ VM_BUG_ON(HPageRestoreReserve(page));
+ hugetlb_delete_from_page_cache(page);
+ freed++;
+ if (!truncate_op) {
+ if (unlikely(
+ hugetlb_unreserve_pages(inode,
index, index + 1, 1)))
- hugetlb_fix_reserve_counts(inode);
+ hugetlb_fix_reserve_counts(
+ inode);
+ }
}
-
unlock_page(page);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c6d76f61de98..b8f994961a68 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5361,6 +5361,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
spinlock_t *ptl;
unsigned long haddr = address & huge_page_mask(h);
bool new_page, new_pagecache_page = false;
+ bool beyond_i_size = false;
+ bool reserve_alloc = false;

/*
* Currently, we are forced to kill the process in the event the
@@ -5417,6 +5419,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
clear_huge_page(page, address, pages_per_huge_page(h));
__SetPageUptodate(page);
new_page = true;
+ if (HPageRestoreReserve(page))
+ reserve_alloc = true;

if (vma->vm_flags & VM_MAYSHARE) {
int err = hugetlb_add_to_page_cache(page, mapping, idx);
@@ -5475,8 +5479,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,

ptl = huge_pte_lock(h, mm, ptep);
size = i_size_read(mapping->host) >> huge_page_shift(h);
- if (idx >= size)
+ if (idx >= size) {
+ beyond_i_size = true;
goto backout;
+ }

ret = 0;
if (!huge_pte_none(huge_ptep_get(ptep)))
@@ -5514,10 +5520,16 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
backout:
spin_unlock(ptl);
backout_unlocked:
+ if (new_pagecache_page && beyond_i_size)
+ hugetlb_delete_from_page_cache(page);
unlock_page(page);
/* restore reserve for newly allocated pages not in page cache */
- if (new_page && !new_pagecache_page)
- restore_reserve_on_error(h, vma, haddr, page);
+ if (!new_pagecache_page) {
+ if (reserve_alloc)
+ SetHPageRestoreReserve(page);
+ if (new_page)
+ restore_reserve_on_error(h, vma, haddr, page);
+ }
put_page(page);
goto out;
}
@@ -5812,15 +5824,15 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
* Recheck the i_size after holding PT lock to make sure not
* to leave any page mapped (as page_mapped()) beyond the end
* of the i_size (remove_inode_hugepages() is strict about
- * enforcing that). If we bail out here, we'll also leave a
- * page in the radix tree in the vm_shared case beyond the end
- * of the i_size, but remove_inode_hugepages() will take care
- * of it as soon as we drop the hugetlb_fault_mutex_table.
+ * enforcing that). If we bail out here, remove the page
+ * added to the radix tree.
*/
size = i_size_read(mapping->host) >> huge_page_shift(h);
ret = -EFAULT;
- if (idx >= size)
+ if (idx >= size) {
+ hugetlb_delete_from_page_cache(page);
goto out_release_unlock;
+ }

ret = -EEXIST;
if (!huge_pte_none(huge_ptep_get(dst_pte)))
--
2.35.1