[PATCH 13/15] mm: Add a huge page fault handler for files

From: Matthew Wilcox
Date: Tue Sep 24 2019 - 20:52:47 EST


From: William Kucharski <william.kucharski@xxxxxxxxxx>

Add filemap_huge_fault() to attempt to satisfy page
faults on memory-mapped read-only text pages using THP when possible.

Signed-off-by: William Kucharski <william.kucharski@xxxxxxxxxx>
[rebased on top of mm prep patches -- Matthew]
Signed-off-by: Matthew Wilcox (Oracle) <willy@xxxxxxxxxxxxx>
---
include/linux/mm.h | 10 +++
include/linux/pagemap.h | 8 ++
mm/filemap.c | 165 ++++++++++++++++++++++++++++++++++++++--
3 files changed, 178 insertions(+), 5 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 04bea9f9282c..623878f11eaf 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2414,6 +2414,16 @@ extern void truncate_inode_pages_final(struct address_space *);

/* generic vm_area_ops exported for stackable file systems */
extern vm_fault_t filemap_fault(struct vm_fault *vmf);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern vm_fault_t filemap_huge_fault(struct vm_fault *vmf,
+ enum page_entry_size pe_size);
+#else
+static inline vm_fault_t filemap_huge_fault(struct vm_fault *vmf,
+ enum page_entry_size pe_size)
+{
+ return VM_FAULT_FALLBACK;
+}
+#endif
extern void filemap_map_pages(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff);
extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index d6d97f9fb762..ae09788f5345 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -354,6 +354,14 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping,
mapping_gfp_mask(mapping));
}

+/* This (head) page should be found at this offset in the page cache */
+static inline void page_cache_assert(struct page *page, pgoff_t offset)
+{
+ VM_BUG_ON_PAGE(PageTail(page), page);
+ VM_BUG_ON_PAGE(page->index == (offset & ~(compound_nr(page) - 1)),
+ page);
+}
+
static inline struct page *find_subpage(struct page *page, pgoff_t offset)
{
if (PageHuge(page))
diff --git a/mm/filemap.c b/mm/filemap.c
index b07ef9469861..8017e905df7a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1590,7 +1590,8 @@ static bool pagecache_is_conflict(struct page *page)
*
* Looks up the page cache entries at @mapping between @offset and
* @offset + 2^@order. If there is a page cache page, it is returned with
- * an increased refcount unless it is smaller than @order.
+ * an increased refcount unless it is smaller than @order. This function
+ * returns the head page, not a tail page.
*
* If the slot holds a shadow entry of a previously evicted page, or a
* swap entry from shmem/tmpfs, it is returned.
@@ -1601,7 +1602,7 @@ static bool pagecache_is_conflict(struct page *page)
static struct page *__find_get_page(struct address_space *mapping,
unsigned long offset, unsigned int order)
{
- XA_STATE(xas, &mapping->i_pages, offset);
+ XA_STATE(xas, &mapping->i_pages, offset & ~((1UL << order) - 1));
struct page *page;

rcu_read_lock();
@@ -1635,7 +1636,6 @@ static struct page *__find_get_page(struct address_space *mapping,
put_page(page);
goto repeat;
}
- page = find_subpage(page, offset);
out:
rcu_read_unlock();

@@ -1741,11 +1741,12 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
put_page(page);
goto repeat;
}
- VM_BUG_ON_PAGE(page->index != offset, page);
+ page_cache_assert(page, offset);
}

if (fgp_flags & FGP_ACCESSED)
mark_page_accessed(page);
+ page = find_subpage(page, offset);

no_page:
if (!page && (fgp_flags & FGP_CREAT)) {
@@ -2638,7 +2639,7 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
put_page(page);
goto retry_find;
}
- VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
+ page_cache_assert(page, offset);

/*
* We have a locked page in the page cache, now we need to check
@@ -2711,6 +2712,160 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
}
EXPORT_SYMBOL(filemap_fault);

+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/**
+ * filemap_huge_fault - Read in file data for page fault handling.
+ * @vmf: struct vm_fault containing details of the fault.
+ * @pe_size: Page entry size.
+ *
+ * filemap_huge_fault() is invoked via the vma operations vector for a
+ * mapped memory region to read in file data during a page fault.
+ *
+ * The goto's are kind of ugly, but this streamlines the normal case of having
+ * it in the page cache, and handles the special cases reasonably without
+ * having a lot of duplicated code.
+ *
+ * vma->vm_mm->mmap_sem must be held on entry.
+ *
+ * If our return value has VM_FAULT_RETRY set, it's because the mmap_sem
+ * may be dropped before doing I/O or by lock_page_maybe_drop_mmap().
+ *
+ * If our return value does not have VM_FAULT_RETRY set, the mmap_sem
+ * has not been released.
+ *
+ * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
+ *
+ * Return: bitwise-OR of %VM_FAULT_ codes.
+ */
+vm_fault_t filemap_huge_fault(struct vm_fault *vmf,
+ enum page_entry_size pe_size)
+{
+ int error;
+ struct vm_area_struct *vma = vmf->vma;
+ struct file *file = vma->vm_file;
+ struct file *fpin = NULL;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ pgoff_t offset = vmf->pgoff;
+ pgoff_t max_off;
+ struct page *page;
+ vm_fault_t ret = 0;
+
+ if (pe_size != PE_SIZE_PMD)
+ return VM_FAULT_FALLBACK;
+ /* Read-only mappings for now */
+ if (vmf->flags & FAULT_FLAG_WRITE)
+ return VM_FAULT_FALLBACK;
+ if (vma->vm_start & ~HPAGE_PMD_MASK)
+ return VM_FAULT_FALLBACK;
+ /* Don't allocate a huge page for the tail of the file (?) */
+ max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ if (unlikely((offset | (HPAGE_PMD_NR - 1)) >= max_off))
+ return VM_FAULT_FALLBACK;
+
+ /*
+ * Do we have something in the page cache already?
+ */
+ page = __find_get_page(mapping, offset, HPAGE_PMD_ORDER);
+ if (likely(page)) {
+ if (pagecache_is_conflict(page))
+ return VM_FAULT_FALLBACK;
+ /* Readahead the next huge page here? */
+ page = find_subpage(page, offset & ~(HPAGE_PMD_NR - 1));
+ } else {
+ /* No page in the page cache at all */
+ count_vm_event(PGMAJFAULT);
+ count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
+ ret = VM_FAULT_MAJOR;
+retry_find:
+ page = pagecache_get_page(mapping, offset,
+ FGP_CREAT | FGP_FOR_MMAP | FGP_PMD,
+ vmf->gfp_mask |
+ __GFP_NOWARN | __GFP_NORETRY);
+ if (!page)
+ return VM_FAULT_FALLBACK;
+ }
+
+ if (!lock_page_maybe_drop_mmap(vmf, page, &fpin))
+ goto out_retry;
+
+ /* Did it get truncated? */
+ if (unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ put_page(page);
+ goto retry_find;
+ }
+ VM_BUG_ON_PAGE(page_to_index(page) != offset, page);
+
+ /*
+ * We have a locked page in the page cache, now we need to check
+ * that it's up-to-date. Because we don't readahead in huge_fault,
+ * this may or may not be due to an error.
+ */
+ if (!PageUptodate(page))
+ goto page_not_uptodate;
+
+ /*
+ * We've made it this far and we had to drop our mmap_sem, now is the
+ * time to return to the upper layer and have it re-find the vma and
+ * redo the fault.
+ */
+ if (fpin) {
+ unlock_page(page);
+ goto out_retry;
+ }
+
+ /*
+ * Found the page and have a reference on it.
+ * We must recheck i_size under page lock.
+ */
+ max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ if (unlikely(offset >= max_off)) {
+ unlock_page(page);
+ put_page(page);
+ return VM_FAULT_SIGBUS;
+ }
+
+ ret |= alloc_set_pte(vmf, NULL, page);
+ unlock_page(page);
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
+ put_page(page);
+ return ret;
+
+page_not_uptodate:
+ ClearPageError(page);
+ fpin = maybe_unlock_mmap_for_io(vmf, fpin);
+ error = mapping->a_ops->readpage(file, page);
+ if (!error) {
+ wait_on_page_locked(page);
+ if (!PageUptodate(page))
+ error = -EIO;
+ }
+ if (fpin)
+ goto out_retry;
+ put_page(page);
+
+ if (!error || error == AOP_TRUNCATED_PAGE)
+ goto retry_find;
+
+ /* Things didn't work out */
+ return VM_FAULT_SIGBUS;
+
+out_retry:
+ /*
+ * We dropped the mmap_sem, we need to return to the fault handler to
+ * re-find the vma and come back and find our hopefully still populated
+ * page.
+ */
+ if (page)
+ put_page(page);
+ if (fpin)
+ fput(fpin);
+ return ret | VM_FAULT_RETRY;
+}
+EXPORT_SYMBOL(filemap_huge_fault);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
void filemap_map_pages(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff)
{
--
2.23.0