[PATCH v2 22/46] hugetlb: add HGM support to copy_hugetlb_page_range

From: James Houghton
Date: Fri Feb 17 2023 - 19:30:18 EST


This allows fork() to work with high-granularity mappings. The page
table structure is copied such that partially mapped regions will remain
partially mapped in the same way for the new process.

A page's reference count is incremented for *each* portion of it that
is mapped in the page table. For example, if you have a PMD-mapped 1G
page, the reference count will be incremented by 512.

mapcount is handled similar to THPs: if you're completely mapping a
hugepage, then the compound_mapcount is incremented. If you're mapping a
part of it, the subpages that are getting mapped will have their
mapcounts incremented.

Signed-off-by: James Houghton <jthoughton@xxxxxxxxxx>

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 1a1a71868dfd..2fe1eb6897d4 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -162,6 +162,8 @@ void hugepage_put_subpool(struct hugepage_subpool *spool);

void hugetlb_remove_rmap(struct page *subpage, unsigned long shift,
struct hstate *h, struct vm_area_struct *vma);
+void hugetlb_add_file_rmap(struct page *subpage, unsigned long shift,
+ struct hstate *h, struct vm_area_struct *vma);

void hugetlb_dup_vma_private(struct vm_area_struct *vma);
void clear_vma_resv_huge_pages(struct vm_area_struct *vma);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 693332b7e186..210c6f2b16a5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -141,6 +141,37 @@ void hugetlb_remove_rmap(struct page *subpage, unsigned long shift,
page_remove_rmap(subpage, vma, false);
}
}
+/*
+ * hugetlb_add_file_rmap() - increment the mapcounts for file-backed hugetlb
+ * pages appropriately.
+ *
+ * For pages that are being mapped with their hstate-level PTE (e.g., a 1G page
+ * being mapped with a 1G PUD), then we increment the compound_mapcount for the
+ * head page.
+ *
+ * For pages that are being mapped with high-granularity, we increment the
+ * mapcounts for the individual subpages that are getting mapped.
+ */
+void hugetlb_add_file_rmap(struct page *subpage, unsigned long shift,
+ struct hstate *h, struct vm_area_struct *vma)
+{
+ struct page *hpage = compound_head(subpage);
+
+ if (shift == huge_page_shift(h)) {
+ VM_BUG_ON_PAGE(subpage != hpage, subpage);
+ page_add_file_rmap(hpage, vma, true);
+ } else {
+ unsigned long nr_subpages = 1UL << (shift - PAGE_SHIFT);
+ struct page *final_page = &subpage[nr_subpages];
+
+ VM_BUG_ON_PAGE(HPageVmemmapOptimized(hpage), hpage);
+ /*
+ * Increment the mapcount on each page that is getting mapped.
+ */
+ for (; subpage < final_page; ++subpage)
+ page_add_file_rmap(subpage, vma, false);
+ }
+}

static inline bool subpool_is_free(struct hugepage_subpool *spool)
{
@@ -5210,7 +5241,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *src_vma)
{
pte_t *src_pte, *dst_pte, entry;
- struct page *ptepage;
+ struct hugetlb_pte src_hpte, dst_hpte;
+ struct page *ptepage, *hpage;
unsigned long addr;
bool cow = is_cow_mapping(src_vma->vm_flags);
struct hstate *h = hstate_vma(src_vma);
@@ -5238,18 +5270,24 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
}

last_addr_mask = hugetlb_mask_last_page(h);
- for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
+ addr = src_vma->vm_start;
+ while (addr < src_vma->vm_end) {
spinlock_t *src_ptl, *dst_ptl;
- src_pte = hugetlb_walk(src_vma, addr, sz);
- if (!src_pte) {
- addr |= last_addr_mask;
+ unsigned long hpte_sz;
+
+ if (hugetlb_full_walk(&src_hpte, src_vma, addr)) {
+ addr = (addr | last_addr_mask) + sz;
continue;
}
- dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz);
- if (!dst_pte) {
- ret = -ENOMEM;
+ ret = hugetlb_full_walk_alloc(&dst_hpte, dst_vma, addr,
+ hugetlb_pte_size(&src_hpte));
+ if (ret)
break;
- }
+
+ src_pte = src_hpte.ptep;
+ dst_pte = dst_hpte.ptep;
+
+ hpte_sz = hugetlb_pte_size(&src_hpte);

/*
* If the pagetables are shared don't copy or take references.
@@ -5259,13 +5297,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
* another vma. So page_count of ptep page is checked instead
* to reliably determine whether pte is shared.
*/
- if (page_count(virt_to_page(dst_pte)) > 1) {
- addr |= last_addr_mask;
+ if (hugetlb_pte_size(&dst_hpte) == sz &&
+ page_count(virt_to_page(dst_pte)) > 1) {
+ addr = (addr | last_addr_mask) + sz;
continue;
}

- dst_ptl = huge_pte_lock(h, dst, dst_pte);
- src_ptl = huge_pte_lockptr(huge_page_shift(h), src, src_pte);
+ dst_ptl = hugetlb_pte_lock(&dst_hpte);
+ src_ptl = hugetlb_pte_lockptr(&src_hpte);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
entry = huge_ptep_get(src_pte);
again:
@@ -5309,10 +5348,15 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
*/
if (userfaultfd_wp(dst_vma))
set_huge_pte_at(dst, addr, dst_pte, entry);
+ } else if (!hugetlb_pte_present_leaf(&src_hpte, entry)) {
+ /* Retry the walk. */
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
+ continue;
} else {
- entry = huge_ptep_get(src_pte);
ptepage = pte_page(entry);
- get_page(ptepage);
+ hpage = compound_head(ptepage);
+ get_page(hpage);

/*
* Failing to duplicate the anon rmap is a rare case
@@ -5324,13 +5368,34 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
* need to be without the pgtable locks since we could
* sleep during the process.
*/
- if (!PageAnon(ptepage)) {
- page_add_file_rmap(ptepage, src_vma, true);
- } else if (page_try_dup_anon_rmap(ptepage, true,
+ if (!PageAnon(hpage)) {
+ hugetlb_add_file_rmap(ptepage,
+ src_hpte.shift, h, src_vma);
+ }
+ /*
+ * It is currently impossible to get anonymous HugeTLB
+ * high-granularity mappings, so we use 'hpage' here.
+ *
+ * This will need to be changed when HGM support for
+ * anon mappings is added.
+ */
+ else if (page_try_dup_anon_rmap(hpage, true,
src_vma)) {
pte_t src_pte_old = entry;
struct folio *new_folio;

+ /*
+ * If we are mapped at high granularity, we
+ * may end up allocating lots and lots of
+ * hugepages when we only need one. Bail out
+ * now.
+ */
+ if (hugetlb_pte_size(&src_hpte) != sz) {
+ put_page(hpage);
+ ret = -EINVAL;
+ break;
+ }
+
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
/* Do not use reserve as it's private owned */
@@ -5342,7 +5407,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
}
copy_user_huge_page(&new_folio->page, ptepage, addr, dst_vma,
npages);
- put_page(ptepage);
+ put_page(hpage);

/* Install the new hugetlb folio if src pte stable */
dst_ptl = huge_pte_lock(h, dst, dst_pte);
@@ -5360,6 +5425,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
hugetlb_install_folio(dst_vma, dst_pte, addr, new_folio);
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
+ addr += hugetlb_pte_size(&src_hpte);
continue;
}

@@ -5376,10 +5442,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
}

set_huge_pte_at(dst, addr, dst_pte, entry);
- hugetlb_count_add(npages, dst);
+ hugetlb_count_add(
+ hugetlb_pte_size(&dst_hpte) / PAGE_SIZE,
+ dst);
}
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
+ addr += hugetlb_pte_size(&src_hpte);
}

if (cow) {
--
2.39.2.637.g21b0678d19-goog