Re: [PATCH v7 5/7] mm: Make alloc_contig_range handle free hugetlb pages

From: David Hildenbrand
Date: Wed Apr 14 2021 - 08:26:29 EST

Next message: Segher Boessenkool: "Re: [PATCH v1 1/2] powerpc/bitops: Use immediate operand when possible"
Previous message: David Edmondson: "Re: [PATCH 5/6] KVM: SVM: pass a proper reason in kvm_emulate_instruction()"
In reply to: Oscar Salvador: "Re: [PATCH v7 5/7] mm: Make alloc_contig_range handle free hugetlb pages"
Next in thread: Oscar Salvador: "Re: [PATCH v7 5/7] mm: Make alloc_contig_range handle free hugetlb pages"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

+static inline int isolate_or_dissolve_huge_page(struct page *page)
+{
+ return -ENOMEM;

Without CONFIG_HUGETLB_PAGE, there is no way someone could possible pass in something valid. Although it doesn't matter too much, -EINVAL or similar sounds a bit better.

+}
+
static inline struct page *alloc_huge_page(struct vm_area_struct *vma,
unsigned long addr,
int avoid_reserve)
diff --git a/mm/compaction.c b/mm/compaction.c
index eeba4668c22c..89426b6d1ea3 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -788,7 +788,7 @@ static bool too_many_isolated(pg_data_t *pgdat)
* Isolate all pages that can be migrated from the range specified by
* [low_pfn, end_pfn). The range is expected to be within same pageblock.
* Returns errno, like -EAGAIN or -EINTR in case e.g signal pending or congestion,
- * or 0.
+ * -ENOMEM in case we could not allocate a page, or 0.
* cc->migrate_pfn will contain the next pfn to scan (which may be both less,
* equal to or more that end_pfn).
*
@@ -809,6 +809,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
bool skip_on_failure = false;
unsigned long next_skip_pfn = 0;
bool skip_updated = false;
+ bool fatal_error = false;

Can't we use "ret == -ENOMEM" instead of fatal_error?

int ret = 0;
cc->migrate_pfn = low_pfn;
@@ -907,6 +908,33 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
valid_page = page;
}
+ if (PageHuge(page) && cc->alloc_contig) {
+ ret = isolate_or_dissolve_huge_page(page);
+
+ /*
+ * Fail isolation in case isolate_or_dissolve_huge_page
+ * reports an error. In case of -ENOMEM, abort right away.
+ */
+ if (ret < 0) {
+ /*
+ * Do not report -EBUSY down the chain.
+ */
+ if (ret == -ENOMEM)
+ fatal_error = true;
+ else
+ ret = 0;
+ low_pfn += (1UL << compound_order(page)) - 1;
+ goto isolate_fail;
+ }
+
+ /*
+ * Ok, the hugepage was dissolved. Now these pages are
+ * Buddy and cannot be re-allocated because they are
+ * isolated. Fall-through as the check below handles
+ * Buddy pages.
+ */
+ }
+
/*
* Skip if free. We read page order here without zone lock
* which is generally unsafe, but the race window is small and
@@ -1066,7 +1094,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
put_page(page);
isolate_fail:
- if (!skip_on_failure)
+ if (!skip_on_failure && !fatal_error)
continue;
/*
@@ -1092,6 +1120,9 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
*/
next_skip_pfn += 1UL << cc->order;
}
+
+ if (fatal_error)
+ break;
}
/*
@@ -1145,7 +1176,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* @end_pfn: The one-past-last PFN.
*
* Returns errno, like -EAGAIN or -EINTR in case e.g signal pending or congestion,
- * or 0.
+ * -ENOMEM in case we could not allocate a page, or 0.
*/
int
isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0607b2b71ac6..4a664d6e82c1 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2266,6 +2266,121 @@ static void restore_reserve_on_error(struct hstate *h,
}
}
+/*
+ * alloc_and_dissolve_huge_page - Allocate a new page and dissolve the old one
+ * @h: struct hstate old page belongs to
+ * @old_page: Old page to dissolve
+ * Returns 0 on success, otherwise negated error.
+ */
+
+static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page)
+{
+ gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
+ int nid = page_to_nid(old_page);
+ struct page *new_page;
+ int ret = 0;
+
+ /*
+ * Before dissolving the page, we need to allocate a new one for the
+ * pool to remain stable. Using alloc_buddy_huge_page() allows us to
+ * not having to deal with prep_new_page() and avoids dealing of any
+ * counters. This simplifies and let us do the whole thing under the
+ * lock.
+ */
+ new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL);
+ if (!new_page)
+ return -ENOMEM;
+
+retry:
+ spin_lock_irq(&hugetlb_lock);
+ if (!PageHuge(old_page)) {
+ /*
+ * Freed from under us. Drop new_page too.
+ */
+ goto free_new;
+ } else if (page_count(old_page)) {
+ /*
+ * Someone has grabbed the page, fail for now.
+ */
+ ret = -EBUSY;
+ goto free_new;
+ } else if (!HPageFreed(old_page)) {
+ /*
+ * Page's refcount is 0 but it has not been enqueued in the
+ * freelist yet. Race window is small, so we can succeed here if
+ * we retry.
+ */
+ spin_unlock_irq(&hugetlb_lock);
+ cond_resched();
+ goto retry;
+ } else {
+ /*
+ * Ok, old_page is still a genuine free hugepage. Remove it from
+ * the freelist and decrease the counters. These will be
+ * incremented again when calling __prep_account_new_huge_page()
+ * and enqueue_huge_page() for new_page. The counters will remain
+ * stable since this happens under the lock.
+ */
+ remove_hugetlb_page(h, old_page, false);
+
+ /*
+ * Call __prep_new_huge_page() to construct the hugetlb page, and
+ * enqueue it then to place it in the freelists. After this,
+ * counters are back on track. Free hugepages have a refcount of 0,
+ * so we need to decrease new_page's count as well.
+ */
+ __prep_new_huge_page(new_page);
+ __prep_account_new_huge_page(h, nid);
+ page_ref_dec(new_page);
+ enqueue_huge_page(h, new_page);
+
+ /*
+ * Pages have been replaced, we can safely free the old one.
+ */
+ spin_unlock_irq(&hugetlb_lock);
+ update_and_free_page(h, old_page);
+ }
+
+ return ret;
+
+free_new:
+ spin_unlock_irq(&hugetlb_lock);
+ __free_pages(new_page, huge_page_order(h));
+
+ return ret;
+}
+
+int isolate_or_dissolve_huge_page(struct page *page)
+{
+ struct hstate *h;
+ struct page *head;
+
+ /*
+ * The page might have been dissolved from under our feet, so make sure
+ * to carefully check the state under the lock.
+ * Return success when racing as if we dissolved the page ourselves.
+ */
+ spin_lock_irq(&hugetlb_lock);
+ if (PageHuge(page)) {
+ head = compound_head(page);
+ h = page_hstate(head);
+ } else {
+ spin_unlock(&hugetlb_lock);
+ return 0;
+ }
+ spin_unlock_irq(&hugetlb_lock);
+
+ /*
+ * Fence off gigantic pages as there is a cyclic dependency between
+ * alloc_contig_range and them. Return -ENOME as this has the effect

s/-ENOME/-ENOMEM/

+ * of bailing out right away without further retrying.
+ */
+ if (hstate_is_gigantic(h))
+ return -ENOMEM;
+
+ return alloc_and_dissolve_huge_page(h, head);
+}
+
struct page *alloc_huge_page(struct vm_area_struct *vma,
unsigned long addr, int avoid_reserve)
{

Complicated stuff, but looks good to me.

--
Thanks,

David / dhildenb

Next message: Segher Boessenkool: "Re: [PATCH v1 1/2] powerpc/bitops: Use immediate operand when possible"
Previous message: David Edmondson: "Re: [PATCH 5/6] KVM: SVM: pass a proper reason in kvm_emulate_instruction()"
In reply to: Oscar Salvador: "Re: [PATCH v7 5/7] mm: Make alloc_contig_range handle free hugetlb pages"
Next in thread: Oscar Salvador: "Re: [PATCH v7 5/7] mm: Make alloc_contig_range handle free hugetlb pages"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]