Re: [RFC PATCH v2 4/7] mm: make alloc_contig_range work at pageblock granularity

From: Eric Ren
Date: Fri Dec 10 2021 - 03:13:03 EST


Hi,

On 2021/12/10 07:04, Zi Yan wrote:
From: Zi Yan <ziy@xxxxxxxxxx>

alloc_contig_range() worked at MAX_ORDER-1 granularity to avoid merging
pageblocks with different migratetypes. It might unnecessarily convert
extra pageblocks at the beginning and at the end of the range. Change
alloc_contig_range() to work at pageblock granularity.

It is done by restoring pageblock types and split >pageblock_order free
pages after isolating at MAX_ORDER-1 granularity and migrating pages
away at pageblock granularity. The reason for this process is that
during isolation, some pages, either free or in-use, might have >pageblock
sizes and isolating part of them can cause free accounting issues.
Could you elaborate on how the acconting issue would happen in details?


Restoring the migratetypes of the pageblocks not in the interesting
range later is much easier.

Signed-off-by: Zi Yan <ziy@xxxxxxxxxx>
---
mm/page_alloc.c | 169 ++++++++++++++++++++++++++++++++++++++++++------
1 file changed, 149 insertions(+), 20 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 107a5f186d3b..5ffbeb1b7512 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -8981,8 +8981,8 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page,
#ifdef CONFIG_CONTIG_ALLOC
static unsigned long pfn_max_align_down(unsigned long pfn)
{
- return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
- pageblock_nr_pages) - 1);
+ return ALIGN_DOWN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
+ pageblock_nr_pages));
}
static unsigned long pfn_max_align_up(unsigned long pfn)
@@ -9071,6 +9071,52 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
return 0;
}
+static inline int save_migratetypes(unsigned char *migratetypes,
+ unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long pfn = start_pfn;
+ int num = 0;
+
+ while (pfn < end_pfn) {
+ migratetypes[num] = get_pageblock_migratetype(pfn_to_page(pfn));
+ num++;
+ pfn += pageblock_nr_pages;
+ }
+ return num;
+}
+
+static inline int restore_migratetypes(unsigned char *migratetypes,
+ unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long pfn = start_pfn;
+ int num = 0;
+
+ while (pfn < end_pfn) {
+ set_pageblock_migratetype(pfn_to_page(pfn), migratetypes[num]);
+ num++;
+ pfn += pageblock_nr_pages;
+ }
+ return num;
+}
+
+static inline void split_free_page_into_pageblocks(struct page *free_page,
+ int order, struct zone *zone)
+{
+ unsigned long pfn;
+
+ spin_lock(&zone->lock);
+ del_page_from_free_list(free_page, zone, order);
+ for (pfn = page_to_pfn(free_page);
+ pfn < page_to_pfn(free_page) + (1UL << order);
+ pfn += pageblock_nr_pages) {
+ int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
+
+ __free_one_page(pfn_to_page(pfn), pfn, zone, pageblock_order,
+ mt, FPI_NONE);
+ }
+ spin_unlock(&zone->lock);
+}
+
/**
* alloc_contig_range() -- tries to allocate given range of pages
* @start: start PFN to allocate
@@ -9096,8 +9142,15 @@ int alloc_contig_range(unsigned long start, unsigned long end,
unsigned migratetype, gfp_t gfp_mask)
{
unsigned long outer_start, outer_end;
+ unsigned long isolate_start = pfn_max_align_down(start);
+ unsigned long isolate_end = pfn_max_align_up(end);
+ unsigned long alloc_start = ALIGN_DOWN(start, pageblock_nr_pages);
+ unsigned long alloc_end = ALIGN(end, pageblock_nr_pages);
What is the differecence between isolate_* and alloc_*?
+ unsigned long num_pageblock_to_save;
unsigned int order;
int ret = 0;
+ unsigned char *saved_mt;
+ int num;
struct compact_control cc = {
.nr_migratepages = 0,
@@ -9111,11 +9164,30 @@ int alloc_contig_range(unsigned long start, unsigned long end,
};
INIT_LIST_HEAD(&cc.migratepages);
+ /*
+ * TODO: make MIGRATE_ISOLATE a standalone bit to avoid overwriting
+ * the exiting migratetype. Then, we will not need the save and restore
+ * process here.
+ */
+
+ /* Save the migratepages of the pageblocks before start and after end */
+ num_pageblock_to_save = (alloc_start - isolate_start) / pageblock_nr_pages
+ + (isolate_end - alloc_end) / pageblock_nr_pages;
+ saved_mt =
+ kmalloc_array(num_pageblock_to_save,
+ sizeof(unsigned char), GFP_KERNEL);
+ if (!saved_mt)
+ return -ENOMEM;
+
+ num = save_migratetypes(saved_mt, isolate_start, alloc_start);
+
+ num = save_migratetypes(&saved_mt[num], alloc_end, isolate_end);
the two lines above looks confusing to figure out the logic. Could you help explain a bit?

Thanks,
Eric
+
/*
* What we do here is we mark all pageblocks in range as
* MIGRATE_ISOLATE. Because pageblock and max order pages may
* have different sizes, and due to the way page allocator
- * work, we align the range to biggest of the two pages so
+ * work, we align the isolation range to biggest of the two so
* that page allocator won't try to merge buddies from
* different pageblocks and change MIGRATE_ISOLATE to some
* other migration type.
@@ -9125,6 +9197,20 @@ int alloc_contig_range(unsigned long start, unsigned long end,
* we are interested in). This will put all the pages in
* range back to page allocator as MIGRATE_ISOLATE.
*
+ * Afterwards, we restore the migratetypes of the pageblocks not
+ * in range, split free pages spanning outside the range,
+ * and put split free pages (at pageblock_order) to the right
+ * migratetype list.
+ *
+ * NOTE: the above approach is used because it can cause free
+ * page accounting issues during isolation, if a page, either
+ * free or in-use, contains multiple pageblocks and we only
+ * isolate a subset of them. For example, if only the second
+ * pageblock is isolated from a page with 2 pageblocks, after
+ * the page is free, it will be put in the first pageblock
+ * migratetype list instead of having 2 pageblocks in two
+ * separate migratetype lists.
+ *
* When this is done, we take the pages in range from page
* allocator removing them from the buddy system. This way
* page allocator will never consider using them.
@@ -9135,10 +9221,9 @@ int alloc_contig_range(unsigned long start, unsigned long end,
* put back to page allocator so that buddy can use them.
*/
- ret = start_isolate_page_range(pfn_max_align_down(start),
- pfn_max_align_up(end), migratetype, 0);
+ ret = start_isolate_page_range(isolate_start, isolate_end, migratetype, 0);
if (ret)
- return ret;
+ goto done;
drain_all_pages(cc.zone);
@@ -9174,6 +9259,19 @@ int alloc_contig_range(unsigned long start, unsigned long end,
* isolated thus they won't get removed from buddy.
*/
+ /*
+ * Restore migratetypes of pageblocks outside [start, end)
+ * TODO: remove it when MIGRATE_ISOLATE becomes a standalone bit
+ */
+
+ num = restore_migratetypes(saved_mt, isolate_start, alloc_start);
+
+ num = restore_migratetypes(&saved_mt[num], alloc_end, isolate_end);
+
+ /*
+ * Split free page spanning [isolate_start, alloc_start) and put the
+ * pageblocks in the right migratetype lists.
+ */
order = 0;
outer_start = start;
while (!PageBuddy(pfn_to_page(outer_start))) {
@@ -9188,37 +9286,68 @@ int alloc_contig_range(unsigned long start, unsigned long end,
order = buddy_order(pfn_to_page(outer_start));
/*
- * outer_start page could be small order buddy page and
- * it doesn't include start page. Adjust outer_start
- * in this case to report failed page properly
- * on tracepoint in test_pages_isolated()
+ * split the free page has start page and put the pageblocks
+ * in the right migratetype list
*/
- if (outer_start + (1UL << order) <= start)
- outer_start = start;
+ if (outer_start + (1UL << order) > start) {
+ struct page *free_page = pfn_to_page(outer_start);
+
+ split_free_page_into_pageblocks(free_page, order, cc.zone);
+ }
+ }
+
+ /*
+ * Split free page spanning [alloc_end, isolate_end) and put the
+ * pageblocks in the right migratetype list
+ */
+ order = 0;
+ outer_end = end;
+ while (!PageBuddy(pfn_to_page(outer_end))) {
+ if (++order >= MAX_ORDER) {
+ outer_end = end;
+ break;
+ }
+ outer_end &= ~0UL << order;
+ }
+
+ if (outer_end != end) {
+ order = buddy_order(pfn_to_page(outer_end));
+
+ /*
+ * split the free page has start page and put the pageblocks
+ * in the right migratetype list
+ */
+ VM_BUG_ON(outer_end + (1UL << order) <= end);
+ {
+ struct page *free_page = pfn_to_page(outer_end);
+
+ split_free_page_into_pageblocks(free_page, order, cc.zone);
+ }
}
/* Make sure the range is really isolated. */
- if (test_pages_isolated(outer_start, end, 0)) {
+ if (test_pages_isolated(alloc_start, alloc_end, 0)) {
ret = -EBUSY;
goto done;
}
/* Grab isolated pages from freelists. */
- outer_end = isolate_freepages_range(&cc, outer_start, end);
+ outer_end = isolate_freepages_range(&cc, alloc_start, alloc_end);
if (!outer_end) {
ret = -EBUSY;
goto done;
}
/* Free head and tail (if any) */
- if (start != outer_start)
- free_contig_range(outer_start, start - outer_start);
- if (end != outer_end)
- free_contig_range(end, outer_end - end);
+ if (start != alloc_start)
+ free_contig_range(alloc_start, start - alloc_start);
+ if (end != alloc_end)
+ free_contig_range(end, alloc_end - end);
done:
- undo_isolate_page_range(pfn_max_align_down(start),
- pfn_max_align_up(end), migratetype);
+ kfree(saved_mt);
+ undo_isolate_page_range(alloc_start,
+ alloc_end, migratetype);
return ret;
}
EXPORT_SYMBOL(alloc_contig_range);