[RFC 2/3] mm/page_alloc: Access lists in 'struct per_cpu_pages' indirectly

From: Nicolas Saenz Julienne
Date: Fri Oct 08 2021 - 12:19:44 EST


In preparation to adding remote pcplists drain support, let's bundle
'struct per_cpu_pages' list heads and page count into a new structure,
'struct pcplists', and have all code access it indirectly through a
pointer. It'll be used by upcoming patches, which will maintain multiple
versions of pcplists and switch the pointer atomically.

free_pcppages_bulk() also gains a new argument, since we want to avoid
dereferencing the pcplists pointer twice per critical section (delimited
by the pagevec local locks).

'struct pcplists' data is marked as __private, so as to make sure nobody
accesses it directly, except for the initialization code. Note that
'struct per_cpu_pages' is used during boot, when no allocation is
possible.

Signed-off-by: Nicolas Saenz Julienne <nsaenzju@xxxxxxxxxx>
---
include/linux/mmzone.h | 10 +++++--
mm/page_alloc.c | 66 +++++++++++++++++++++++++-----------------
mm/vmstat.c | 6 ++--
3 files changed, 49 insertions(+), 33 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 6a1d79d84675..fb023da9a181 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -358,7 +358,6 @@ enum zone_watermarks {

/* Fields and list protected by pagesets local_lock in page_alloc.c */
struct per_cpu_pages {
- int count; /* number of pages in the list */
int high; /* high watermark, emptying needed */
int batch; /* chunk size for buddy add/remove */
short free_factor; /* batch scaling factor during free */
@@ -366,8 +365,13 @@ struct per_cpu_pages {
short expire; /* When 0, remote pagesets are drained */
#endif

- /* Lists of pages, one per migrate type stored on the pcp-lists */
- struct list_head lists[NR_PCP_LISTS];
+ struct pcplists *lp;
+ struct pcplists {
+ /* Number of pages in the lists */
+ int count;
+ /* Lists of pages, one per migrate type stored on the pcp-lists */
+ struct list_head lists[NR_PCP_LISTS];
+ } __private pcplists;
};

struct per_cpu_zonestat {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dd89933503b4..842816f269da 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1438,7 +1438,8 @@ static inline void prefetch_buddy(struct page *page)
* pinned" detection logic.
*/
static void free_pcppages_bulk(struct zone *zone, int count,
- struct per_cpu_pages *pcp)
+ struct per_cpu_pages *pcp,
+ struct pcplists *lp)
{
int pindex = 0;
int batch_free = 0;
@@ -1453,7 +1454,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
* Ensure proper count is passed which otherwise would stuck in the
* below while (list_empty(list)) loop.
*/
- count = min(pcp->count, count);
+ count = min(lp->count, count);
while (count > 0) {
struct list_head *list;

@@ -1468,7 +1469,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
batch_free++;
if (++pindex == NR_PCP_LISTS)
pindex = 0;
- list = &pcp->lists[pindex];
+ list = &lp->lists[pindex];
} while (list_empty(list));

/* This is the only non-empty list. Free them all. */
@@ -1508,7 +1509,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
}
} while (count > 0 && --batch_free && !list_empty(list));
}
- pcp->count -= nr_freed;
+ lp->count -= nr_freed;

/*
* local_lock_irq held so equivalent to spin_lock_irqsave for
@@ -3069,14 +3070,16 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
*/
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
{
+ struct pcplists *lp;
unsigned long flags;
int to_drain, batch;

local_lock_irqsave(&pagesets.lock, flags);
batch = READ_ONCE(pcp->batch);
- to_drain = min(pcp->count, batch);
+ lp = pcp->lp;
+ to_drain = min(lp->count, batch);
if (to_drain > 0)
- free_pcppages_bulk(zone, to_drain, pcp);
+ free_pcppages_bulk(zone, to_drain, pcp, lp);
local_unlock_irqrestore(&pagesets.lock, flags);
}
#endif
@@ -3092,12 +3095,14 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
{
unsigned long flags;
struct per_cpu_pages *pcp;
+ struct pcplists *lp;

local_lock_irqsave(&pagesets.lock, flags);

pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
- if (pcp->count)
- free_pcppages_bulk(zone, pcp->count, pcp);
+ lp = pcp->lp;
+ if (lp->count)
+ free_pcppages_bulk(zone, lp->count, pcp, lp);

local_unlock_irqrestore(&pagesets.lock, flags);
}
@@ -3158,7 +3163,7 @@ static void drain_local_pages_wq(struct work_struct *work)
*
* drain_all_pages() is optimized to only execute on cpus where pcplists are
* not empty. The check for non-emptiness can however race with a free to
- * pcplist that has not yet increased the pcp->count from 0 to 1. Callers
+ * pcplist that has not yet increased the lp->count from 0 to 1. Callers
* that need the guarantee that every CPU has drained can disable the
* optimizing racy check.
*/
@@ -3200,21 +3205,22 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
struct per_cpu_pages *pcp;
struct zone *z;
bool has_pcps = false;
+ struct pcplists *lp;

if (force_all_cpus) {
/*
- * The pcp.count check is racy, some callers need a
+ * The lp->count check is racy, some callers need a
* guarantee that no cpu is missed.
*/
has_pcps = true;
} else if (zone) {
- pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
- if (pcp->count)
+ lp = per_cpu_ptr(zone->per_cpu_pageset, cpu)->lp;
+ if (lp->count)
has_pcps = true;
} else {
for_each_populated_zone(z) {
- pcp = per_cpu_ptr(z->per_cpu_pageset, cpu);
- if (pcp->count) {
+ lp = per_cpu_ptr(z->per_cpu_pageset, cpu)->lp;
+ if (lp->count) {
has_pcps = true;
break;
}
@@ -3366,19 +3372,21 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn,
{
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
+ struct pcplists *lp;
int high;
int pindex;

__count_vm_event(PGFREE);
pcp = this_cpu_ptr(zone->per_cpu_pageset);
+ lp = pcp->lp;
pindex = order_to_pindex(migratetype, order);
- list_add(&page->lru, &pcp->lists[pindex]);
- pcp->count += 1 << order;
+ list_add(&page->lru, &lp->lists[pindex]);
+ lp->count += 1 << order;
high = nr_pcp_high(pcp, zone);
- if (pcp->count >= high) {
+ if (lp->count >= high) {
int batch = READ_ONCE(pcp->batch);

- free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch), pcp);
+ free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch), pcp, lp);
}
}

@@ -3603,9 +3611,11 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
struct per_cpu_pages *pcp)
{
struct list_head *list;
+ struct pcplists *lp;
struct page *page;

- list = &pcp->lists[order_to_pindex(migratetype, order)];
+ lp = pcp->lp;
+ list = &lp->lists[order_to_pindex(migratetype, order)];

do {
if (list_empty(list)) {
@@ -3625,14 +3635,14 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
batch, list,
migratetype, alloc_flags);

- pcp->count += alloced << order;
+ lp->count += alloced << order;
if (unlikely(list_empty(list)))
return NULL;
}

page = list_first_entry(list, struct page, lru);
list_del(&page->lru);
- pcp->count -= 1 << order;
+ lp->count -= 1 << order;
} while (check_new_pcp(page));

return page;
@@ -5877,7 +5887,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
continue;

for_each_online_cpu(cpu)
- free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count;
+ free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->lp->count;
}

printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
@@ -5971,7 +5981,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)

free_pcp = 0;
for_each_online_cpu(cpu)
- free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count;
+ free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->lp->count;

show_node(zone);
printk(KERN_CONT
@@ -6012,7 +6022,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
K(zone_page_state(zone, NR_MLOCK)),
K(zone_page_state(zone, NR_BOUNCE)),
K(free_pcp),
- K(this_cpu_read(zone->per_cpu_pageset->count)),
+ K(this_cpu_read(zone->per_cpu_pageset)->lp->count),
K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
printk("lowmem_reserve[]:");
for (i = 0; i < MAX_NR_ZONES; i++)
@@ -6848,7 +6858,7 @@ static int zone_highsize(struct zone *zone, int batch, int cpu_online)

/*
* pcp->high and pcp->batch values are related and generally batch is lower
- * than high. They are also related to pcp->count such that count is lower
+ * than high. They are also related to pcp->lp->count such that count is lower
* than high, and as soon as it reaches high, the pcplist is flushed.
*
* However, guaranteeing these relations at all times would require e.g. write
@@ -6856,7 +6866,7 @@ static int zone_highsize(struct zone *zone, int batch, int cpu_online)
* thus be prone to error and bad for performance. Thus the update only prevents
* store tearing. Any new users of pcp->batch and pcp->high should ensure they
* can cope with those fields changing asynchronously, and fully trust only the
- * pcp->count field on the local CPU with interrupts disabled.
+ * pcp->lp->count field on the local CPU with interrupts disabled.
*
* mutex_is_locked(&pcp_batch_high_lock) required when calling this function
* outside of boot time (or some other assurance that no concurrent updaters
@@ -6876,8 +6886,10 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta
memset(pcp, 0, sizeof(*pcp));
memset(pzstats, 0, sizeof(*pzstats));

+ pcp->lp = &ACCESS_PRIVATE(pcp, pcplists);
+
for (pindex = 0; pindex < NR_PCP_LISTS; pindex++)
- INIT_LIST_HEAD(&pcp->lists[pindex]);
+ INIT_LIST_HEAD(&pcp->lp->lists[pindex]);

/*
* Set batch and high values safe for a boot pageset. A true percpu
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 8ce2620344b2..5279d3f34e0b 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -856,7 +856,7 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
* if not then there is nothing to expire.
*/
if (!__this_cpu_read(pcp->expire) ||
- !__this_cpu_read(pcp->count))
+ !this_cpu_ptr(pcp)->lp->count)
continue;

/*
@@ -870,7 +870,7 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
if (__this_cpu_dec_return(pcp->expire))
continue;

- if (__this_cpu_read(pcp->count)) {
+ if (this_cpu_ptr(pcp)->lp->count) {
drain_zone_pages(zone, this_cpu_ptr(pcp));
changes++;
}
@@ -1707,7 +1707,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
"\n high: %i"
"\n batch: %i",
i,
- pcp->count,
+ pcp->lp->count,
pcp->high,
pcp->batch);
#ifdef CONFIG_SMP
--
2.31.1