Re: [PATCH] percpu: rework memcg accounting

From: Roman Gushchin
Date: Thu May 13 2021 - 14:27:24 EST


On Thu, May 13, 2021 at 03:41:46PM +0000, Dennis Zhou wrote:
> Hello,
>
> On Tue, May 11, 2021 at 05:35:04PM -0700, Roman Gushchin wrote:
> > The current implementation of the memcg accounting of the percpu
> > memory is based on the idea of having a separate set of chunks for
> > accounted and non-accounted memory. This approach has an advantage
> > of not wasting any extra memory for memcg data for non-accounted
> > chunks, however it complicates the code and leads to a less effective
> > chunk usage due to a lower utilization.
> >
> > Instead of having two chunk types it's possible to have only one and
> > allocate the space for memcg data (objcg array) dynamically. It makes
> > the code simpler in many places (no need to iterate over multiple
> > chunk types) and saves memory. The only downside is that in a unlikely
> > event when a percpu allocation succeeds and the allocation of the
> > objcg array fails, the percpu allocation is passed unaccounted. It's
> > not a big deal (a single unaccounted allocation will unlikely cause
> > any systematic problems) and it's how the slab accounting works too.
> >
> > On my test vm the percpu size just after boot decreased from 7680 kB
> > to 7488 kB and the number of chunk decreased by 1. It's not a big win,
> > however on production machines with many chunks the win will be likely
> > bigger.
>
> Overall I'm fine with this approach. I would like to see some production
> numbers to better understand the # of memcg aware chunks vs not as well
> as overtime how does that evolve? I suspect that over time non-memcg
> aware chunks will be converted and most chunks will be memcg aware after
> some period of time.

Just looked at a random host in our production: there are 20 chunks (including
the reserved), 6 are !memcg aware, 14 are memcg aware.

This is how free bytes are distributed:
free_bytes : 8092 memcg_aware : 0
free_bytes : 12708 memcg_aware : 0
free_bytes : 0 memcg_aware : 0
free_bytes : 2320 memcg_aware : 0
free_bytes : 13272 memcg_aware : 0
free_bytes : 131944 memcg_aware : 0
free_bytes : 20880 memcg_aware : 1
free_bytes : 12536 memcg_aware : 1
free_bytes : 10088 memcg_aware : 1
free_bytes : 10080 memcg_aware : 1
free_bytes : 2904 memcg_aware : 1
free_bytes : 16176 memcg_aware : 1
free_bytes : 15440 memcg_aware : 1
free_bytes : 5280 memcg_aware : 1
free_bytes : 145864 memcg_aware : 1
free_bytes : 248800 memcg_aware : 1
free_bytes : 256240 memcg_aware : 1
free_bytes : 259664 memcg_aware : 1
free_bytes : 256240 memcg_aware : 1
free_bytes : 262144 memcg_aware : 1

>
> If the numbers aren't compelling, let's just allocate all chunks to be
> memcg aware.

This is an option too, however this will require some tricks to handle correctly
the first chunk, the case where memcg accounting is disabled via a boot option,
etc. On-demand allocation is just simpler IMO.

>
> >
> > Signed-off-by: Roman Gushchin <guro@xxxxxx>
> > ---
> > mm/percpu-internal.h | 52 +------------
> > mm/percpu-km.c | 5 +-
> > mm/percpu-stats.c | 45 ++++-------
> > mm/percpu-vm.c | 11 ++-
> > mm/percpu.c | 180 ++++++++++++++++++-------------------------
> > 5 files changed, 98 insertions(+), 195 deletions(-)
> >
> > diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
> > index 10604dce806f..b6dc22904088 100644
> > --- a/mm/percpu-internal.h
> > +++ b/mm/percpu-internal.h
> > @@ -5,25 +5,6 @@
> > #include <linux/types.h>
> > #include <linux/percpu.h>
> >
> > -/*
> > - * There are two chunk types: root and memcg-aware.
> > - * Chunks of each type have separate slots list.
> > - *
> > - * Memcg-aware chunks have an attached vector of obj_cgroup pointers, which is
> > - * used to store memcg membership data of a percpu object. Obj_cgroups are
> > - * ref-counted pointers to a memory cgroup with an ability to switch dynamically
> > - * to the parent memory cgroup. This allows to reclaim a deleted memory cgroup
> > - * without reclaiming of all outstanding objects, which hold a reference at it.
> > - */
> > -enum pcpu_chunk_type {
> > - PCPU_CHUNK_ROOT,
> > -#ifdef CONFIG_MEMCG_KMEM
> > - PCPU_CHUNK_MEMCG,
> > -#endif
> > - PCPU_NR_CHUNK_TYPES,
> > - PCPU_FAIL_ALLOC = PCPU_NR_CHUNK_TYPES
> > -};
> > -
> > /*
> > * pcpu_block_md is the metadata block struct.
> > * Each chunk's bitmap is split into a number of full blocks.
> > @@ -91,7 +72,7 @@ extern struct list_head *pcpu_chunk_lists;
> > extern int pcpu_nr_slots;
> > extern int pcpu_sidelined_slot;
> > extern int pcpu_to_depopulate_slot;
> > -extern int pcpu_nr_empty_pop_pages[];
> > +extern int pcpu_nr_empty_pop_pages;
> >
> > extern struct pcpu_chunk *pcpu_first_chunk;
> > extern struct pcpu_chunk *pcpu_reserved_chunk;
> > @@ -132,37 +113,6 @@ static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk)
> > return pcpu_nr_pages_to_map_bits(chunk->nr_pages);
> > }
> >
> > -#ifdef CONFIG_MEMCG_KMEM
> > -static inline enum pcpu_chunk_type pcpu_chunk_type(struct pcpu_chunk *chunk)
> > -{
> > - if (chunk->obj_cgroups)
> > - return PCPU_CHUNK_MEMCG;
> > - return PCPU_CHUNK_ROOT;
> > -}
> > -
> > -static inline bool pcpu_is_memcg_chunk(enum pcpu_chunk_type chunk_type)
> > -{
> > - return chunk_type == PCPU_CHUNK_MEMCG;
> > -}
> > -
> > -#else
> > -static inline enum pcpu_chunk_type pcpu_chunk_type(struct pcpu_chunk *chunk)
> > -{
> > - return PCPU_CHUNK_ROOT;
> > -}
> > -
> > -static inline bool pcpu_is_memcg_chunk(enum pcpu_chunk_type chunk_type)
> > -{
> > - return false;
> > -}
> > -#endif
> > -
> > -static inline struct list_head *pcpu_chunk_list(enum pcpu_chunk_type chunk_type)
> > -{
> > - return &pcpu_chunk_lists[pcpu_nr_slots *
> > - pcpu_is_memcg_chunk(chunk_type)];
> > -}
> > -
> > #ifdef CONFIG_PERCPU_STATS
> >
> > #include <linux/spinlock.h>
> > diff --git a/mm/percpu-km.c b/mm/percpu-km.c
> > index c84a9f781a6c..c9d529dc7651 100644
> > --- a/mm/percpu-km.c
> > +++ b/mm/percpu-km.c
> > @@ -44,8 +44,7 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
> > /* nada */
> > }
> >
> > -static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type,
> > - gfp_t gfp)
> > +static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
> > {
> > const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
> > struct pcpu_chunk *chunk;
> > @@ -53,7 +52,7 @@ static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type,
> > unsigned long flags;
> > int i;
> >
> > - chunk = pcpu_alloc_chunk(type, gfp);
> > + chunk = pcpu_alloc_chunk(gfp);
> > if (!chunk)
> > return NULL;
> >
> > diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c
> > index 2125981acfb9..7103515525e9 100644
> > --- a/mm/percpu-stats.c
> > +++ b/mm/percpu-stats.c
> > @@ -34,15 +34,11 @@ static int find_max_nr_alloc(void)
> > {
> > struct pcpu_chunk *chunk;
> > int slot, max_nr_alloc;
> > - enum pcpu_chunk_type type;
> >
> > max_nr_alloc = 0;
> > - for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
> > - for (slot = 0; slot < pcpu_nr_slots; slot++)
> > - list_for_each_entry(chunk, &pcpu_chunk_list(type)[slot],
> > - list)
> > - max_nr_alloc = max(max_nr_alloc,
> > - chunk->nr_alloc);
> > + for (slot = 0; slot < pcpu_nr_slots; slot++)
> > + list_for_each_entry(chunk, &pcpu_chunk_lists[slot], list)
> > + max_nr_alloc = max(max_nr_alloc, chunk->nr_alloc);
> >
> > return max_nr_alloc;
> > }
> > @@ -134,7 +130,7 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk,
> > P("cur_med_alloc", cur_med_alloc);
> > P("cur_max_alloc", cur_max_alloc);
> > #ifdef CONFIG_MEMCG_KMEM
> > - P("memcg_aware", pcpu_is_memcg_chunk(pcpu_chunk_type(chunk)));
> > + P("memcg_aware", !!chunk->obj_cgroups);
> > #endif
> > seq_putc(m, '\n');
> > }
> > @@ -144,8 +140,6 @@ static int percpu_stats_show(struct seq_file *m, void *v)
> > struct pcpu_chunk *chunk;
> > int slot, max_nr_alloc;
> > int *buffer;
> > - enum pcpu_chunk_type type;
> > - int nr_empty_pop_pages;
> >
> > alloc_buffer:
> > spin_lock_irq(&pcpu_lock);
> > @@ -166,10 +160,6 @@ static int percpu_stats_show(struct seq_file *m, void *v)
> > goto alloc_buffer;
> > }
> >
> > - nr_empty_pop_pages = 0;
> > - for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
> > - nr_empty_pop_pages += pcpu_nr_empty_pop_pages[type];
> > -
> > #define PL(X) \
> > seq_printf(m, " %-20s: %12lld\n", #X, (long long int)pcpu_stats_ai.X)
> >
> > @@ -201,7 +191,7 @@ static int percpu_stats_show(struct seq_file *m, void *v)
> > PU(nr_max_chunks);
> > PU(min_alloc_size);
> > PU(max_alloc_size);
> > - P("empty_pop_pages", nr_empty_pop_pages);
> > + P("empty_pop_pages", pcpu_nr_empty_pop_pages);
> > seq_putc(m, '\n');
> >
> > #undef PU
> > @@ -215,20 +205,17 @@ static int percpu_stats_show(struct seq_file *m, void *v)
> > chunk_map_stats(m, pcpu_reserved_chunk, buffer);
> > }
> >
> > - for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++) {
> > - for (slot = 0; slot < pcpu_nr_slots; slot++) {
> > - list_for_each_entry(chunk, &pcpu_chunk_list(type)[slot],
> > - list) {
> > - if (chunk == pcpu_first_chunk)
> > - seq_puts(m, "Chunk: <- First Chunk\n");
> > - else if (slot == pcpu_to_depopulate_slot)
> > - seq_puts(m, "Chunk (to_depopulate)\n");
> > - else if (slot == pcpu_sidelined_slot)
> > - seq_puts(m, "Chunk (sidelined):\n");
> > - else
> > - seq_puts(m, "Chunk:\n");
> > - chunk_map_stats(m, chunk, buffer);
> > - }
> > + for (slot = 0; slot < pcpu_nr_slots; slot++) {
> > + list_for_each_entry(chunk, &pcpu_chunk_lists[slot], list) {
> > + if (chunk == pcpu_first_chunk)
> > + seq_puts(m, "Chunk: <- First Chunk\n");
> > + else if (slot == pcpu_to_depopulate_slot)
> > + seq_puts(m, "Chunk (to_depopulate)\n");
> > + else if (slot == pcpu_sidelined_slot)
> > + seq_puts(m, "Chunk (sidelined):\n");
> > + else
> > + seq_puts(m, "Chunk:\n");
> > + chunk_map_stats(m, chunk, buffer);
> > }
> > }
> >
> > diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
> > index c75f6f24f2d5..e59566b949d1 100644
> > --- a/mm/percpu-vm.c
> > +++ b/mm/percpu-vm.c
> > @@ -328,13 +328,12 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
> > pcpu_free_pages(chunk, pages, page_start, page_end);
> > }
> >
> > -static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type,
> > - gfp_t gfp)
> > +static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
> > {
> > struct pcpu_chunk *chunk;
> > struct vm_struct **vms;
> >
> > - chunk = pcpu_alloc_chunk(type, gfp);
> > + chunk = pcpu_alloc_chunk(gfp);
> > if (!chunk)
> > return NULL;
> >
> > @@ -403,7 +402,7 @@ static bool pcpu_should_reclaim_chunk(struct pcpu_chunk *chunk)
> > * chunk, move it to the to_depopulate list.
> > */
> > return ((chunk->isolated && chunk->nr_empty_pop_pages) ||
> > - (pcpu_nr_empty_pop_pages[pcpu_chunk_type(chunk)] >
> > - PCPU_EMPTY_POP_PAGES_HIGH + chunk->nr_empty_pop_pages &&
> > - chunk->nr_empty_pop_pages >= chunk->nr_pages / 4));
> > + (pcpu_nr_empty_pop_pages > PCPU_EMPTY_POP_PAGES_HIGH +
> > + chunk->nr_empty_pop_pages &&
> > + chunk->nr_empty_pop_pages >= chunk->nr_pages / 4));
> > }
> > diff --git a/mm/percpu.c b/mm/percpu.c
> > index 79eebc80860d..09624d920dc5 100644
> > --- a/mm/percpu.c
> > +++ b/mm/percpu.c
> > @@ -179,10 +179,10 @@ struct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */
> > static LIST_HEAD(pcpu_map_extend_chunks);
> >
> > /*
> > - * The number of empty populated pages by chunk type, protected by pcpu_lock.
> > + * The number of empty populated pages, protected by pcpu_lock.
> > * The reserved chunk doesn't contribute to the count.
> > */
> > -int pcpu_nr_empty_pop_pages[PCPU_NR_CHUNK_TYPES];
> > +int pcpu_nr_empty_pop_pages;
> >
> > /*
> > * The number of populated pages in use by the allocator, protected by
> > @@ -532,13 +532,10 @@ static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot,
> > bool move_front)
> > {
> > if (chunk != pcpu_reserved_chunk) {
> > - struct list_head *pcpu_slot;
> > -
> > - pcpu_slot = pcpu_chunk_list(pcpu_chunk_type(chunk));
> > if (move_front)
> > - list_move(&chunk->list, &pcpu_slot[slot]);
> > + list_move(&chunk->list, &pcpu_chunk_lists[slot]);
> > else
> > - list_move_tail(&chunk->list, &pcpu_slot[slot]);
> > + list_move_tail(&chunk->list, &pcpu_chunk_lists[slot]);
> > }
> > }
> >
> > @@ -574,27 +571,22 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
> >
> > static void pcpu_isolate_chunk(struct pcpu_chunk *chunk)
> > {
> > - enum pcpu_chunk_type type = pcpu_chunk_type(chunk);
> > - struct list_head *pcpu_slot = pcpu_chunk_list(type);
> > -
> > lockdep_assert_held(&pcpu_lock);
> >
> > if (!chunk->isolated) {
> > chunk->isolated = true;
> > - pcpu_nr_empty_pop_pages[type] -= chunk->nr_empty_pop_pages;
> > + pcpu_nr_empty_pop_pages -= chunk->nr_empty_pop_pages;
> > }
> > - list_move(&chunk->list, &pcpu_slot[pcpu_to_depopulate_slot]);
> > + list_move(&chunk->list, &pcpu_chunk_lists[pcpu_to_depopulate_slot]);
> > }
> >
> > static void pcpu_reintegrate_chunk(struct pcpu_chunk *chunk)
> > {
> > - enum pcpu_chunk_type type = pcpu_chunk_type(chunk);
> > -
> > lockdep_assert_held(&pcpu_lock);
> >
> > if (chunk->isolated) {
> > chunk->isolated = false;
> > - pcpu_nr_empty_pop_pages[type] += chunk->nr_empty_pop_pages;
> > + pcpu_nr_empty_pop_pages += chunk->nr_empty_pop_pages;
> > pcpu_chunk_relocate(chunk, -1);
> > }
> > }
> > @@ -612,7 +604,7 @@ static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr)
> > {
> > chunk->nr_empty_pop_pages += nr;
> > if (chunk != pcpu_reserved_chunk && !chunk->isolated)
> > - pcpu_nr_empty_pop_pages[pcpu_chunk_type(chunk)] += nr;
> > + pcpu_nr_empty_pop_pages += nr;
> > }
> >
> > /*
> > @@ -1447,7 +1439,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
> > return chunk;
> > }
> >
> > -static struct pcpu_chunk *pcpu_alloc_chunk(enum pcpu_chunk_type type, gfp_t gfp)
> > +static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
> > {
> > struct pcpu_chunk *chunk;
> > int region_bits;
> > @@ -1475,16 +1467,6 @@ static struct pcpu_chunk *pcpu_alloc_chunk(enum pcpu_chunk_type type, gfp_t gfp)
> > if (!chunk->md_blocks)
> > goto md_blocks_fail;
> >
> > -#ifdef CONFIG_MEMCG_KMEM
> > - if (pcpu_is_memcg_chunk(type)) {
> > - chunk->obj_cgroups =
> > - pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) *
> > - sizeof(struct obj_cgroup *), gfp);
> > - if (!chunk->obj_cgroups)
> > - goto objcg_fail;
> > - }
> > -#endif
> > -
> > pcpu_init_md_blocks(chunk);
> >
> > /* init metadata */
> > @@ -1492,10 +1474,6 @@ static struct pcpu_chunk *pcpu_alloc_chunk(enum pcpu_chunk_type type, gfp_t gfp)
> >
> > return chunk;
> >
> > -#ifdef CONFIG_MEMCG_KMEM
> > -objcg_fail:
> > - pcpu_mem_free(chunk->md_blocks);
> > -#endif
> > md_blocks_fail:
> > pcpu_mem_free(chunk->bound_map);
> > bound_map_fail:
> > @@ -1589,8 +1567,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
> > int page_start, int page_end, gfp_t gfp);
> > static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
> > int page_start, int page_end);
> > -static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type,
> > - gfp_t gfp);
> > +static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp);
> > static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
> > static struct page *pcpu_addr_to_page(void *addr);
> > static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
> > @@ -1633,55 +1610,68 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
> > }
> >
> > #ifdef CONFIG_MEMCG_KMEM
> > -static enum pcpu_chunk_type pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
> > - struct obj_cgroup **objcgp)
> > +static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
> > + struct obj_cgroup **objcgp)
> > {
> > struct obj_cgroup *objcg;
> >
> > if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT))
> > - return PCPU_CHUNK_ROOT;
> > + return true;
> >
> > objcg = get_obj_cgroup_from_current();
> > if (!objcg)
> > - return PCPU_CHUNK_ROOT;
> > + return true;
> >
> > if (obj_cgroup_charge(objcg, gfp, size * num_possible_cpus())) {
> > obj_cgroup_put(objcg);
> > - return PCPU_FAIL_ALLOC;
> > + return false;
> > }
> >
> > *objcgp = objcg;
> > - return PCPU_CHUNK_MEMCG;
> > + return true;
> > }
> >
> > static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
> > struct pcpu_chunk *chunk, int off,
> > - size_t size)
> > + size_t size, gfp_t gfp)
> > {
> > if (!objcg)
> > return;
> >
> > - if (chunk) {
> > - chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg;
> > + if (!chunk)
> > + goto cancel_charge;
> >
> > - rcu_read_lock();
> > - mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
> > - size * num_possible_cpus());
> > - rcu_read_unlock();
> > - } else {
> > - obj_cgroup_uncharge(objcg, size * num_possible_cpus());
> > - obj_cgroup_put(objcg);
> > + if (!chunk->obj_cgroups) {
> > + chunk->obj_cgroups = pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) *
> > + sizeof(struct obj_cgroup *), gfp);
>
> Percpu does some weird things with gfp flags. This might need to be
> fixed if we don't always pre-allocate this for our definition of atomic.
> I haven't thought too much about this just yet as seem my thoughts
> above.

Yeah, I thought about it, but it seems that passing gfp here is ok.
Or at least I don't see any specific problem. If you do, please, let me know.

Thanks!