[PATCH 3/5] mm: memcontrol: reparent the kmem pages on cgroup removal

From: Muchun Song
Date: Mon Mar 01 2021 - 01:29:02 EST


Currently the slab objects already reparent to it's parent memcg on
cgroup removal. But there are still some corner objects which are
not reparent (e.g. allocations larger than order-1 page on SLUB).
Actually those objects are allocated directly from the buddy allocator.
And they are chared as kmem to memcg via __memcg_kmem_charge_page().
Such objects are not reparent on cgroup removal.

So this patch aims to reparent kmem pages on cgroup removal. Doing
this is simple with help of the infrastructures of obj_cgroup.
Finally, the page->memcg_data points to an object cgroup for the
kmem page.

Signed-off-by: Muchun Song <songmuchun@xxxxxxxxxxxxx>
---
include/linux/memcontrol.h | 66 +++++++++++--------
mm/memcontrol.c | 155 ++++++++++++++++++++++++---------------------
2 files changed, 124 insertions(+), 97 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 1d2c82464c8c..27043478220f 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -370,23 +370,15 @@ static inline bool page_memcg_charged(struct page *page)
}

/*
- * page_memcg_kmem - get the memory cgroup associated with a kmem page.
- * @page: a pointer to the page struct
+ * After the initialization objcg->memcg is always pointing at
+ * a valid memcg, but can be atomically swapped to the parent memcg.
*
- * Returns a pointer to the memory cgroup associated with the kmem page,
- * or NULL. This function assumes that the page is known to have a proper
- * memory cgroup pointer. It is only suitable for kmem pages which means
- * PageMemcgKmem() returns true for this page.
+ * The caller must ensure that the returned memcg won't be released:
+ * e.g. acquire the rcu_read_lock or css_set_lock.
*/
-static inline struct mem_cgroup *page_memcg_kmem(struct page *page)
+static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
{
- unsigned long memcg_data = page->memcg_data;
-
- VM_BUG_ON_PAGE(PageSlab(page), page);
- VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page);
- VM_BUG_ON_PAGE(!(memcg_data & MEMCG_DATA_KMEM), page);
-
- return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+ return READ_ONCE(objcg->memcg);
}

/*
@@ -462,6 +454,17 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page)
if (memcg_data & MEMCG_DATA_OBJCGS)
return NULL;

+ if (memcg_data & MEMCG_DATA_KMEM) {
+ struct obj_cgroup *objcg;
+
+ /*
+ * The caller must ensure that the returned memcg won't be
+ * released: e.g. acquire the rcu_read_lock or css_set_lock.
+ */
+ objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+ return obj_cgroup_memcg(objcg);
+ }
+
return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
}

@@ -520,6 +523,24 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page)
return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
}

+/*
+ * page_objcg - get the object cgroup associated with a kmem page
+ * @page: a pointer to the page struct
+ *
+ * Returns a pointer to the object cgroup associated with the kmem page,
+ * or NULL. This function assumes that the page is known to have an
+ * associated object cgroup. It's only safe to call this function
+ * against kmem pages (PageMemcgKmem() returns true).
+ */
+static inline struct obj_cgroup *page_objcg(struct page *page)
+{
+ unsigned long memcg_data = page->memcg_data;
+
+ VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page);
+ VM_BUG_ON_PAGE(!(memcg_data & MEMCG_DATA_KMEM), page);
+
+ return (struct obj_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+}
#else
static inline struct obj_cgroup **page_objcgs(struct page *page)
{
@@ -530,6 +551,11 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page)
{
return NULL;
}
+
+static inline struct obj_cgroup *page_objcg(struct page *page)
+{
+ return NULL;
+}
#endif

static __always_inline bool memcg_stat_item_in_bytes(int idx)
@@ -748,18 +774,6 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg)
percpu_ref_put(&objcg->refcnt);
}

-/*
- * After the initialization objcg->memcg is always pointing at
- * a valid memcg, but can be atomically swapped to the parent memcg.
- *
- * The caller must ensure that the returned memcg won't be released:
- * e.g. acquire the rcu_read_lock or css_set_lock.
- */
-static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
-{
- return READ_ONCE(objcg->memcg);
-}
-
static inline void mem_cgroup_put(struct mem_cgroup *memcg)
{
if (memcg)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bfd6efe1e196..39cb8c5bf8b2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -856,10 +856,16 @@ void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
{
struct page *head = compound_head(page); /* rmap on tail pages */
struct mem_cgroup *memcg;
- pg_data_t *pgdat = page_pgdat(page);
+ pg_data_t *pgdat;
struct lruvec *lruvec;

- memcg = PageMemcgKmem(head) ? page_memcg_kmem(head) : page_memcg(head);
+ if (PageMemcgKmem(head)) {
+ __mod_lruvec_kmem_state(page_to_virt(head), idx, val);
+ return;
+ }
+
+ pgdat = page_pgdat(head);
+ memcg = page_memcg(head);
/* Untracked pages have no memcg, no lruvec. Update only the node */
if (!memcg) {
__mod_node_page_state(pgdat, idx, val);
@@ -1056,24 +1062,6 @@ static __always_inline struct mem_cgroup *active_memcg(void)
return current->active_memcg;
}

-static __always_inline struct mem_cgroup *get_active_memcg(void)
-{
- struct mem_cgroup *memcg;
-
- rcu_read_lock();
- memcg = active_memcg();
- if (memcg) {
- /* current->active_memcg must hold a ref. */
- if (WARN_ON_ONCE(!css_tryget(&memcg->css)))
- memcg = root_mem_cgroup;
- else
- memcg = current->active_memcg;
- }
- rcu_read_unlock();
-
- return memcg;
-}
-
static __always_inline bool memcg_kmem_bypass(void)
{
/* Allow remote memcg charging from any context. */
@@ -1088,20 +1076,6 @@ static __always_inline bool memcg_kmem_bypass(void)
}

/**
- * If active memcg is set, do not fallback to current->mm->memcg.
- */
-static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
-{
- if (memcg_kmem_bypass())
- return NULL;
-
- if (unlikely(active_memcg()))
- return get_active_memcg();
-
- return get_mem_cgroup_from_mm(current->mm);
-}
-
-/**
* mem_cgroup_iter - iterate over memory cgroup hierarchy
* @root: hierarchy root
* @prev: previously returned memcg, NULL on first invocation
@@ -3148,18 +3122,18 @@ static void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_page
*/
int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
{
- struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg;
int ret = 0;

- memcg = get_mem_cgroup_from_current();
- if (memcg && !mem_cgroup_is_root(memcg)) {
- ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
+ objcg = get_obj_cgroup_from_current();
+ if (objcg) {
+ ret = obj_cgroup_charge_page(objcg, gfp, 1 << order);
if (!ret) {
- page->memcg_data = (unsigned long)memcg |
+ page->memcg_data = (unsigned long)objcg |
MEMCG_DATA_KMEM;
return 0;
}
- css_put(&memcg->css);
+ obj_cgroup_put(objcg);
}
return ret;
}
@@ -3171,17 +3145,18 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
*/
void __memcg_kmem_uncharge_page(struct page *page, int order)
{
- struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg;
unsigned int nr_pages = 1 << order;

if (!page_memcg_charged(page))
return;

- memcg = page_memcg_kmem(page);
- VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
- __memcg_kmem_uncharge(memcg, nr_pages);
+ VM_BUG_ON_PAGE(!PageMemcgKmem(page), page);
+
+ objcg = page_objcg(page);
+ obj_cgroup_uncharge_page(objcg, nr_pages);
page->memcg_data = 0;
- css_put(&memcg->css);
+ obj_cgroup_put(objcg);
}

static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
@@ -6798,8 +6773,12 @@ struct uncharge_gather {
struct mem_cgroup *memcg;
unsigned long nr_pages;
unsigned long pgpgout;
- unsigned long nr_kmem;
struct page *dummy_page;
+
+#ifdef CONFIG_MEMCG_KMEM
+ struct obj_cgroup *objcg;
+ unsigned long nr_kmem;
+#endif
};

static inline void uncharge_gather_clear(struct uncharge_gather *ug)
@@ -6811,12 +6790,21 @@ static void uncharge_batch(const struct uncharge_gather *ug)
{
unsigned long flags;

+#ifdef CONFIG_MEMCG_KMEM
+ if (ug->objcg) {
+ obj_cgroup_uncharge_page(ug->objcg, ug->nr_kmem);
+ /* drop reference from uncharge_kmem_page */
+ obj_cgroup_put(ug->objcg);
+ }
+#endif
+
+ if (!ug->memcg)
+ return;
+
if (!mem_cgroup_is_root(ug->memcg)) {
page_counter_uncharge(&ug->memcg->memory, ug->nr_pages);
if (do_memsw_account())
page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
- page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
memcg_oom_recover(ug->memcg);
}

@@ -6826,26 +6814,40 @@ static void uncharge_batch(const struct uncharge_gather *ug)
memcg_check_events(ug->memcg, ug->dummy_page);
local_irq_restore(flags);

- /* drop reference from uncharge_page */
+ /* drop reference from uncharge_user_page */
css_put(&ug->memcg->css);
}

-static void uncharge_page(struct page *page, struct uncharge_gather *ug)
+#ifdef CONFIG_MEMCG_KMEM
+static void uncharge_kmem_page(struct page *page, struct uncharge_gather *ug)
{
- unsigned long nr_pages;
- struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg = page_objcg(page);

- VM_BUG_ON_PAGE(PageLRU(page), page);
+ if (ug->objcg != objcg) {
+ if (ug->objcg) {
+ uncharge_batch(ug);
+ uncharge_gather_clear(ug);
+ }
+ ug->objcg = objcg;

- if (!page_memcg_charged(page))
- return;
+ /* pairs with obj_cgroup_put in uncharge_batch */
+ obj_cgroup_get(ug->objcg);
+ }
+
+ ug->nr_kmem += compound_nr(page);
+ page->memcg_data = 0;
+ obj_cgroup_put(ug->objcg);
+}
+#else
+static void uncharge_kmem_page(struct page *page, struct uncharge_gather *ug)
+{
+}
+#endif
+
+static void uncharge_user_page(struct page *page, struct uncharge_gather *ug)
+{
+ struct mem_cgroup *memcg = page_memcg(page);

- /*
- * Nobody should be changing or seriously looking at
- * page memcg at this point, we have fully exclusive
- * access to the page.
- */
- memcg = PageMemcgKmem(page) ? page_memcg_kmem(page) : page_memcg(page);
if (ug->memcg != memcg) {
if (ug->memcg) {
uncharge_batch(ug);
@@ -6856,18 +6858,30 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
/* pairs with css_put in uncharge_batch */
css_get(&ug->memcg->css);
}
+ ug->pgpgout++;
+ ug->dummy_page = page;
+
+ ug->nr_pages += compound_nr(page);
+ page->memcg_data = 0;
+ css_put(&ug->memcg->css);
+}

- nr_pages = compound_nr(page);
- ug->nr_pages += nr_pages;
+static void uncharge_page(struct page *page, struct uncharge_gather *ug)
+{
+ VM_BUG_ON_PAGE(PageLRU(page), page);

+ if (!page_memcg_charged(page))
+ return;
+
+ /*
+ * Nobody should be changing or seriously looking at
+ * page memcg at this point, we have fully exclusive
+ * access to the page.
+ */
if (PageMemcgKmem(page))
- ug->nr_kmem += nr_pages;
+ uncharge_kmem_page(page, ug);
else
- ug->pgpgout++;
-
- ug->dummy_page = page;
- page->memcg_data = 0;
- css_put(&ug->memcg->css);
+ uncharge_user_page(page, ug);
}

/**
@@ -6910,8 +6924,7 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
uncharge_gather_clear(&ug);
list_for_each_entry(page, page_list, lru)
uncharge_page(page, &ug);
- if (ug.memcg)
- uncharge_batch(&ug);
+ uncharge_batch(&ug);
}

/**
--
2.11.0