[RFC][PATCH 3/4] memcg: reduce size of mem_cgroup by removingper-node info array

From: KAMEZAWA Hiroyuki
Date: Mon Sep 27 2010 - 05:59:17 EST


From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>

Now, memcgroup's per-zone structure is looked up as

mem->info.nodeinfo[nid]->zoneinfo[zid]

1st. This nodeinfo is array of pointers of MAX_NUMNODES size. This makes
sizeof struct mem_cgroup very large and struct mem_cgroup will be allocated on
vmalloc() area because the size is larger than PAGE_SIZE.
(This will never be fixed even when nodehotplug is supported.)

2nd. Now, page_cgroup->mem_cgroup is an ID. Then, we need 2 level lookup up
to accesss per-zone structure as

mem = css_lookup(pc->mem_cgroup);
mz = mem->info.nodeinfo[nid]->zoneinfo[zid]

This look up seems wasteful. This patch removes mem->info and moves all per-zone
memcg onto radix-tree. mem_cgroup_per_zone structure can be found by

radix_tree_lookup(&memcg_lrus, id_func(memcg, nid, zid)).

This makes memcg small (4440 bytes => 344bytes) and combine 2 lookup into one.

Following patch will add memory hotplug support.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>
---
mm/memcontrol.c | 86 +++++++++++++++++++++++++++++++++++++-------------------
1 file changed, 57 insertions(+), 29 deletions(-)

Index: mmotm-0922/mm/memcontrol.c
===================================================================
--- mmotm-0922.orig/mm/memcontrol.c
+++ mmotm-0922/mm/memcontrol.c
@@ -122,13 +122,16 @@ struct mem_cgroup_per_zone {
/* Macro for accessing counter */
#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])

-struct mem_cgroup_per_node {
- struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
-};
+RADIX_TREE(memcg_lrus, GFP_KERNEL);
+DEFINE_SPINLOCK(memcg_lrutable_lock);

-struct mem_cgroup_lru_info {
- struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
-};
+static inline long node_zone_idx(int memcg, int node, int zone) {
+ unsigned long id;
+
+ id = ((node) << ZONES_SHIFT | (zone)) << 16;
+ id |= memcg;
+ return id;
+}

/*
* Cgroups above their limits are maintained in a RB-Tree, independent of
@@ -206,11 +209,6 @@ struct mem_cgroup {
* the counter to account for mem+swap usage.
*/
struct res_counter memsw;
- /*
- * Per cgroup active and inactive list, similar to the
- * per zone LRU lists.
- */
- struct mem_cgroup_lru_info info;

/*
protect against reclaim related member.
@@ -388,9 +386,14 @@ static struct mem_cgroup *memcg_lookup(u
}

static struct mem_cgroup_per_zone *
-mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
+mem_cgroup_zoneinfo(int memcgid, int nid, int zid)
{
- return &mem->info.nodeinfo[nid]->zoneinfo[zid];
+ struct mem_cgroup_per_zone *mz;
+
+ rcu_read_lock();
+ mz = radix_tree_lookup(&memcg_lrus, node_zone_idx(memcgid, nid, zid));
+ rcu_read_unlock();
+ return mz;
}

struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
@@ -401,14 +404,13 @@ struct cgroup_subsys_state *mem_cgroup_c
static struct mem_cgroup_per_zone *
page_cgroup_zoneinfo(struct page_cgroup *pc)
{
- struct mem_cgroup *mem = memcg_lookup(pc->mem_cgroup);
int nid = page_cgroup_nid(pc);
int zid = page_cgroup_zid(pc);

- if (!mem)
+ if (!pc->mem_cgroup)
return NULL;

- return mem_cgroup_zoneinfo(mem, nid, zid);
+ return mem_cgroup_zoneinfo(pc->mem_cgroup, nid, zid);
}

static struct mem_cgroup_tree_per_zone *
@@ -496,7 +498,7 @@ static void mem_cgroup_update_tree(struc
* because their event counter is not touched.
*/
for (; mem; mem = parent_mem_cgroup(mem)) {
- mz = mem_cgroup_zoneinfo(mem, nid, zid);
+ mz = mem_cgroup_zoneinfo(css_id(&mem->css), nid, zid);
excess = res_counter_soft_limit_excess(&mem->res);
/*
* We have to update the tree if mz is on RB-tree or
@@ -525,7 +527,7 @@ static void mem_cgroup_remove_from_trees

for_each_node_state(node, N_POSSIBLE) {
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
- mz = mem_cgroup_zoneinfo(mem, node, zone);
+ mz = mem_cgroup_zoneinfo(css_id(&mem->css), node, zone);
mctz = soft_limit_tree_node_zone(node, zone);
mem_cgroup_remove_exceeded(mem, mz, mctz);
}
@@ -658,7 +660,7 @@ static unsigned long mem_cgroup_get_loca

for_each_online_node(nid)
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
- mz = mem_cgroup_zoneinfo(mem, nid, zid);
+ mz = mem_cgroup_zoneinfo(css_id(&mem->css), nid, zid);
total += MEM_CGROUP_ZSTAT(mz, idx);
}
return total;
@@ -1039,7 +1041,9 @@ unsigned long mem_cgroup_zone_nr_pages(s
{
int nid = zone_to_nid(zone);
int zid = zone_idx(zone);
- struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
+ struct mem_cgroup_per_zone *mz;
+
+ mz = mem_cgroup_zoneinfo(css_id(&memcg->css), nid, zid);

return MEM_CGROUP_ZSTAT(mz, lru);
}
@@ -1049,7 +1053,9 @@ struct zone_reclaim_stat *mem_cgroup_get
{
int nid = zone_to_nid(zone);
int zid = zone_idx(zone);
- struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
+ struct mem_cgroup_per_zone *mz;
+
+ mz = mem_cgroup_zoneinfo(css_id(&memcg->css), nid, zid);

return &mz->reclaim_stat;
}
@@ -1099,7 +1105,7 @@ unsigned long mem_cgroup_isolate_pages(u
int ret;

BUG_ON(!mem_cont);
- mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
+ mz = mem_cgroup_zoneinfo(css_id(&mem_cont->css), nid, zid);
src = &mz->lists[lru];

scan = 0;
@@ -3179,7 +3185,7 @@ static int mem_cgroup_force_empty_list(s
int ret = 0;

zone = &NODE_DATA(node)->node_zones[zid];
- mz = mem_cgroup_zoneinfo(mem, node, zid);
+ mz = mem_cgroup_zoneinfo(css_id(&mem->css), node, zid);
list = &mz->lists[lru];

loop = MEM_CGROUP_ZSTAT(mz, lru);
@@ -3676,7 +3682,8 @@ static int mem_control_stat_show(struct

for_each_online_node(nid)
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
- mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
+ mz = mem_cgroup_zoneinfo(
+ css_id(&mem_cont->css), nid, zid);

recent_rotated[0] +=
mz->reclaim_stat.recent_rotated[0];
@@ -4173,10 +4180,9 @@ static int register_memsw_files(struct c

static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
{
- struct mem_cgroup_per_node *pn;
struct mem_cgroup_per_zone *mz;
enum lru_list l;
- int zone, tmp = node;
+ int id, zone, ret, tmp = node;
/*
* This routine is called against possible nodes.
* But it's BUG to call kmalloc() against offline node.
@@ -4187,27 +4193,51 @@ static int alloc_mem_cgroup_per_zone_inf
*/
if (!node_state(node, N_NORMAL_MEMORY))
tmp = -1;
- pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
- if (!pn)
- return 1;
-
- mem->info.nodeinfo[node] = pn;
- memset(pn, 0, sizeof(*pn));
-
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
- mz = &pn->zoneinfo[zone];
+ mz = kzalloc_node(sizeof(struct mem_cgroup_per_zone),
+ GFP_KERNEL, tmp);
+ if (!mz)
+ break;
+ radix_tree_preload(GFP_KERNEL);
+ spin_lock_irq(&memcg_lrutable_lock);
+ id = node_zone_idx(css_id(&mem->css), node, zone);
+ ret = radix_tree_insert(&memcg_lrus, id, mz);
+ spin_unlock_irq(&memcg_lrutable_lock);
+ if (ret)
+ break;
for_each_lru(l)
INIT_LIST_HEAD(&mz->lists[l]);
- mz->usage_in_excess = 0;
mz->on_tree = false;
mz->mem = mem;
}
- return 0;
+
+ if (zone == MAX_NR_ZONES)
+ return 0;
+
+ for (; zone >= 0; zone--) {
+ id = node_zone_idx(css_id(&mem->css), node, zone);
+ spin_lock_irq(&memcg_lrutable_lock);
+ mz = radix_tree_delete(&memcg_lrus, id);
+ spin_unlock_irq(&memcg_lrutable_lock);
+ kfree(mz);
+ }
+
+ return 1;
}

static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
{
- kfree(mem->info.nodeinfo[node]);
+ int id, zone;
+ struct mem_cgroup_per_zone *mz;
+ unsigned long flags;
+
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+ id = node_zone_idx(css_id(&mem->css), node, zone);
+ spin_lock_irqsave(&memcg_lrutable_lock, flags);
+ mz = radix_tree_delete(&memcg_lrus, id);
+ spin_unlock_irqrestore(&memcg_lrutable_lock, flags);
+ kfree(mz);
+ }
}

static struct mem_cgroup *mem_cgroup_alloc(void)
@@ -4234,6 +4264,7 @@ static struct mem_cgroup *mem_cgroup_all
mem = NULL;
}
spin_lock_init(&mem->pcp_counter_lock);
+
return mem;
}

@@ -4343,13 +4374,14 @@ mem_cgroup_create(struct cgroup_subsys *
if (!mem)
return ERR_PTR(error);

+ error = alloc_css_id(ss, cont, &mem->css);
+ if (error)
+ goto free_out;
+
for_each_node_state(node, N_POSSIBLE)
if (alloc_mem_cgroup_per_zone_info(mem, node))
goto free_out;

- error = alloc_css_id(ss, cont, &mem->css);
- if (error)
- goto free_out;
/* Here, css_id(&mem->css) works. but css_lookup(id)->mem doesn't */

/* root ? */

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/