[RFC][PATCH 2/2] memcg: hierarchy reclaim with CGROUP ID

From: KAMEZAWA Hiroyuki
Date: Thu Nov 27 2008 - 02:09:28 EST


Implement hierarchy reclaim by cgroup_id.

What changes:
- reclaim is not done by tree-walk algorithm
- mem_cgroup->last_schan_child is ID, not pointer.
- no cgroup_lock.
- scanning order is just defined by ID's order.
(Scan by round-robin logic.)
- Order of scanning can be changed easily(maybe).

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxx>


mm/memcontrol.c | 129 +++++++++++---------------------------------------------
1 file changed, 27 insertions(+), 102 deletions(-)

Index: mmotm-2.6.28-Nov24/mm/memcontrol.c
===================================================================
--- mmotm-2.6.28-Nov24.orig/mm/memcontrol.c
+++ mmotm-2.6.28-Nov24/mm/memcontrol.c
@@ -148,7 +148,7 @@ struct mem_cgroup {
* While reclaiming in a hiearchy, we cache the last child we
* reclaimed from. Protected by cgroup_lock()
*/
- struct mem_cgroup *last_scanned_child;
+ int last_scan_child;
/*
* Should the accounting and control be hierarchical, per subtree?
*/
@@ -472,102 +472,31 @@ unsigned long mem_cgroup_isolate_pages(u
return nr_taken;
}

-#define mem_cgroup_from_res_counter(counter, member) \
- container_of(counter, struct mem_cgroup, member)
-
+#define mem_cgroup_from_res_counter(counter, member) \
+ container_of(counter, struct mem_cgroup, member)
/*
- * This routine finds the DFS walk successor. This routine should be
- * called with cgroup_mutex held
+ * get the cgroup under hierarchy under root. start from root->last_scan_child
+ * and root->last_scanned_child is updated.
*/
static struct mem_cgroup *
-mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem)
-{
- struct cgroup *cgroup, *curr_cgroup, *root_cgroup;
-
- curr_cgroup = curr->css.cgroup;
- root_cgroup = root_mem->css.cgroup;
-
- if (!list_empty(&curr_cgroup->children)) {
- /*
- * Walk down to children
- */
- mem_cgroup_put(curr);
- cgroup = list_entry(curr_cgroup->children.next,
- struct cgroup, sibling);
- curr = mem_cgroup_from_cont(cgroup);
- mem_cgroup_get(curr);
- goto done;
- }
-
-visit_parent:
- if (curr_cgroup == root_cgroup) {
- mem_cgroup_put(curr);
- curr = root_mem;
- mem_cgroup_get(curr);
- goto done;
- }
-
- /*
- * Goto next sibling
- */
- if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) {
- mem_cgroup_put(curr);
- cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup,
- sibling);
- curr = mem_cgroup_from_cont(cgroup);
- mem_cgroup_get(curr);
- goto done;
- }
-
- /*
- * Go up to next parent and next parent's sibling if need be
- */
- curr_cgroup = curr_cgroup->parent;
- goto visit_parent;
-
-done:
- root_mem->last_scanned_child = curr;
- return curr;
-}
-
-/*
- * Visit the first child (need not be the first child as per the ordering
- * of the cgroup list, since we track last_scanned_child) of @mem and use
- * that to reclaim free pages from.
- */
-static struct mem_cgroup *
-mem_cgroup_get_first_node(struct mem_cgroup *root_mem)
+mem_cgroup_get_reclaim_target(struct mem_cgroup *root_mem)
{
struct cgroup *cgroup;
+ struct cgroup *root = root_mem->css.cgroup;
struct mem_cgroup *ret;
- bool obsolete = (root_mem->last_scanned_child &&
- root_mem->last_scanned_child->obsolete);
-
- /*
- * Scan all children under the mem_cgroup mem
- */
- cgroup_lock();
- if (list_empty(&root_mem->css.cgroup->children)) {
- ret = root_mem;
- goto done;
- }
-
- if (!root_mem->last_scanned_child || obsolete) {
-
- if (obsolete)
- mem_cgroup_put(root_mem->last_scanned_child);
+ int id;

- cgroup = list_first_entry(&root_mem->css.cgroup->children,
- struct cgroup, sibling);
+ while (!ret) {
+ rcu_read_lock();
+ cgroup = cgroup_get_next(root_mem->last_scan_child, root, &id);
ret = mem_cgroup_from_cont(cgroup);
- mem_cgroup_get(ret);
- } else
- ret = mem_cgroup_get_next_node(root_mem->last_scanned_child,
- root_mem);
+ rcu_read_unlock();
+ root_mem->last_scan_child = id + 1;
+ if (ret->obsolete)
+ ret = NULL;
+ }
+ mem_cgroup_get(ret);

-done:
- root_mem->last_scanned_child = ret;
- cgroup_unlock();
return ret;
}

@@ -581,7 +510,7 @@ done:
static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
gfp_t gfp_mask, bool noswap)
{
- struct mem_cgroup *next_mem;
+ struct mem_cgroup *next_mem, *start;
int ret = 0;

/*
@@ -595,23 +524,21 @@ static int mem_cgroup_hierarchical_recla
if (res_counter_check_under_limit(&root_mem->res))
return 0;

- next_mem = mem_cgroup_get_first_node(root_mem);
-
- while (next_mem != root_mem) {
+ next_mem = mem_cgroup_get_reclaim_target(root_mem);
+ start = next_mem;
+ do {
if (next_mem->obsolete) {
mem_cgroup_put(next_mem);
- cgroup_lock();
- next_mem = mem_cgroup_get_first_node(root_mem);
- cgroup_unlock();
+ next_mem = mem_cgroup_get_reclaim_target(root_mem);
continue;
}
ret = try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap);
+ mem_cgroup_put(next_mem);
if (res_counter_check_under_limit(&root_mem->res))
- return 0;
- cgroup_lock();
- next_mem = mem_cgroup_get_next_node(next_mem, root_mem);
- cgroup_unlock();
- }
+ break;
+ next_mem = mem_cgroup_get_reclaim_target(root_mem);
+ } while (start != next_mem);
+
return ret;
}

@@ -1959,8 +1886,6 @@ mem_cgroup_create(struct cgroup_subsys *
res_counter_init(&mem->memsw, NULL);
}

- mem->last_scanned_child = NULL;
-
return &mem->css;
free_out:
for_each_node_state(node, N_POSSIBLE)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/