[PATCH] memcg: relax memcg iter caching

From: Michal Hocko
Date: Mon Feb 11 2013 - 14:23:51 EST


Now that per-node-zone-priority iterator caches memory cgroups rather
than their css ids we have to be careful and remove them from the
iterator when they are on the way out otherwise they might hang for
unbounded amount of time (until the global/targeted reclaim triggers the
zone under priority to find out the group is dead and let it to find the
final rest).

We can fix this issue by relaxing rules for the last_visited memcg as
well.
Instead of taking reference to css before it is stored into
iter->last_visited we can just store its pointer and track the number of
removed groups for each memcg. This number would be stored into iterator
everytime when a memcg is cached. If the iter count doesn't match the
curent walker root's one we will start over from the root again. The
group counter is incremented upwards the hierarchy every time a group is
removed.

Locking rules are a bit complicated but we primarily rely on rcu which
protects css from disappearing while it is proved to be still valid. The
validity is checked in two steps. First the iter->last_dead_count has
to match root->dead_count and second css_tryget has to confirm the
that the group is still alive and it pins it until we get a next memcg.

Spotted-by: Ying Han <yinghan@xxxxxxxxxx>
Original-idea-by: Johannes Weiner <hannes@xxxxxxxxxxx>
Signed-off-by: Michal Hocko <mhocko@xxxxxxx>
---
mm/memcontrol.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++--------
1 file changed, 57 insertions(+), 9 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e9f5c47..f9b5719 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -144,8 +144,13 @@ struct mem_cgroup_stat_cpu {
};

struct mem_cgroup_reclaim_iter {
- /* last scanned hierarchy member with elevated css ref count */
+ /*
+ * last scanned hierarchy member. Valid only if last_dead_count
+ * matches memcg->dead_count of the hierarchy root group.
+ */
struct mem_cgroup *last_visited;
+ unsigned int last_dead_count;
+
/* scan generation, increased every round-trip */
unsigned int generation;
/* lock to protect the position and generation */
@@ -357,6 +362,7 @@ struct mem_cgroup {
struct mem_cgroup_stat_cpu nocpu_base;
spinlock_t pcp_counter_lock;

+ atomic_t dead_count;
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
struct tcp_memcontrol tcp_mem;
#endif
@@ -1158,19 +1164,33 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
int nid = zone_to_nid(reclaim->zone);
int zid = zone_idx(reclaim->zone);
struct mem_cgroup_per_zone *mz;
+ unsigned int dead_count;

mz = mem_cgroup_zoneinfo(root, nid, zid);
iter = &mz->reclaim_iter[reclaim->priority];
spin_lock(&iter->iter_lock);
- last_visited = iter->last_visited;
if (prev && reclaim->generation != iter->generation) {
- if (last_visited) {
- css_put(&last_visited->css);
- iter->last_visited = NULL;
- }
+ iter->last_visited = NULL;
spin_unlock(&iter->iter_lock);
goto out_unlock;
}
+
+ /*
+ * last_visited might be invalid if some of the group
+ * downwards was removed. As we do not know which one
+ * disappeared we have to start all over again from the
+ * root.
+ * css ref count then makes sure that css won't
+ * disappear while we iterate to the next memcg
+ */
+ last_visited = iter->last_visited;
+ dead_count = atomic_read(&root->dead_count);
+ smp_rmb();
+ if (last_visited &&
+ ((dead_count != iter->last_dead_count) ||
+ !css_tryget(&last_visited->css))) {
+ last_visited = NULL;
+ }
}

/*
@@ -1210,10 +1230,12 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
if (css && !memcg)
curr = mem_cgroup_from_css(css);

- /* make sure that the cached memcg is not removed */
- if (curr)
- css_get(&curr->css);
+ /*
+ * No memory barrier is needed here because we are
+ * protected by iter_lock
+ */
iter->last_visited = curr;
+ iter->last_dead_count = atomic_read(&root->dead_count);

if (!css)
iter->generation++;
@@ -6375,10 +6397,36 @@ free_out:
return ERR_PTR(error);
}

+/*
+ * Announce all parents that a group from their hierarchy is gone.
+ */
+static void mem_cgroup_uncache_from_reclaim(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *parent = memcg;
+
+ while ((parent = parent_mem_cgroup(parent)))
+ atomic_inc(&parent->dead_count);
+
+ /*
+ * if the root memcg is not hierarchical we have to check it
+ * explicitely.
+ */
+ if (!root_mem_cgroup->use_hierarchy)
+ atomic_inc(&parent->dead_count);
+
+ /*
+ * Make sure that dead_count updates are visible before other
+ * cleanup from css_offline.
+ * Pairs with smp_rmb in mem_cgroup_iter
+ */
+ smp_wmb();
+}
+
static void mem_cgroup_css_offline(struct cgroup *cont)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

+ mem_cgroup_uncache_from_reclaim(memcg);
mem_cgroup_reparent_charges(memcg);
mem_cgroup_destroy_all_caches(memcg);
}
--
1.7.10.4


--
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/