[patch 1/2] mm: memcg: per-memcg reclaim statistics

From: Johannes Weiner
Date: Tue Jan 10 2012 - 10:03:13 EST


With the single per-zone LRU gone and global reclaim scanning
individual memcgs, it's straight-forward to collect meaningful and
accurate per-memcg reclaim statistics.

This adds the following items to memory.stat:

pgreclaim
pgscan

Number of pages reclaimed/scanned from that memcg due to its own
hard limit (or physical limit in case of the root memcg) by the
allocating task.

kswapd_pgreclaim
kswapd_pgscan

Reclaim activity from kswapd due to the memcg's own limit. Only
applicable to the root memcg for now since kswapd is only triggered
by physical limits, but kswapd-style reclaim based on memcg hard
limits is being developped.

hierarchy_pgreclaim
hierarchy_pgscan
hierarchy_kswapd_pgreclaim
hierarchy_kswapd_pgscan

Reclaim activity due to limitations in one of the memcg's parents.

Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx>
---
Documentation/cgroups/memory.txt | 4 ++
include/linux/memcontrol.h | 10 +++++
mm/memcontrol.c | 84 +++++++++++++++++++++++++++++++++++++-
mm/vmscan.c | 7 +++
4 files changed, 103 insertions(+), 2 deletions(-)

diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index cc0ebc5..eb9e982 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -389,6 +389,10 @@ mapped_file - # of bytes of mapped file (includes tmpfs/shmem)
pgpgin - # of pages paged in (equivalent to # of charging events).
pgpgout - # of pages paged out (equivalent to # of uncharging events).
swap - # of bytes of swap usage
+pgreclaim - # of pages reclaimed due to this memcg's limit
+pgscan - # of pages scanned due to this memcg's limit
+kswapd_* - # reclaim activity by background daemon due to this memcg's limit
+hierarchy_* - # reclaim activity due to pressure from parental memcg
inactive_anon - # of bytes of anonymous memory and swap cache memory on
LRU list.
active_anon - # of bytes of anonymous and swap cache memory on active
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index bd3b102..6c1d69e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -121,6 +121,8 @@ struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
struct zone *zone);
struct zone_reclaim_stat*
mem_cgroup_get_reclaim_stat_from_page(struct page *page);
+void mem_cgroup_account_reclaim(struct mem_cgroup *, struct mem_cgroup *,
+ unsigned long, unsigned long, bool);
extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
struct task_struct *p);
extern void mem_cgroup_replace_page_cache(struct page *oldpage,
@@ -347,6 +349,14 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
return NULL;
}

+static inline void mem_cgroup_account_reclaim(struct mem_cgroup *root,
+ struct mem_cgroup *memcg,
+ unsigned long nr_reclaimed,
+ unsigned long nr_scanned,
+ bool kswapd)
+{
+}
+
static inline void
mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8e2a80d..170dff4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -91,12 +91,23 @@ enum mem_cgroup_stat_index {
MEM_CGROUP_STAT_NSTATS,
};

+#define MEM_CGROUP_EVENTS_KSWAPD 2
+#define MEM_CGROUP_EVENTS_HIERARCHY 4
+
enum mem_cgroup_events_index {
MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */
MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */
MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */
MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */
MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */
+ MEM_CGROUP_EVENTS_PGRECLAIM,
+ MEM_CGROUP_EVENTS_PGSCAN,
+ MEM_CGROUP_EVENTS_KSWAPD_PGRECLAIM,
+ MEM_CGROUP_EVENTS_KSWAPD_PGSCAN,
+ MEM_CGROUP_EVENTS_HIERARCHY_PGRECLAIM,
+ MEM_CGROUP_EVENTS_HIERARCHY_PGSCAN,
+ MEM_CGROUP_EVENTS_HIERARCHY_KSWAPD_PGRECLAIM,
+ MEM_CGROUP_EVENTS_HIERARCHY_KSWAPD_PGSCAN,
MEM_CGROUP_EVENTS_NSTATS,
};
/*
@@ -889,6 +900,38 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
return (memcg == root_mem_cgroup);
}

+/**
+ * mem_cgroup_account_reclaim - update per-memcg reclaim statistics
+ * @root: memcg that triggered reclaim
+ * @memcg: memcg that is actually being scanned
+ * @nr_reclaimed: number of pages reclaimed from @memcg
+ * @nr_scanned: number of pages scanned from @memcg
+ * @kswapd: whether reclaiming task is kswapd or allocator itself
+ */
+void mem_cgroup_account_reclaim(struct mem_cgroup *root,
+ struct mem_cgroup *memcg,
+ unsigned long nr_reclaimed,
+ unsigned long nr_scanned,
+ bool kswapd)
+{
+ unsigned int offset = 0;
+
+ if (!root)
+ root = root_mem_cgroup;
+
+ if (kswapd)
+ offset += MEM_CGROUP_EVENTS_KSWAPD;
+ if (root != memcg)
+ offset += MEM_CGROUP_EVENTS_HIERARCHY;
+
+ preempt_disable();
+ __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGRECLAIM + offset],
+ nr_reclaimed);
+ __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGSCAN + offset],
+ nr_scanned);
+ preempt_enable();
+}
+
void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
{
struct mem_cgroup *memcg;
@@ -1662,6 +1705,8 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;

while (1) {
+ unsigned long nr_reclaimed;
+
victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
if (!victim) {
loop++;
@@ -1687,8 +1732,11 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
}
if (!mem_cgroup_reclaimable(victim, false))
continue;
- total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
- zone, &nr_scanned);
+ nr_reclaimed = mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
+ zone, &nr_scanned);
+ mem_cgroup_account_reclaim(root_mem_cgroup, victim, nr_reclaimed,
+ nr_scanned, current_is_kswapd());
+ total += nr_reclaimed;
*total_scanned += nr_scanned;
if (!res_counter_soft_limit_excess(&root_memcg->res))
break;
@@ -4023,6 +4071,14 @@ enum {
MCS_SWAP,
MCS_PGFAULT,
MCS_PGMAJFAULT,
+ MCS_PGRECLAIM,
+ MCS_PGSCAN,
+ MCS_KSWAPD_PGRECLAIM,
+ MCS_KSWAPD_PGSCAN,
+ MCS_HIERARCHY_PGRECLAIM,
+ MCS_HIERARCHY_PGSCAN,
+ MCS_HIERARCHY_KSWAPD_PGRECLAIM,
+ MCS_HIERARCHY_KSWAPD_PGSCAN,
MCS_INACTIVE_ANON,
MCS_ACTIVE_ANON,
MCS_INACTIVE_FILE,
@@ -4047,6 +4103,14 @@ struct {
{"swap", "total_swap"},
{"pgfault", "total_pgfault"},
{"pgmajfault", "total_pgmajfault"},
+ {"pgreclaim", "total_pgreclaim"},
+ {"pgscan", "total_pgscan"},
+ {"kswapd_pgreclaim", "total_kswapd_pgreclaim"},
+ {"kswapd_pgscan", "total_kswapd_pgscan"},
+ {"hierarchy_pgreclaim", "total_hierarchy_pgreclaim"},
+ {"hierarchy_pgscan", "total_hierarchy_pgscan"},
+ {"hierarchy_kswapd_pgreclaim", "total_hierarchy_kswapd_pgreclaim"},
+ {"hierarchy_kswapd_pgscan", "total_hierarchy_kswapd_pgscan"},
{"inactive_anon", "total_inactive_anon"},
{"active_anon", "total_active_anon"},
{"inactive_file", "total_inactive_file"},
@@ -4079,6 +4143,22 @@ mem_cgroup_get_local_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
s->stat[MCS_PGFAULT] += val;
val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT);
s->stat[MCS_PGMAJFAULT] += val;
+ val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGRECLAIM);
+ s->stat[MCS_PGRECLAIM] += val;
+ val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGSCAN);
+ s->stat[MCS_PGSCAN] += val;
+ val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_KSWAPD_PGRECLAIM);
+ s->stat[MCS_KSWAPD_PGRECLAIM] += val;
+ val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_KSWAPD_PGSCAN);
+ s->stat[MCS_KSWAPD_PGSCAN] += val;
+ val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_HIERARCHY_PGRECLAIM);
+ s->stat[MCS_HIERARCHY_PGRECLAIM] += val;
+ val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_HIERARCHY_PGSCAN);
+ s->stat[MCS_HIERARCHY_PGSCAN] += val;
+ val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_HIERARCHY_KSWAPD_PGRECLAIM);
+ s->stat[MCS_HIERARCHY_KSWAPD_PGRECLAIM] += val;
+ val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_HIERARCHY_KSWAPD_PGSCAN);
+ s->stat[MCS_HIERARCHY_KSWAPD_PGSCAN] += val;

/* per zone stat */
val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c631234..e3fd8a7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2115,12 +2115,19 @@ static void shrink_zone(int priority, struct zone *zone,

memcg = mem_cgroup_iter(root, NULL, &reclaim);
do {
+ unsigned long nr_reclaimed = sc->nr_reclaimed;
+ unsigned long nr_scanned = sc->nr_scanned;
struct mem_cgroup_zone mz = {
.mem_cgroup = memcg,
.zone = zone,
};

shrink_mem_cgroup_zone(priority, &mz, sc);
+
+ mem_cgroup_account_reclaim(root, memcg,
+ sc->nr_reclaimed - nr_reclaimed,
+ sc->nr_scanned - nr_scanned,
+ current_is_kswapd());
/*
* Limit reclaim has historically picked one memcg and
* scanned it with decreasing priority levels until
--
1.7.7.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/