[PATCH 4/7] memcg: update numa information based on event counter

From: KAMEZAWA Hiroyuki
Date: Wed Jun 15 2011 - 03:19:46 EST


commit 889976 adds an numa node round-robin for memcg. But the information
is updated once per 10sec.

This patch changes the update trigger from jiffies to memcg's event count.
After this patch, numa scan information will be updated when

- the number of pagein/out events is larger than 3% of limit
or
- the number of pagein/out events is larger than 16k
(==64MB pagein/pageout if pagesize==4k.)

The counter of mem->numascan_update the sum of percpu events counter.
When a task hits limit, it checks mem->numascan_update. If it's over
min(3% of limit, 16k), numa information will be updated.

This patch also adds mutex for updating information. This will allow us
to avoid unnecessary scan.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>
---
mm/memcontrol.c | 51 +++++++++++++++++++++++++++++++++++++++++++++------
1 file changed, 45 insertions(+), 6 deletions(-)

Index: mmotm-0615/mm/memcontrol.c
===================================================================
--- mmotm-0615.orig/mm/memcontrol.c
+++ mmotm-0615/mm/memcontrol.c
@@ -108,10 +108,12 @@ enum mem_cgroup_events_index {
enum mem_cgroup_events_target {
MEM_CGROUP_TARGET_THRESH,
MEM_CGROUP_TARGET_SOFTLIMIT,
+ MEM_CGROUP_TARGET_NUMASCAN,
MEM_CGROUP_NTARGETS,
};
#define THRESHOLDS_EVENTS_TARGET (128)
#define SOFTLIMIT_EVENTS_TARGET (1024)
+#define NUMASCAN_EVENTS_TARGET (1024)

struct mem_cgroup_stat_cpu {
long count[MEM_CGROUP_STAT_NSTATS];
@@ -288,8 +290,9 @@ struct mem_cgroup {
int last_scanned_node;
#if MAX_NUMNODES > 1
nodemask_t scan_nodes;
- unsigned long next_scan_node_update;
+ struct mutex numascan_mutex;
#endif
+ atomic_t numascan_update;
/*
* Should the accounting and control be hierarchical, per subtree?
*/
@@ -741,6 +744,9 @@ static void __mem_cgroup_target_update(s
case MEM_CGROUP_TARGET_SOFTLIMIT:
next = val + SOFTLIMIT_EVENTS_TARGET;
break;
+ case MEM_CGROUP_TARGET_NUMASCAN:
+ next = val + NUMASCAN_EVENTS_TARGET;
+ break;
default:
return;
}
@@ -764,6 +770,13 @@ static void memcg_check_events(struct me
__mem_cgroup_target_update(mem,
MEM_CGROUP_TARGET_SOFTLIMIT);
}
+ if (unlikely(__memcg_event_check(mem,
+ MEM_CGROUP_TARGET_NUMASCAN))) {
+ atomic_add(MEM_CGROUP_TARGET_NUMASCAN,
+ &mem->numascan_update);
+ __mem_cgroup_target_update(mem,
+ MEM_CGROUP_TARGET_NUMASCAN);
+ }
}
}

@@ -1616,17 +1629,32 @@ mem_cgroup_select_victim(struct mem_cgro
/*
* Always updating the nodemask is not very good - even if we have an empty
* list or the wrong list here, we can start from some node and traverse all
- * nodes based on the zonelist. So update the list loosely once per 10 secs.
+ * nodes based on the zonelist.
*
+ * The counter of mem->numascan_update is updated once per
+ * NUMASCAN_EVENTS_TARGET. We update the numa information when we see
+ * the number of event is larger than 3% of limit or 64MB pagein/pageout.
*/
+#define NUMASCAN_UPDATE_RATIO (3)
+#define NUMASCAN_UPDATE_THRESH (16384UL) /* 16k events of pagein/pageout */
static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
{
int nid;
-
- if (time_after(mem->next_scan_node_update, jiffies))
+ unsigned long long limit;
+ /* if no limit, we never reach here */
+ limit = res_counter_read_u64(&mem->res, RES_LIMIT);
+ limit /= PAGE_SIZE;
+ /* 3% of limit */
+ limit = (limit * NUMASCAN_UPDATE_RATIO/100UL);
+ limit = min_t(unsigned long long, limit, NUMASCAN_UPDATE_THRESH);
+ /*
+ * If the number of pagein/out event is larger than 3% of limit or
+ * 64MB pagein/out, refresh numa information.
+ */
+ if (atomic_read(&mem->numascan_update) < limit ||
+ !mutex_trylock(&mem->numascan_mutex))
return;
-
- mem->next_scan_node_update = jiffies + 10*HZ;
+ atomic_set(&mem->numascan_update, 0);
/* make a nodemask where this memcg uses memory from */
mem->scan_nodes = node_states[N_HIGH_MEMORY];

@@ -1642,6 +1670,7 @@ static void mem_cgroup_may_update_nodema
continue;
node_clear(nid, mem->scan_nodes);
}
+ mutex_unlock(&mem->numascan_mutex);
}

/*
@@ -1679,11 +1708,20 @@ int mem_cgroup_select_victim_node(struct
return node;
}

+static void mem_cgroup_numascan_init(struct mem_cgroup *mem)
+{
+ mutex_init(&mem->numascan_mutex);
+}
+
#else
int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
{
return 0;
}
+static void mem_cgroup_numascan_init(struct mem_cgroup *mem)
+{
+ return 0;
+}
#endif


@@ -5054,6 +5092,7 @@ mem_cgroup_create(struct cgroup_subsys *
atomic_set(&mem->refcnt, 1);
mem->move_charge_at_immigrate = 0;
mutex_init(&mem->thresholds_lock);
+ mem_cgroup_numascan_init(mem);
spin_lock_init(&mem->scanstat.lock);
return &mem->css;
free_out:

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/