[PATCH 01/10] Revert "slub: move synchronize_sched out of slab_mutex on shrink"

From: Tejun Heo
Date: Tue Jan 17 2017 - 19:02:15 EST


This reverts commit 89e364db71fb5e7fc8d93228152abfa67daf35fa.

With kmem cgroup support enabled, kmem_caches can be created and
destroyed frequently and a great number of near empty kmem_caches can
accumulate if there are a lot of transient cgroups and the system is
not under memory pressure. When memory reclaim starts under such
conditions, it can lead to consecutive deactivation and destruction of
many kmem_caches, easily hundreds of thousands on moderately large
systems, exposing scalability issues in the current slab management
code. This is one of the patches to address the issue.

Moving synchronize_sched() out of slab_mutex isn't enough as it's
still inside cgroup_mutex. The whole deactivation / release path will
be updated to avoid all synchronous RCU operations. Revert this
insufficient optimization in preparation to ease future changes.

Signed-off-by: Tejun Heo <tj@xxxxxxxxxx>
Reported-by: Jay Vana <jsvana@xxxxxx>
Cc: Vladimir Davydov <vdavydov.dev@xxxxxxxxx>
Cc: Christoph Lameter <cl@xxxxxxxxx>
Cc: Pekka Enberg <penberg@xxxxxxxxxx>
Cc: David Rientjes <rientjes@xxxxxxxxxx>
Cc: Joonsoo Kim <iamjoonsoo.kim@xxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
---
mm/slab.c | 4 ++--
mm/slab.h | 2 +-
mm/slab_common.c | 27 ++-------------------------
mm/slob.c | 2 +-
mm/slub.c | 19 +++++++++++++++++--
5 files changed, 23 insertions(+), 31 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index 29bc6c0..767e8e4 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2314,7 +2314,7 @@ static int drain_freelist(struct kmem_cache *cache,
return nr_freed;
}

-int __kmem_cache_shrink(struct kmem_cache *cachep)
+int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate)
{
int ret = 0;
int node;
@@ -2334,7 +2334,7 @@ int __kmem_cache_shrink(struct kmem_cache *cachep)

int __kmem_cache_shutdown(struct kmem_cache *cachep)
{
- return __kmem_cache_shrink(cachep);
+ return __kmem_cache_shrink(cachep, false);
}

void __kmem_cache_release(struct kmem_cache *cachep)
diff --git a/mm/slab.h b/mm/slab.h
index de6579d..4acc644 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -161,7 +161,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,

int __kmem_cache_shutdown(struct kmem_cache *);
void __kmem_cache_release(struct kmem_cache *);
-int __kmem_cache_shrink(struct kmem_cache *);
+int __kmem_cache_shrink(struct kmem_cache *, bool);
void slab_kmem_cache_release(struct kmem_cache *);

struct seq_file;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index ae32384..46ff746 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -579,29 +579,6 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
get_online_cpus();
get_online_mems();

-#ifdef CONFIG_SLUB
- /*
- * In case of SLUB, we need to disable empty slab caching to
- * avoid pinning the offline memory cgroup by freeable kmem
- * pages charged to it. SLAB doesn't need this, as it
- * periodically purges unused slabs.
- */
- mutex_lock(&slab_mutex);
- list_for_each_entry(s, &slab_caches, list) {
- c = is_root_cache(s) ? cache_from_memcg_idx(s, idx) : NULL;
- if (c) {
- c->cpu_partial = 0;
- c->min_partial = 0;
- }
- }
- mutex_unlock(&slab_mutex);
- /*
- * kmem_cache->cpu_partial is checked locklessly (see
- * put_cpu_partial()). Make sure the change is visible.
- */
- synchronize_sched();
-#endif
-
mutex_lock(&slab_mutex);
list_for_each_entry(s, &slab_caches, list) {
if (!is_root_cache(s))
@@ -613,7 +590,7 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
if (!c)
continue;

- __kmem_cache_shrink(c);
+ __kmem_cache_shrink(c, true);
arr->entries[idx] = NULL;
}
mutex_unlock(&slab_mutex);
@@ -784,7 +761,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
get_online_cpus();
get_online_mems();
kasan_cache_shrink(cachep);
- ret = __kmem_cache_shrink(cachep);
+ ret = __kmem_cache_shrink(cachep, false);
put_online_mems();
put_online_cpus();
return ret;
diff --git a/mm/slob.c b/mm/slob.c
index eac04d4..5ec1580 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -634,7 +634,7 @@ void __kmem_cache_release(struct kmem_cache *c)
{
}

-int __kmem_cache_shrink(struct kmem_cache *d)
+int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate)
{
return 0;
}
diff --git a/mm/slub.c b/mm/slub.c
index 067598a..68b84f9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3883,7 +3883,7 @@ EXPORT_SYMBOL(kfree);
* being allocated from last increasing the chance that the last objects
* are freed in them.
*/
-int __kmem_cache_shrink(struct kmem_cache *s)
+int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
{
int node;
int i;
@@ -3895,6 +3895,21 @@ int __kmem_cache_shrink(struct kmem_cache *s)
unsigned long flags;
int ret = 0;

+ if (deactivate) {
+ /*
+ * Disable empty slabs caching. Used to avoid pinning offline
+ * memory cgroups by kmem pages that can be freed.
+ */
+ s->cpu_partial = 0;
+ s->min_partial = 0;
+
+ /*
+ * s->cpu_partial is checked locklessly (see put_cpu_partial),
+ * so we have to make sure the change is visible.
+ */
+ synchronize_sched();
+ }
+
flush_all(s);
for_each_kmem_cache_node(s, node, n) {
INIT_LIST_HEAD(&discard);
@@ -3951,7 +3966,7 @@ static int slab_mem_going_offline_callback(void *arg)

mutex_lock(&slab_mutex);
list_for_each_entry(s, &slab_caches, list)
- __kmem_cache_shrink(s);
+ __kmem_cache_shrink(s, false);
mutex_unlock(&slab_mutex);

return 0;
--
2.9.3