[PATCH 1/1] fs: Let filesystems opt out of memcg awareness

From: Bharata B Rao
Date: Wed Apr 14 2021 - 01:51:24 EST


All filesystem mounts by default are memcg aware and end hence
end up creating shrinker list_lrus for all the memcgs. Due to
the way the memcg_nr_cache_ids grow and the list_lru heads are
allocated for all memcgs, huge amount of memory gets consumed
by kmalloc-32 slab cache when running thousands of containers.

Improve this situation by allowing filesystems to opt out
of memcg awareness. In this patch, tmpfs, proc and ramfs
opt out of memcg awareness. This leads to considerable memory
savings when running 10k containers.

Signed-off-by: Bharata B Rao <bharata@xxxxxxxxxxxxx>
---
fs/proc/root.c | 1 +
fs/ramfs/inode.c | 1 +
fs/super.c | 27 +++++++++++++++++++--------
include/linux/fs_context.h | 2 ++
mm/shmem.c | 1 +
5 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/fs/proc/root.c b/fs/proc/root.c
index c7e3b1350ef8..7856bc2ca9f4 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -257,6 +257,7 @@ static int proc_init_fs_context(struct fs_context *fc)
fc->user_ns = get_user_ns(ctx->pid_ns->user_ns);
fc->fs_private = ctx;
fc->ops = &proc_fs_context_ops;
+ fc->memcg_optout = true;
return 0;
}

diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 9ebd17d7befb..576a88bb7407 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -278,6 +278,7 @@ int ramfs_init_fs_context(struct fs_context *fc)
fsi->mount_opts.mode = RAMFS_DEFAULT_MODE;
fc->s_fs_info = fsi;
fc->ops = &ramfs_context_ops;
+ fc->memcg_optout = true;
return 0;
}

diff --git a/fs/super.c b/fs/super.c
index 8c1baca35c16..59aa22c678e6 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -198,7 +198,8 @@ static void destroy_unused_super(struct super_block *s)
* returns a pointer new superblock or %NULL if allocation had failed.
*/
static struct super_block *alloc_super(struct file_system_type *type, int flags,
- struct user_namespace *user_ns)
+ struct user_namespace *user_ns,
+ bool memcg_optout)
{
struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER);
static const struct super_operations default_op;
@@ -266,13 +267,22 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
s->s_shrink.scan_objects = super_cache_scan;
s->s_shrink.count_objects = super_cache_count;
s->s_shrink.batch = 1024;
- s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE;
+ s->s_shrink.flags = SHRINKER_NUMA_AWARE;
+ if (!memcg_optout)
+ s->s_shrink.flags |= SHRINKER_MEMCG_AWARE;
if (prealloc_shrinker(&s->s_shrink))
goto fail;
- if (list_lru_init_memcg(&s->s_dentry_lru, &s->s_shrink))
- goto fail;
- if (list_lru_init_memcg(&s->s_inode_lru, &s->s_shrink))
- goto fail;
+ if (memcg_optout) {
+ if (list_lru_init(&s->s_dentry_lru))
+ goto fail;
+ if (list_lru_init(&s->s_inode_lru))
+ goto fail;
+ } else {
+ if (list_lru_init_memcg(&s->s_dentry_lru, &s->s_shrink))
+ goto fail;
+ if (list_lru_init_memcg(&s->s_inode_lru, &s->s_shrink))
+ goto fail;
+ }
return s;

fail:
@@ -527,7 +537,8 @@ struct super_block *sget_fc(struct fs_context *fc,
}
if (!s) {
spin_unlock(&sb_lock);
- s = alloc_super(fc->fs_type, fc->sb_flags, user_ns);
+ s = alloc_super(fc->fs_type, fc->sb_flags, user_ns,
+ fc->memcg_optout);
if (!s)
return ERR_PTR(-ENOMEM);
goto retry;
@@ -610,7 +621,7 @@ struct super_block *sget(struct file_system_type *type,
}
if (!s) {
spin_unlock(&sb_lock);
- s = alloc_super(type, (flags & ~SB_SUBMOUNT), user_ns);
+ s = alloc_super(type, (flags & ~SB_SUBMOUNT), user_ns, false);
if (!s)
return ERR_PTR(-ENOMEM);
goto retry;
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index 37e1e8f7f08d..73388c0b6950 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -110,6 +110,8 @@ struct fs_context {
bool need_free:1; /* Need to call ops->free() */
bool global:1; /* Goes into &init_user_ns */
bool oldapi:1; /* Coming from mount(2) */
+ bool memcg_optout:1; /* Opt out from per-memcg
+ lru handling */
};

struct fs_context_operations {
diff --git a/mm/shmem.c b/mm/shmem.c
index b2db4ed0fbc7..0c9b2af52825 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3915,6 +3915,7 @@ int shmem_init_fs_context(struct fs_context *fc)

fc->fs_private = ctx;
fc->ops = &shmem_fs_context_ops;
+ fc->memcg_optout = true;
return 0;
}

--
2.26.2