[PATCH] [RFC] mnt: restrict a number of "struct mnt"

From: Andrey Vagin
Date: Mon Jun 17 2013 - 04:31:18 EST


I found that a few processes can eat all host memory and nobody can kill them.
$ mount -t tmpfs xxx /mnt
$ mount --make-shared /mnt
$ for i in `seq 30`; do mount --bind /mnt `mktemp -d /mnt/test.XXXXXX` & done

All this processes are unkillable, because they took i_mutex and waits
namespace_lock.

...
21715 pts/0 ÂÂÂD ÂÂÂÂÂ0:00 ÂÂÂÂÂÂÂÂÂ\_ mount --bind /mnt /mnt/test.ht6jzO
21716 pts/0 ÂÂÂD ÂÂÂÂÂ0:00 ÂÂÂÂÂÂÂÂÂ\_ mount --bind /mnt /mnt/test.97K4mI
21717 pts/0 ÂÂÂR ÂÂÂÂÂ0:01 ÂÂÂÂÂÂÂÂÂ\_ mount --bind /mnt /mnt/test.gO2CD9
...

Each of this process doubles a number of mounts, so at the end we will
have about 2^32 mounts and the size of struct mnt is 256 bytes, so we
need about 1TB of RAM.

Another problem is that âumountâ of a big tree is very hard operation
and it requires a lot of time.
E.g.:
16411
umount("/tmp/xxx", MNT_DETACH) ÂÂÂÂÂÂÂÂÂ= 0 <7.852066> (7.8 sec)
32795
umount("/tmp/xxx", MNT_DETACH) ÂÂÂÂÂÂÂÂÂ= 0 <34.485501> ( 34 sec)

For all this time sys_umoun takes namespace_sem and vfsmount_lock...

Due to all this reasons I suggest to restrict a number of mounts.
Probably we can optimize this code in a future, but now this restriction
can help.

Cc: Alexander Viro <viro@xxxxxxxxxxxxxxxxxx>
Cc: "Eric W. Biederman" <ebiederm@xxxxxxxxxxxx>
Cc: "Serge E. Hallyn" <serge@xxxxxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Kees Cook <keescook@xxxxxxxxxxxx>
Cc: Mel Gorman <mgorman@xxxxxxx>
Cc: Rik van Riel <riel@xxxxxxxxxx>
Signed-off-by: Andrey Vagin <avagin@xxxxxxxxxx>
---
fs/namespace.c | 66 +++++++++++++++++++++++++------------------
include/linux/mnt_namespace.h | 2 ++
kernel/sysctl.c | 8 ++++++
3 files changed, 49 insertions(+), 27 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 7b1ca9b..d22e54c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -41,6 +41,9 @@ static struct list_head *mountpoint_hashtable __read_mostly;
static struct kmem_cache *mnt_cache __read_mostly;
static struct rw_semaphore namespace_sem;

+unsigned int sysctl_mount_nr __read_mostly = 16384;
+static atomic_t mount_nr = ATOMIC_INIT(0);
+
/* /sys/fs */
struct kobject *fs_kobj;
EXPORT_SYMBOL_GPL(fs_kobj);
@@ -164,43 +167,49 @@ unsigned int mnt_get_count(struct mount *mnt)

static struct mount *alloc_vfsmnt(const char *name)
{
- struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
- if (mnt) {
- int err;
+ struct mount *mnt;
+ int err;

- err = mnt_alloc_id(mnt);
- if (err)
- goto out_free_cache;
+ if (atomic_inc_return(&mount_nr) > sysctl_mount_nr)
+ goto out_dec_mount_nr;

- if (name) {
- mnt->mnt_devname = kstrdup(name, GFP_KERNEL);
- if (!mnt->mnt_devname)
- goto out_free_id;
- }
+ mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
+ if (!mnt)
+ goto out_dec_mount_nr;
+
+ err = mnt_alloc_id(mnt);
+ if (err)
+ goto out_free_cache;
+
+ if (name) {
+ mnt->mnt_devname = kstrdup(name, GFP_KERNEL);
+ if (!mnt->mnt_devname)
+ goto out_free_id;
+ }

#ifdef CONFIG_SMP
- mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
- if (!mnt->mnt_pcp)
- goto out_free_devname;
+ mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
+ if (!mnt->mnt_pcp)
+ goto out_free_devname;

- this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
+ this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
#else
- mnt->mnt_count = 1;
- mnt->mnt_writers = 0;
+ mnt->mnt_count = 1;
+ mnt->mnt_writers = 0;
#endif

- INIT_LIST_HEAD(&mnt->mnt_hash);
- INIT_LIST_HEAD(&mnt->mnt_child);
- INIT_LIST_HEAD(&mnt->mnt_mounts);
- INIT_LIST_HEAD(&mnt->mnt_list);
- INIT_LIST_HEAD(&mnt->mnt_expire);
- INIT_LIST_HEAD(&mnt->mnt_share);
- INIT_LIST_HEAD(&mnt->mnt_slave_list);
- INIT_LIST_HEAD(&mnt->mnt_slave);
+ INIT_LIST_HEAD(&mnt->mnt_hash);
+ INIT_LIST_HEAD(&mnt->mnt_child);
+ INIT_LIST_HEAD(&mnt->mnt_mounts);
+ INIT_LIST_HEAD(&mnt->mnt_list);
+ INIT_LIST_HEAD(&mnt->mnt_expire);
+ INIT_LIST_HEAD(&mnt->mnt_share);
+ INIT_LIST_HEAD(&mnt->mnt_slave_list);
+ INIT_LIST_HEAD(&mnt->mnt_slave);
#ifdef CONFIG_FSNOTIFY
- INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
+ INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
#endif
- }
+
return mnt;

#ifdef CONFIG_SMP
@@ -211,6 +220,8 @@ out_free_id:
mnt_free_id(mnt);
out_free_cache:
kmem_cache_free(mnt_cache, mnt);
+out_dec_mount_nr:
+ atomic_dec(&mount_nr);
return NULL;
}

@@ -546,6 +557,7 @@ static void free_vfsmnt(struct mount *mnt)
#ifdef CONFIG_SMP
free_percpu(mnt->mnt_pcp);
#endif
+ atomic_dec(&mount_nr);
kmem_cache_free(mnt_cache, mnt);
}

diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h
index 12b2ab5..d8e5ec9 100644
--- a/include/linux/mnt_namespace.h
+++ b/include/linux/mnt_namespace.h
@@ -2,6 +2,8 @@
#define _NAMESPACE_H_
#ifdef __KERNEL__

+extern unsigned int sysctl_mount_nr;
+
struct mnt_namespace;
struct fs_struct;
struct user_namespace;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9edcf45..bebfdd7 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -61,6 +61,7 @@
#include <linux/kmod.h>
#include <linux/capability.h>
#include <linux/binfmts.h>
+#include <linux/mnt_namespace.h>
#include <linux/sched/sysctl.h>

#include <asm/uaccess.h>
@@ -1616,6 +1617,13 @@ static struct ctl_table fs_table[] = {
.proc_handler = &pipe_proc_fn,
.extra1 = &pipe_min_size,
},
+ {
+ .procname = "mount-nr",
+ .data = &sysctl_mount_nr,
+ .maxlen = sizeof(sysctl_mount_nr),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
{ }
};

--
1.8.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/