Re: [PATCH 11/23] fs: Add /proc/namespaces/ directory

From: Christian Brauner
Date: Thu Jul 30 2020 - 09:26:26 EST


On Thu, Jul 30, 2020 at 03:00:19PM +0300, Kirill Tkhai wrote:
> This is a new directory to show all namespaces, which can be
> accessed from this /proc tasks credentials.
>
> Every /proc is related to a pid_namespace, and the pid_namespace
> is related to a user_namespace. The items, we show in this
> /proc/namespaces/ directory, are the namespaces,
> whose user_namespaces are the same as /proc's user_namespace,
> or their descendants.
>
> Say, /proc has pid_ns->user_ns, so in /proc/namespace we show
> only a ns, which is in_userns(pid_ns->user_ns, ns->user_ns).
>
> The final result is like below:
>
> # ls /proc/namespaces/ -l
> lrwxrwxrwx 1 root root 0 Jul 29 16:50 'cgroup:[4026531835]' -> 'cgroup:[4026531835]'
> lrwxrwxrwx 1 root root 0 Jul 29 16:50 'ipc:[4026531839]' -> 'ipc:[4026531839]'
> lrwxrwxrwx 1 root root 0 Jul 29 16:50 'mnt:[4026531840]' -> 'mnt:[4026531840]'
> lrwxrwxrwx 1 root root 0 Jul 29 16:50 'mnt:[4026531861]' -> 'mnt:[4026531861]'
> lrwxrwxrwx 1 root root 0 Jul 29 16:50 'mnt:[4026532133]' -> 'mnt:[4026532133]'
> lrwxrwxrwx 1 root root 0 Jul 29 16:50 'mnt:[4026532134]' -> 'mnt:[4026532134]'
> lrwxrwxrwx 1 root root 0 Jul 29 16:50 'mnt:[4026532135]' -> 'mnt:[4026532135]'
> lrwxrwxrwx 1 root root 0 Jul 29 16:50 'mnt:[4026532136]' -> 'mnt:[4026532136]'
> lrwxrwxrwx 1 root root 0 Jul 29 16:50 'net:[4026531993]' -> 'net:[4026531993]'
> lrwxrwxrwx 1 root root 0 Jul 29 16:50 'pid:[4026531836]' -> 'pid:[4026531836]'
> lrwxrwxrwx 1 root root 0 Jul 29 16:50 'time:[4026531834]' -> 'time:[4026531834]'
> lrwxrwxrwx 1 root root 0 Jul 29 16:50 'user:[4026531837]' -> 'user:[4026531837]'
> lrwxrwxrwx 1 root root 0 Jul 29 16:50 'uts:[4026531838]' -> 'uts:[4026531838]'

So usually, the /proc/<pid>/ns entries are guarded by
ptrace_may_access() but from skimming the patch it seems that
/proc/namespaces/ would be accessible by any user.

I think we should guard /proc/namespaces/. Either by restricting it to
userns CAP_SYS_ADMIN or - to make it work with unprivileged CRIU - by
ns_capable(proc's_pid_ns->user_ns, CAP_SYS_PTRACE).


This should probably also be a mount option on procfs given that we now
allow a restricted view of procfs.

Christian

>
> Every namespace may be open like ordinary file in /proc/[pid]/ns.
>
> Signed-off-by: Kirill Tkhai <ktkhai@xxxxxxxxxxxxx>
> ---
> fs/nsfs.c | 2
> fs/proc/Makefile | 1
> fs/proc/internal.h | 16 ++
> fs/proc/namespaces.c | 314 +++++++++++++++++++++++++++++++++++++++++++++++
> fs/proc/root.c | 17 ++-
> include/linux/proc_fs.h | 1
> 6 files changed, 345 insertions(+), 6 deletions(-)
> create mode 100644 fs/proc/namespaces.c
>
> diff --git a/fs/nsfs.c b/fs/nsfs.c
> index ee4be67d3a0b..61b789d2089c 100644
> --- a/fs/nsfs.c
> +++ b/fs/nsfs.c
> @@ -58,7 +58,7 @@ static void nsfs_evict(struct inode *inode)
> ns->ops->put(ns);
> }
>
> -static int __ns_get_path(struct path *path, struct ns_common *ns)
> +int __ns_get_path(struct path *path, struct ns_common *ns)
> {
> struct vfsmount *mnt = nsfs_mnt;
> struct dentry *dentry;
> diff --git a/fs/proc/Makefile b/fs/proc/Makefile
> index dc2d51f42905..34ff671c6d59 100644
> --- a/fs/proc/Makefile
> +++ b/fs/proc/Makefile
> @@ -25,6 +25,7 @@ proc-y += util.o
> proc-y += version.o
> proc-y += softirqs.o
> proc-y += task_namespaces.o
> +proc-y += namespaces.o
> proc-y += self.o
> proc-y += thread_self.o
> proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o
> diff --git a/fs/proc/internal.h b/fs/proc/internal.h
> index 572757ff97be..d19fe5574799 100644
> --- a/fs/proc/internal.h
> +++ b/fs/proc/internal.h
> @@ -134,10 +134,11 @@ void task_dump_owner(struct task_struct *task, umode_t mode,
> kuid_t *ruid, kgid_t *rgid);
>
> unsigned name_to_int(const struct qstr *qstr);
> -/*
> - * Offset of the first process in the /proc root directory..
> - */
> -#define FIRST_PROCESS_ENTRY 256
> +
> +/* Offset of "namespaces" entry in /proc root directory */
> +#define NAMESPACES_ENTRY 256
> +/* Offset of the first process in the /proc root directory */
> +#define FIRST_PROCESS_ENTRY (NAMESPACES_ENTRY + 1)
>
> /* Worst case buffer size needed for holding an integer. */
> #define PROC_NUMBUF 13
> @@ -168,6 +169,7 @@ extern void proc_pid_evict_inode(struct proc_inode *);
> extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t);
> extern void pid_update_inode(struct task_struct *, struct inode *);
> extern int pid_delete_dentry(const struct dentry *);
> +extern int proc_emit_namespaces(struct file *, struct dir_context *);
> extern int proc_pid_readdir(struct file *, struct dir_context *);
> struct dentry *proc_pid_lookup(struct dentry *, unsigned int);
> extern loff_t mem_lseek(struct file *, loff_t, int);
> @@ -222,6 +224,12 @@ void set_proc_pid_nlink(void);
> extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
> extern void proc_entry_rundown(struct proc_dir_entry *);
>
> +/*
> + * namespaces.c
> + */
> +extern int proc_setup_namespaces(struct super_block *);
> +extern void proc_namespaces_init(void);
> +
> /*
> * task_namespaces.c
> */
> diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
> new file mode 100644
> index 000000000000..ab47e1555619
> --- /dev/null
> +++ b/fs/proc/namespaces.c
> @@ -0,0 +1,314 @@
> +#include <linux/pid_namespace.h>
> +#include <linux/user_namespace.h>
> +#include <linux/namei.h>
> +#include "internal.h"
> +
> +static unsigned namespaces_inum __ro_after_init;
> +
> +int proc_emit_namespaces(struct file *file, struct dir_context *ctx)
> +{
> + struct proc_fs_info *fs_info = proc_sb_info(file_inode(file)->i_sb);
> + struct inode *inode = d_inode(fs_info->proc_namespaces);
> +
> + return dir_emit(ctx, "namespaces", 10, inode->i_ino, DT_DIR);
> +}
> +
> +static int parse_namespace_dentry_name(const struct dentry *dentry,
> + const char **type, unsigned int *type_len, unsigned int *inum)
> +{
> + const char *p, *name;
> + int count;
> +
> + *type = name = dentry->d_name.name;
> + p = strchr(name, ':');
> + *type_len = p - name;
> + if (!p || p == name)
> + return -ENOENT;

Hm, rather:

p = strchr(name, ':');
if (!p || p == name)
return -ENOENT;
*type_len = p - name;

> +
> + p += 1;
> + if (sscanf(p, "[%u]%n", inum, &count) != 1 || *(p + count) != '\0' ||
> + *inum < PROC_NS_MIN_INO)
> + return -ENOENT;
> +
> + return 0;
> +}
> +
> +static struct ns_common *get_namespace_by_dentry(struct pid_namespace *pid_ns,
> + const struct dentry *dentry)
> +{
> + unsigned int type_len, inum, p_inum;
> + struct user_namespace *user_ns;
> + struct ns_common *ns;
> + const char *type;
> +
> + if (parse_namespace_dentry_name(dentry, &type, &type_len, &inum) < 0)
> + return NULL;
> +
> + p_inum = inum - 1;
> + ns = ns_get_next(&p_inum);
> + if (!ns)
> + return NULL;
> +
> + if (ns->inum != inum || strncmp(type, ns->ops->name, type_len) != 0 ||
> + ns->ops->name[type_len] != '\0') {
> + ns->ops->put(ns);
> + return NULL;
> + }
> +
> + if (ns->ops != &userns_operations)
> + user_ns = ns->ops->owner(ns);
> + else
> + user_ns = container_of(ns, struct user_namespace, ns);
> +
> + if (!in_userns(pid_ns->user_ns, user_ns)) {
> + ns->ops->put(ns);
> + return NULL;
> + }
> +
> + return ns;
> +}
> +
> +static struct dentry *proc_namespace_instantiate(struct dentry *dentry,
> + struct task_struct *task, const void *ptr);
> +
> +static struct dentry *proc_namespaces_lookup(struct inode *dir, struct dentry *dentry,
> + unsigned int flags)
> +{
> + struct pid_namespace *pid_ns = proc_pid_ns(dir->i_sb);
> + struct task_struct *task;
> + struct ns_common *ns;
> +
> + ns = get_namespace_by_dentry(pid_ns, dentry);
> + if (!ns)
> + return ERR_PTR(-ENOENT);
> +
> + read_lock(&tasklist_lock);
> + task = get_task_struct(pid_ns->child_reaper);
> + read_unlock(&tasklist_lock);
> +
> + dentry = proc_namespace_instantiate(dentry, task, ns);
> + put_task_struct(task);
> + ns->ops->put(ns);
> +
> + return dentry;
> +}
> +
> +static int proc_namespaces_permission(struct inode *inode, int mask)
> +{
> + if ((mask & MAY_EXEC) && S_ISLNK(inode->i_mode))
> + return -EACCES;
> +
> + return 0;
> +}
> +
> +static int proc_namespaces_getattr(const struct path *path, struct kstat *stat,
> + u32 request_mask, unsigned int query_flags)
> +{
> + struct inode *inode = d_inode(path->dentry);
> +
> + generic_fillattr(inode, stat);
> + return 0;
> +}
> +
> +static const struct inode_operations proc_namespaces_inode_operations = {
> + .lookup = proc_namespaces_lookup,
> + .permission = proc_namespaces_permission,
> + .getattr = proc_namespaces_getattr,
> +};
> +
> +static int proc_namespaces_readlink(struct dentry *dentry, char __user *buffer, int buflen)
> +{
> + struct inode *dir = dentry->d_parent->d_inode;
> + struct pid_namespace *pid_ns = proc_pid_ns(dir->i_sb);
> + struct ns_common *ns;
> +
> + ns = get_namespace_by_dentry(pid_ns, dentry);
> + if (!ns)
> + return -ENOENT;
> + ns->ops->put(ns);
> +
> + /* proc_namespaces_readdir() creates dentry names in namespace format */
> + return readlink_copy(buffer, buflen, dentry->d_iname);
> +}
> +
> +int __ns_get_path(struct path *path, struct ns_common *ns);
> +
> +static const char *proc_namespaces_getlink(struct dentry *dentry,
> + struct inode *inode, struct delayed_call *done)
> +{
> + struct pid_namespace *pid_ns = proc_pid_ns(inode->i_sb);
> + struct ns_common *ns;
> + struct path path;
> + int ret;
> +
> + if (!dentry)
> + return ERR_PTR(-ECHILD);
> +
> + while (1) {
> + ret = -ENOENT;
> + ns = get_namespace_by_dentry(pid_ns, dentry);
> + if (!ns)
> + goto out;
> +
> + ret = __ns_get_path(&path, ns);
> + if (ret == -EAGAIN)
> + continue;
> + if (ret)
> + goto out;
> + break;
> + }
> +
> + ret = nd_jump_link(&path);
> +out:
> + return ERR_PTR(ret);
> +}
> +
> +static const struct inode_operations proc_namespaces_link_inode_operations = {
> + .readlink = proc_namespaces_readlink,
> + .get_link = proc_namespaces_getlink,
> +};
> +
> +static int namespace_delete_dentry(const struct dentry *dentry)
> +{
> + struct inode *dir = dentry->d_parent->d_inode;
> + struct pid_namespace *pid_ns = proc_pid_ns(dir->i_sb);
> + struct ns_common *ns;
> +
> + ns = get_namespace_by_dentry(pid_ns, dentry);
> + if (!ns)
> + return 1;
> +
> + ns->ops->put(ns);
> + return 0;
> +}
> +
> +const struct dentry_operations namespaces_dentry_operations = {
> + .d_delete = namespace_delete_dentry,
> +};
> +
> +static void namespace_update_inode(struct inode *inode)
> +{
> + struct user_namespace *user_ns = proc_pid_ns(inode->i_sb)->user_ns;
> +
> + inode->i_uid = make_kuid(user_ns, 0);
> + if (!uid_valid(inode->i_uid))
> + inode->i_uid = GLOBAL_ROOT_UID;
> +
> + inode->i_gid = make_kgid(user_ns, 0);
> + if (!gid_valid(inode->i_gid))
> + inode->i_gid = GLOBAL_ROOT_GID;
> +}
> +
> +static struct dentry *proc_namespace_instantiate(struct dentry *dentry,
> + struct task_struct *task, const void *ptr)
> +{
> + const struct ns_common *ns = ptr;
> + struct inode *inode;
> + struct proc_inode *ei;
> +
> + /*
> + * Create inode with credentials of @task, and add it to @task's
> + * quick removal list.
> + */
> + inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK | S_IRWXUGO);
> + if (!inode)
> + return ERR_PTR(-ENOENT);
> +
> + ei = PROC_I(inode);
> + inode->i_op = &proc_namespaces_link_inode_operations;
> + ei->ns_ops = ns->ops;
> + namespace_update_inode(inode);
> +
> + d_set_d_op(dentry, &namespaces_dentry_operations);
> + return d_splice_alias(inode, dentry);
> +}
> +
> +static int proc_namespaces_readdir(struct file *file, struct dir_context *ctx)
> +{
> + struct pid_namespace *pid_ns = proc_pid_ns(file_inode(file)->i_sb);
> + struct user_namespace *user_ns;
> + struct task_struct *task;
> + struct ns_common *ns;
> + unsigned int inum;
> +
> + read_lock(&tasklist_lock);
> + task = get_task_struct(pid_ns->child_reaper);
> + read_unlock(&tasklist_lock);
> +
> + if (!dir_emit_dots(file, ctx))
> + goto out;
> +
> + inum = ctx->pos - 2;
> + while ((ns = ns_get_next(&inum)) != NULL) {
> + unsigned int len;
> + char name[32];
> +
> + if (ns->ops != &userns_operations)
> + user_ns = ns->ops->owner(ns);
> + else
> + user_ns = container_of(ns, struct user_namespace, ns);
> +
> + if (!in_userns(pid_ns->user_ns, user_ns))
> + goto next;
> +
> + len = snprintf(name, sizeof(name), "%s:[%u]", ns->ops->name, inum);
> +
> + if (!proc_fill_cache(file, ctx, name, len,
> + proc_namespace_instantiate, task, ns)) {
> + ns->ops->put(ns);
> + break;
> + }
> +next:
> + ns->ops->put(ns);
> + ctx->pos = inum + 2;
> + }
> +out:
> + put_task_struct(task);
> + return 0;
> +}
> +
> +static const struct file_operations proc_namespaces_file_operations = {
> + .read = generic_read_dir,
> + .iterate_shared = proc_namespaces_readdir,
> + .llseek = generic_file_llseek,
> +};
> +
> +int proc_setup_namespaces(struct super_block *s)
> +{
> + struct proc_fs_info *fs_info = proc_sb_info(s);
> + struct inode *root_inode = d_inode(s->s_root);
> + struct dentry *namespaces;
> + int ret = -ENOMEM;
> +
> + inode_lock(root_inode);
> + namespaces = d_alloc_name(s->s_root, "namespaces");
> + if (namespaces) {
> + struct inode *inode = new_inode_pseudo(s);
> + if (inode) {
> + inode->i_ino = namespaces_inum;
> + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
> + inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
> + inode->i_uid = GLOBAL_ROOT_UID;
> + inode->i_gid = GLOBAL_ROOT_GID;
> + inode->i_op = &proc_namespaces_inode_operations;
> + inode->i_fop = &proc_namespaces_file_operations;
> + d_add(namespaces, inode);
> + ret = 0;
> + } else {
> + dput(namespaces);
> + }
> + }
> + inode_unlock(root_inode);
> +
> + if (ret)
> + pr_err("proc_setup_namespaces: can't allocate /proc/namespaces\n");
> + else
> + fs_info->proc_namespaces = namespaces;
> +
> + return ret;
> +}
> +
> +void __init proc_namespaces_init(void)
> +{
> + proc_alloc_inum(&namespaces_inum);
> +}
> diff --git a/fs/proc/root.c b/fs/proc/root.c
> index 5e444d4f9717..e4e4f90fca3d 100644
> --- a/fs/proc/root.c
> +++ b/fs/proc/root.c
> @@ -206,6 +206,10 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc)
> return -ENOMEM;
> }
>
> + ret = proc_setup_namespaces(s);
> + if (ret)
> + return ret;
> +
> ret = proc_setup_self(s);
> if (ret) {
> return ret;
> @@ -272,6 +276,9 @@ static void proc_kill_sb(struct super_block *sb)
> dput(fs_info->proc_self);
> dput(fs_info->proc_thread_self);
>
> + if (fs_info->proc_namespaces)
> + dput(fs_info->proc_namespaces);
> +
> kill_anon_super(sb);
> put_pid_ns(fs_info->pid_ns);
> kfree(fs_info);
> @@ -289,6 +296,7 @@ void __init proc_root_init(void)
> {
> proc_init_kmemcache();
> set_proc_pid_nlink();
> + proc_namespaces_init();
> proc_self_init();
> proc_thread_self_init();
> proc_symlink("mounts", NULL, "self/mounts");
> @@ -326,8 +334,15 @@ static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentr
>
> static int proc_root_readdir(struct file *file, struct dir_context *ctx)
> {
> - if (ctx->pos < FIRST_PROCESS_ENTRY) {
> + if (ctx->pos < NAMESPACES_ENTRY) {
> int error = proc_readdir(file, ctx);
> + if (unlikely(error <= 0))
> + return error;
> + ctx->pos = NAMESPACES_ENTRY;
> + }
> +
> + if (ctx->pos == NAMESPACES_ENTRY) {
> + int error = proc_emit_namespaces(file, ctx);
> if (unlikely(error <= 0))
> return error;
> ctx->pos = FIRST_PROCESS_ENTRY;
> diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
> index 97b3f5f06db9..8b0002a6cacf 100644
> --- a/include/linux/proc_fs.h
> +++ b/include/linux/proc_fs.h
> @@ -61,6 +61,7 @@ struct proc_fs_info {
> struct pid_namespace *pid_ns;
> struct dentry *proc_self; /* For /proc/self */
> struct dentry *proc_thread_self; /* For /proc/thread-self */
> + struct dentry *proc_namespaces; /* For /proc/namespaces */
> kgid_t pid_gid;
> enum proc_hidepid hide_pid;
> enum proc_pidonly pidonly;
>
>