Re: [PATCH v2 5/7] loop: preserve sysfs backwards compatibility

From: Serge E. Hallyn
Date: Wed Apr 22 2020 - 21:17:14 EST


On Wed, Apr 22, 2020 at 04:54:35PM +0200, Christian Brauner wrote:
> For sysfs the initial namespace is special. All devices currently
> propagate into all non-initial namespaces. For example, sysfs is usually
> mounted in a privileged or unprivileged container and all devices are
> visible to the container but are owned by global root. Even though none
> of the propagated files can be used there are still a lot of read-only
> values that are accessed or read by tools running in non-initial
> namespaces. Some devices though, which can be moved or created in
> another namespace, will only show up in the corresponding namespace.
> This currently includes network and loop devices but no other ones.
> Since all current workloads depend on devices from the inital namespace
> being visible this behavior cannot be simply changed. This patch just
> makes sure to keep propagating devices that share the same device class
> with loop devices from the initial namespaces into all non-initial
> namespaces as before. In short, nothing changes only loopfs loop devices
> will be shown in their correct namespace.
>
> Cc: Jens Axboe <axboe@xxxxxxxxx>
> Cc: Tejun Heo <tj@xxxxxxxxxx>
> Cc: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx>
> Signed-off-by: Christian Brauner <christian.brauner@xxxxxxxxxx>

Hi,

two comments below:

> ---
> /* v2 */
> - Christian Brauner <christian.brauner@xxxxxxxxxx>:
> - Protect init_net with a CONFIG_NET ifdef in case it is set to "n".
> - As Tejun pointed out there is argument to be made that a new mount
> option for sysfs could be added that would change how devices are
> propagated. This patch does not prevent this but it is an orthogonal
> problem.
> ---
> block/genhd.c | 79 +++++++++++++++++++++++++++++++++++++
> fs/kernfs/dir.c | 34 +++++++++++++---
> fs/kernfs/kernfs-internal.h | 24 +++++++++++
> fs/sysfs/mount.c | 4 ++
> include/linux/genhd.h | 3 ++
> include/linux/kernfs.h | 22 +++++++++++
> include/linux/kobject_ns.h | 4 ++
> lib/kobject.c | 2 +
> 8 files changed, 167 insertions(+), 5 deletions(-)
>
> diff --git a/block/genhd.c b/block/genhd.c
> index 06b642b23a07..b5b2601c4311 100644
> --- a/block/genhd.c
> +++ b/block/genhd.c
> @@ -1198,11 +1198,81 @@ static struct kobject *base_probe(dev_t devt, int *partno, void *data)
> return NULL;
> }
>
> +#ifdef CONFIG_BLK_DEV_LOOPFS
> +static void *user_grab_current_ns(void)
> +{
> + struct user_namespace *ns = current_user_ns();
> + return get_user_ns(ns);
> +}
> +
> +static const void *user_initial_ns(void)
> +{
> + return &init_user_ns;
> +}
> +
> +static void user_put_ns(void *p)
> +{
> + struct user_namespace *ns = p;
> + put_user_ns(ns);
> +}
> +
> +static bool user_current_may_mount(void)
> +{
> + return ns_capable(current_user_ns(), CAP_SYS_ADMIN);
> +}
> +
> +const struct kobj_ns_type_operations user_ns_type_operations = {
> + .type = KOBJ_NS_TYPE_USER,
> + .current_may_mount = user_current_may_mount,
> + .grab_current_ns = user_grab_current_ns,
> + .initial_ns = user_initial_ns,
> + .drop_ns = user_put_ns,
> +};
> +
> +static const void *block_class_user_namespace(struct device *dev)
> +{
> + struct gendisk *disk;
> +
> + if (dev->type == &part_type)
> + disk = part_to_disk(dev_to_part(dev));
> + else
> + disk = dev_to_disk(dev);
> +
> + return disk->user_ns;
> +}
> +
> +static void block_class_get_ownership(struct device *dev, kuid_t *uid, kgid_t *gid)
> +{
> + struct gendisk *disk;
> + struct user_namespace *ns;
> +
> + if (dev->type == &part_type)
> + disk = part_to_disk(dev_to_part(dev));
> + else
> + disk = dev_to_disk(dev);
> +
> + ns = disk->user_ns;
> + if (ns && ns != &init_user_ns) {
> + kuid_t ns_root_uid = make_kuid(ns, 0);
> + kgid_t ns_root_gid = make_kgid(ns, 0);
> +
> + if (uid_valid(ns_root_uid))
> + *uid = ns_root_uid;
> +
> + if (gid_valid(ns_root_gid))
> + *gid = ns_root_gid;
> + }

You're not setting uid and gid in the else case?

> +}
> +#endif /* CONFIG_BLK_DEV_LOOPFS */
> +
> static int __init genhd_device_init(void)
> {
> int error;
>
> block_class.dev_kobj = sysfs_dev_block_kobj;
> +#ifdef CONFIG_BLK_DEV_LOOPFS
> + kobj_ns_type_register(&user_ns_type_operations);
> +#endif
> error = class_register(&block_class);
> if (unlikely(error))
> return error;
> @@ -1524,8 +1594,14 @@ static void disk_release(struct device *dev)
> blk_put_queue(disk->queue);
> kfree(disk);
> }
> +
> struct class block_class = {
> .name = "block",
> +#ifdef CONFIG_BLK_DEV_LOOPFS
> + .ns_type = &user_ns_type_operations,
> + .namespace = block_class_user_namespace,
> + .get_ownership = block_class_get_ownership,
> +#endif
> };
>
> static char *block_devnode(struct device *dev, umode_t *mode,
> @@ -1715,6 +1791,9 @@ struct gendisk *__alloc_disk_node(int minors, int node_id)
> disk_to_dev(disk)->class = &block_class;
> disk_to_dev(disk)->type = &disk_type;
> device_initialize(disk_to_dev(disk));
> +#ifdef CONFIG_BLK_DEV_LOOPFS
> + disk->user_ns = &init_user_ns;
> +#endif
> }
> return disk;
> }
> diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
> index 1f2d894ae454..02796ba6521a 100644
> --- a/fs/kernfs/dir.c
> +++ b/fs/kernfs/dir.c
> @@ -575,10 +575,15 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
> goto out_bad;
>
> /* The kernfs node has been moved to a different namespace */
> - if (kn->parent && kernfs_ns_enabled(kn->parent) &&
> - kernfs_info(dentry->d_sb)->ns[kn->ns_type] != kn->ns)
> - goto out_bad;
> + if (kn->parent && kernfs_ns_enabled(kn->parent)) {
> + if (kernfs_init_ns_propagates(kn->parent) &&
> + kn->ns == kernfs_init_ns(kn->parent->ns_type))
> + goto out_good;
> + if (kernfs_info(dentry->d_sb)->ns[kn->parent->ns_type] != kn->ns)
> + goto out_bad;
> + }
>
> +out_good:
> mutex_unlock(&kernfs_mutex);
> return 1;
> out_bad:
> @@ -1090,6 +1095,10 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir,
> ns = kernfs_info(dir->i_sb)->ns[parent->ns_type];
>
> kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
> + if (!kn && kernfs_init_ns_propagates(parent)) {
> + ns = kernfs_init_ns(parent->ns_type);
> + kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
> + }
>
> /* no such entry */
> if (!kn || !kernfs_active(kn)) {
> @@ -1614,6 +1623,8 @@ static int kernfs_dir_fop_release(struct inode *inode, struct file *filp)
> static struct kernfs_node *kernfs_dir_pos(const void *ns,
> struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos)
> {
> + const void *init_ns;
> +
> if (pos) {
> int valid = kernfs_active(pos) &&
> pos->parent == parent && hash == pos->hash;
> @@ -1621,6 +1632,12 @@ static struct kernfs_node *kernfs_dir_pos(const void *ns,
> if (!valid)
> pos = NULL;
> }
> +
> + if (kernfs_init_ns_propagates(parent))
> + init_ns = kernfs_init_ns(parent->ns_type);
> + else
> + init_ns = NULL;
> +
> if (!pos && (hash > 1) && (hash < INT_MAX)) {
> struct rb_node *node = parent->dir.children.rb_node;
> while (node) {
> @@ -1635,7 +1652,7 @@ static struct kernfs_node *kernfs_dir_pos(const void *ns,
> }
> }
> /* Skip over entries which are dying/dead or in the wrong namespace */
> - while (pos && (!kernfs_active(pos) || pos->ns != ns)) {
> + while (pos && (!kernfs_active(pos) || (pos->ns != ns && pos->ns != init_ns))) {
> struct rb_node *node = rb_next(&pos->rb);
> if (!node)
> pos = NULL;
> @@ -1650,13 +1667,20 @@ static struct kernfs_node *kernfs_dir_next_pos(const void *ns,
> {
> pos = kernfs_dir_pos(ns, parent, ino, pos);
> if (pos) {
> + const void *init_ns;
> + if (kernfs_init_ns_propagates(parent))
> + init_ns = kernfs_init_ns(parent->ns_type);
> + else
> + init_ns = NULL;
> +
> do {
> struct rb_node *node = rb_next(&pos->rb);
> if (!node)
> pos = NULL;
> else
> pos = rb_to_kn(node);
> - } while (pos && (!kernfs_active(pos) || pos->ns != ns));
> + } while (pos && (!kernfs_active(pos) ||
> + (pos->ns != ns && pos->ns != init_ns)));
> }
> return pos;
> }
> diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
> index 7c972c00f84a..74eb6c447361 100644
> --- a/fs/kernfs/kernfs-internal.h
> +++ b/fs/kernfs/kernfs-internal.h
> @@ -80,6 +80,30 @@ static inline struct kernfs_node *kernfs_dentry_node(struct dentry *dentry)
> return d_inode(dentry)->i_private;
> }
>
> +#ifdef CONFIG_NET
> +extern struct net init_net;
> +#endif
> +
> +extern struct user_namespace init_user_ns;
> +
> +static inline const void *kernfs_init_ns(enum kobj_ns_type ns_type)
> +{
> + switch (ns_type) {
> + case KOBJ_NS_TYPE_NET:
> +#ifdef CONFIG_NET
> + return &init_net;
> +#else
> + break;
> +#endif
> + case KOBJ_NS_TYPE_USER:
> + return &init_user_ns;
> + default:
> + pr_debug("Unsupported namespace type %d for kernfs\n", ns_type);
> + }
> +
> + return NULL;
> +}
> +
> extern const struct super_operations kernfs_sops;
> extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
>
> diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
> index 5e2ec88a709e..99b82a0ae7ea 100644
> --- a/fs/sysfs/mount.c
> +++ b/fs/sysfs/mount.c
> @@ -43,6 +43,8 @@ static void sysfs_fs_context_free(struct fs_context *fc)
>
> if (kfc->ns_tag[KOBJ_NS_TYPE_NET])
> kobj_ns_drop(KOBJ_NS_TYPE_NET, kfc->ns_tag[KOBJ_NS_TYPE_NET]);
> + if (kfc->ns_tag[KOBJ_NS_TYPE_USER])
> + kobj_ns_drop(KOBJ_NS_TYPE_USER, kfc->ns_tag[KOBJ_NS_TYPE_USER]);
> kernfs_free_fs_context(fc);
> kfree(kfc);
> }
> @@ -67,6 +69,7 @@ static int sysfs_init_fs_context(struct fs_context *fc)
> return -ENOMEM;
>
> kfc->ns_tag[KOBJ_NS_TYPE_NET] = netns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
> + kfc->ns_tag[KOBJ_NS_TYPE_USER] = kobj_ns_grab_current(KOBJ_NS_TYPE_USER);

It's nice and tidy this way so maybe worth it, but getting
the kobj_ns_type_lock spinlock twice in a row here seems
unfortunate.

> kfc->root = sysfs_root;
> kfc->magic = SYSFS_MAGIC;
> fc->fs_private = kfc;
> @@ -85,6 +88,7 @@ static void sysfs_kill_sb(struct super_block *sb)
>
> kernfs_kill_sb(sb);
> kobj_ns_drop(KOBJ_NS_TYPE_NET, ns[KOBJ_NS_TYPE_NET]);
> + kobj_ns_drop(KOBJ_NS_TYPE_USER, ns[KOBJ_NS_TYPE_USER]);
> }
>
> static struct file_system_type sysfs_fs_type = {
> diff --git a/include/linux/genhd.h b/include/linux/genhd.h
> index 9b3fffdf4011..4edf8a82e815 100644
> --- a/include/linux/genhd.h
> +++ b/include/linux/genhd.h
> @@ -220,6 +220,9 @@ struct gendisk {
> int node_id;
> struct badblocks *bb;
> struct lockdep_map lockdep_map;
> +#ifdef CONFIG_BLK_DEV_LOOPFS
> + struct user_namespace *user_ns;
> +#endif
> };
>
> static inline struct gendisk *part_to_disk(struct hd_struct *part)
> diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
> index d0544f2e0c99..7809584a7c8e 100644
> --- a/include/linux/kernfs.h
> +++ b/include/linux/kernfs.h
> @@ -53,6 +53,7 @@ enum kernfs_node_flag {
> KERNFS_SUICIDED = 0x0800,
> KERNFS_EMPTY_DIR = 0x1000,
> KERNFS_HAS_RELEASE = 0x2000,
> + KERNFS_NS_PROPAGATE = 0x4000,
> };
>
> /* @flags for kernfs_create_root() */
> @@ -337,6 +338,27 @@ static inline void kernfs_enable_ns(struct kernfs_node *kn,
> kn->ns_type = ns_type;
> }
>
> +static inline void kernfs_enable_init_ns_propagates(struct kernfs_node *kn)
> +{
> + WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR);
> + WARN_ON_ONCE(!RB_EMPTY_ROOT(&kn->dir.children));
> + WARN_ON_ONCE(!(kn->flags & KERNFS_NS));
> + kn->flags |= KERNFS_NS_PROPAGATE;
> +}
> +
> +/**
> + * kernfs_init_ns_propagates - test whether init ns propagates
> + * @kn: the node to test
> + *
> + * Test whether kernfs entries created in the init namespace propagate into
> + * other namespaces.
> + */
> +static inline bool kernfs_init_ns_propagates(const struct kernfs_node *kn)
> +{
> + return ((kn->flags & (KERNFS_NS | KERNFS_NS_PROPAGATE)) ==
> + (KERNFS_NS | KERNFS_NS_PROPAGATE));
> +}
> +
> /**
> * kernfs_ns_enabled - test whether namespace is enabled
> * @kn: the node to test
> diff --git a/include/linux/kobject_ns.h b/include/linux/kobject_ns.h
> index 991a9286bcea..a9c45bcce235 100644
> --- a/include/linux/kobject_ns.h
> +++ b/include/linux/kobject_ns.h
> @@ -26,6 +26,7 @@ struct kobject;
> enum kobj_ns_type {
> KOBJ_NS_TYPE_NONE = 0,
> KOBJ_NS_TYPE_NET,
> + KOBJ_NS_TYPE_USER,
> KOBJ_NS_TYPES
> };
>
> @@ -34,6 +35,8 @@ enum kobj_ns_type {
> * @grab_current_ns: return a new reference to calling task's namespace
> * @initial_ns: return the initial namespace (i.e. init_net_ns)
> * @drop_ns: drops a reference to namespace
> + * @initial_ns_propagates: whether devices in the initial namespace propagate
> + * to all other namespaces
> */
> struct kobj_ns_type_operations {
> enum kobj_ns_type type;
> @@ -41,6 +44,7 @@ struct kobj_ns_type_operations {
> void *(*grab_current_ns)(void);
> const void *(*initial_ns)(void);
> void (*drop_ns)(void *);
> + bool (*initial_ns_propagates)(void);
> };
>
> int kobj_ns_type_register(const struct kobj_ns_type_operations *ops);
> diff --git a/lib/kobject.c b/lib/kobject.c
> index c58c62d49a10..96bb8c732d1c 100644
> --- a/lib/kobject.c
> +++ b/lib/kobject.c
> @@ -121,6 +121,8 @@ static int create_dir(struct kobject *kobj)
> BUG_ON(!kobj_ns_type_registered(ops->type));
>
> sysfs_enable_ns(kobj->sd, ops->type);
> + if (ops->initial_ns_propagates && ops->initial_ns_propagates())
> + kernfs_enable_init_ns_propagates(kobj->sd);
> }
>
> return 0;
> --
> 2.26.1