Re: [PATCH 3/4] cgroup: introduce sane_behavior mount option

From: Serge Hallyn
Date: Sun Apr 14 2013 - 21:06:09 EST


Quoting Tejun Heo (tj@xxxxxxxxxx):
> It's a sad fact that at this point various cgroup controllers are
> carrying so many idiosyncrasies and pure insanities that it simply
> isn't possible to reach any sort of sane consistent behavior while
> maintaining staying fully compatible with what already has been
> exposed to userland.
>
> As we can't break exposed userland interface, transitioning to sane
> behaviors can only be done in steps while maintaining backwards
> compatibility. This patch introduces a new mount option -
> __DEVEL__sane_behavior - which disables crazy features and enforces
> consistent behaviors in cgroup core proper and various controllers.
> As exactly which behaviors it changes are still being determined, the
> mount option, at this point, is useful only for development of the new
> behaviors. As such, the mount option is prefixed with __DEVEL__ and
> generates a warning message when used.
>
> Eventually, once we get to the point where all controller's behaviors
> are consistent enough to implement unified hierarchy, the __DEVEL__
> prefix will be dropped, and more importantly, unified-hierarchy will
> enforce sane_behavior by default. Maybe we'll able to completely drop
> the crazy stuff after a while, maybe not, but we at least have a
> strategy to move on to saner behaviors.
>
> This patch introduces the mount option and changes the following
> behaviors in cgroup core.
>
> * Mount options "noprefix" and "clone_children" are disallowed. Also,
> cgroupfs file cgroup.clone_children is not created.
>
> * When mounting an existing superblock, mount options should match.
> This is currently pretty crazy. If one mounts a cgroup, creates a
> subdirectory, unmounts it and then mount it again with different
> option, it looks like the new options are applied but they aren't.
>
> * Remount is disallowed.
>
> The behaviors changes are documented in the comment above
> CGRP_ROOT_SANE_BEHAVIOR enum and will be expanded as different
> controllers are converted and planned improvements progress.
>
> Signed-off-by: Tejun Heo <tj@xxxxxxxxxx>

Acked-by: Serge E. Hallyn <serge.hallyn@xxxxxxxxxx>

> Cc: Li Zefan <lizefan@xxxxxxxxxx>
> Cc: Michal Hocko <mhocko@xxxxxxx>
> Cc: Vivek Goyal <vgoyal@xxxxxxxxxx>
> ---
> include/linux/cgroup.h | 43 +++++++++++++++++++++++++++++++++++++++++++
> kernel/cgroup.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 92 insertions(+)
>
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index b21881e..9c300ad 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -156,6 +156,8 @@ enum {
> * specified at mount time and thus is implemented here.
> */
> CGRP_CPUSET_CLONE_CHILDREN,
> + /* see the comment above CGRP_ROOT_SANE_BEHAVIOR for details */
> + CGRP_SANE_BEHAVIOR,
> };
>
> struct cgroup_name {
> @@ -243,6 +245,37 @@ struct cgroup {
>
> /* cgroupfs_root->flags */
> enum {
> + /*
> + * Unfortunately, cgroup core and various controllers are riddled
> + * with idiosyncrasies and pointless options. The following flag,
> + * when set, will force sane behavior - some options are forced on,
> + * others are disallowed, and some controllers will change their
> + * hierarchical or other behaviors.
> + *
> + * The set of behaviors affected by this flag are still being
> + * determined and developed and the mount option for this flag is
> + * prefixed with __DEVEL__. The prefix will be dropped once we
> + * reach the point where all behaviors are compatible with the
> + * planned unified hierarchy, which will automatically turn on this
> + * flag.
> + *
> + * The followings are the behaviors currently affected this flag.
> + *
> + * - Mount options "noprefix" and "clone_children" are disallowed.
> + * Also, cgroupfs file cgroup.clone_children is not created.
> + *
> + * - When mounting an existing superblock, mount options should
> + * match.
> + *
> + * - Remount is disallowed.
> + *
> + * The followings are planned changes.
> + *
> + * - release_agent will be disallowed once replacement notification
> + * mechanism is implemented.
> + */
> + CGRP_ROOT_SANE_BEHAVIOR = (1 << 0),
> +
> CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */
> CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */
> };
> @@ -360,6 +393,7 @@ struct cgroup_map_cb {
> /* cftype->flags */
> #define CFTYPE_ONLY_ON_ROOT (1U << 0) /* only create on root cg */
> #define CFTYPE_NOT_ON_ROOT (1U << 1) /* don't create on root cg */
> +#define CFTYPE_INSANE (1U << 2) /* don't create if sane_behavior */
>
> #define MAX_CFTYPE_NAME 64
>
> @@ -486,6 +520,15 @@ struct cgroup_scanner {
> void *data;
> };
>
> +/*
> + * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details. This
> + * function can be called as long as @cgrp is accessible.
> + */
> +static inline bool cgroup_sane_behavior(const struct cgroup *cgrp)
> +{
> + return cgrp->root->flags & CGRP_ROOT_SANE_BEHAVIOR;
> +}
> +
> /* Caller should hold rcu_read_lock() */
> static inline const char *cgroup_name(const struct cgroup *cgrp)
> {
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 8848070..e229800 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -1080,6 +1080,8 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
> mutex_lock(&cgroup_root_mutex);
> for_each_subsys(root, ss)
> seq_printf(seq, ",%s", ss->name);
> + if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
> + seq_puts(seq, ",sane_behavior");
> if (root->flags & CGRP_ROOT_NOPREFIX)
> seq_puts(seq, ",noprefix");
> if (root->flags & CGRP_ROOT_XATTR)
> @@ -1144,6 +1146,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
> all_ss = true;
> continue;
> }
> + if (!strcmp(token, "__DEVEL__sane_behavior")) {
> + opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
> + continue;
> + }
> if (!strcmp(token, "noprefix")) {
> opts->flags |= CGRP_ROOT_NOPREFIX;
> continue;
> @@ -1231,6 +1237,20 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
>
> /* Consistency checks */
>
> + if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
> + pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
> +
> + if (opts->flags & CGRP_ROOT_NOPREFIX) {
> + pr_err("cgroup: sane_behavior: noprefix is not allowed\n");
> + return -EINVAL;
> + }
> +
> + if (opts->cpuset_clone_children) {
> + pr_err("cgroup: sane_behavior: clone_children is not allowed\n");
> + return -EINVAL;
> + }
> + }
> +
> /*
> * Option noprefix was introduced just for backward compatibility
> * with the old cpuset, so we allow noprefix only if mounting just
> @@ -1307,6 +1327,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
> struct cgroup_sb_opts opts;
> unsigned long added_mask, removed_mask;
>
> + if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
> + pr_err("cgroup: sane_behavior: remount is not allowed\n");
> + return -EINVAL;
> + }
> +
> mutex_lock(&cgrp->dentry->d_inode->i_mutex);
> mutex_lock(&cgroup_mutex);
> mutex_lock(&cgroup_root_mutex);
> @@ -1657,6 +1682,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
> * any) is not needed
> */
> cgroup_drop_root(opts.new_root);
> +
> + if (((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) &&
> + root->flags != opts.flags) {
> + pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
> + ret = -EINVAL;
> + goto drop_new_super;
> + }
> +
> /* no subsys rebinding, so refcounts don't change */
> drop_parsed_module_refcounts(opts.subsys_mask);
> }
> @@ -2197,6 +2230,13 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
> return 0;
> }
>
> +static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft,
> + struct seq_file *seq)
> +{
> + seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
> + return 0;
> +}
> +
> /* A buffer size big enough for numbers or short strings */
> #define CGROUP_LOCAL_BUFFER_SIZE 64
>
> @@ -2678,6 +2718,8 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
>
> for (cft = cfts; cft->name[0] != '\0'; cft++) {
> /* does cft->flags tell us to skip this file on @cgrp? */
> + if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
> + continue;
> if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
> continue;
> if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
> @@ -3915,10 +3957,17 @@ static struct cftype files[] = {
> },
> {
> .name = "cgroup.clone_children",
> + .flags = CFTYPE_INSANE,
> .read_u64 = cgroup_clone_children_read,
> .write_u64 = cgroup_clone_children_write,
> },
> {
> + .name = "cgroup.sane_behavior",
> + .flags = CFTYPE_ONLY_ON_ROOT,
> + .read_seq_string = cgroup_sane_behavior_show,
> + .mode = S_IRUGO,
> + },
> + {
> .name = "release_agent",
> .flags = CFTYPE_ONLY_ON_ROOT,
> .read_seq_string = cgroup_release_agent_show,
> --
> 1.8.1.4
>
> _______________________________________________
> Containers mailing list
> Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx
> https://lists.linuxfoundation.org/mailman/listinfo/containers
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/