Re: [PATCH 2/2] Customize sched domain via cpuset (v2)

From: Peter Zijlstra
Date: Thu Apr 10 2008 - 10:53:30 EST


On Fri, 2008-04-04 at 18:11 +0900, Hidetoshi Seto wrote:
> The implementation has some updates...
>
> > - Add 2 new cpuset files:
> > sched_wake_idle_far
> > sched_balance_newidle_far
> -> Merged into 1 file, having levels:
> sched_relax_domain_level
>
> > - Modify partition_sched_domains() and build_sched_domains()
> > to take flags parameter passed from cpuset.
> -> Changed to "attributes" rather than "flags."
>
> > - Fill newidle_idx for node domains which currently unused but
> > might be required for sched_balance_newidle_far.
>
> + We can change the "default" level by boot option 'relax_domain_level='.
>
> Signed-off-by: Hidetoshi Seto <seto.hidetoshi@xxxxxxxxxxxxxx>

This seems like a sufficiently flexible interface. Paul, have you got
any outstanding objections?

Acked-by: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>

> ---
> include/asm-ia64/topology.h | 2 -
> include/asm-sh/topology.h | 2 -
> include/asm-x86/topology.h | 2 -
> include/linux/sched.h | 23 +++++++++++-
> kernel/cpuset.c | 61 ++++++++++++++++++++++++++++++++
> kernel/sched.c | 82 +++++++++++++++++++++++++++++++++++++++++---
> kernel/sched_fair.c | 4 +-
> 7 files changed, 165 insertions(+), 11 deletions(-)
>
> Index: GIT-torvalds/include/linux/sched.h
> ===================================================================
> --- GIT-torvalds.orig/include/linux/sched.h
> +++ GIT-torvalds/include/linux/sched.h
> @@ -704,6 +704,7 @@ enum cpu_idle_type {
> #define SD_POWERSAVINGS_BALANCE 256 /* Balance for power savings */
> #define SD_SHARE_PKG_RESOURCES 512 /* Domain members share cpu pkg resources */
> #define SD_SERIALIZE 1024 /* Only a single load balancing instance */
> +#define SD_WAKE_IDLE_FAR 2048 /* Gain latency sacrificing cache hit */
>
> #define BALANCE_FOR_MC_POWER \
> (sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0)
> @@ -733,6 +734,24 @@ struct sched_group {
> u32 reciprocal_cpu_power;
> };
>
> +enum sched_domain_level {
> + SD_LV_NONE = 0,
> + SD_LV_SIBLING,
> + SD_LV_MC,
> + SD_LV_CPU,
> + SD_LV_NODE,
> + SD_LV_ALLNODES,
> + SD_LV_MAX
> +};
> +
> +struct sched_domain_attr {
> + int relax_domain_level;
> +};
> +
> +#define SD_ATTR_INIT (struct sched_domain_attr) { \
> + .relax_domain_level = -1, \
> +}
> +
> struct sched_domain {
> /* These fields must be setup */
> struct sched_domain *parent; /* top domain must be null terminated */
> @@ -750,6 +769,7 @@ struct sched_domain {
> unsigned int wake_idx;
> unsigned int forkexec_idx;
> int flags; /* See SD_* */
> + enum sched_domain_level level;
>
> /* Runtime fields. */
> unsigned long last_balance; /* init to jiffies. units in jiffies */
> @@ -789,7 +809,8 @@ struct sched_domain {
> #endif
> };
>
> -extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new);
> +extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
> + struct sched_domain_attr *dattr_new);
> extern int arch_reinit_sched_domains(void);
>
> #endif /* CONFIG_SMP */
> Index: GIT-torvalds/kernel/sched.c
> ===================================================================
> --- GIT-torvalds.orig/kernel/sched.c
> +++ GIT-torvalds/kernel/sched.c
> @@ -6582,11 +6582,42 @@ static void init_sched_groups_power(int
> } while (group != child->groups);
> }
>
> +static int default_relax_domain_level = -1;
> +
> +static int __init setup_relax_domain_level(char *str)
> +{
> + default_relax_domain_level = simple_strtoul(str, NULL, 0);
> + return 1;
> +}
> +__setup("relax_domain_level=", setup_relax_domain_level);
> +
> +static void set_domain_attribute(struct sched_domain *sd,
> + struct sched_domain_attr *attr)
> +{
> + int request;
> +
> + if (!attr || attr->relax_domain_level < 0) {
> + if (default_relax_domain_level < 0)
> + return;
> + else
> + request = default_relax_domain_level;
> + } else
> + request = attr->relax_domain_level;
> + if (request < sd->level) {
> + /* turn off idle balance on this domain */
> + sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE);
> + } else {
> + /* turn on idle balance on this domain */
> + sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE);
> + }
> +}
> +
> /*
> * Build sched domains for a given set of cpus and attach the sched domains
> * to the individual cpus
> */
> -static int build_sched_domains(const cpumask_t *cpu_map)
> +static int __build_sched_domains(const cpumask_t *cpu_map,
> + struct sched_domain_attr *attr)
> {
> int i;
> struct root_domain *rd;
> @@ -6626,7 +6657,9 @@ static int build_sched_domains(const cpu
> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
> sd = &per_cpu(allnodes_domains, i);
> *sd = SD_ALLNODES_INIT;
> + sd->level = SD_LV_ALLNODES;
> sd->span = *cpu_map;
> + set_domain_attribute(sd, attr);
> cpu_to_allnodes_group(i, cpu_map, &sd->groups);
> p = sd;
> sd_allnodes = 1;
> @@ -6635,7 +6668,9 @@ static int build_sched_domains(const cpu
>
> sd = &per_cpu(node_domains, i);
> *sd = SD_NODE_INIT;
> + sd->level = SD_LV_NODE;
> sd->span = sched_domain_node_span(cpu_to_node(i));
> + set_domain_attribute(sd, attr);
> sd->parent = p;
> if (p)
> p->child = sd;
> @@ -6645,7 +6680,9 @@ static int build_sched_domains(const cpu
> p = sd;
> sd = &per_cpu(phys_domains, i);
> *sd = SD_CPU_INIT;
> + sd->level = SD_LV_CPU;
> sd->span = nodemask;
> + set_domain_attribute(sd, attr);
> sd->parent = p;
> if (p)
> p->child = sd;
> @@ -6655,8 +6692,10 @@ static int build_sched_domains(const cpu
> p = sd;
> sd = &per_cpu(core_domains, i);
> *sd = SD_MC_INIT;
> + sd->level = SD_LV_MC;
> sd->span = cpu_coregroup_map(i);
> cpus_and(sd->span, sd->span, *cpu_map);
> + set_domain_attribute(sd, attr);
> sd->parent = p;
> p->child = sd;
> cpu_to_core_group(i, cpu_map, &sd->groups);
> @@ -6666,8 +6705,10 @@ static int build_sched_domains(const cpu
> p = sd;
> sd = &per_cpu(cpu_domains, i);
> *sd = SD_SIBLING_INIT;
> + sd->level = SD_LV_SIBLING;
> sd->span = per_cpu(cpu_sibling_map, i);
> cpus_and(sd->span, sd->span, *cpu_map);
> + set_domain_attribute(sd, attr);
> sd->parent = p;
> p->child = sd;
> cpu_to_cpu_group(i, cpu_map, &sd->groups);
> @@ -6840,8 +6881,15 @@ error:
> #endif
> }
>
> +static int build_sched_domains(const cpumask_t *cpu_map)
> +{
> + return __build_sched_domains(cpu_map, NULL);
> +}
> +
> static cpumask_t *doms_cur; /* current sched domains */
> static int ndoms_cur; /* number of sched domains in 'doms_cur' */
> +static struct sched_domain_attr *dattr_cur; /* attribues of custom domains
> + in 'doms_cur' */
>
> /*
> * Special case: If a kmalloc of a doms_cur partition (array of
> @@ -6868,6 +6916,7 @@ static int arch_init_sched_domains(const
> doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
> if (!doms_cur)
> doms_cur = &fallback_doms;
> + dattr_cur = NULL;
> cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
> err = build_sched_domains(doms_cur);
> register_sched_domain_sysctl();
> @@ -6896,6 +6945,22 @@ static void detach_destroy_domains(const
> arch_destroy_sched_domains(cpu_map);
> }
>
> +/* handle null as "default" */
> +static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
> + struct sched_domain_attr *new, int idx_new)
> +{
> + struct sched_domain_attr tmp;
> +
> + /* fast path */
> + if (!new && !cur)
> + return 1;
> +
> + tmp = SD_ATTR_INIT;
> + return !memcmp(cur ? (cur + idx_cur) : &tmp,
> + new ? (new + idx_new) : &tmp,
> + sizeof(struct sched_domain_attr));
> +}
> +
> /*
> * Partition sched domains as specified by the 'ndoms_new'
> * cpumasks in the array doms_new[] of cpumasks. This compares
> @@ -6917,7 +6982,8 @@ static void detach_destroy_domains(const
> *
> * Call with hotplug lock held
> */
> -void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
> +void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
> + struct sched_domain_attr *dattr_new)
> {
> int i, j;
>
> @@ -6929,13 +6995,15 @@ void partition_sched_domains(int ndoms_n
> if (doms_new == NULL) {
> ndoms_new = 1;
> doms_new = &fallback_doms;
> + dattr_new = NULL;
> cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
> }
>
> /* Destroy deleted domains */
> for (i = 0; i < ndoms_cur; i++) {
> for (j = 0; j < ndoms_new; j++) {
> - if (cpus_equal(doms_cur[i], doms_new[j]))
> + if (cpus_equal(doms_cur[i], doms_new[j])
> + && dattrs_equal(dattr_cur, i, dattr_new, j))
> goto match1;
> }
> /* no match - a current sched domain not in new doms_new[] */
> @@ -6947,11 +7015,13 @@ match1:
> /* Build new domains */
> for (i = 0; i < ndoms_new; i++) {
> for (j = 0; j < ndoms_cur; j++) {
> - if (cpus_equal(doms_new[i], doms_cur[j]))
> + if (cpus_equal(doms_new[i], doms_cur[j])
> + && dattrs_equal(dattr_new, i, dattr_cur, j))
> goto match2;
> }
> /* no match - add a new doms_new */
> - build_sched_domains(doms_new + i);
> + __build_sched_domains(doms_new + i,
> + dattr_new ? dattr_new + i : NULL);
> match2:
> ;
> }
> @@ -6959,7 +7029,9 @@ match2:
> /* Remember the new sched domains */
> if (doms_cur != &fallback_doms)
> kfree(doms_cur);
> + kfree(dattr_cur); /* kfree(NULL) is safe */
> doms_cur = doms_new;
> + dattr_cur = dattr_new;
> ndoms_cur = ndoms_new;
>
> register_sched_domain_sysctl();
> Index: GIT-torvalds/kernel/sched_fair.c
> ===================================================================
> --- GIT-torvalds.orig/kernel/sched_fair.c
> +++ GIT-torvalds/kernel/sched_fair.c
> @@ -957,7 +957,9 @@ static int wake_idle(int cpu, struct tas
> return cpu;
>
> for_each_domain(cpu, sd) {
> - if (sd->flags & SD_WAKE_IDLE) {
> + if ((sd->flags & SD_WAKE_IDLE)
> + || ((sd->flags & SD_WAKE_IDLE_FAR)
> + && !task_hot(p, task_rq(p)->clock, sd))) {
> cpus_and(tmp, sd->span, p->cpus_allowed);
> for_each_cpu_mask(i, tmp) {
> if (idle_cpu(i)) {
> Index: GIT-torvalds/kernel/cpuset.c
> ===================================================================
> --- GIT-torvalds.orig/kernel/cpuset.c
> +++ GIT-torvalds/kernel/cpuset.c
> @@ -98,6 +98,9 @@ struct cpuset {
> /* partition number for rebuild_sched_domains() */
> int pn;
>
> + /* for custom sched domain */
> + int relax_domain_level;
> +
> /* used for walking a cpuset heirarchy */
> struct list_head stack_list;
> };
> @@ -478,6 +481,16 @@ static int cpusets_overlap(struct cpuset
> return cpus_intersects(a->cpus_allowed, b->cpus_allowed);
> }
>
> +static void
> +update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
> +{
> + if (!dattr)
> + return;
> + if (dattr->relax_domain_level < c->relax_domain_level)
> + dattr->relax_domain_level = c->relax_domain_level;
> + return;
> +}
> +
> /*
> * rebuild_sched_domains()
> *
> @@ -553,12 +566,14 @@ static void rebuild_sched_domains(void)
> int csn; /* how many cpuset ptrs in csa so far */
> int i, j, k; /* indices for partition finding loops */
> cpumask_t *doms; /* resulting partition; i.e. sched domains */
> + struct sched_domain_attr *dattr; /* attributes for custom domains */
> int ndoms; /* number of sched domains in result */
> int nslot; /* next empty doms[] cpumask_t slot */
>
> q = NULL;
> csa = NULL;
> doms = NULL;
> + dattr = NULL;
>
> /* Special case for the 99% of systems with one, full, sched domain */
> if (is_sched_load_balance(&top_cpuset)) {
> @@ -566,6 +581,11 @@ static void rebuild_sched_domains(void)
> doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
> if (!doms)
> goto rebuild;
> + dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
> + if (dattr) {
> + *dattr = SD_ATTR_INIT;
> + update_domain_attr(dattr, &top_cpuset);
> + }
> *doms = top_cpuset.cpus_allowed;
> goto rebuild;
> }
> @@ -622,6 +642,7 @@ restart:
> doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
> if (!doms)
> goto rebuild;
> + dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
>
> for (nslot = 0, i = 0; i < csn; i++) {
> struct cpuset *a = csa[i];
> @@ -644,12 +665,15 @@ restart:
> }
>
> cpus_clear(*dp);
> + if (dattr)
> + *(dattr + nslot) = SD_ATTR_INIT;
> for (j = i; j < csn; j++) {
> struct cpuset *b = csa[j];
>
> if (apn == b->pn) {
> cpus_or(*dp, *dp, b->cpus_allowed);
> b->pn = -1;
> + update_domain_attr(dattr, b);
> }
> }
> nslot++;
> @@ -660,7 +684,7 @@ restart:
> rebuild:
> /* Have scheduler rebuild sched domains */
> get_online_cpus();
> - partition_sched_domains(ndoms, doms);
> + partition_sched_domains(ndoms, doms, dattr);
> put_online_cpus();
>
> done:
> @@ -668,6 +692,7 @@ done:
> kfifo_free(q);
> kfree(csa);
> /* Don't kfree(doms) -- partition_sched_domains() does that. */
> + /* Don't kfree(dattr) -- partition_sched_domains() does that. */
> }
>
> static inline int started_after_time(struct task_struct *t1,
> @@ -1011,6 +1036,21 @@ static int update_memory_pressure_enable
> return 0;
> }
>
> +static int update_relax_domain_level(struct cpuset *cs, char *buf)
> +{
> + int val = simple_strtol(buf, NULL, 10);
> +
> + if (val < 0)
> + val = -1;
> +
> + if (val != cs->relax_domain_level) {
> + cs->relax_domain_level = val;
> + rebuild_sched_domains();
> + }
> +
> + return 0;
> +}
> +
> /*
> * update_flag - read a 0 or a 1 in a file and update associated flag
> * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
> @@ -1202,6 +1242,7 @@ typedef enum {
> FILE_CPU_EXCLUSIVE,
> FILE_MEM_EXCLUSIVE,
> FILE_SCHED_LOAD_BALANCE,
> + FILE_SCHED_RELAX_DOMAIN_LEVEL,
> FILE_MEMORY_PRESSURE_ENABLED,
> FILE_MEMORY_PRESSURE,
> FILE_SPREAD_PAGE,
> @@ -1256,6 +1297,9 @@ static ssize_t cpuset_common_file_write(
> case FILE_SCHED_LOAD_BALANCE:
> retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer);
> break;
> + case FILE_SCHED_RELAX_DOMAIN_LEVEL:
> + retval = update_relax_domain_level(cs, buffer);
> + break;
> case FILE_MEMORY_MIGRATE:
> retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
> break;
> @@ -1354,6 +1398,9 @@ static ssize_t cpuset_common_file_read(s
> case FILE_SCHED_LOAD_BALANCE:
> *s++ = is_sched_load_balance(cs) ? '1' : '0';
> break;
> + case FILE_SCHED_RELAX_DOMAIN_LEVEL:
> + s += sprintf(s, "%d", cs->relax_domain_level);
> + break;
> case FILE_MEMORY_MIGRATE:
> *s++ = is_memory_migrate(cs) ? '1' : '0';
> break;
> @@ -1424,6 +1471,13 @@ static struct cftype cft_sched_load_bala
> .private = FILE_SCHED_LOAD_BALANCE,
> };
>
> +static struct cftype cft_sched_relax_domain_level = {
> + .name = "sched_relax_domain_level",
> + .read = cpuset_common_file_read,
> + .write = cpuset_common_file_write,
> + .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
> +};
> +
> static struct cftype cft_memory_migrate = {
> .name = "memory_migrate",
> .read = cpuset_common_file_read,
> @@ -1475,6 +1529,9 @@ static int cpuset_populate(struct cgroup
> return err;
> if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0)
> return err;
> + if ((err = cgroup_add_file(cont, ss,
> + &cft_sched_relax_domain_level)) < 0)
> + return err;
> if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0)
> return err;
> if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0)
> @@ -1559,6 +1616,7 @@ static struct cgroup_subsys_state *cpuse
> cs->mems_allowed = NODE_MASK_NONE;
> cs->mems_generation = cpuset_mems_generation++;
> fmeter_init(&cs->fmeter);
> + cs->relax_domain_level = -1;
>
> cs->parent = parent;
> number_of_cpusets++;
> @@ -1631,6 +1689,7 @@ int __init cpuset_init(void)
> fmeter_init(&top_cpuset.fmeter);
> top_cpuset.mems_generation = cpuset_mems_generation++;
> set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
> + top_cpuset.relax_domain_level = -1;
>
> err = register_filesystem(&cpuset_fs_type);
> if (err < 0)
> Index: GIT-torvalds/include/asm-ia64/topology.h
> ===================================================================
> --- GIT-torvalds.orig/include/asm-ia64/topology.h
> +++ GIT-torvalds/include/asm-ia64/topology.h
> @@ -93,7 +93,7 @@ void build_cpu_to_node_map(void);
> .cache_nice_tries = 2, \
> .busy_idx = 3, \
> .idle_idx = 2, \
> - .newidle_idx = 0, /* unused */ \
> + .newidle_idx = 2, \
> .wake_idx = 1, \
> .forkexec_idx = 1, \
> .flags = SD_LOAD_BALANCE \
> Index: GIT-torvalds/include/asm-sh/topology.h
> ===================================================================
> --- GIT-torvalds.orig/include/asm-sh/topology.h
> +++ GIT-torvalds/include/asm-sh/topology.h
> @@ -16,7 +16,7 @@
> .cache_nice_tries = 2, \
> .busy_idx = 3, \
> .idle_idx = 2, \
> - .newidle_idx = 0, \
> + .newidle_idx = 2, \
> .wake_idx = 1, \
> .forkexec_idx = 1, \
> .flags = SD_LOAD_BALANCE \
> Index: GIT-torvalds/include/asm-x86/topology.h
> ===================================================================
> --- GIT-torvalds.orig/include/asm-x86/topology.h
> +++ GIT-torvalds/include/asm-x86/topology.h
> @@ -129,7 +129,7 @@ extern unsigned long node_remap_size[];
>
> # define SD_CACHE_NICE_TRIES 2
> # define SD_IDLE_IDX 2
> -# define SD_NEWIDLE_IDX 0
> +# define SD_NEWIDLE_IDX 2
> # define SD_FORKEXEC_IDX 1
>
> #endif
>

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/