Re: [PATCH 2/2] Customize sched domain via cpuset

From: Peter Zijlstra
Date: Tue Apr 01 2008 - 07:52:20 EST


On Tue, 2008-04-01 at 20:27 +0900, Hidetoshi Seto wrote:
> The implementation is here.
>
> - Add 2 new cpuset files:
> sched_wake_idle_far
> sched_balance_newidle_far
>
> - Modify partition_sched_domains() and build_sched_domains()
> to take flags parameter passed from cpuset.
>
> - Fill newidle_idx for node domains which currently unused but
> might be required for sched_balance_newidle_far.

Just to be clear; the same effect can be had by poking into:

/proc/sys/kernel/sched_domain/$cpu/$domain/flags

but this interface you now propose gives a more stable interface in that
you'd have to re-do your setting after every cpuset change (admittedly
those are rare, but I see how it could be a nuisance).

Or do you actually add something that wasn't available through the
initial domain interface?

> Signed-off-by: Hidetoshi Seto <seto.hidetoshi@xxxxxxxxxxxxxx>
>
> ---
> include/asm-ia64/topology.h | 2
> include/asm-sh/topology.h | 2
> include/asm-x86/topology.h | 2
> include/linux/sched.h | 4 +
> kernel/cpuset.c | 89 ++++++++++++++++++++++++++++++++++++++++++--
> kernel/sched.c | 38 ++++++++++++++++--
> kernel/sched_fair.c | 4 +
> 7 files changed, 128 insertions(+), 13 deletions(-)
>
> Index: GIT-torvalds/kernel/sched_fair.c
> ===================================================================
> --- GIT-torvalds.orig/kernel/sched_fair.c
> +++ GIT-torvalds/kernel/sched_fair.c
> @@ -957,7 +957,9 @@ static int wake_idle(int cpu, struct tas
> return cpu;
>
> for_each_domain(cpu, sd) {
> - if (sd->flags & SD_WAKE_IDLE) {
> + if ((sd->flags & SD_WAKE_IDLE)
> + || ((sd->flags & SD_WAKE_IDLE_FAR)
> + && !task_hot(p, task_rq(p)->clock, sd))) {
> cpus_and(tmp, sd->span, p->cpus_allowed);
> for_each_cpu_mask(i, tmp) {
> if (idle_cpu(i)) {
> Index: GIT-torvalds/kernel/cpuset.c
> ===================================================================
> --- GIT-torvalds.orig/kernel/cpuset.c
> +++ GIT-torvalds/kernel/cpuset.c
> @@ -126,6 +126,8 @@ typedef enum {
> CS_MEM_EXCLUSIVE,
> CS_MEMORY_MIGRATE,
> CS_SCHED_LOAD_BALANCE,
> + CS_SCHED_BALANCE_NEWIDLE_FAR,
> + CS_SCHED_WAKE_IDLE_FAR,
> CS_SPREAD_PAGE,
> CS_SPREAD_SLAB,
> } cpuset_flagbits_t;
> @@ -146,6 +148,16 @@ static inline int is_sched_load_balance(
> return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
> }
>
> +static inline int is_sched_balance_newidle_far(const struct cpuset *cs)
> +{
> + return test_bit(CS_SCHED_BALANCE_NEWIDLE_FAR, &cs->flags);
> +}
> +
> +static inline int is_sched_wake_idle_far(const struct cpuset *cs)
> +{
> + return test_bit(CS_SCHED_WAKE_IDLE_FAR, &cs->flags);
> +}
> +
> static inline int is_memory_migrate(const struct cpuset *cs)
> {
> return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
> @@ -161,6 +173,11 @@ static inline int is_spread_slab(const s
> return test_bit(CS_SPREAD_SLAB, &cs->flags);
> }
>
> +static inline int is_sched_custom_domain(const struct cpuset *cs)
> +{
> + return is_sched_balance_newidle_far(cs) || is_sched_wake_idle_far(cs);
> +}
> +
> /*
> * Increment this integer everytime any cpuset changes its
> * mems_allowed value. Users of cpusets can track this generation
> @@ -553,12 +570,14 @@ static void rebuild_sched_domains(void)
> int csn; /* how many cpuset ptrs in csa so far */
> int i, j, k; /* indices for partition finding loops */
> cpumask_t *doms; /* resulting partition; i.e. sched domains */
> + int *flags; /* flags for custom sched domains */
> int ndoms; /* number of sched domains in result */
> int nslot; /* next empty doms[] cpumask_t slot */
>
> q = NULL;
> csa = NULL;
> doms = NULL;
> + flags = NULL;
>
> /* Special case for the 99% of systems with one, full, sched domain */
> if (is_sched_load_balance(&top_cpuset)) {
> @@ -566,6 +585,13 @@ static void rebuild_sched_domains(void)
> doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
> if (!doms)
> goto rebuild;
> + if (is_sched_custom_domain(&top_cpuset)) {
> + flags = kzalloc(sizeof(int), GFP_KERNEL);
> + if (flags && is_sched_balance_newidle_far(&top_cpuset))
> + *flags |= SD_BALANCE_NEWIDLE;
> + if (flags && is_sched_wake_idle_far(&top_cpuset))
> + *flags |= SD_WAKE_IDLE_FAR;
> + }
> *doms = top_cpuset.cpus_allowed;
> goto rebuild;
> }
> @@ -622,6 +648,7 @@ restart:
> doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
> if (!doms)
> goto rebuild;
> + flags = kzalloc(ndoms * sizeof(int), GFP_KERNEL);
>
> for (nslot = 0, i = 0; i < csn; i++) {
> struct cpuset *a = csa[i];
> @@ -650,6 +677,13 @@ restart:
> if (apn == b->pn) {
> cpus_or(*dp, *dp, b->cpus_allowed);
> b->pn = -1;
> + if (flags
> + && is_sched_balance_newidle_far(b))
> + *(flags + nslot) |=
> + SD_BALANCE_NEWIDLE;
> + if (flags && is_sched_wake_idle_far(b))
> + *(flags + nslot) |=
> + SD_WAKE_IDLE_FAR;
> }
> }
> nslot++;
> @@ -660,7 +694,7 @@ restart:
> rebuild:
> /* Have scheduler rebuild sched domains */
> get_online_cpus();
> - partition_sched_domains(ndoms, doms);
> + partition_sched_domains(ndoms, doms, flags);
> put_online_cpus();
>
> done:
> @@ -668,6 +702,7 @@ done:
> kfifo_free(q);
> kfree(csa);
> /* Don't kfree(doms) -- partition_sched_domains() does that. */
> + /* Don't kfree(flags) -- partition_sched_domains() does that. */
> }
>
> static inline int started_after_time(struct task_struct *t1,
> @@ -1011,10 +1046,26 @@ static int update_memory_pressure_enable
> return 0;
> }
>
> +static int need_rebuild_domains(struct cpuset *cs, struct cpuset *tcs)
> +{
> + if (is_sched_load_balance(cs) != is_sched_load_balance(tcs))
> + return 1;
> + if (!is_sched_load_balance(tcs))
> + return 0;
> + if (is_sched_balance_newidle_far(cs) !=
> + is_sched_balance_newidle_far(tcs))
> + return 1;
> + if (is_sched_wake_idle_far(cs) != is_sched_wake_idle_far(tcs))
> + return 1;
> + return 0;
> +}
> +
> /*
> * update_flag - read a 0 or a 1 in a file and update associated flag
> * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
> * CS_SCHED_LOAD_BALANCE,
> + * CS_SCHED_BALANCE_NEW_IDLE_FAR,
> + * CS_SCHED_WAKE_IDLE_FAR,
> * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE,
> * CS_SPREAD_PAGE, CS_SPREAD_SLAB)
> * cs: the cpuset to update
> @@ -1043,8 +1094,7 @@ static int update_flag(cpuset_flagbits_t
> return err;
>
> cpus_nonempty = !cpus_empty(trialcs.cpus_allowed);
> - balance_flag_changed = (is_sched_load_balance(cs) !=
> - is_sched_load_balance(&trialcs));
> + balance_flag_changed = need_rebuild_domains(cs, &trialcs);
>
> mutex_lock(&callback_mutex);
> cs->flags = trialcs.flags;
> @@ -1202,6 +1252,8 @@ typedef enum {
> FILE_CPU_EXCLUSIVE,
> FILE_MEM_EXCLUSIVE,
> FILE_SCHED_LOAD_BALANCE,
> + FILE_SCHED_BALANCE_NEWIDLE_FAR,
> + FILE_SCHED_WAKE_IDLE_FAR,
> FILE_MEMORY_PRESSURE_ENABLED,
> FILE_MEMORY_PRESSURE,
> FILE_SPREAD_PAGE,
> @@ -1256,6 +1308,12 @@ static ssize_t cpuset_common_file_write(
> case FILE_SCHED_LOAD_BALANCE:
> retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer);
> break;
> + case FILE_SCHED_BALANCE_NEWIDLE_FAR:
> + retval = update_flag(CS_SCHED_BALANCE_NEWIDLE_FAR, cs, buffer);
> + break;
> + case FILE_SCHED_WAKE_IDLE_FAR:
> + retval = update_flag(CS_SCHED_WAKE_IDLE_FAR, cs, buffer);
> + break;
> case FILE_MEMORY_MIGRATE:
> retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
> break;
> @@ -1354,6 +1412,12 @@ static ssize_t cpuset_common_file_read(s
> case FILE_SCHED_LOAD_BALANCE:
> *s++ = is_sched_load_balance(cs) ? '1' : '0';
> break;
> + case FILE_SCHED_BALANCE_NEWIDLE_FAR:
> + *s++ = is_sched_balance_newidle_far(cs) ? '1' : '0';
> + break;
> + case FILE_SCHED_WAKE_IDLE_FAR:
> + *s++ = is_sched_wake_idle_far(cs) ? '1' : '0';
> + break;
> case FILE_MEMORY_MIGRATE:
> *s++ = is_memory_migrate(cs) ? '1' : '0';
> break;
> @@ -1424,6 +1488,20 @@ static struct cftype cft_sched_load_bala
> .private = FILE_SCHED_LOAD_BALANCE,
> };
>
> +static struct cftype cft_sched_balance_newidle_far = {
> + .name = "sched_balance_newidle_far",
> + .read = cpuset_common_file_read,
> + .write = cpuset_common_file_write,
> + .private = FILE_SCHED_BALANCE_NEWIDLE_FAR,
> +};
> +
> +static struct cftype cft_sched_wake_idle_far = {
> + .name = "sched_wake_idle_far",
> + .read = cpuset_common_file_read,
> + .write = cpuset_common_file_write,
> + .private = FILE_SCHED_WAKE_IDLE_FAR,
> +};
> +
> static struct cftype cft_memory_migrate = {
> .name = "memory_migrate",
> .read = cpuset_common_file_read,
> @@ -1475,6 +1553,11 @@ static int cpuset_populate(struct cgroup
> return err;
> if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0)
> return err;
> + if ((err = cgroup_add_file(cont, ss,
> + &cft_sched_balance_newidle_far)) < 0)
> + return err;
> + if ((err = cgroup_add_file(cont, ss, &cft_sched_wake_idle_far)) < 0)
> + return err;
> if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0)
> return err;
> if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0)
> Index: GIT-torvalds/include/linux/sched.h
> ===================================================================
> --- GIT-torvalds.orig/include/linux/sched.h
> +++ GIT-torvalds/include/linux/sched.h
> @@ -704,6 +704,7 @@ enum cpu_idle_type {
> #define SD_POWERSAVINGS_BALANCE 256 /* Balance for power savings */
> #define SD_SHARE_PKG_RESOURCES 512 /* Domain members share cpu pkg resources */
> #define SD_SERIALIZE 1024 /* Only a single load balancing instance */
> +#define SD_WAKE_IDLE_FAR 2048 /* Gain latency sacrificing cache hit */
>
> #define BALANCE_FOR_MC_POWER \
> (sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0)
> @@ -789,7 +790,8 @@ struct sched_domain {
> #endif
> };
>
> -extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new);
> +extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
> + int *flags_new);
> extern int arch_reinit_sched_domains(void);
>
> #endif /* CONFIG_SMP */
> Index: GIT-torvalds/kernel/sched.c
> ===================================================================
> --- GIT-torvalds.orig/kernel/sched.c
> +++ GIT-torvalds/kernel/sched.c
> @@ -6586,7 +6586,7 @@ static void init_sched_groups_power(int
> * Build sched domains for a given set of cpus and attach the sched domains
> * to the individual cpus
> */
> -static int build_sched_domains(const cpumask_t *cpu_map)
> +static int __build_sched_domains(const cpumask_t *cpu_map, int flags)
> {
> int i;
> struct root_domain *rd;
> @@ -6627,6 +6627,7 @@ static int build_sched_domains(const cpu
> sd = &per_cpu(allnodes_domains, i);
> *sd = SD_ALLNODES_INIT;
> sd->span = *cpu_map;
> + /* prohibit "sd->flags |= flags" for allnodes_domain */
> cpu_to_allnodes_group(i, cpu_map, &sd->groups);
> p = sd;
> sd_allnodes = 1;
> @@ -6636,6 +6637,7 @@ static int build_sched_domains(const cpu
> sd = &per_cpu(node_domains, i);
> *sd = SD_NODE_INIT;
> sd->span = sched_domain_node_span(cpu_to_node(i));
> + sd->flags |= flags;
> sd->parent = p;
> if (p)
> p->child = sd;
> @@ -6646,6 +6648,7 @@ static int build_sched_domains(const cpu
> sd = &per_cpu(phys_domains, i);
> *sd = SD_CPU_INIT;
> sd->span = nodemask;
> + sd->flags |= flags;
> sd->parent = p;
> if (p)
> p->child = sd;
> @@ -6657,6 +6660,7 @@ static int build_sched_domains(const cpu
> *sd = SD_MC_INIT;
> sd->span = cpu_coregroup_map(i);
> cpus_and(sd->span, sd->span, *cpu_map);
> + sd->flags |= flags;
> sd->parent = p;
> p->child = sd;
> cpu_to_core_group(i, cpu_map, &sd->groups);
> @@ -6668,6 +6672,7 @@ static int build_sched_domains(const cpu
> *sd = SD_SIBLING_INIT;
> sd->span = per_cpu(cpu_sibling_map, i);
> cpus_and(sd->span, sd->span, *cpu_map);
> + sd->flags |= flags;
> sd->parent = p;
> p->child = sd;
> cpu_to_cpu_group(i, cpu_map, &sd->groups);
> @@ -6840,8 +6845,14 @@ error:
> #endif
> }
>
> +static int build_sched_domains(const cpumask_t *cpu_map)
> +{
> + return __build_sched_domains(cpu_map, 0);
> +}
> +
> static cpumask_t *doms_cur; /* current sched domains */
> static int ndoms_cur; /* number of sched domains in 'doms_cur' */
> +static int *flags_cur; /* custom flags of domains in 'doms_cur' */
>
> /*
> * Special case: If a kmalloc of a doms_cur partition (array of
> @@ -6868,6 +6879,7 @@ static int arch_init_sched_domains(const
> doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
> if (!doms_cur)
> doms_cur = &fallback_doms;
> + flags_cur = NULL;
> cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
> err = build_sched_domains(doms_cur);
> register_sched_domain_sysctl();
> @@ -6896,6 +6908,16 @@ static void detach_destroy_domains(const
> arch_destroy_sched_domains(cpu_map);
> }
>
> +/* handle null as 0s array */
> +static inline int flags_equal(int *cur, int idx_cur, int *new, int idx_new)
> +{
> + if (!new)
> + return (!cur || !cur[idx_cur]);
> + if (!cur)
> + return (!new[idx_new]);
> + return (cur[idx_cur] == new[idx_new]);
> +}
> +
> /*
> * Partition sched domains as specified by the 'ndoms_new'
> * cpumasks in the array doms_new[] of cpumasks. This compares
> @@ -6917,7 +6939,7 @@ static void detach_destroy_domains(const
> *
> * Call with hotplug lock held
> */
> -void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
> +void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, int *flags_new)
> {
> int i, j;
>
> @@ -6929,13 +6951,15 @@ void partition_sched_domains(int ndoms_n
> if (doms_new == NULL) {
> ndoms_new = 1;
> doms_new = &fallback_doms;
> + flags_new = NULL;
> cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
> }
>
> /* Destroy deleted domains */
> for (i = 0; i < ndoms_cur; i++) {
> for (j = 0; j < ndoms_new; j++) {
> - if (cpus_equal(doms_cur[i], doms_new[j]))
> + if (cpus_equal(doms_cur[i], doms_new[j])
> + && flags_equal(flags_cur, i, flags_new, j))
> goto match1;
> }
> /* no match - a current sched domain not in new doms_new[] */
> @@ -6947,11 +6971,13 @@ match1:
> /* Build new domains */
> for (i = 0; i < ndoms_new; i++) {
> for (j = 0; j < ndoms_cur; j++) {
> - if (cpus_equal(doms_new[i], doms_cur[j]))
> + if (cpus_equal(doms_new[i], doms_cur[j])
> + && flags_equal(flags_new, i, flags_cur, j))
> goto match2;
> }
> /* no match - add a new doms_new */
> - build_sched_domains(doms_new + i);
> + __build_sched_domains(doms_new + i,
> + flags_new ? flags_new[i] : 0);
> match2:
> ;
> }
> @@ -6959,7 +6985,9 @@ match2:
> /* Remember the new sched domains */
> if (doms_cur != &fallback_doms)
> kfree(doms_cur);
> + kfree(flags_cur); /* kfree(NULL) is safe */
> doms_cur = doms_new;
> + flags_cur = flags_new;
> ndoms_cur = ndoms_new;
>
> register_sched_domain_sysctl();
> Index: GIT-torvalds/include/asm-ia64/topology.h
> ===================================================================
> --- GIT-torvalds.orig/include/asm-ia64/topology.h
> +++ GIT-torvalds/include/asm-ia64/topology.h
> @@ -93,7 +93,7 @@ void build_cpu_to_node_map(void);
> .cache_nice_tries = 2, \
> .busy_idx = 3, \
> .idle_idx = 2, \
> - .newidle_idx = 0, /* unused */ \
> + .newidle_idx = 2, \
> .wake_idx = 1, \
> .forkexec_idx = 1, \
> .flags = SD_LOAD_BALANCE \
> Index: GIT-torvalds/include/asm-sh/topology.h
> ===================================================================
> --- GIT-torvalds.orig/include/asm-sh/topology.h
> +++ GIT-torvalds/include/asm-sh/topology.h
> @@ -16,7 +16,7 @@
> .cache_nice_tries = 2, \
> .busy_idx = 3, \
> .idle_idx = 2, \
> - .newidle_idx = 0, \
> + .newidle_idx = 2, \
> .wake_idx = 1, \
> .forkexec_idx = 1, \
> .flags = SD_LOAD_BALANCE \
> Index: GIT-torvalds/include/asm-x86/topology.h
> ===================================================================
> --- GIT-torvalds.orig/include/asm-x86/topology.h
> +++ GIT-torvalds/include/asm-x86/topology.h
> @@ -129,7 +129,7 @@ extern unsigned long node_remap_size[];
>
> # define SD_CACHE_NICE_TRIES 2
> # define SD_IDLE_IDX 2
> -# define SD_NEWIDLE_IDX 0
> +# define SD_NEWIDLE_IDX 2
> # define SD_FORKEXEC_IDX 1
>
> #endif
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/