[RFC PATCH 12/12] housekeeping: Reimplement isolcpus on housekeeping

From: Frederic Weisbecker
Date: Tue Aug 22 2017 - 21:52:23 EST


We want to centralize the isolation features on the housekeeping
subsystem and scheduler isolation is a significant part of it.

While at it, this is a proposition for a reimplementation of isolcpus=
that doesn't involve scheduler domain isolation. Therefore this
brings a behaviour change: all user tasks inherit init/1 affinity which
avoid the isolcpus= range. But if a task later overrides its affinity
which turns out to intersect an isolated CPU, load balancing may occur
on it.

OTOH such a reimplementation that doesn't shortcut scheduler internals
makes a better candidate for an interface extension to cpuset.

Signed-off-by: Frederic Weisbecker <fweisbec@xxxxxxxxx>
Cc: Chris Metcalf <cmetcalf@xxxxxxxxxxxx>
Cc: Rik van Riel <riel@xxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Mike Galbraith <efault@xxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Christoph Lameter <cl@xxxxxxxxx>
Cc: Paul E. McKenney <paulmck@xxxxxxxxxxxxxxxxxx>
Cc: Wanpeng Li <kernellwp@xxxxxxxxx>
Cc: Luiz Capitulino <lcapitulino@xxxxxxxxxx>
---
drivers/base/cpu.c | 10 ++++++++-
include/linux/sched.h | 2 --
kernel/cgroup/cpuset.c | 13 ++---------
kernel/housekeeping.c | 57 +++++++++++++++++++++++++++++++++++++++++--------
kernel/sched/core.c | 16 +-------------
kernel/sched/topology.c | 19 ++---------------
6 files changed, 62 insertions(+), 55 deletions(-)

diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 2c3b359..35b2b10 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -18,6 +18,7 @@
#include <linux/cpufeature.h>
#include <linux/tick.h>
#include <linux/pm_qos.h>
+#include <linux/housekeeping.h>

#include "base.h"

@@ -271,8 +272,15 @@ static ssize_t print_cpus_isolated(struct device *dev,
struct device_attribute *attr, char *buf)
{
int n = 0, len = PAGE_SIZE-2;
+ cpumask_var_t isolated;

- n = scnprintf(buf, len, "%*pbl\n", cpumask_pr_args(cpu_isolated_map));
+ if (!alloc_cpumask_var(&isolated, GFP_KERNEL))
+ return -ENOMEM;
+
+ cpumask_andnot(isolated, cpu_possible_mask, housekeeping_cpumask(HK_FLAG_SCHED));
+ n = scnprintf(buf, len, "%*pbl\n", cpumask_pr_args(isolated));
+
+ free_cpumask_var(isolated);

return n;
}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c28b182..816ff52 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -166,8 +166,6 @@ struct task_group;
/* Task command name length: */
#define TASK_COMM_LEN 16

-extern cpumask_var_t cpu_isolated_map;
-
extern void scheduler_tick(void);

#define MAX_SCHEDULE_TIMEOUT LONG_MAX
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 8d51516..5d71020 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -639,7 +639,6 @@ static int generate_sched_domains(cpumask_var_t **domains,
int csn; /* how many cpuset ptrs in csa so far */
int i, j, k; /* indices for partition finding loops */
cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
- cpumask_var_t non_isolated_cpus; /* load balanced CPUs */
struct sched_domain_attr *dattr; /* attributes for custom domains */
int ndoms = 0; /* number of sched domains in result */
int nslot; /* next empty doms[] struct cpumask slot */
@@ -649,10 +648,6 @@ static int generate_sched_domains(cpumask_var_t **domains,
dattr = NULL;
csa = NULL;

- if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL))
- goto done;
- cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
-
/* Special case for the 99% of systems with one, full, sched domain */
if (is_sched_load_balance(&top_cpuset)) {
ndoms = 1;
@@ -665,8 +660,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
*dattr = SD_ATTR_INIT;
update_domain_attr_tree(dattr, &top_cpuset);
}
- cpumask_and(doms[0], top_cpuset.effective_cpus,
- non_isolated_cpus);
+ cpumask_copy(doms[0], top_cpuset.effective_cpus);

goto done;
}
@@ -689,8 +683,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
* the corresponding sched domain.
*/
if (!cpumask_empty(cp->cpus_allowed) &&
- !(is_sched_load_balance(cp) &&
- cpumask_intersects(cp->cpus_allowed, non_isolated_cpus)))
+ !(is_sched_load_balance(cp)))
continue;

if (is_sched_load_balance(cp))
@@ -772,7 +765,6 @@ static int generate_sched_domains(cpumask_var_t **domains,

if (apn == b->pn) {
cpumask_or(dp, dp, b->effective_cpus);
- cpumask_and(dp, dp, non_isolated_cpus);
if (dattr)
update_domain_attr_tree(dattr + nslot, b);

@@ -785,7 +777,6 @@ static int generate_sched_domains(cpumask_var_t **domains,
BUG_ON(nslot != ndoms);

done:
- free_cpumask_var(non_isolated_cpus);
kfree(csa);

/*
diff --git a/kernel/housekeeping.c b/kernel/housekeeping.c
index 633a0d9..1fd9316 100644
--- a/kernel/housekeeping.c
+++ b/kernel/housekeeping.c
@@ -58,30 +58,69 @@ void __init housekeeping_init(void)
WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
}

-static int __init housekeeping_nohz_full_setup(char *str)
+static int __init housekeeping_setup(char *str, enum hk_flags flags)
{
cpumask_var_t non_housekeeping_mask;

alloc_bootmem_cpumask_var(&non_housekeeping_mask);
if (cpulist_parse(str, non_housekeeping_mask) < 0) {
- pr_warn("Housekeeping: Incorrect nohz_full cpumask\n");
free_bootmem_cpumask_var(non_housekeeping_mask);
return 0;
}

- alloc_bootmem_cpumask_var(&housekeeping_mask);
- cpumask_andnot(housekeeping_mask, cpu_possible_mask, non_housekeeping_mask);
+ if (!housekeeping_flags) {
+ alloc_bootmem_cpumask_var(&housekeeping_mask);
+ cpumask_andnot(housekeeping_mask,
+ cpu_possible_mask, non_housekeeping_mask);
+ if (cpumask_empty(housekeeping_mask))
+ cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
+ } else {
+ cpumask_var_t tmp;

- if (cpumask_empty(housekeeping_mask))
- cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
+ alloc_bootmem_cpumask_var(&tmp);
+ cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask);
+ if (!cpumask_equal(tmp, housekeeping_mask)) {
+ pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
+ free_bootmem_cpumask_var(tmp);
+ free_bootmem_cpumask_var(non_housekeeping_mask);
+ return 0;
+ }
+ free_bootmem_cpumask_var(tmp);
+ }

- housekeeping_flags = HK_FLAG_TICK | HK_FLAG_TIMER |
- HK_FLAG_RCU | HK_FLAG_MISC;
+ if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK))
+ tick_nohz_full_setup(non_housekeeping_mask);

- tick_nohz_full_setup(non_housekeeping_mask);
+ housekeeping_flags |= flags;

free_bootmem_cpumask_var(non_housekeeping_mask);

return 1;
}
+
+static int __init housekeeping_nohz_full_setup(char *str)
+{
+ unsigned int flags;
+ int ret;
+
+ flags = HK_FLAG_TICK | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
+ ret = housekeeping_setup(str, flags);
+ if (!ret)
+ pr_warn("Housekeeping: Incorrect nohz_full cpumask\n");
+ return ret;
+}
__setup("nohz_full=", housekeeping_nohz_full_setup);
+
+static int __init housekeeping_isolcpus_setup(char *str)
+{
+ unsigned int flags;
+ int ret;
+
+ flags = HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC |
+ HK_FLAG_SCHED | HK_FLAG_WORKQUEUE | HK_FLAG_KTHREAD;
+ ret = housekeeping_setup(str, flags);
+ if (!ret)
+ pr_warn("Housekeeping: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
+ return ret;
+}
+__setup("isolcpus=", housekeeping_isolcpus_setup);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 877c85d..269f3ac 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -84,9 +84,6 @@ __read_mostly int scheduler_running;
*/
int sysctl_sched_rt_runtime = 950000;

-/* CPUs with isolated domains */
-cpumask_var_t cpu_isolated_map;
-
/*
* __task_rq_lock - lock the rq @p resides on.
*/
@@ -5672,10 +5669,6 @@ static inline void sched_init_smt(void) { }

void __init sched_init_smp(void)
{
- cpumask_var_t non_isolated_cpus;
-
- alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
-
sched_init_numa();

/*
@@ -5685,16 +5678,12 @@ void __init sched_init_smp(void)
*/
mutex_lock(&sched_domains_mutex);
sched_init_domains(cpu_active_mask);
- cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
- if (cpumask_empty(non_isolated_cpus))
- cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
mutex_unlock(&sched_domains_mutex);

/* Move init over to a non-isolated CPU */
- if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
+ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_SCHED)) < 0)
BUG();
sched_init_granularity();
- free_cpumask_var(non_isolated_cpus);

init_sched_rt_class();
init_sched_dl_class();
@@ -5898,9 +5887,6 @@ void __init sched_init(void)
calc_load_update = jiffies + LOAD_FREQ;

#ifdef CONFIG_SMP
- /* May be allocated at isolcpus cmdline parse time */
- if (cpu_isolated_map == NULL)
- zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
idle_thread_set_boot_cpu();
set_cpu_rq_start_time(smp_processor_id());
#endif
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index bd8b6d6..e060e28 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -466,21 +466,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
update_top_cache_domain(cpu);
}

-/* Setup the mask of CPUs configured for isolated domains */
-static int __init isolated_cpu_setup(char *str)
-{
- int ret;
-
- alloc_bootmem_cpumask_var(&cpu_isolated_map);
- ret = cpulist_parse(str, cpu_isolated_map);
- if (ret) {
- pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
- return 0;
- }
- return 1;
-}
-__setup("isolcpus=", isolated_cpu_setup);
-
struct s_data {
struct sched_domain ** __percpu sd;
struct root_domain *rd;
@@ -1775,7 +1760,7 @@ int sched_init_domains(const struct cpumask *cpu_map)
doms_cur = alloc_sched_domains(ndoms_cur);
if (!doms_cur)
doms_cur = &fallback_doms;
- cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
+ cpumask_copy(doms_cur[0], cpu_map);
err = build_sched_domains(doms_cur[0], NULL);
register_sched_domain_sysctl();

@@ -1871,7 +1856,7 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
if (doms_new == NULL) {
n = 0;
doms_new = &fallback_doms;
- cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
+ cpumask_copy(doms_new[0], cpu_active_mask);
WARN_ON_ONCE(dattr_new);
}

--
2.7.4