Re: [PATCH 3/3] sched: Implement interface for cgroup unified hierarchy

From: Peter Zijlstra
Date: Thu Sep 17 2015 - 11:16:26 EST


On Thu, Sep 17, 2015 at 04:35:27PM +0200, Peter Zijlstra wrote:
> I'd be happy to fail a CPU down for user tasks where this is the last
> runnable CPU of.

A little like so. Completely untested.

---
Subject: sched: Refuse to unplug a CPU if this will violate user task affinity

Its bad policy to allow unplugging a CPU for which a user set explicit
affinity, either strictly on this CPU or in case this was the last
online CPU in its mask.

Either would end up forcing the thread on a random other CPU, violating
the sys_sched_setaffinity() constraint.

Disallow this by default; root might not be aware of all user
affinities, but can negotiate and change affinities for all tasks.

Provide a sysctl to go back to the old behaviour.

Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
include/linux/sched/sysctl.h | 1 +
kernel/sched/core.c | 46 ++++++++++++++++++++++++++++++++++++++++++++
kernel/sysctl.c | 9 +++++++++
3 files changed, 56 insertions(+)

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index c9e4731cf10b..9444b549914b 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -39,6 +39,7 @@ extern unsigned int sysctl_sched_latency;
extern unsigned int sysctl_sched_min_granularity;
extern unsigned int sysctl_sched_wakeup_granularity;
extern unsigned int sysctl_sched_child_runs_first;
+extern unsigned int sysctl_sched_strict_affinity;

enum sched_tunable_scaling {
SCHED_TUNABLESCALING_NONE,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6ab415aa15c4..457c8b912fc6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -284,6 +284,11 @@ __read_mostly int scheduler_running;
*/
int sysctl_sched_rt_runtime = 950000;

+/*
+ * Disallows cpu unplug if that would result in a task without runnable CPUs.
+ */
+unsigned int sysctl_sched_strict_affinity = 1;
+
/* cpus with isolated domains */
cpumask_var_t cpu_isolated_map;

@@ -5430,6 +5435,42 @@ static void set_rq_offline(struct rq *rq)
}

/*
+ * Test if there's a user task for which @cpu is the last runnable CPU
+ */
+static bool migration_possible(int cpu)
+{
+ struct task_struct *g, *p;
+ bool ret = true;
+ int next;
+
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, p) {
+ /* if its running elsewhere, this cannot be its last cpu */
+ if (task_cpu(p) != cpu)
+ continue;
+
+ /* we only care about user state */
+ if (p->flags & PF_KTHREAD)
+ continue;
+
+ next = -1;
+again:
+ next = cpumask_next_and(next, tsk_cpus_allowed(p), cpu_active_mask);
+ if (next >= nr_cpu_ids) {
+ printk(KERN_WARNING "task %s-%d refused unplug of CPU %d\n",
+ p->comm, p->pid, cpu);
+ ret = false;
+ break;
+ }
+ if (next == cpu)
+ goto again;
+ }
+ read_unlock(&tasklist_lock);
+
+ return ret;
+}
+
+/*
* migration_call - callback that gets triggered when a CPU is added.
* Here we can start up the necessary migration thread for the new CPU.
*/
@@ -5440,6 +5481,11 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
unsigned long flags;
struct rq *rq = cpu_rq(cpu);

+ if (action == CPU_DOWN_PREPARE && sysctl_sched_strict_affinity) {
+ if (!migration_possible(cpu))
+ return notifier_from_errno(-EBUSY);
+ }
+
switch (action & ~CPU_TASKS_FROZEN) {

case CPU_UP_PREPARE:
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e69201d8094e..9d0edcc73cc3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -283,6 +283,15 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+#ifdef CONFIG_SMP
+ {
+ .procname = "sched_strict_affinity",
+ .data = &sysctl_sched_strict_affinity,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif /* CONFIG_SMP */
#ifdef CONFIG_SCHED_DEBUG
{
.procname = "sched_min_granularity_ns",
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/