[PATCH RT] Defer migrate_enable migration while task state != TASK_RUNNING

From: joe . korty
Date: Fri Mar 23 2018 - 11:10:14 EST


I see the below kernel splat in 4.9-rt when I run a test program that
continually changes the affinity of some set of running pids:

do not call blocking ops when !TASK_RUNNING; state=2 set at ...
...
stop_one_cpu+0x60/0x80
migrate_enable+0x21f/0x3e0
rt_spin_unlock+0x2f/0x40
prepare_to_wait+0x5c/0x80
...

The reason is that spin_unlock, write_unlock, and read_unlock call
migrate_enable, and since 4.4-rt, migrate_enable will sleep if it discovers
that a migration is in order. But sleeping in the unlock services is not
expected by most kernel developers, and where that counts most is in code
sequences like the following:

set_current_state(TASK_UNINTERRUPIBLE);
spin_unlock(&s);
schedule();

Rather than try to fix up such expectations, this patch merely restores
the non-sleeping nature of unlocking by the simple expediate of deferring,
to a future migrate_enable, and actual migration-with-sleep operation,
for as long as migrate_enable sees task state != TASK_RUNNING. This works
well as the kernel is littered with lock/unlock sequences, so if the
current unlock cannot migrate, another unlock will come along shortly
and it will do the deferred migration for us.

With this patch and a debug harness applied to 4.9.84-rt62, I get the
following results when a set of stress(1) threads have their affinities
randomly changes as fast as possible:

[ 111.331805] 6229(stress): migrate_enable() deferred.
[ 111.332112] 6229(stress): migrate_enable() deferral APPLIED.
[ 111.977399] 6231(stress): migrate_enable() deferred.
[ 111.977833] 6231(stress): migrate_enable() deferral APPLIED.
[ 135.240704] 6231(stress): migrate_enable() deferred.
[ 135.241164] 6231(stress): migrate_enable() deferral APPLIED.
[ 139.734616] 6229(stress): migrate_enable() deferred.
[ 139.736553] 6229(stress): migrate_enable() deferral APPLIED.
[ 156.441660] 6229(stress): migrate_enable() deferred.
[ 156.441709] 6229(stress): migrate_enable() deferral APPLIED.
[ 163.600191] 6231(stress): migrate_enable() deferred.
[ 163.600561] 6231(stress): migrate_enable() deferral APPLIED.
[ 181.593035] 6231(stress): migrate_enable() deferred.
[ 181.593136] 6231(stress): migrate_enable() deferral APPLIED.
[ 198.376387] 6229(stress): migrate_enable() deferred.
[ 198.376591] 6229(stress): migrate_enable() deferral APPLIED.
[ 202.355940] 6231(stress): migrate_enable() deferred.
[ 202.356939] 6231(stress): migrate_enable() deferral APPLIED.
[ 203.773164] 6229(stress): migrate_enable() deferred.
[ 203.773243] 6229(stress): migrate_enable() deferral APPLIED.
...

The test sequence used:

stress --cpu 8 --io 1 --vm 2 &
./rotate $(ps -eLotid,comm | fgrep stress | awk '{print $1}')

The test program used:

cat <<EOF >rotate.c
/*
* rotate.c
* Randomly and rapidly change the affinity of some set of processes.
* Does not return.
* Limitations: Hard coded to use cpus 0-7. May not work on systems
* with more than 64 cpus.
*
* Usage: rotate pid pid ...
*
* Algorithm: In a loop, for each pid given, randomly select 2 of
* the 8 cpus 37.5% of the time; otherwise, randomly select a
* single cpu from that same set.
*/

#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sched.h>
#include <errno.h>

int pids[100000];
int npids;
int ncpus = 8;

int main(int argc, char **argv)
{
int pid, cpu, status;
unsigned long mask = 0;
int i;

setbuf(stdout, NULL);

argc--,argv++;
npids = argc;
for(i=0; i<npids; i++) {
pids[i] = atoi(argv[i]);
if (!pids[i]) {
fprintf(stderr, "tid #%d is bad\n", i);
return 1;
}
}

for(;;) {
for(i=0; i<npids; i++) {
cpu = (unsigned long long)rand() * ncpus / RAND_MAX;
mask = 1U << cpu;
if (rand() & 0x400) {
cpu = (unsigned long long)rand() * ncpus / RAND_MAX;
mask |= 1U << cpu;
}
pid = pids[i];
status = sched_setaffinity(pid, sizeof(mask), (cpu_set_t *)&mask);
if (status == -1) {
fprintf(stderr, "sched_setaffinity(%d, 8, %016lx) failed\n", pid, mask);
perror("sched_setaffinity");
return 1;
}
}
}
return 0;
}
EOF

The debug patch used:

cat patches/debug <<EOF
1 XXXXXXXX--- a/include/linux/sched.h
2 XXXXXXXX+++ b/include/linux/sched.h
3 XXXXXXXX@@ -1558,6 +1558,7 @@ struct task_struct {
4 XXXXXXXX #ifdef CONFIG_PREEMPT_RT_FULL
5 XXXXXXXX int migrate_disable;
6 XXXXXXXX int migrate_disable_update;
7 XXXXXXXX+ int migrate_enable_deferred;
8 XXXXXXXX # ifdef CONFIG_SCHED_DEBUG
9 XXXXXXXX int migrate_disable_atomic;
10 XXXXXXXX # endif
11 XXXXXXXX--- a/kernel/sched/core.c
12 XXXXXXXX+++ b/kernel/sched/core.c
13 XXXXXXXX@@ -3468,6 +3468,12 @@ void migrate_enable(void)
14 XXXXXXXX struct rq *rq;
15 XXXXXXXX struct rq_flags rf;
16 XXXXXXXX
17 XXXXXXXX+ if (p->migrate_enable_deferred) {
18 XXXXXXXX+ p->migrate_enable_deferred = 0;
19 XXXXXXXX+ pr_info("%d(%s): migrate_enable() deferral APPLIED.\n",
20 XXXXXXXX+ p->pid, p->comm);
21 XXXXXXXX+ }
22 XXXXXXXX+
23 XXXXXXXX rq = task_rq_lock(p, &rf);
24 XXXXXXXX update_rq_clock(rq);
25 XXXXXXXX
26 XXXXXXXX@@ -3499,6 +3505,15 @@ void migrate_enable(void)
27 XXXXXXXX tlb_migrate_finish(p->mm);
28 XXXXXXXX return;
29 XXXXXXXX }
30 XXXXXXXX+ } else if (p->migrate_disable_update && p->state != TASK_RUNNING) {
31 XXXXXXXX+ if (p->migrate_enable_deferred)
32 XXXXXXXX+ pr_info("%d(%s): migrate_enable() deferred (again).\n",
33 XXXXXXXX+ p->pid, p->comm);
34 XXXXXXXX+ else {
35 XXXXXXXX+ pr_info("%d(%s): migrate_enable() deferred.\n",
36 XXXXXXXX+ p->pid, p->comm);
37 XXXXXXXX+ p->migrate_enable_deferred = 1;
38 XXXXXXXX+ }
39 XXXXXXXX }
40 XXXXXXXX
41 XXXXXXXX unpin_current_cpu();
EOF

The rt patch sched-migrate-disable-handle-updated-task-mask-mg-di.patch
appears to have introduced this issue, around the 4.4-rt timeframe.

Signed-off-by: Joe Korty <joe.korty@xxxxxxxxxxxxxxxxx>

Index: b/kernel/sched/core.c
===================================================================
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3457,7 +3457,14 @@ void migrate_enable(void)
*/
p->migrate_disable = 0;

- if (p->migrate_disable_update) {
+ /*
+ * Do not apply affinity update on this migrate_enable if task
+ * is preparing to go to sleep for some other reason (eg, task
+ * state == TASK_INTERRUPTIBLE). Instead defer update to a
+ * future migate_enable that is called when task state is again
+ * == TASK_RUNNING.
+ */
+ if (p->migrate_disable_update && p->state == TASK_RUNNING) {
struct rq *rq;
struct rq_flags rf;