[BUG] Corrupted SCHED_DEADLINE bandwidth with cpusets

From: Steven Rostedt
Date: Wed Feb 03 2016 - 13:56:00 EST


There's an accounting issue with the SCHED_DEADLINE and the creation of
cpusets. If a SCHED_DEADLINE task already exists and a new root domain
is created, the calculation of the bandwidth among the root domains
gets corrupted.

For the reproducer, I downloaded Juri's tests:

https://github.com/jlelli/tests.git

For his burn.c file.

https://github.com/jlelli/schedtool-dl.git

For his modified schedtool utility.


I have a kernel with my patches that show the bandwidth:

# grep dl /proc/sched_debug
dl_rq[0]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : 0
dl_rq[1]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : 0
dl_rq[2]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : 0
dl_rq[3]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : 0
dl_rq[4]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : 0
dl_rq[5]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : 0
dl_rq[6]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : 0
dl_rq[7]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : 0


Note: the sched_rt runtime and period are 950000 and 1000000
respectively, and the bw ratio is (95/100) << 20 == 996147.

This isn't the way I first discovered the issue, but it appears to be
the quickest way to reproduce it.

Make sure there's no other cpusets. As libvirt created some, I had to
remove them first:

# rmdir /sys/fs/cgroup/cpuset/libvirt/{qemu,}


# burn&
# schedtool -E -t 2000000:20000000 $!

# grep dl /proc/sched_debug
dl_rq[0]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : 104857
dl_rq[1]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : 104857
dl_rq[2]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : 104857
dl_rq[3]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : 104857
dl_rq[4]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : 104857
dl_rq[5]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : 104857
dl_rq[6]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : 104857
dl_rq[7]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : 104857

Note: (2/20) << 20 == 104857

# echo 0 > /sys/fs/cgroup/cpuset/cpuset.sched_load_balance

# grep dl /proc/sched_debug
dl_rq[0]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : 0
dl_rq[1]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : 0
dl_rq[2]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : 0
dl_rq[3]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : 0
dl_rq[4]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : 0
dl_rq[5]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : 0
dl_rq[6]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : 0
dl_rq[7]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : 0

Notice that after removing load_balancing from the main cpuset, all the
totals went to zero.

Let's see what happens when we kill the task.

# killall burn

# grep dl /proc/sched_debug
dl_rq[0]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : -104857
dl_rq[1]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : -104857
dl_rq[2]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : -104857
dl_rq[3]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : -104857
dl_rq[4]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : -104857
dl_rq[5]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : -104857
dl_rq[6]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : -104857
dl_rq[7]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : -104857

They all went negative!

Not good, but we can recover...

# echo 1 > /sys/fs/cgroup/cpuset/cpuset.sched_load_balance

# grep dl /proc/sched_debugdl_rq[0]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : 0
dl_rq[1]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : 0
dl_rq[2]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : 0
dl_rq[3]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : 0
dl_rq[4]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : 0
dl_rq[5]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : 0
dl_rq[6]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : 0
dl_rq[7]:
.dl_nr_running : 0
.dl_bw->bw : 996147
.dl_bw->total_bw : 0

Playing with this a bit more, I found that it appears that setting
load_balance to 1 in the toplevel cpuset always resets the deadline
bandwidth weather or not it should be. At least that's a way to recover
from things not working anymore, but I still believe this is a bug.

Things can get even worse when adding a bunch of cpusets. But it
appears to always be cleared if you kill all sched_deadline tasks and
set the toplevel cpuset sched_load_balance to 1.


I traced this with the below patch and have this:

schedtool-2279 [004] d... 124.195359: __sched_setscheduler: dl_b=ffff88011a01d040 new=104857 old=0
schedtool-2279 [004] d... 124.195361: __dl_add: (ffff88011a01d040) total=0 add tsk=104857
schedtool-2279 [004] d... 124.195362: __dl_add: new total=104857
schedtool-2279 [004] d... 124.195364: <stack trace>
=> __dl_add
=> __sched_setscheduler
=> SyS_sched_setattr
=> entry_SYSCALL_64_fastpath
bash-2213 [003] d... 142.917342: rq_attach_root: old_rd refcount=8
bash-2213 [003] d... 142.917344: rq_attach_root: add new rd ffffffff81fcd9c0 (ffffffff81fcda00) total=0
bash-2213 [003] d... 142.917351: <stack trace>
=> rq_attach_root
=> cpu_attach_domain
=> partition_sched_domains
=> rebuild_sched_domains_locked
=> update_flag
=> cpuset_write_u64
=> cgroup_file_write
=> kernfs_fop_write
=> __vfs_write
=> vfs_write
=> SyS_write
=> entry_SYSCALL_64_fastpath
bash-2213 [003] d... 142.917387: rq_attach_root: old_rd refcount=7
bash-2213 [003] d... 142.917388: rq_attach_root: add new rd ffffffff81fcd9c0 (ffffffff81fcda00) total=0
bash-2213 [003] d... 142.917393: <stack trace>
=> rq_attach_root
=> cpu_attach_domain
=> partition_sched_domains
=> rebuild_sched_domains_locked
=> update_flag
=> cpuset_write_u64
=> cgroup_file_write
=> kernfs_fop_write
=> __vfs_write
=> vfs_write
=> SyS_write
=> entry_SYSCALL_64_fastpath
bash-2213 [003] d... 142.917420: rq_attach_root: old_rd refcount=6
bash-2213 [003] d... 142.917420: rq_attach_root: add new rd ffffffff81fcd9c0 (ffffffff81fcda00) total=0
bash-2213 [003] d... 142.917425: <stack trace>
=> rq_attach_root
=> cpu_attach_domain
=> partition_sched_domains
=> rebuild_sched_domains_locked
=> update_flag
=> cpuset_write_u64
=> cgroup_file_write
=> kernfs_fop_write
=> __vfs_write
=> vfs_write
=> SyS_write
=> entry_SYSCALL_64_fastpath
bash-2213 [003] d... 142.917452: rq_attach_root: old_rd refcount=5
bash-2213 [003] d... 142.917452: rq_attach_root: add new rd ffffffff81fcd9c0 (ffffffff81fcda00) total=0
bash-2213 [003] d... 142.917457: <stack trace>
=> rq_attach_root
=> cpu_attach_domain
=> partition_sched_domains
=> rebuild_sched_domains_locked
=> update_flag
=> cpuset_write_u64
=> cgroup_file_write
=> kernfs_fop_write
=> __vfs_write
=> vfs_write
=> SyS_write
=> entry_SYSCALL_64_fastpath
bash-2213 [003] d... 142.917485: rq_attach_root: old_rd refcount=4
bash-2213 [003] d... 142.917485: rq_attach_root: add new rd ffffffff81fcd9c0 (ffffffff81fcda00) total=0
bash-2213 [003] d... 142.917490: <stack trace>
=> rq_attach_root
=> cpu_attach_domain
=> partition_sched_domains
=> rebuild_sched_domains_locked
=> update_flag
=> cpuset_write_u64
=> cgroup_file_write
=> kernfs_fop_write
=> __vfs_write
=> vfs_write
=> SyS_write
=> entry_SYSCALL_64_fastpath
bash-2213 [003] d... 142.917518: rq_attach_root: old_rd refcount=3
bash-2213 [003] d... 142.917519: rq_attach_root: add new rd ffffffff81fcd9c0 (ffffffff81fcda00) total=0
bash-2213 [003] d... 142.917524: <stack trace>
=> rq_attach_root
=> cpu_attach_domain
=> partition_sched_domains
=> rebuild_sched_domains_locked
=> update_flag
=> cpuset_write_u64
=> cgroup_file_write
=> kernfs_fop_write
=> __vfs_write
=> vfs_write
=> SyS_write
=> entry_SYSCALL_64_fastpath
bash-2213 [003] d... 142.917550: rq_attach_root: old_rd refcount=2
bash-2213 [003] d... 142.917550: rq_attach_root: add new rd ffffffff81fcd9c0 (ffffffff81fcda00) total=0
bash-2213 [003] d... 142.917556: <stack trace>
=> rq_attach_root
=> cpu_attach_domain
=> partition_sched_domains
=> rebuild_sched_domains_locked
=> update_flag
=> cpuset_write_u64
=> cgroup_file_write
=> kernfs_fop_write
=> __vfs_write
=> vfs_write
=> SyS_write
=> entry_SYSCALL_64_fastpath
bash-2213 [003] d... 142.917582: rq_attach_root: old_rd refcount=1
bash-2213 [003] d... 142.917582: rq_attach_root: old rd ffff88011a01d000 (ffff88011a01d040) total=104857
bash-2213 [003] d... 142.917582: rq_attach_root: add new rd ffffffff81fcd9c0 (ffffffff81fcda00) total=0
bash-2213 [003] d... 142.917588: <stack trace>
=> rq_attach_root
=> cpu_attach_domain
=> partition_sched_domains
=> rebuild_sched_domains_locked
=> update_flag
=> cpuset_write_u64
=> cgroup_file_write
=> kernfs_fop_write
=> __vfs_write
=> vfs_write
=> SyS_write
=> entry_SYSCALL_64_fastpath
kworker/0:2-268 [000] d... 153.335522: task_dead_dl: [0:ffffffff81fcda00] total=0 tsk=104857
kworker/0:2-268 [000] d... 153.335523: task_dead_dl: new total=-104857
kworker/0:2-268 [000] d... 153.335528: <stack trace>
=> task_dead_dl
=> finish_task_switch
=> __schedule
=> schedule
=> worker_thread
=> kthread
=> ret_from_fork


Seems to be some disconnect between cgroups and sched deadline root
domains.

-- Steve

---
kernel/sched/core.c | 9 +++++++++
kernel/sched/deadline.c | 5 +++++
kernel/sched/sched.h | 6 ++++++
3 files changed, 20 insertions(+)

Index: linux-trace.git/kernel/sched/core.c
===================================================================
--- linux-trace.git.orig/kernel/sched/core.c 2016-02-03 10:54:55.158659968 -0500
+++ linux-trace.git/kernel/sched/core.c 2016-02-03 13:52:56.213696814 -0500
@@ -2301,6 +2301,7 @@ static int dl_overflow(struct task_struc
if (new_bw == p->dl.dl_bw)
return 0;

+ trace_printk("dl_b=%p new=%lld old=%lld\n", dl_b, new_bw, p->dl.dl_bw);
/*
* Either if a task, enters, leave, or stays -deadline but changes
* its parameters, we may need to update accordingly the total
@@ -5068,6 +5069,7 @@ int task_can_attach(struct task_struct *
if (overflow)
ret = -EBUSY;
else {
+ struct dl_bw *src_dl_b;
/*
* We reserve space for this task in the destination
* root_domain, as we can't fail after this point.
@@ -5075,6 +5077,8 @@ int task_can_attach(struct task_struct *
* later on (see set_cpus_allowed_dl()).
*/
__dl_add(dl_b, p->dl.dl_bw);
+ src_dl_b = dl_bw_of(task_cpu(p));
+ trace_printk("source %p total=%lld\n", src_dl_b, src_dl_b->total_bw);
}
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
rcu_read_unlock_sched();
@@ -5636,6 +5640,7 @@ static void rq_attach_root(struct rq *rq

cpumask_clear_cpu(rq->cpu, old_rd->span);

+ trace_printk("old_rd refcount=%d\n", atomic_read(&old_rd->refcount));
/*
* If we dont want to free the old_rd yet then
* set old_rd to NULL to skip the freeing later
@@ -5646,7 +5651,11 @@ static void rq_attach_root(struct rq *rq
}

atomic_inc(&rd->refcount);
+ if (old_rd)
+ trace_printk("old rd %p (%p) total=%lld\n", rq->rd, &rq->rd->dl_bw, rq->rd->dl_bw.total_bw);
+ trace_printk("add new rd %p (%p) total=%lld\n", rd, &rd->dl_bw, rd->dl_bw.total_bw);
rq->rd = rd;
+ trace_dump_stack(0);

cpumask_set_cpu(rq->cpu, rd->span);
if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
Index: linux-trace.git/kernel/sched/deadline.c
===================================================================
--- linux-trace.git.orig/kernel/sched/deadline.c 2016-02-03 10:21:27.140280992 -0500
+++ linux-trace.git/kernel/sched/deadline.c 2016-02-03 13:44:41.321432980 -0500
@@ -66,6 +66,8 @@ void init_dl_bw(struct dl_bw *dl_b)
else
dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime());
raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock);
+ trace_printk("clear total_bw %p (was:%lld)\n", dl_b, dl_b->total_bw);
+ trace_dump_stack(0);
dl_b->total_bw = 0;
}

@@ -1224,7 +1226,10 @@ static void task_dead_dl(struct task_str
*/
raw_spin_lock_irq(&dl_b->lock);
/* XXX we should retain the bw until 0-lag */
+ trace_printk("[%d:%p] total=%lld tsk=%lld\n", task_cpu(p), dl_b, dl_b->total_bw, p->dl.dl_bw);
dl_b->total_bw -= p->dl.dl_bw;
+ trace_printk("new total=%lld\n", dl_b->total_bw);
+ trace_dump_stack(0);
raw_spin_unlock_irq(&dl_b->lock);
}

Index: linux-trace.git/kernel/sched/sched.h
===================================================================
--- linux-trace.git.orig/kernel/sched/sched.h 2016-02-03 10:54:55.182659590 -0500
+++ linux-trace.git/kernel/sched/sched.h 2016-02-03 13:44:41.321432980 -0500
@@ -192,13 +192,19 @@ struct dl_bw {
static inline
void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
{
+ trace_printk("(%p) total=%lld sub tsk=%lld\n", dl_b, dl_b->total_bw, tsk_bw);
dl_b->total_bw -= tsk_bw;
+ trace_printk("new total=%lld\n", dl_b->total_bw);
+ trace_dump_stack(0);
}

static inline
void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
{
+ trace_printk("(%p) total=%lld add tsk=%lld\n", dl_b, dl_b->total_bw, tsk_bw);
dl_b->total_bw += tsk_bw;
+ trace_printk("new total=%lld\n", dl_b->total_bw);
+ trace_dump_stack(0);
}

static inline