[PATCH] sched/{cpuset,core}: restore root_domain status across destructive ops

From: Juri Lelli
Date: Tue Jul 28 2015 - 06:55:51 EST


Hotplug and sched_domains update operations are destructive w.r.t data
associated with cpuset; in this case we care about root_domains.
SCHED_DEADLINE puts bandwidth information regarding admitted tasks on
root_domains, information that is gone when an hotplug or update
operation happens. Also, it is not currently possible to tell to which
task(s) the allocated bandwidth belongs, as this link is lost after
sched_setscheduler() succeeds.

This patch forces rebuilding of allocated bandwidth information at
root_domain level after partition_sched_domains() is done. It also
ensures that we don't leave stale information in def_root_domain when
that becomes empty (since it is never freed).

Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Li Zefan <lizefan@xxxxxxxxxx>
Cc: cgroups@xxxxxxxxxxxxxxx
Reported-by: Wanpeng Li <wanpeng.li@xxxxxxxxxxxxxxx>
Reported-by: Steven Rostedt <rostedt@xxxxxxxxxxx>
Signed-off-by: Juri Lelli <juri.lelli@xxxxxxx>
---
include/linux/sched.h | 2 ++
kernel/cpuset.c | 39 +++++++++++++++++++++++++++++++++++++++
kernel/sched/core.c | 28 ++++++++++++++++++++++++++++
3 files changed, 69 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a10494a..5f9eeb4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2241,6 +2241,8 @@ extern int cpuset_cpumask_can_shrink(const struct cpumask *cur,
const struct cpumask *trial);
extern int task_can_attach(struct task_struct *p,
const struct cpumask *cs_cpus_allowed);
+void sched_restore_dl_bw(struct task_struct *task,
+ const struct cpumask *new_mask);
#ifdef CONFIG_SMP
extern void do_set_cpus_allowed(struct task_struct *p,
const struct cpumask *new_mask);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3e945fc..57078f0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -785,6 +785,44 @@ done:
return ndoms;
}

+/**
+ * update_tasks_rd - Update tasks' root_domains status.
+ * @cs: the cpuset to which each task's root_domain belongs
+ *
+ * Iterate through each task of @cs updating state of its related
+ * root_domain.
+ */
+static void update_tasks_rd(struct cpuset *cs)
+{
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ css_task_iter_start(&cs->css, &it);
+ while ((task = css_task_iter_next(&it)))
+ sched_restore_dl_bw(task, cs->effective_cpus);
+ css_task_iter_end(&it);
+}
+
+static void cpuset_update_rd(void)
+{
+ struct cpuset *cs;
+ struct cgroup_subsys_state *pos_css;
+
+ lockdep_assert_held(&cpuset_mutex);
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
+ if (!css_tryget_online(&cs->css))
+ continue;
+ rcu_read_unlock();
+
+ update_tasks_rd(cs);
+
+ rcu_read_lock();
+ css_put(&cs->css);
+ }
+ rcu_read_unlock();
+}
+
/*
* Rebuild scheduler domains.
*
@@ -818,6 +856,7 @@ static void rebuild_sched_domains_locked(void)

/* Have scheduler rebuild the domains */
partition_sched_domains(ndoms, doms, attr);
+ cpuset_update_rd();
out:
put_online_cpus();
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f1ce7a8..f9558f0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2277,6 +2277,23 @@ static inline int dl_bw_cpus(int i)
}
#endif

+void sched_restore_dl_bw(struct task_struct *task,
+ const struct cpumask *new_mask)
+{
+ struct dl_bw *dl_b;
+ unsigned long flags;
+
+ if (!task_has_dl_policy(task))
+ return;
+
+ rcu_read_lock_sched();
+ dl_b = dl_bw_of(cpumask_any(new_mask));
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+ dl_b->total_bw += task->dl.dl_bw;
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+ rcu_read_unlock_sched();
+}
+
/*
* We must be sure that accepting a new task (or allowing changing the
* parameters of an existing one) is consistent with the bandwidth
@@ -5636,6 +5653,17 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)

cpumask_clear_cpu(rq->cpu, old_rd->span);

+ if (old_rd == &def_root_domain &&
+ cpumask_empty(old_rd->span)) {
+ /*
+ * def_root_domain is never freed, so we have to clean
+ * it when it becomes empty.
+ */
+ raw_spin_lock(&old_rd->dl_bw.lock);
+ old_rd->dl_bw.total_bw = 0;
+ raw_spin_unlock(&old_rd->dl_bw.lock);
+ }
+
/*
* If we dont want to free the old_rd yet then
* set old_rd to NULL to skip the freeing later
--
2.7.0