[PATCH] sched: Update tasks in core queue when its cgroup tag is changed

From: Tim Chen
Date: Thu Nov 07 2019 - 15:45:24 EST


A task will need to be moved into the core scheduler queue when the cgroup
it belongs to is tagged to run with core scheduling. Similarly the task
will need to be moved out of the core scheduler queue when the cgroup
is untagged.

Also after we forked a task, its core scheduler queue's presence will
need to be updated according to its new cgroup's status.

Implement these missing core scheduler queue update mechanisms.
Otherwise, the core scheduler will OOPs the kernel when we toggle the
cgroup's core scheduling tag due to inconsistent core scheduler queue
status.

Use stop machine mechanism to update all tasks in a cgroup to prevent a
new task from sneaking into the cgroup, and missed out from the update
while we iterates through all the tasks in the cgroup. A more complicated
scheme could probably avoid the stop machine. Such scheme will also
need to resovle inconsistency between a task's cgroup core scheduling
tag and residency in core scheduler queue.

We are opting for the simple stop machine mechanism for now that avoids
such complications.

Signed-off-by: Tim Chen <tim.c.chen@xxxxxxxxxxxxxxx>
---
kernel/sched/core.c | 119 +++++++++++++++++++++++++++++++++++++++-----
1 file changed, 106 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4778b5940c30..08e7fdd5972d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -138,6 +138,37 @@ static inline bool __sched_core_less(struct task_struct *a, struct task_struct *
return false;
}

+static bool sched_core_empty(struct rq *rq)
+{
+ return RB_EMPTY_ROOT(&rq->core_tree);
+}
+
+static bool sched_core_enqueued(struct task_struct *task)
+{
+ return !RB_EMPTY_NODE(&task->core_node);
+}
+
+static struct task_struct *sched_core_first(struct rq *rq)
+{
+ struct task_struct *task;
+
+ task = container_of(rb_first(&rq->core_tree), struct task_struct, core_node);
+ return task;
+}
+
+static void sched_core_flush(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct task_struct *task;
+
+ while (!sched_core_empty(rq)) {
+ task = sched_core_first(rq);
+ rb_erase(&task->core_node, &rq->core_tree);
+ RB_CLEAR_NODE(&task->core_node);
+ }
+ rq->core->core_task_seq++;
+}
+
static void sched_core_enqueue(struct rq *rq, struct task_struct *p)
{
struct rb_node *parent, **node;
@@ -169,10 +200,11 @@ static void sched_core_dequeue(struct rq *rq, struct task_struct *p)
{
rq->core->core_task_seq++;

- if (!p->core_cookie)
+ if (!sched_core_enqueued(p))
return;

rb_erase(&p->core_node, &rq->core_tree);
+ RB_CLEAR_NODE(&p->core_node);
}

/*
@@ -236,6 +268,18 @@ static int __sched_core_stopper(void *data)
bool enabled = !!(unsigned long)data;
int cpu;

+ if (!enabled) {
+ for_each_online_cpu(cpu) {
+ /*
+ * All active and migrating tasks will have already been removed
+ * from core queue when we clear the cgroup tags.
+ * However, dying tasks could still be left in core queue.
+ * Flush them here.
+ */
+ sched_core_flush(cpu);
+ }
+ }
+
for_each_online_cpu(cpu)
cpu_rq(cpu)->core_enabled = enabled;

@@ -247,7 +291,11 @@ static int sched_core_count;

static void __sched_core_enable(void)
{
- // XXX verify there are no cookie tasks (yet)
+ int cpu;
+
+ /* verify there are no cookie tasks (yet) */
+ for_each_online_cpu(cpu)
+ BUG_ON(!sched_core_empty(cpu_rq(cpu)));

static_branch_enable(&__sched_core_enabled);
stop_machine(__sched_core_stopper, (void *)true, NULL);
@@ -257,8 +305,6 @@ static void __sched_core_enable(void)

static void __sched_core_disable(void)
{
- // XXX verify there are no cookie tasks (left)
-
stop_machine(__sched_core_stopper, (void *)false, NULL);
static_branch_disable(&__sched_core_enabled);

@@ -285,6 +331,7 @@ void sched_core_put(void)

static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
+static bool sched_core_enqueued(struct task_struct *task) { return false; }

#endif /* CONFIG_SCHED_CORE */

@@ -3016,6 +3063,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
#ifdef CONFIG_SMP
plist_node_init(&p->pushable_tasks, MAX_PRIO);
RB_CLEAR_NODE(&p->pushable_dl_tasks);
+#endif
+#ifdef CONFIG_SCHED_CORE
+ RB_CLEAR_NODE(&p->core_node);
#endif
return 0;
}
@@ -6560,6 +6610,9 @@ void init_idle(struct task_struct *idle, int cpu)
#ifdef CONFIG_SMP
sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
#endif
+#ifdef CONFIG_SCHED_CORE
+ RB_CLEAR_NODE(&idle->core_node);
+#endif
}

#ifdef CONFIG_SMP
@@ -7671,7 +7724,12 @@ static void cpu_cgroup_fork(struct task_struct *task)
rq = task_rq_lock(task, &rf);

update_rq_clock(rq);
+ if (sched_core_enqueued(task))
+ sched_core_dequeue(rq, task);
sched_change_group(task, TASK_SET_GROUP);
+ if (sched_core_enabled(rq) && task_on_rq_queued(task) &&
+ task->core_cookie)
+ sched_core_enqueue(rq, task);

task_rq_unlock(rq, task, &rf);
}
@@ -8033,12 +8091,51 @@ static u64 cpu_core_tag_read_u64(struct cgroup_subsys_state *css, struct cftype
return !!tg->tagged;
}

-static int cpu_core_tag_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, u64 val)
+struct write_core_tag {
+ struct cgroup_subsys_state *css;
+ int val;
+};
+
+static int __sched_write_tag(void *data)
{
- struct task_group *tg = css_tg(css);
+ struct write_core_tag *tag = (struct write_core_tag *) data;
+ struct cgroup_subsys_state *css = tag->css;
+ int val = tag->val;
+ struct task_group *tg = css_tg(tag->css);
struct css_task_iter it;
struct task_struct *p;

+ tg->tagged = !!val;
+
+ css_task_iter_start(css, 0, &it);
+ /*
+ * Note: css_task_iter_next will skip dying tasks.
+ * There could still be dying tasks left in the core queue
+ * when we set cgroup tag to 0 when the loop is done below.
+ */
+ while ((p = css_task_iter_next(&it))) {
+ p->core_cookie = !!val ? (unsigned long)tg : 0UL;
+
+ if (sched_core_enqueued(p)) {
+ sched_core_dequeue(task_rq(p), p);
+ if (!p->core_cookie)
+ continue;
+ }
+
+ if (p->core_cookie && task_on_rq_queued(p))
+ sched_core_enqueue(task_rq(p), p);
+
+ }
+ css_task_iter_end(&it);
+
+ return 0;
+}
+
+static int cpu_core_tag_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, u64 val)
+{
+ struct task_group *tg = css_tg(css);
+ struct write_core_tag wtag;
+
if (val > 1)
return -ERANGE;

@@ -8048,16 +8145,12 @@ static int cpu_core_tag_write_u64(struct cgroup_subsys_state *css, struct cftype
if (tg->tagged == !!val)
return 0;

- tg->tagged = !!val;
-
if (!!val)
sched_core_get();

- css_task_iter_start(css, 0, &it);
- while ((p = css_task_iter_next(&it)))
- p->core_cookie = !!val ? (unsigned long)tg : 0UL;
- css_task_iter_end(&it);
-
+ wtag.css = css;
+ wtag.val = val;
+ stop_machine(__sched_write_tag, (void *) &wtag, NULL);
if (!val)
sched_core_put();

--
2.20.1