[PATCH 2/3] sched,fair: Fix local starvation

From: Peter Zijlstra
Date: Tue May 10 2016 - 13:49:50 EST


Mike reported that the recent commit 3a47d5124a95 ("sched/fair: Fix
fairness issue on migration") broke interactivity and the signal
starve test.

The problem is that I assumed ENQUEUE_WAKING was only set when we do a
cross-cpu wakeup (migration), which isn't true. This means we now
destroy the vruntime history of tasks and wakeup-preemption suffers.

Cure this by making my assumption true, only call
sched_class::task_waking() when we do a cross-cpu wakeup. This avoids
the indirect call in the case we do a local wakeup.

Cc: Pavan Kondeti <pkondeti@xxxxxxxxxxxxxx>
Cc: Ben Segall <bsegall@xxxxxxxxxx>
Cc: Matt Fleming <matt@xxxxxxxxxxxxxxxxxxx>
Cc: Morten Rasmussen <morten.rasmussen@xxxxxxx>
Cc: Paul Turner <pjt@xxxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: byungchul.park@xxxxxxx
Cc: Andrew Hunter <ahh@xxxxxxxxxx>
Fixes: 3a47d5124a95 ("sched/fair: Fix fairness issue on migration")
Reported-by: Mike Galbraith <mgalbraith@xxxxxxx>
Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
kernel/sched/core.c | 29 +++++++++++++++++++++--------
kernel/sched/fair.c | 41 ++++++++++++++++++++++++++++++++++++++++-
2 files changed, 61 insertions(+), 9 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1709,14 +1709,22 @@ static void
ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
struct pin_cookie cookie)
{
+ int en_flags = ENQUEUE_WAKEUP;
+
lockdep_assert_held(&rq->lock);

#ifdef CONFIG_SMP
if (p->sched_contributes_to_load)
rq->nr_uninterruptible--;
+
+ /*
+ * If we migrated; we must have called sched_class::task_waking().
+ */
+ if (wake_flags & WF_MIGRATED)
+ en_flags |= ENQUEUE_WAKING;
#endif

- ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
+ ttwu_activate(rq, p, en_flags);
ttwu_do_wakeup(rq, p, wake_flags, cookie);
}

@@ -1762,7 +1770,11 @@ void sched_ttwu_pending(void)
while (llist) {
p = llist_entry(llist, struct task_struct, wake_entry);
llist = llist_next(llist);
- ttwu_do_activate(rq, p, 0, cookie);
+ /*
+ * See ttwu_queue(); we only call ttwu_queue_remote() when
+ * its a x-cpu wakeup.
+ */
+ ttwu_do_activate(rq, p, WF_MIGRATED, cookie);
}

lockdep_unpin_lock(&rq->lock, cookie);
@@ -1849,7 +1861,7 @@ bool cpus_share_cache(int this_cpu, int
}
#endif /* CONFIG_SMP */

-static void ttwu_queue(struct task_struct *p, int cpu)
+static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
{
struct rq *rq = cpu_rq(cpu);
struct pin_cookie cookie;
@@ -1864,7 +1876,7 @@ static void ttwu_queue(struct task_struc

raw_spin_lock(&rq->lock);
cookie = lockdep_pin_lock(&rq->lock);
- ttwu_do_activate(rq, p, 0, cookie);
+ ttwu_do_activate(rq, p, wake_flags, cookie);
lockdep_unpin_lock(&rq->lock, cookie);
raw_spin_unlock(&rq->lock);
}
@@ -2034,17 +2046,18 @@ try_to_wake_up(struct task_struct *p, un
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;

- if (p->sched_class->task_waking)
- p->sched_class->task_waking(p);
-
cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
if (task_cpu(p) != cpu) {
wake_flags |= WF_MIGRATED;
+
+ if (p->sched_class->task_waking)
+ p->sched_class->task_waking(p);
+
set_task_cpu(p, cpu);
}
#endif /* CONFIG_SMP */

- ttwu_queue(p, cpu);
+ ttwu_queue(p, cpu, wake_flags);
stat:
if (schedstat_enabled())
ttwu_stat(p, cpu, wake_flags);
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3247,6 +3247,37 @@ static inline void check_schedstat_requi
#endif
}

+
+/*
+ * MIGRATION
+ *
+ * dequeue
+ * update_curr()
+ * update_min_vruntime()
+ * vruntime -= min_vruntime
+ *
+ * enqueue
+ * update_curr()
+ * update_min_vruntime()
+ * vruntime += min_vruntime
+ *
+ * this way the vruntime transition between RQs is done when both
+ * min_vruntime are up-to-date.
+ *
+ * WAKEUP (remote)
+ *
+ * ->task_waking_fair()
+ * vruntime -= min_vruntime
+ *
+ * enqueue
+ * update_curr()
+ * update_min_vruntime()
+ * vruntime += min_vruntime
+ *
+ * this way we don't have the most up-to-date min_vruntime on the originating
+ * CPU and an up-to-date min_vruntime on the destination CPU.
+ */
+
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
@@ -3264,7 +3295,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, st

/*
* Otherwise, renormalise after, such that we're placed at the current
- * moment in time, instead of some random moment in the past.
+ * moment in time, instead of some random moment in the past. Being
+ * placed in the past could significantly boost this task to the
+ * fairness detriment of existing tasks.
*/
if (renorm && !curr)
se->vruntime += cfs_rq->min_vruntime;
@@ -4811,6 +4844,12 @@ static unsigned long cpu_avg_load_per_ta
return 0;
}

+/*
+ * Called to migrate a waking task; as blocked tasks retain absolute vruntime
+ * the migration needs to deal with this by subtracting the old and adding the
+ * new min_vruntime -- the latter is done by enqueue_entity() when placing
+ * the task on the new runqueue.
+ */
static void task_waking_fair(struct task_struct *p)
{
struct sched_entity *se = &p->se;