Re: [RFC patch 0/2] sched: dynamically adapt granularity withnr_running

From: Peter Zijlstra
Date: Sun Sep 12 2010 - 08:45:34 EST



You found improved latencies with something like the below as well,
right? Except your proglet needs timers to be special too iirc.

Thomas objected to 'special' wakeups, and I can fully appreciate why,
but maybe we could try it anyway, its only a reasonably soft hint
anyway.

( full series with changelogs at:
programming.kicks-ass.net/sekrit/sched-patches.tar.bz2 )

I'm currently running it on my laptop, and while spread is reasonably
controlled, interactivity isn't too sucky, but its not too hot either.

(I did lower my min_gran to like 1/5th of latency)

---
drivers/input/evdev.c | 2 +
include/linux/sched.h | 22 +++++--
kernel/sched.c | 8 +-
kernel/sched_debug.c | 2 -
kernel/sched_fair.c | 160 +++++++++++++++++++++++------------------------
kernel/sched_features.h | 13 ++---
7 files changed, 107 insertions(+), 102 deletions(-)

diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c
index 5808731..1c5b626 100644
--- a/drivers/input/evdev.c
+++ b/drivers/input/evdev.c
@@ -84,6 +84,7 @@ static void evdev_event(struct input_handle *handle,
event.code = code;
event.value = value;

+ sched_wake_interactive_enable();
rcu_read_lock();

client = rcu_dereference(evdev->grab);
@@ -96,6 +97,7 @@ static void evdev_event(struct input_handle *handle,
rcu_read_unlock();

wake_up_interruptible(&evdev->wait);
+ sched_wake_interactive_disable();
}

static int evdev_fasync(int fd, struct file *file, int on)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 53eb33c..dd40801 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1097,7 +1097,6 @@ struct sched_statistics {
u64 block_start;
u64 block_max;
u64 exec_max;
- u64 slice_max;

u64 nr_migrations_cold;
u64 nr_failed_migrations_affine;
@@ -1121,7 +1120,8 @@ struct sched_entity {
struct load_weight load; /* for load-balancing */
struct rb_node run_node;
struct list_head group_node;
- unsigned int on_rq;
+ unsigned int on_rq : 1,
+ interactive : 1;

u64 exec_start;
u64 sum_exec_runtime;
@@ -1239,11 +1239,11 @@ struct task_struct {
unsigned did_exec:1;
unsigned in_execve:1; /* Tell the LSMs that the process is doing an
* execve */
- unsigned in_iowait:1;

-
- /* Revert to default priority/policy when forking */
- unsigned sched_reset_on_fork:1;
+ unsigned sched_in_iowait :1; /* Called io_schedule() */
+ unsigned sched_reset_on_fork :1; /* Revert to default priority/policy
+ * on fork */
+ unsigned sched_wake_interactive:4; /* User driven wakeup */

pid_t pid;
pid_t tgid;
@@ -1506,6 +1506,16 @@ struct task_struct {
#endif
};

+static inline void sched_wake_interactive_enable(void)
+{
+ current->sched_wake_interactive++;
+}
+
+static inline void sched_wake_interactive_disable(void)
+{
+ current->sched_wake_interactive--;
+}
+
/* Future-safe accessor for struct task_struct's cpus_allowed. */
#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)

diff --git a/kernel/sched.c b/kernel/sched.c
index 1ab8394..89ff2c3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5125,9 +5125,9 @@ void __sched io_schedule(void)

delayacct_blkio_start();
atomic_inc(&rq->nr_iowait);
- current->in_iowait = 1;
+ current->sched_in_iowait = 1;
schedule();
- current->in_iowait = 0;
+ current->sched_in_iowait = 0;
atomic_dec(&rq->nr_iowait);
delayacct_blkio_end();
}
@@ -5140,9 +5140,9 @@ long __sched io_schedule_timeout(long timeout)

delayacct_blkio_start();
atomic_inc(&rq->nr_iowait);
- current->in_iowait = 1;
+ current->sched_in_iowait = 1;
ret = schedule_timeout(timeout);
- current->in_iowait = 0;
+ current->sched_in_iowait = 0;
atomic_dec(&rq->nr_iowait);
delayacct_blkio_end();
return ret;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 2e1b0d1..c301164 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -76,7 +76,6 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu,
PN(se->statistics.sleep_max);
PN(se->statistics.block_max);
PN(se->statistics.exec_max);
- PN(se->statistics.slice_max);
PN(se->statistics.wait_max);
PN(se->statistics.wait_sum);
P(se->statistics.wait_count);
@@ -408,7 +407,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
PN(se.statistics.sleep_max);
PN(se.statistics.block_max);
PN(se.statistics.exec_max);
- PN(se.statistics.slice_max);
PN(se.statistics.wait_max);
PN(se.statistics.wait_sum);
P(se.statistics.wait_count);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 9b5b4f8..a1ad97d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -301,27 +301,6 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
return se->vruntime - cfs_rq->min_vruntime;
}

-static void update_min_vruntime(struct cfs_rq *cfs_rq)
-{
- u64 vruntime = cfs_rq->min_vruntime;
-
- if (cfs_rq->curr)
- vruntime = cfs_rq->curr->vruntime;
-
- if (cfs_rq->rb_leftmost) {
- struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
- struct sched_entity,
- run_node);
-
- if (!cfs_rq->curr)
- vruntime = se->vruntime;
- else
- vruntime = min_vruntime(vruntime, se->vruntime);
- }
-
- cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
-}
-
/*
* Enqueue an entity into the rb-tree:
*/
@@ -495,6 +474,30 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
return calc_delta_fair(sched_slice(cfs_rq, se), se);
}

+static void update_min_vruntime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
+{
+ struct sched_entity *left = __pick_next_entity(cfs_rq);
+ struct sched_entity *curr = cfs_rq->curr;
+ u64 new_vruntime, vruntime;
+
+ if (left && curr)
+ vruntime = min_vruntime(left->vruntime, curr->vruntime);
+ else if (left)
+ vruntime = left->vruntime;
+ else if (curr)
+ vruntime = curr->vruntime;
+ else
+ return;
+
+ new_vruntime = cfs_rq->min_vruntime;
+ if (sched_feat(DYN_MIN_VRUNTIME) && delta_exec) {
+ new_vruntime += calc_delta_mine(delta_exec, NICE_0_LOAD,
+ &cfs_rq->load);
+ }
+
+ cfs_rq->min_vruntime = max_vruntime(new_vruntime, vruntime);
+}
+
/*
* Update the current task's runtime statistics. Skip current tasks that
* are not in our scheduling class.
@@ -513,7 +516,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
delta_exec_weighted = calc_delta_fair(delta_exec, curr);

curr->vruntime += delta_exec_weighted;
- update_min_vruntime(cfs_rq);
+ update_min_vruntime(cfs_rq, delta_exec);
}

static void update_curr(struct cfs_rq *cfs_rq)
@@ -688,7 +691,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
se->statistics.sum_sleep_runtime += delta;

if (tsk) {
- if (tsk->in_iowait) {
+ if (tsk->sched_in_iowait) {
se->statistics.iowait_sum += delta;
se->statistics.iowait_count++;
trace_sched_stat_iowait(tsk, delta);
@@ -708,6 +711,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
}
#endif
+ se->prev_sum_exec_runtime = se->sum_exec_runtime;
}

static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -718,7 +722,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
if (d < 0)
d = -d;

- if (d > 3*sysctl_sched_latency)
+ if (d > 3*cfs_rq->nr_running*sysctl_sched_latency)
schedstat_inc(cfs_rq, nr_spread_over);
#endif
}
@@ -738,7 +742,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
vruntime += sched_vslice(cfs_rq, se);

/* sleeps up to a single latency don't count. */
- if (!initial) {
+ if (sched_feat(FAIR_SLEEPERS) && !initial) {
unsigned long thresh = sysctl_sched_latency;

/*
@@ -752,9 +756,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
}

/* ensure we never gain time by being placed backwards. */
- vruntime = max_vruntime(se->vruntime, vruntime);
-
- se->vruntime = vruntime;
+ se->vruntime = max_vruntime(se->vruntime, vruntime);
}

static void
@@ -826,7 +828,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
account_entity_dequeue(cfs_rq, se);
- update_min_vruntime(cfs_rq);
+ update_min_vruntime(cfs_rq, 0);

/*
* Normalize the entity after updating the min_vruntime because the
@@ -837,44 +839,34 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
se->vruntime -= cfs_rq->min_vruntime;
}

+static int
+wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
+
/*
* Preempt the current task with a newly woken task if needed:
*/
static void
check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
- unsigned long ideal_runtime, delta_exec;
+ unsigned long slice = sched_slice(cfs_rq, curr);
+
+ if (curr->sum_exec_runtime - curr->prev_sum_exec_runtime < slice) {
+ struct sched_entity *pse = __pick_next_entity(cfs_rq);
+
+ if (pse && wakeup_preempt_entity(curr, pse) == 1)
+ goto preempt;

- ideal_runtime = sched_slice(cfs_rq, curr);
- delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
- if (delta_exec > ideal_runtime) {
- resched_task(rq_of(cfs_rq)->curr);
- /*
- * The current task ran long enough, ensure it doesn't get
- * re-elected due to buddy favours.
- */
- clear_buddies(cfs_rq, curr);
return;
}

/*
- * Ensure that a task that missed wakeup preemption by a
- * narrow margin doesn't have to wait for a full slice.
- * This also mitigates buddy induced latencies under load.
+ * The current task ran long enough, ensure it doesn't get
+ * re-elected due to buddy favours.
*/
- if (!sched_feat(WAKEUP_PREEMPT))
- return;
-
- if (delta_exec < sysctl_sched_min_granularity)
- return;
+ clear_buddies(cfs_rq, curr);

- if (cfs_rq->nr_running > 1) {
- struct sched_entity *se = __pick_next_entity(cfs_rq);
- s64 delta = curr->vruntime - se->vruntime;
-
- if (delta > ideal_runtime)
- resched_task(rq_of(cfs_rq)->curr);
- }
+preempt:
+ resched_task(rq_of(cfs_rq)->curr);
}

static void
@@ -893,36 +885,21 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)

update_stats_curr_start(cfs_rq, se);
cfs_rq->curr = se;
-#ifdef CONFIG_SCHEDSTATS
- /*
- * Track our maximum slice length, if the CPU's load is at
- * least twice that of our own weight (i.e. dont track it
- * when there are only lesser-weight tasks around):
- */
- if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
- se->statistics.slice_max = max(se->statistics.slice_max,
- se->sum_exec_runtime - se->prev_sum_exec_runtime);
- }
-#endif
- se->prev_sum_exec_runtime = se->sum_exec_runtime;
}

-static int
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
-
static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
{
struct sched_entity *se = __pick_next_entity(cfs_rq);
struct sched_entity *left = se;

- if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
- se = cfs_rq->next;
+ if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
+ se = cfs_rq->last;

/*
- * Prefer last buddy, try to return the CPU to a preempted task.
+ * Prefer the next buddy, only set through the interactivity logic.
*/
- if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
- se = cfs_rq->last;
+ if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
+ se = cfs_rq->next;

clear_buddies(cfs_rq, se);

@@ -931,6 +908,13 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)

static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
{
+ unsigned long slice = sched_slice(cfs_rq, prev);
+
+ prev->interactive = 0;
+
+ if (prev->sum_exec_runtime - prev->prev_sum_exec_runtime >= slice)
+ prev->prev_sum_exec_runtime += slice;
+
/*
* If still on the runqueue then deactivate_task()
* was not called and update_curr() has to be done:
@@ -1652,7 +1636,11 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
struct task_struct *curr = rq->curr;
struct sched_entity *se = &curr->se, *pse = &p->se;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
- int scale = cfs_rq->nr_running >= sched_nr_latency;
+ /*
+ * The buddy logic doesn't work well when there's not actually enough
+ * tasks for there to be buddies.
+ */
+ int buddies = (cfs_rq->nr_running >= 2);

if (unlikely(rt_prio(p->prio)))
goto preempt;
@@ -1663,8 +1651,16 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
if (unlikely(se == pse))
return;

- if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK))
+ if ((se->interactive || curr->sched_wake_interactive) &&
+ !p->sched_in_iowait)
+ pse->interactive = 1;
+
+ if (!(wake_flags & WF_FORK) && pse->interactive) {
+ clear_buddies(cfs_rq, NULL);
set_next_buddy(pse);
+ update_curr(cfs_rq);
+ goto preempt;
+ }

/*
* We can come here with TIF_NEED_RESCHED already set from new task
@@ -1709,7 +1705,7 @@ preempt:
if (unlikely(!se->on_rq || curr == rq->idle))
return;

- if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
+ if (sched_feat(LAST_BUDDY) && buddies && entity_is_task(se))
set_last_buddy(se);
}

@@ -3404,11 +3400,13 @@ static void nohz_balancer_kick(int cpu)
}

if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
- struct call_single_data *cp;
-
cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
- cp = &per_cpu(remote_sched_softirq_cb, cpu);
- __smp_call_function_single(ilb_cpu, cp, 0);
+
+ if (ilb_cpu != cpu) {
+ struct call_single_data *cp;
+ cp = &per_cpu(remote_sched_softirq_cb, cpu);
+ __smp_call_function_single(ilb_cpu, cp, 0);
+ }
}
return;
}
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 83c66e8..33b81f9 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -3,13 +3,14 @@
* them to run sooner, but does not allow tons of sleepers to
* rip the spread apart.
*/
+SCHED_FEAT(FAIR_SLEEPERS, 0)
SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)

/*
* Place new tasks ahead so that they do not starve already running
* tasks
*/
-SCHED_FEAT(START_DEBIT, 1)
+SCHED_FEAT(START_DEBIT, 0)

/*
* Should wakeups try to preempt running tasks.
@@ -25,13 +26,6 @@ SCHED_FEAT(WAKEUP_PREEMPT, 1)
SCHED_FEAT(AFFINE_WAKEUPS, 1)

/*
- * Prefer to schedule the task we woke last (assuming it failed
- * wakeup-preemption), since its likely going to consume data we
- * touched, increases cache locality.
- */
-SCHED_FEAT(NEXT_BUDDY, 0)
-
-/*
* Prefer to schedule the task that ran last (when we did
* wake-preempt) as that likely will touch the same data, increases
* cache locality.
@@ -55,6 +49,9 @@ SCHED_FEAT(LB_BIAS, 1)
SCHED_FEAT(LB_SHARES_UPDATE, 1)
SCHED_FEAT(ASYM_EFF_LOAD, 1)

+SCHED_FEAT(DYN_MIN_VRUNTIME, 1)
+SCHED_FEAT(INTERACTIVE, 1)
+
/*
* Spin-wait on mutex acquisition when the mutex owner is running on
* another cpu -- assumes that when the owner is running, it will soon

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/