[PATCH] Nick's scheduler policy v12

From: Nick Piggin
Date: Fri Sep 05 2003 - 12:59:21 EST


Hi,
Backboost is gone so X really should be at -10 or even higher.
I'm now using 160 bits of the priority array instead of 140. This
works on x86, but I haven't had a look at other architectures.

Its still sort of in progress, but I thought I should send an update.
Its against 2.6.0-test4 and includes my other assorted scheduler patches.

Scheduling latency is very good now, < 50ms for a nice 0 process doing
nothing except a signal once in a while. This during a make -j3 compile
and mad X activity on UP.

I still need to think about how to get X working a bit more nicely
though. I'd like to be able to get good interactivity with large make
loads, but thats at the bottom of the list.

linux-2.6-npiggin/fs/proc/array.c | 7
linux-2.6-npiggin/include/linux/sched.h | 10
linux-2.6-npiggin/kernel/fork.c | 30 -
linux-2.6-npiggin/kernel/sched.c | 542 +++++++++++++++++++-------------
4 files changed, 342 insertions(+), 247 deletions(-)

diff -puN fs/proc/array.c~rollup fs/proc/array.c
--- linux-2.6/fs/proc/array.c~rollup 2003-09-06 03:44:21.000000000 +1000
+++ linux-2.6-npiggin/fs/proc/array.c 2003-09-06 03:44:41.000000000 +1000
@@ -154,13 +154,18 @@ static inline char * task_state(struct t
read_lock(&tasklist_lock);
buffer += sprintf(buffer,
"State:\t%s\n"
+ "sleep_avg:\t%lu\n"
+ "sleep_time:\t%lu\n"
+ "total_time:\t%lu\n"
"Tgid:\t%d\n"
"Pid:\t%d\n"
"PPid:\t%d\n"
"TracerPid:\t%d\n"
"Uid:\t%d\t%d\t%d\t%d\n"
"Gid:\t%d\t%d\t%d\t%d\n",
- get_task_state(p), p->tgid,
+ get_task_state(p),
+ p->sleep_avg, p->sleep_time, p->total_time,
+ p->tgid,
p->pid, p->pid ? p->real_parent->pid : 0,
p->pid && p->ptrace ? p->parent->pid : 0,
p->uid, p->euid, p->suid, p->fsuid,
diff -puN include/linux/sched.h~rollup include/linux/sched.h
--- linux-2.6/include/linux/sched.h~rollup 2003-09-06 03:44:26.000000000 +1000
+++ linux-2.6-npiggin/include/linux/sched.h 2003-09-06 03:44:41.000000000 +1000
@@ -280,7 +280,7 @@ struct signal_struct {
#define MAX_USER_RT_PRIO 100
#define MAX_RT_PRIO MAX_USER_RT_PRIO

-#define MAX_PRIO (MAX_RT_PRIO + 40)
+#define MAX_PRIO (MAX_RT_PRIO + 60)

/*
* Some day this will be a full-fledged user tracking system..
@@ -339,12 +339,17 @@ struct task_struct {
struct list_head run_list;
prio_array_t *array;

+ /* Scheduler variables follow. kernel/sched.c */
+ unsigned long array_sequence;
+ unsigned long timestamp;
+
+ unsigned long total_time, sleep_time;
unsigned long sleep_avg;
- unsigned long last_run;

unsigned long policy;
cpumask_t cpus_allowed;
unsigned int time_slice, first_time_slice;
+ unsigned int used_slice;

struct list_head tasks;
struct list_head ptrace_children;
@@ -552,6 +557,7 @@ extern int FASTCALL(wake_up_state(struct
extern int FASTCALL(wake_up_process(struct task_struct * tsk));
extern int FASTCALL(wake_up_process_kick(struct task_struct * tsk));
extern void FASTCALL(wake_up_forked_process(struct task_struct * tsk));
+extern void FASTCALL(sched_fork(task_t * p));
extern void FASTCALL(sched_exit(task_t * p));

asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru);
diff -puN kernel/fork.c~rollup kernel/fork.c
--- linux-2.6/kernel/fork.c~rollup 2003-09-06 03:44:31.000000000 +1000
+++ linux-2.6-npiggin/kernel/fork.c 2003-09-06 03:44:41.000000000 +1000
@@ -911,33 +911,9 @@ struct task_struct *copy_process(unsigne
p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
p->pdeath_signal = 0;

- /*
- * Share the timeslice between parent and child, thus the
- * total amount of pending timeslices in the system doesn't change,
- * resulting in more scheduling fairness.
- */
- local_irq_disable();
- p->time_slice = (current->time_slice + 1) >> 1;
- /*
- * The remainder of the first timeslice might be recovered by
- * the parent if the child exits early enough.
- */
- p->first_time_slice = 1;
- current->time_slice >>= 1;
- p->last_run = jiffies;
- if (!current->time_slice) {
- /*
- * This case is rare, it happens when the parent has only
- * a single jiffy left from its timeslice. Taking the
- * runqueue lock is not a problem.
- */
- current->time_slice = 1;
- preempt_disable();
- scheduler_tick(0, 0);
- local_irq_enable();
- preempt_enable();
- } else
- local_irq_enable();
+ /* Perform scheduler related accounting */
+ sched_fork(p);
+
/*
* Ok, add it to the run-queues and make it
* visible to the rest of the system.
diff -puN kernel/sched.c~rollup kernel/sched.c
--- linux-2.6/kernel/sched.c~rollup 2003-09-06 03:44:35.000000000 +1000
+++ linux-2.6-npiggin/kernel/sched.c 2003-09-06 03:44:41.000000000 +1000
@@ -60,85 +60,56 @@
#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))

/*
- * These are the 'tuning knobs' of the scheduler:
- *
- * Minimum timeslice is 10 msecs, default timeslice is 100 msecs,
- * maximum timeslice is 200 msecs. Timeslices get refilled after
- * they expire.
- */
-#define MIN_TIMESLICE ( 10 * HZ / 1000)
-#define MAX_TIMESLICE (200 * HZ / 1000)
-#define CHILD_PENALTY 50
-#define PARENT_PENALTY 100
-#define EXIT_WEIGHT 3
-#define PRIO_BONUS_RATIO 25
-#define INTERACTIVE_DELTA 2
-#define MAX_SLEEP_AVG (10*HZ)
-#define STARVATION_LIMIT (10*HZ)
-#define NODE_THRESHOLD 125
+ * MIN_TIMESLICE is the timeslice that a minimum priority process gets if there
+ * is a maximum priority process runnable. MAX_TIMESLICE is derived from the
+ * formula in task_timeslice. It cannot be changed here. It is the timesilce
+ * that the maximum priority process will get. Larger timeslices are attainable
+ * by low priority processes however.
+ */
+#define MIN_TIMESLICE ((1 * HZ / 1000) ? 1 * HZ / 1000 : 1)
+#define MAX_TIMESLICE (60 * MIN_TIMESLICE) /* do not change */
+
+/* Maximum amount of history that will be used to calculate priority */
+#define MAX_SLEEP (HZ)

/*
- * If a task is 'interactive' then we reinsert it in the active
- * array after it has expired its current timeslice. (it will not
- * continue to run immediately, it will still roundrobin with
- * other interactive tasks.)
- *
- * This part scales the interactivity limit depending on niceness.
- *
- * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
- * Here are a few examples of different nice levels:
- *
- * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
- * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
- * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
- * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
- * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
- *
- * (the X axis represents the possible -5 ... 0 ... +5 dynamic
- * priority range a task can explore, a value of '1' means the
- * task is rated interactive.)
- *
- * Ie. nice +19 tasks can never get 'interactive' enough to be
- * reinserted into the active array. And only heavily CPU-hog nice -20
- * tasks will be expired. Default nice 0 tasks are somewhere between,
- * it takes some effort for them to get interactive, but it's not
- * too hard.
+ * Maximum effect that 1 block of activity (run/sleep/etc) can have. This is
+ * will moderate dicard freak events (eg. SIGSTOP)
*/
+#define MAX_SLEEP_AFFECT (MAX_SLEEP/4)
+#define MAX_RUN_AFFECT (MAX_SLEEP/4)
+#define MAX_WAIT_AFFECT (MAX_RUN_AFFECT/4)

-#define SCALE(v1,v1_max,v2_max) \
- (v1) * (v2_max) / (v1_max)
+/*
+ * The amount of history can be decreased (on fork for example). This puts a
+ * lower bound on it.
+ */
+#define MIN_HISTORY (MAX_SLEEP/2)

-#define DELTA(p) \
- (SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \
- INTERACTIVE_DELTA)
+/*
+ * SLEEP_FACTOR is a fixed point factor used to scale history tracking things.
+ * In particular: total_time, sleep_time, sleep_avg.
+ */
+#define SLEEP_FACTOR (1024)

-#define TASK_INTERACTIVE(p) \
- ((p)->prio <= (p)->static_prio - DELTA(p))
+#define NODE_THRESHOLD 125

/*
- * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ]
- * to time slice values.
- *
- * The higher a thread's priority, the bigger timeslices
- * it gets during one round of execution. But even the lowest
- * priority thread gets MIN_TIMESLICE worth of execution time.
- *
- * task_timeslice() is the interface that is used by the scheduler.
+ * The scheduler classifies a process as performing one of the following
+ * activities
*/
+#define STIME_SLEEP 1 /* Sleeping */
+#define STIME_RUN 2 /* Using CPU */
+#define STIME_WAIT 3 /* Waiting for CPU */

-#define BASE_TIMESLICE(p) (MIN_TIMESLICE + \
- ((MAX_TIMESLICE - MIN_TIMESLICE) * (MAX_PRIO-1-(p)->static_prio)/(MAX_USER_PRIO - 1)))
-
-static inline unsigned int task_timeslice(task_t *p)
-{
- return BASE_TIMESLICE(p);
-}
+#define TASK_PREEMPTS_CURR(p, rq) \
+ ( (p)->prio < (rq)->curr->prio )

/*
* These are the runqueue data structures:
*/

-#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
+#define BITMAP_SIZE ((((MAX_PRIO+7)/8)+sizeof(long)-1)/sizeof(long))

typedef struct runqueue runqueue_t;

@@ -157,7 +128,8 @@ struct prio_array {
*/
struct runqueue {
spinlock_t lock;
- unsigned long nr_running, nr_switches, expired_timestamp,
+ unsigned long array_sequence;
+ unsigned long nr_running, nr_switches,
nr_uninterruptible;
task_t *curr, *idle;
struct mm_struct *prev_mm;
@@ -299,34 +271,102 @@ static inline void enqueue_task(struct t
}

/*
- * effective_prio - return the priority that is based on the static
- * priority but is modified by bonuses/penalties.
- *
- * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
- * into the -5 ... 0 ... +5 bonus/penalty range.
- *
- * We use 25% of the full 0...39 priority range so that:
- *
- * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
- * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
+ * add_task_time updates a task @p after @time of doing the specified @type
+ * of activity. See STIME_*. This is used for priority calculation.
+ */
+static void add_task_time(task_t *p, unsigned long time, unsigned long type)
+{
+ unsigned long r;
+ unsigned long max_affect;
+
+ if (time == 0)
+ return;
+
+ if (type == STIME_SLEEP)
+ max_affect = MAX_SLEEP_AFFECT;
+ else if (type == STIME_RUN)
+ max_affect = MAX_RUN_AFFECT;
+ else
+ max_affect = MAX_WAIT_AFFECT;
+
+ if (time > max_affect)
+ time = max_affect;
+
+ r = MAX_SLEEP - time;
+ p->total_time = (r*p->total_time + MAX_SLEEP/2) / MAX_SLEEP;
+ p->sleep_time = (r*p->sleep_time + MAX_SLEEP/2) / MAX_SLEEP;
+
+ if (type != STIME_WAIT) {
+ p->total_time += SLEEP_FACTOR * time;
+ if (type == STIME_SLEEP)
+ p->sleep_time += SLEEP_FACTOR * time;
+
+ p->sleep_avg = (SLEEP_FACTOR * p->sleep_time) / p->total_time;
+ }
+
+ if (p->total_time < SLEEP_FACTOR * MIN_HISTORY) {
+ p->total_time = SLEEP_FACTOR * MIN_HISTORY;
+ p->sleep_time = p->total_time * p->sleep_avg / SLEEP_FACTOR;
+ }
+}
+
+/*
+ * The higher a thread's priority, the bigger timeslices
+ * it gets during one round of execution. But even the lowest
+ * priority thread gets MIN_TIMESLICE worth of execution time.
*
- * Both properties are important to certain workloads.
+ * Timeslices are scaled, so if only low priority processes are running,
+ * they will all get long timeslices.
+ */
+static unsigned int task_timeslice(task_t *p, runqueue_t *rq)
+{
+ int idx, delta;
+ unsigned int base, timeslice;
+
+ if (rt_task(p))
+ return MAX_TIMESLICE;
+
+ idx = min(find_next_bit(rq->active->bitmap, MAX_PRIO, MAX_RT_PRIO),
+ find_next_bit(rq->expired->bitmap, MAX_PRIO, MAX_RT_PRIO));
+ idx = min(idx, p->prio);
+ delta = p->prio - idx;
+
+ /*
+ * This is a bit subtle. The first line establishes a timeslice based
+ * on how far this task is from being the highest priority runnable.
+ * The second line scales this result so low priority tasks will get
+ * big timeslices if higher priority ones are not running.
+ */
+ base = MIN_TIMESLICE * (MAX_USER_PRIO + 1) / (delta + 2);
+ timeslice = base * (USER_PRIO(idx) + 8) / 24;
+
+ if (timeslice <= 0)
+ timeslice = 1;
+
+ return timeslice;
+}
+
+/*
+ * task_priority: calculates a task's priority based on previous running
+ * history (see add_task_time). The priority is just a simple linear function
+ * based on sleep_avg and static_prio.
*/
-static int effective_prio(task_t *p)
+static unsigned long task_priority(task_t *p)
{
int bonus, prio;

if (rt_task(p))
return p->prio;

- bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 -
- MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2;
+ bonus = ((MAX_USER_PRIO / 3) * p->sleep_avg + (SLEEP_FACTOR / 2)) / SLEEP_FACTOR;
+ prio = USER_PRIO(p->static_prio) + (MAX_USER_PRIO / 3);

- prio = p->static_prio - bonus;
+ prio = MAX_RT_PRIO + prio - bonus;
if (prio < MAX_RT_PRIO)
prio = MAX_RT_PRIO;
if (prio > MAX_PRIO-1)
prio = MAX_PRIO-1;
+
return prio;
}

@@ -347,34 +387,21 @@ static inline void __activate_task(task_
*/
static inline void activate_task(task_t *p, runqueue_t *rq)
{
- long sleep_time = jiffies - p->last_run - 1;
+ unsigned long now = jiffies;
+ unsigned long sleep = now - p->timestamp;
+ p->timestamp = now;

- if (sleep_time > 0) {
- int sleep_avg;
+ add_task_time(p, sleep, STIME_SLEEP);

- /*
- * This code gives a bonus to interactive tasks.
- *
- * The boost works by updating the 'average sleep time'
- * value here, based on ->last_run. The more time a task
- * spends sleeping, the higher the average gets - and the
- * higher the priority boost gets as well.
- */
- sleep_avg = p->sleep_avg + sleep_time;
+ p->prio = task_priority(p);
+
+ /*
+ * If we have slept through an active/expired array switch, restart
+ * our timeslice too.
+ */
+ if (rq->array_sequence != p->array_sequence)
+ p->used_slice = 0;

- /*
- * 'Overflow' bonus ticks go to the waker as well, so the
- * ticks are not lost. This has the effect of further
- * boosting tasks that are related to maximum-interactive
- * tasks.
- */
- if (sleep_avg > MAX_SLEEP_AVG)
- sleep_avg = MAX_SLEEP_AVG;
- if (p->sleep_avg != sleep_avg) {
- p->sleep_avg = sleep_avg;
- p->prio = effective_prio(p);
- }
- }
__activate_task(p, rq);
}

@@ -383,6 +410,7 @@ static inline void activate_task(task_t
*/
static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
{
+ p->array_sequence = rq->array_sequence;
nr_running_dec(rq);
if (p->state == TASK_UNINTERRUPTIBLE)
rq->nr_uninterruptible++;
@@ -426,7 +454,7 @@ static inline void resched_task(task_t *
* be called with interrupts off, or it may introduce deadlock with
* smp_call_function() if an IPI is sent by the same process we are
* waiting to become inactive.
- */
+ n*/
void wait_task_inactive(task_t * p)
{
unsigned long flags;
@@ -497,11 +525,9 @@ repeat_lock_task:
}
if (old_state == TASK_UNINTERRUPTIBLE)
rq->nr_uninterruptible--;
- if (sync)
- __activate_task(p, rq);
- else {
- activate_task(p, rq);
- if (p->prio < rq->curr->prio)
+ activate_task(p, rq);
+ if (!sync) {
+ if (TASK_PREEMPTS_CURR(p, rq))
resched_task(rq->curr);
}
success = 1;
@@ -534,36 +560,90 @@ int wake_up_state(task_t *p, unsigned in
}

/*
+ * Perform scheduler related accounting for a newly forked process @p.
+ * @p is forked by current.
+ */
+void sched_fork(task_t *p)
+{
+ unsigned long ts;
+ unsigned long flags;
+ runqueue_t *rq;
+
+ /*
+ * Share the timeslice between parent and child, thus the
+ * total amount of pending timeslices in the system doesn't change,
+ * resulting in more scheduling fairness.
+ */
+ local_irq_disable();
+ p->timestamp = jiffies;
+ rq = task_rq_lock(current, &flags);
+ ts = task_timeslice(current, rq);
+ task_rq_unlock(rq, &flags);
+
+ /*
+ * Share half our timeslice with the child.
+ */
+ p->used_slice = current->used_slice + (ts - current->used_slice) / 2;
+ current->used_slice += (ts - current->used_slice + 1) / 2;
+
+ /*
+ * The remainder of the first timeslice might be recovered by
+ * the parent if the child exits early enough.
+ */
+ p->first_time_slice = 1;
+ if (current->used_slice >= ts) {
+ /*
+ * This case is rare, it happens when the parent has only
+ * a single jiffy left from its timeslice. Taking the
+ * runqueue lock is not a problem.
+ */
+ current->used_slice = ts - 1;
+ preempt_disable();
+ scheduler_tick(0, 0);
+ local_irq_enable();
+ preempt_enable();
+ } else
+ local_irq_enable();
+}
+
+/*
* wake_up_forked_process - wake up a freshly forked process.
*
* This function will do some initial scheduler statistics housekeeping
* that must be done for every newly created process.
*/
-void wake_up_forked_process(task_t * p)
+void wake_up_forked_process(task_t *p)
{
unsigned long flags;
runqueue_t *rq = task_rq_lock(current, &flags);

p->state = TASK_RUNNING;
- /*
- * We decrease the sleep average of forking parents
- * and children as well, to keep max-interactive tasks
- * from forking tasks that are max-interactive.
- */
- current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100;
- p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100;
- p->prio = effective_prio(p);
+
set_task_cpu(p, smp_processor_id());

- if (unlikely(!current->array))
- __activate_task(p, rq);
- else {
- p->prio = current->prio;
- list_add_tail(&p->run_list, &current->run_list);
- p->array = current->array;
- p->array->nr_active++;
- nr_running_inc(rq);
+ /*
+ * Get only 1/10th of the parents history. Limited by MIN_HISTORY.
+ */
+ p->total_time = current->total_time / 4;
+ p->sleep_time = current->sleep_time / 4;
+ p->sleep_avg = current->sleep_avg;
+
+ if (p->total_time < SLEEP_FACTOR * MIN_HISTORY) {
+ p->total_time = SLEEP_FACTOR * MIN_HISTORY;
+ p->sleep_time = p->total_time * p->sleep_avg / SLEEP_FACTOR;
}
+
+ /*
+ * Lose 1/4 sleep_time for forking.
+ */
+ current->sleep_time = 3 * current->sleep_time / 4;
+ if (current->total_time != 0)
+ current->sleep_avg = (SLEEP_FACTOR * current->sleep_time)
+ / current->total_time;
+
+ p->prio = task_priority(p);
+ __activate_task(p, rq);
+
task_rq_unlock(rq, &flags);
}

@@ -581,19 +661,25 @@ void sched_exit(task_t * p)
unsigned long flags;

local_irq_save(flags);
+
+ /* Regain the unused timeslice given to @p by its parent */
if (p->first_time_slice) {
- p->parent->time_slice += p->time_slice;
- if (unlikely(p->parent->time_slice > MAX_TIMESLICE))
- p->parent->time_slice = MAX_TIMESLICE;
+ unsigned long flags;
+ runqueue_t *rq;
+ rq = task_rq_lock(p, &flags);
+ p->parent->used_slice -= task_timeslice(p, rq) - p->used_slice;
+ task_rq_unlock(rq, &flags);
+ }
+
+ /* Apply some penalty to @p's parent if @p used a lot of CPU */
+ if (p->sleep_avg < p->parent->sleep_avg) {
+ add_task_time(p->parent,
+ MAX_SLEEP * (p->parent->sleep_avg - p->sleep_avg)
+ / SLEEP_FACTOR / 2,
+ STIME_RUN);
}
+
local_irq_restore(flags);
- /*
- * If the child was a (relative-) CPU hog then decrease
- * the sleep_avg of the parent as well.
- */
- if (p->sleep_avg < p->parent->sleep_avg)
- p->parent->sleep_avg = (p->parent->sleep_avg * EXIT_WEIGHT +
- p->sleep_avg) / (EXIT_WEIGHT + 1);
}

/**
@@ -959,10 +1045,10 @@ static inline runqueue_t *find_busiest_q
if (likely(!busiest))
goto out;

- *imbalance = (max_load - nr_running) / 2;
+ *imbalance = max_load - nr_running;

/* It needs an at least ~25% imbalance to trigger balancing. */
- if (!idle && (*imbalance < (max_load + 3)/4)) {
+ if (!idle && ((*imbalance)*4 < max_load)) {
busiest = NULL;
goto out;
}
@@ -972,7 +1058,7 @@ static inline runqueue_t *find_busiest_q
* Make sure nothing changed since we checked the
* runqueue length.
*/
- if (busiest->nr_running <= nr_running + 1) {
+ if (busiest->nr_running <= nr_running) {
spin_unlock(&busiest->lock);
busiest = NULL;
}
@@ -995,13 +1081,40 @@ static inline void pull_task(runqueue_t
* Note that idle threads have a prio of MAX_PRIO, for this test
* to be always true for them.
*/
- if (p->prio < this_rq->curr->prio)
+ if (TASK_PREEMPTS_CURR(p, this_rq))
set_need_resched();
- else {
- if (p->prio == this_rq->curr->prio &&
- p->time_slice > this_rq->curr->time_slice)
- set_need_resched();
- }
+}
+
+/*
+ * can_migrate_task
+ * May task @p from runqueue @rq be migrated to @this_cpu?
+ * @idle: Is this_cpu idle
+ * Returns: 1 if @p may be migrated, 0 otherwise.
+ */
+static inline int
+can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, int idle)
+{
+ unsigned long delta;
+
+ /*
+ * We do not migrate tasks that are:
+ * 1) running (obviously), or
+ * 2) cannot be migrated to this CPU due to cpus_allowed, or
+ * 3) are cache-hot on their current CPU.
+ */
+
+ if (task_running(rq, p))
+ return 0;
+
+ if (!cpu_isset(this_cpu, p->cpus_allowed))
+ return 0;
+
+ /* Aggressive migration if we're idle */
+ delta = jiffies - p->timestamp;
+ if (!idle && (delta <= cache_decay_ticks))
+ return 0;
+
+ return 1;
}

/*
@@ -1025,6 +1138,12 @@ static void load_balance(runqueue_t *thi
goto out;

/*
+ * We only want to steal a number of tasks equal to 1/2 the imbalance,
+ * otherwise we'll just shift the imbalance to the new queue:
+ */
+ imbalance /= 2;
+
+ /*
* We first consider expired tasks. Those will likely not be
* executed in the near future, and they are most likely to
* be cache-cold, thus switching CPUs has the least effect
@@ -1056,27 +1175,17 @@ skip_bitmap:
skip_queue:
tmp = list_entry(curr, task_t, run_list);

- /*
- * We do not migrate tasks that are:
- * 1) running (obviously), or
- * 2) cannot be migrated to this CPU due to cpus_allowed, or
- * 3) are cache-hot on their current CPU.
- */
-
-#define CAN_MIGRATE_TASK(p,rq,this_cpu) \
- ((!idle || (jiffies - (p)->last_run > cache_decay_ticks)) && \
- !task_running(rq, p) && \
- cpu_isset(this_cpu, (p)->cpus_allowed))
-
curr = curr->prev;

- if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) {
+ if (!can_migrate_task(tmp, busiest, this_cpu, idle)) {
if (curr != head)
goto skip_queue;
idx++;
goto skip_bitmap;
}
pull_task(busiest, array, tmp, this_rq, this_cpu);
+
+ /* Only migrate 1 task if we're idle */
if (!idle && --imbalance) {
if (curr != head)
goto skip_queue;
@@ -1171,20 +1280,6 @@ DEFINE_PER_CPU(struct kernel_stat, kstat
EXPORT_PER_CPU_SYMBOL(kstat);

/*
- * We place interactive tasks back into the active array, if possible.
- *
- * To guarantee that this does not starve expired tasks we ignore the
- * interactivity of a task if the first expired task had to wait more
- * than a 'reasonable' amount of time. This deadline timeout is
- * load-dependent, as the frequency of array switched decreases with
- * increasing number of running tasks:
- */
-#define EXPIRED_STARVING(rq) \
- (STARVATION_LIMIT && ((rq)->expired_timestamp && \
- (jiffies - (rq)->expired_timestamp >= \
- STARVATION_LIMIT * ((rq)->nr_running) + 1)))
-
-/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
*
@@ -1201,17 +1296,11 @@ void scheduler_tick(int user_ticks, int
if (rcu_pending(cpu))
rcu_check_callbacks(cpu, user_ticks);

- /* note: this timer irq context must be accounted for as well */
- if (hardirq_count() - HARDIRQ_OFFSET) {
- cpustat->irq += sys_ticks;
- sys_ticks = 0;
- } else if (softirq_count()) {
- cpustat->softirq += sys_ticks;
- sys_ticks = 0;
- }
-
if (p == rq->idle) {
- if (atomic_read(&rq->nr_iowait) > 0)
+ /* note: this timer irq context must be accounted for as well */
+ if (irq_count() - HARDIRQ_OFFSET >= SOFTIRQ_OFFSET)
+ cpustat->system += sys_ticks;
+ else if (atomic_read(&rq->nr_iowait) > 0)
cpustat->iowait += sys_ticks;
else
cpustat->idle += sys_ticks;
@@ -1232,43 +1321,39 @@ void scheduler_tick(int user_ticks, int
spin_lock(&rq->lock);
/*
* The task was running during this tick - update the
- * time slice counter and the sleep average. Note: we
- * do not update a thread's priority until it either
- * goes to sleep or uses up its timeslice. This makes
- * it possible for interactive tasks to use up their
- * timeslices at their highest priority levels.
+ * time slice counter. Note: we do not update a thread's
+ * priority until it either goes to sleep or uses up its
+ * timeslice.
*/
- if (p->sleep_avg)
- p->sleep_avg--;
if (unlikely(rt_task(p))) {
/*
* RR tasks need a special form of timeslice management.
* FIFO tasks have no timeslices.
*/
- if ((p->policy == SCHED_RR) && !--p->time_slice) {
- p->time_slice = task_timeslice(p);
- p->first_time_slice = 0;
- set_tsk_need_resched(p);
-
- /* put it at the end of the queue: */
- dequeue_task(p, rq->active);
- enqueue_task(p, rq->active);
+ if (p->policy == SCHED_RR) {
+ p->used_slice++;
+ if (p->used_slice >= task_timeslice(p, rq)) {
+ p->used_slice = 0;
+ p->first_time_slice = 0;
+ set_tsk_need_resched(p);
+
+ /* put it at the end of the queue: */
+ dequeue_task(p, rq->active);
+ enqueue_task(p, rq->active);
+ }
}
goto out_unlock;
}
- if (!--p->time_slice) {
+
+ p->used_slice++;
+ if (p->used_slice >= task_timeslice(p, rq)) {
dequeue_task(p, rq->active);
set_tsk_need_resched(p);
- p->prio = effective_prio(p);
- p->time_slice = task_timeslice(p);
+ p->prio = task_priority(p);
+ p->used_slice = 0;
p->first_time_slice = 0;

- if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
- if (!rq->expired_timestamp)
- rq->expired_timestamp = jiffies;
- enqueue_task(p, rq->expired);
- } else
- enqueue_task(p, rq->active);
+ enqueue_task(p, rq->expired);
}
out_unlock:
spin_unlock(&rq->lock);
@@ -1287,6 +1372,8 @@ asmlinkage void schedule(void)
runqueue_t *rq;
prio_array_t *array;
struct list_head *queue;
+ unsigned long now;
+ unsigned long run_time;
int idx;

/*
@@ -1307,7 +1394,11 @@ need_resched:
rq = this_rq();

release_kernel_lock(prev);
- prev->last_run = jiffies;
+ now = jiffies;
+ run_time = now - prev->timestamp;
+
+ add_task_time(prev, run_time, STIME_RUN);
+
spin_lock_irq(&rq->lock);

/*
@@ -1336,7 +1427,6 @@ pick_next_task:
goto pick_next_task;
#endif
next = rq->idle;
- rq->expired_timestamp = 0;
goto switch_tasks;
}

@@ -1345,10 +1435,10 @@ pick_next_task:
/*
* Switch the active and expired arrays.
*/
+ rq->array_sequence++;
rq->active = rq->expired;
rq->expired = array;
array = rq->active;
- rq->expired_timestamp = 0;
}

idx = sched_find_first_bit(array->bitmap);
@@ -1360,7 +1450,10 @@ switch_tasks:
clear_tsk_need_resched(prev);
RCU_qsctr(task_cpu(prev))++;

+ prev->timestamp = now;
if (likely(prev != next)) {
+ add_task_time(next, now - next->timestamp, STIME_WAIT);
+ next->timestamp = now;
rq->nr_switches++;
rq->curr = next;

@@ -1600,6 +1693,7 @@ void set_user_nice(task_t *p, long nice)
unsigned long flags;
prio_array_t *array;
runqueue_t *rq;
+ int old_prio, new_prio, delta;

if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
return;
@@ -1608,6 +1702,12 @@ void set_user_nice(task_t *p, long nice)
* the task might be in the middle of scheduling on another CPU.
*/
rq = task_rq_lock(p, &flags);
+ /*
+ * The RT priorities are set via setscheduler(), but we still
+ * allow the 'normal' nice value to be set - but as expected
+ * it wont have any effect on scheduling until the task is
+ * not SCHED_NORMAL:
+ */
if (rt_task(p)) {
p->static_prio = NICE_TO_PRIO(nice);
goto out_unlock;
@@ -1615,16 +1715,20 @@ void set_user_nice(task_t *p, long nice)
array = p->array;
if (array)
dequeue_task(p, array);
+
+ old_prio = p->prio;
+ new_prio = NICE_TO_PRIO(nice);
+ delta = new_prio - old_prio;
p->static_prio = NICE_TO_PRIO(nice);
- p->prio = NICE_TO_PRIO(nice);
+ p->prio += delta;
+
if (array) {
enqueue_task(p, array);
/*
- * If the task is running and lowered its priority,
- * or increased its priority then reschedule its CPU:
+ * If the task increased its priority or is running and
+ * lowered its priority, then reschedule its CPU:
*/
- if ((NICE_TO_PRIO(nice) < p->static_prio) ||
- task_running(rq, p))
+ if (delta < 0 || (delta > 0 && task_running(rq, p)))
resched_task(rq->curr);
}
out_unlock:
@@ -2139,6 +2243,8 @@ asmlinkage long sys_sched_rr_get_interva
int retval = -EINVAL;
struct timespec t;
task_t *p;
+ unsigned long flags;
+ runqueue_t *rq;

if (pid < 0)
goto out_nounlock;
@@ -2153,8 +2259,10 @@ asmlinkage long sys_sched_rr_get_interva
if (retval)
goto out_unlock;

+ rq = task_rq_lock(p, &flags);
jiffies_to_timespec(p->policy & SCHED_FIFO ?
- 0 : task_timeslice(p), &t);
+ 0 : task_timeslice(p, rq), &t);
+ task_rq_unlock(rq, &flags);
read_unlock(&tasklist_lock);
retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
out_nounlock:

_