[PATCH] Staircase Scheduler v6.3 for 2.6.7-rc2

From: Con Kolivas
Date: Sun Jun 06 2004 - 10:42:38 EST


-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

This is an update of the scheduler policy mechanism rewrite using the
infrastructure of the current O(1) scheduler. Slight changes from the
original design require a detailed description. The change to the original
design has enabled all known corner cases to be abolished and cpu
distribution to be much better maintained. It has proven to be stable in my
testing and is ready for more widespread public testing now.


Aims:
- Interactive by design rather than have interactivity bolted on.
- Good scalability.
- Simple predictable design.
- Maintain appropriate cpu distribution and fairness.
- Low scheduling latency for normal policy tasks.
- Low overhead.
- Making renicing processes actually matter for CPU distribution (nice 0 gets
20 times what nice +20 gets)
- Resistant to priority inversion
- More forgiving of poor locking
- Tunable for a server workload or computational tasks


Description:
- All tasks start at a dynamic priority based on their nice value. They run
for one RR_INTERVAL (nominally set to 10ms) and then drop one stair
(priority). If they sleep before running again they get to run for 2
intervals before being demoted a priority and so on until they get all their
intervals at their best priority: 20 intervals for nice 0; 1 interval for
nice +19.


- - The staircase elevation mechanism can be disabled and all tasks can simply
descend stairs using the sysctl:

echo 0 > /proc/sys/kernel/interactive

this has the effect of maintaining cpu distribution much more strictly
according to nice whereas the default mechanism allows bursts of cpu by
interactive tasks before settling to appropriate cpu distribution.


- - The time tasks are cpu bound can be increased by using the sysctl:

echo 1 > /proc/sys/kernel/compute

which extends the RR_INTERVAL to 100ms and disables the staircase elevation
improving conditions for pure computational tasks by optimising cache
benefits and decreasing context switching (gains another 1.5% on kernbench).


Performance:
- - All cpu throughput benchmarks show equivalent or better performance than
mainline. Note that disabling the interactive setting actually _worsens_ some
benchmarks because of their dependence on yield() so I don't recommend
disabling it unless you do a comparison first.
- - Interactivity is approximately equivalent to mainline 2.6 but with faster
application startup and no known behavioural quirks.


Comments and testing welcome.

fs/proc/array.c | 2
include/linux/sched.h | 8
include/linux/sysctl.h | 2
kernel/sched.c | 676
+++++++++++++------------------------------------
kernel/sysctl.c | 16 +
5 files changed, 212 insertions(+), 492 deletions(-)

Can be downloaded here:
http://ck.kolivas.org/patches/2.6/2.6.7-rc2/patch-2.6.7-rc2-s6.3

and below
Con
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.4 (GNU/Linux)

iD8DBQFAwzquZUg7+tp6mRURArBHAJ9SIBOKX6MYOdkJzdb+xRNnW82JQgCghLou
wXhrRsBsfY2BIqbLT1tUWcs=
=g1no
-----END PGP SIGNATURE-----
diff -Naurp --exclude-from=dontdiff linux-2.6.7-rc2-base/fs/proc/array.c linux-2.6.7-rc2-s6.3/fs/proc/array.c
--- linux-2.6.7-rc2-base/fs/proc/array.c 2004-05-31 21:29:15.000000000 +1000
+++ linux-2.6.7-rc2-s6.3/fs/proc/array.c 2004-06-07 00:03:56.959133536 +1000
@@ -155,7 +155,6 @@ static inline char * task_state(struct t
read_lock(&tasklist_lock);
buffer += sprintf(buffer,
"State:\t%s\n"
- "SleepAVG:\t%lu%%\n"
"Tgid:\t%d\n"
"Pid:\t%d\n"
"PPid:\t%d\n"
@@ -163,7 +162,6 @@ static inline char * task_state(struct t
"Uid:\t%d\t%d\t%d\t%d\n"
"Gid:\t%d\t%d\t%d\t%d\n",
get_task_state(p),
- (p->sleep_avg/1024)*100/(1020000000/1024),
p->tgid,
p->pid, p->pid ? p->real_parent->pid : 0,
p->pid && p->ptrace ? p->parent->pid : 0,
diff -Naurp --exclude-from=dontdiff linux-2.6.7-rc2-base/include/linux/sched.h linux-2.6.7-rc2-s6.3/include/linux/sched.h
--- linux-2.6.7-rc2-base/include/linux/sched.h 2004-05-31 21:29:21.000000000 +1000
+++ linux-2.6.7-rc2-s6.3/include/linux/sched.h 2004-06-07 00:10:37.450576503 +1000
@@ -164,6 +164,7 @@ extern void show_stack(struct task_struc

void io_schedule(void);
long io_schedule_timeout(long timeout);
+extern int interactive, compute;

extern void cpu_init (void);
extern void trap_init(void);
@@ -394,14 +395,13 @@ struct task_struct {
struct list_head run_list;
prio_array_t *array;

- unsigned long sleep_avg;
- long interactive_credit;
unsigned long long timestamp;
- int activated;
+ unsigned long runtime, totalrun;
+ unsigned int deadline;

unsigned long policy;
cpumask_t cpus_allowed;
- unsigned int time_slice, first_time_slice;
+ unsigned int slice, time_slice, first_time_slice;

struct list_head tasks;
struct list_head ptrace_children;
diff -Naurp --exclude-from=dontdiff linux-2.6.7-rc2-base/include/linux/sysctl.h linux-2.6.7-rc2-s6.3/include/linux/sysctl.h
--- linux-2.6.7-rc2-base/include/linux/sysctl.h 2004-05-31 21:29:21.000000000 +1000
+++ linux-2.6.7-rc2-s6.3/include/linux/sysctl.h 2004-06-07 00:06:13.007895851 +1000
@@ -133,6 +133,8 @@ enum
KERN_NGROUPS_MAX=63, /* int: NGROUPS_MAX */
KERN_SPARC_SCONS_PWROFF=64, /* int: serial console power-off halt */
KERN_HZ_TIMER=65, /* int: hz timer on or off */
+ KERN_INTERACTIVE=66, /* interactive tasks to have cpu bursts */
+ KERN_COMPUTE=67, /* adjust timeslices for a compute server */
};


diff -Naurp --exclude-from=dontdiff linux-2.6.7-rc2-base/kernel/sched.c linux-2.6.7-rc2-s6.3/kernel/sched.c
--- linux-2.6.7-rc2-base/kernel/sched.c 2004-05-31 21:29:24.000000000 +1000
+++ linux-2.6.7-rc2-s6.3/kernel/sched.c 2004-06-07 00:36:43.382396246 +1000
@@ -16,7 +16,10 @@
* by Davide Libenzi, preemptible kernel bits by Robert Love.
* 2003-09-03 Interactivity tuning by Con Kolivas.
* 2004-04-02 Scheduler domains code by Nick Piggin
- */
+ * 2004-03-19. New staircase scheduling policy by Con Kolivas with help
+ * from Zwane Mwaikambo and useful suggestions by
+ * William Lee Irwin III.
+*/

#include <linux/mm.h>
#include <linux/module.h>
@@ -66,8 +69,6 @@
#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
-#define AVG_TIMESLICE (MIN_TIMESLICE + ((MAX_TIMESLICE - MIN_TIMESLICE) *\
- (MAX_PRIO-1-NICE_TO_PRIO(0))/(MAX_USER_PRIO - 1)))

/*
* Some helpers for converting nanosecond timing to jiffy resolution
@@ -75,110 +76,18 @@
#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))

-/*
- * These are the 'tuning knobs' of the scheduler:
- *
- * Minimum timeslice is 10 msecs, default timeslice is 100 msecs,
- * maximum timeslice is 200 msecs. Timeslices get refilled after
- * they expire.
- */
-#define MIN_TIMESLICE ( 10 * HZ / 1000)
-#define MAX_TIMESLICE (200 * HZ / 1000)
-#define ON_RUNQUEUE_WEIGHT 30
-#define CHILD_PENALTY 95
-#define PARENT_PENALTY 100
-#define EXIT_WEIGHT 3
-#define PRIO_BONUS_RATIO 25
-#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
-#define INTERACTIVE_DELTA 2
-#define MAX_SLEEP_AVG (AVG_TIMESLICE * MAX_BONUS)
-#define STARVATION_LIMIT (MAX_SLEEP_AVG)
-#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
-#define CREDIT_LIMIT 100
-
-/*
- * If a task is 'interactive' then we reinsert it in the active
- * array after it has expired its current timeslice. (it will not
- * continue to run immediately, it will still roundrobin with
- * other interactive tasks.)
- *
- * This part scales the interactivity limit depending on niceness.
- *
- * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
- * Here are a few examples of different nice levels:
- *
- * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
- * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
- * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
- * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
- * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
- *
- * (the X axis represents the possible -5 ... 0 ... +5 dynamic
- * priority range a task can explore, a value of '1' means the
- * task is rated interactive.)
- *
- * Ie. nice +19 tasks can never get 'interactive' enough to be
- * reinserted into the active array. And only heavily CPU-hog nice -20
- * tasks will be expired. Default nice 0 tasks are somewhere between,
- * it takes some effort for them to get interactive, but it's not
- * too hard.
- */
-
-#define CURRENT_BONUS(p) \
- (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
- MAX_SLEEP_AVG)
-
-#ifdef CONFIG_SMP
-#define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \
- (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
- num_online_cpus())
-#else
-#define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \
- (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
-#endif
-
-#define SCALE(v1,v1_max,v2_max) \
- (v1) * (v2_max) / (v1_max)
-
-#define DELTA(p) \
- (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA)
-
-#define TASK_INTERACTIVE(p) \
- ((p)->prio <= (p)->static_prio - DELTA(p))
-
-#define INTERACTIVE_SLEEP(p) \
- (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
- (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
-
-#define HIGH_CREDIT(p) \
- ((p)->interactive_credit > CREDIT_LIMIT)
-
-#define LOW_CREDIT(p) \
- ((p)->interactive_credit < -CREDIT_LIMIT)
+int compute = 0;
+/*
+ *This is the time all tasks within the same priority round robin.
+ *compute setting is reserved for dedicated computational scheduling
+ *and has ten times larger intervals.
+ */
+#define _RR_INTERVAL ((10 * HZ / 1000) ? : 1)
+#define RR_INTERVAL (_RR_INTERVAL * (1 + 9 * compute))

#define TASK_PREEMPTS_CURR(p, rq) \
((p)->prio < (rq)->curr->prio)

-/*
- * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ]
- * to time slice values.
- *
- * The higher a thread's priority, the bigger timeslices
- * it gets during one round of execution. But even the lowest
- * priority thread gets MIN_TIMESLICE worth of execution time.
- *
- * task_timeslice() is the interface that is used by the scheduler.
- */
-
-#define BASE_TIMESLICE(p) (MIN_TIMESLICE + \
- ((MAX_TIMESLICE - MIN_TIMESLICE) * \
- (MAX_PRIO-1 - (p)->static_prio) / (MAX_USER_PRIO-1)))
-
-static unsigned int task_timeslice(task_t *p)
-{
- return BASE_TIMESLICE(p);
-}
-
#define task_hot(p, now, sd) ((now) - (p)->timestamp < (sd)->cache_hot_time)

/*
@@ -192,7 +101,7 @@ typedef struct runqueue runqueue_t;
struct prio_array {
unsigned int nr_active;
unsigned long bitmap[BITMAP_SIZE];
- struct list_head queue[MAX_PRIO];
+ struct list_head queue[MAX_PRIO + 1];
};

/*
@@ -214,12 +123,11 @@ struct runqueue {
unsigned long cpu_load;
#endif
unsigned long long nr_switches;
- unsigned long expired_timestamp, nr_uninterruptible;
+ unsigned long nr_uninterruptible;
unsigned long long timestamp_last_tick;
task_t *curr, *idle;
struct mm_struct *prev_mm;
- prio_array_t *active, *expired, arrays[2];
- int best_expired_prio;
+ prio_array_t array;
atomic_t nr_iowait;

#ifdef CONFIG_SMP
@@ -300,16 +208,18 @@ static inline void rq_unlock(runqueue_t
/*
* Adding/removing a task to/from a priority array:
*/
-static void dequeue_task(struct task_struct *p, prio_array_t *array)
+static void dequeue_task(struct task_struct *p, runqueue_t *rq)
{
+ prio_array_t* array = &rq->array;
array->nr_active--;
list_del(&p->run_list);
if (list_empty(array->queue + p->prio))
__clear_bit(p->prio, array->bitmap);
}

-static void enqueue_task(struct task_struct *p, prio_array_t *array)
+static void enqueue_task(struct task_struct *p, runqueue_t *rq)
{
+ prio_array_t* array = &rq->array;
list_add_tail(&p->run_list, array->queue + p->prio);
__set_bit(p->prio, array->bitmap);
array->nr_active++;
@@ -321,8 +231,9 @@ static void enqueue_task(struct task_str
* remote queue so we want these tasks to show up at the head of the
* local queue:
*/
-static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
+static inline void enqueue_task_head(struct task_struct *p, runqueue_t *rq)
{
+ prio_array_t* array = &rq->array;
list_add(&p->run_list, array->queue + p->prio);
__set_bit(p->prio, array->bitmap);
array->nr_active++;
@@ -330,42 +241,11 @@ static inline void enqueue_task_head(str
}

/*
- * effective_prio - return the priority that is based on the static
- * priority but is modified by bonuses/penalties.
- *
- * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
- * into the -5 ... 0 ... +5 bonus/penalty range.
- *
- * We use 25% of the full 0...39 priority range so that:
- *
- * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
- * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
- *
- * Both properties are important to certain workloads.
- */
-static int effective_prio(task_t *p)
-{
- int bonus, prio;
-
- if (rt_task(p))
- return p->prio;
-
- bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
-
- prio = p->static_prio - bonus;
- if (prio < MAX_RT_PRIO)
- prio = MAX_RT_PRIO;
- if (prio > MAX_PRIO-1)
- prio = MAX_PRIO-1;
- return prio;
-}
-
-/*
* __activate_task - move a task to the runqueue.
*/
static inline void __activate_task(task_t *p, runqueue_t *rq)
{
- enqueue_task(p, rq->active);
+ enqueue_task(p, rq);
rq->nr_running++;
}

@@ -374,95 +254,112 @@ static inline void __activate_task(task_
*/
static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
{
- enqueue_task_head(p, rq->active);
+ enqueue_task_head(p, rq);
rq->nr_running++;
}

-static void recalc_task_prio(task_t *p, unsigned long long now)
+// deadline - the best deadline rank a task can have.
+static unsigned int deadline(task_t *p)
{
- unsigned long long __sleep_time = now - p->timestamp;
- unsigned long sleep_time;
-
- if (__sleep_time > NS_MAX_SLEEP_AVG)
- sleep_time = NS_MAX_SLEEP_AVG;
+ unsigned int task_user_prio;
+ if (rt_task(p))
+ return p->deadline;
+ task_user_prio = TASK_USER_PRIO(p);
+ if (likely(task_user_prio < 40))
+ return 39 - task_user_prio;
else
- sleep_time = (unsigned long)__sleep_time;
+ return 0;
+}

- if (likely(sleep_time > 0)) {
- /*
- * User tasks that sleep a long time are categorised as
- * idle and will get just interactive status to stay active &
- * prevent them suddenly becoming cpu hogs and starving
- * other processes.
- */
- if (p->mm && p->activated != -1 &&
- sleep_time > INTERACTIVE_SLEEP(p)) {
- p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
- AVG_TIMESLICE);
- if (!HIGH_CREDIT(p))
- p->interactive_credit++;
- } else {
- /*
- * The lower the sleep avg a task has the more
- * rapidly it will rise with sleep time.
- */
- sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
+static void inc_deadline(task_t *p)
+{
+ unsigned int best_deadline;
+ best_deadline = deadline(p);
+ if (!p->mm || rt_task(p) || !best_deadline)
+ return;
+ if (p->deadline < best_deadline)
+ p->deadline++;
+}

- /*
- * Tasks with low interactive_credit are limited to
- * one timeslice worth of sleep avg bonus.
- */
- if (LOW_CREDIT(p) &&
- sleep_time > JIFFIES_TO_NS(task_timeslice(p)))
- sleep_time = JIFFIES_TO_NS(task_timeslice(p));
+static void dec_deadline(task_t *p)
+{
+ if (!p->mm || rt_task(p))
+ return;
+ if (p->deadline)
+ p->deadline--;
+}

- /*
- * Non high_credit tasks waking from uninterruptible
- * sleep are limited in their sleep_avg rise as they
- * are likely to be cpu hogs waiting on I/O
- */
- if (p->activated == -1 && !HIGH_CREDIT(p) && p->mm) {
- if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
- sleep_time = 0;
- else if (p->sleep_avg + sleep_time >=
- INTERACTIVE_SLEEP(p)) {
- p->sleep_avg = INTERACTIVE_SLEEP(p);
- sleep_time = 0;
- }
- }
+// slice - the duration a task runs before losing a deadline rank.
+static unsigned int slice(task_t *p)
+{
+ unsigned int slice = RR_INTERVAL;
+ if (!rt_task(p))
+ slice += deadline(p) * RR_INTERVAL;
+ return slice;
+}

- /*
- * This code gives a bonus to interactive tasks.
- *
- * The boost works by updating the 'average sleep time'
- * value here, based on ->timestamp. The more time a
- * task spends sleeping, the higher the average gets -
- * and the higher the priority boost gets as well.
- */
- p->sleep_avg += sleep_time;
+// interactive - interactive tasks get longer intervals at best priority
+int interactive = 1;

- if (p->sleep_avg > NS_MAX_SLEEP_AVG) {
- p->sleep_avg = NS_MAX_SLEEP_AVG;
- if (!HIGH_CREDIT(p))
- p->interactive_credit++;
- }
+/*
+ * effective_prio - dynamic priority dependent on deadline rank.
+ * With 0 deadline ranks the priority decreases by one each RR_INTERVAL.
+ * As the deadline rank increases the priority stays at the top "stair" or
+ * value for longer.
+ */
+static int effective_prio(task_t *p)
+{
+ int prio;
+ unsigned int full_slice, used_slice, first_slice;
+ unsigned int best_deadline;
+ if (rt_task(p))
+ return p->prio;
+
+ best_deadline = deadline(p);
+ full_slice = slice(p);
+ used_slice = full_slice - p->slice;
+ if (p->deadline > best_deadline || !p->mm)
+ p->deadline = best_deadline;
+ first_slice = RR_INTERVAL;
+ if (interactive && !compute)
+ first_slice *= (p->deadline + 1);
+ prio = MAX_PRIO - 1 - best_deadline;
+
+ if (used_slice < first_slice)
+ return prio;
+ if (p->mm)
+ prio += 1 + (used_slice - first_slice) / RR_INTERVAL;
+ if (prio > MAX_PRIO - 1)
+ prio = MAX_PRIO - 1;
+ return prio;
+}
+
+static void recalc_task_prio(task_t *p, unsigned long long now)
+{
+ unsigned long sleep_time = now - p->timestamp;
+ unsigned long run_time = NS_TO_JIFFIES(p->runtime);
+ unsigned long total_run = NS_TO_JIFFIES(p->totalrun) + run_time;
+ if (!run_time && NS_TO_JIFFIES(p->runtime + sleep_time) < RR_INTERVAL) {
+ if (p->slice - total_run < 1) {
+ p->totalrun = 0;
+ dec_deadline(p);
+ } else {
+ p->totalrun += p->runtime;
+ p->slice -= NS_TO_JIFFIES(p->totalrun);
}
+ } else {
+ inc_deadline(p);
+ p->runtime = 0;
+ p->totalrun = 0;
}
-
- p->prio = effective_prio(p);
}

/*
* activate_task - move a task to the runqueue and do priority recalculation
- *
- * Update all the scheduling statistics stuff. (sleep average
- * calculation, priority modifiers, etc.)
*/
static void activate_task(task_t *p, runqueue_t *rq, int local)
{
- unsigned long long now;
-
- now = sched_clock();
+ unsigned long long now = sched_clock();
#ifdef CONFIG_SMP
if (!local) {
/* Compensate for drifting sched_clock */
@@ -471,33 +368,14 @@ static void activate_task(task_t *p, run
+ rq->timestamp_last_tick;
}
#endif
-
- recalc_task_prio(p, now);
-
- /*
- * This checks to make sure it's not an uninterruptible task
- * that is now waking up.
- */
- if (!p->activated) {
- /*
- * Tasks which were woken up by interrupts (ie. hw events)
- * are most likely of interactive nature. So we give them
- * the credit of extending their sleep time to the period
- * of time they spend on the runqueue, waiting for execution
- * on a CPU, first time around:
- */
- if (in_interrupt())
- p->activated = 2;
- else {
- /*
- * Normal first-time wakeups get a credit too for
- * on-runqueue time, but it will be weighted down:
- */
- p->activated = 1;
- }
- }
+ p->slice = slice(p);
+ if (!rt_task(p) && p->mm)
+ recalc_task_prio(p, now);
+ else
+ inc_deadline(p);
+ p->prio = effective_prio(p);
+ p->time_slice = RR_INTERVAL;
p->timestamp = now;
-
__activate_task(p, rq);
}

@@ -507,9 +385,12 @@ static void activate_task(task_t *p, run
static void deactivate_task(struct task_struct *p, runqueue_t *rq)
{
rq->nr_running--;
- if (p->state == TASK_UNINTERRUPTIBLE)
+ if (p->state == TASK_UNINTERRUPTIBLE) {
rq->nr_uninterruptible++;
- dequeue_task(p, p->array);
+ if (p->deadline > (deadline(p) - 2) && p->deadline && p->mm)
+ p->deadline--;
+ }
+ dequeue_task(p, rq);
p->array = NULL;
}

@@ -813,14 +694,8 @@ out_set_cpu:

out_activate:
#endif /* CONFIG_SMP */
- if (old_state == TASK_UNINTERRUPTIBLE) {
+ if (old_state == TASK_UNINTERRUPTIBLE)
rq->nr_uninterruptible--;
- /*
- * Tasks on involuntary sleep don't earn
- * sleep_avg beyond just interactive state.
- */
- p->activated = -1;
- }

/*
* Sync wakeups (i.e. those types of wakeups where the waker
@@ -890,12 +765,14 @@ void fastcall sched_fork(task_t *p)
*/
local_irq_disable();
p->time_slice = (current->time_slice + 1) >> 1;
+ p->slice = (current->slice + 1) >> 1;
/*
* The remainder of the first timeslice might be recovered by
* the parent if the child exits early enough.
*/
p->first_time_slice = 1;
current->time_slice >>= 1;
+ current->slice >>= 1;
p->timestamp = sched_clock();
if (!current->time_slice) {
/*
@@ -923,33 +800,14 @@ void fastcall wake_up_forked_process(tas
unsigned long flags;
runqueue_t *rq = task_rq_lock(current, &flags);

+ // Forked process gets a lower deadline rank to prevent fork bombs.
+ p->deadline = 0;
BUG_ON(p->state != TASK_RUNNING);

- /*
- * We decrease the sleep average of forking parents
- * and children as well, to keep max-interactive tasks
- * from forking tasks that are max-interactive.
- */
- current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
- PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
-
- p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
- CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
-
- p->interactive_credit = 0;
-
- p->prio = effective_prio(p);
set_task_cpu(p, smp_processor_id());

- if (unlikely(!current->array))
- __activate_task(p, rq);
- else {
- p->prio = current->prio;
- list_add_tail(&p->run_list, &current->run_list);
- p->array = current->array;
- p->array->nr_active++;
- rq->nr_running++;
- }
+ p->prio = effective_prio(p);
+ __activate_task(p, rq);
task_rq_unlock(rq, &flags);
}

@@ -970,19 +828,11 @@ void fastcall sched_exit(task_t * p)
local_irq_save(flags);
if (p->first_time_slice) {
p->parent->time_slice += p->time_slice;
- if (unlikely(p->parent->time_slice > MAX_TIMESLICE))
- p->parent->time_slice = MAX_TIMESLICE;
+ if (unlikely(p->parent->time_slice > slice(p->parent)))
+ p->parent->time_slice = slice(p->parent);
}
local_irq_restore(flags);
- /*
- * If the child was a (relative-) CPU hog then decrease
- * the sleep_avg of the parent as well.
- */
rq = task_rq_lock(p->parent, &flags);
- if (p->sleep_avg < p->parent->sleep_avg)
- p->parent->sleep_avg = p->parent->sleep_avg /
- (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
- (EXIT_WEIGHT + 1);
task_rq_unlock(rq, &flags);
}

@@ -1246,18 +1096,6 @@ lock_again:
double_rq_unlock(this_rq, rq);
goto lock_again;
}
- /*
- * We decrease the sleep average of forking parents
- * and children as well, to keep max-interactive tasks
- * from forking tasks that are max-interactive.
- */
- current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
- PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
-
- p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
- CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
-
- p->interactive_credit = 0;

p->prio = effective_prio(p);
set_task_cpu(p, cpu);
@@ -1369,15 +1207,15 @@ static void double_lock_balance(runqueue
* pull_task - move a task from a remote runqueue to the local runqueue.
* Both runqueues must be locked.
*/
-static inline
-void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
- runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
+static inline
+void pull_task(runqueue_t *src_rq, task_t *p,
+ runqueue_t *this_rq, int this_cpu)
{
- dequeue_task(p, src_array);
+ dequeue_task(p, src_rq);
src_rq->nr_running--;
set_task_cpu(p, this_cpu);
this_rq->nr_running++;
- enqueue_task(p, this_array);
+ enqueue_task(p, this_rq);
p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
+ this_rq->timestamp_last_tick;
/*
@@ -1427,29 +1265,16 @@ static int move_tasks(runqueue_t *this_r
unsigned long max_nr_move, struct sched_domain *sd,
enum idle_type idle)
{
- prio_array_t *array, *dst_array;
+ prio_array_t* array, *dst_array;
struct list_head *head, *curr;
int idx, pulled = 0;
task_t *tmp;

if (max_nr_move <= 0 || busiest->nr_running <= 1)
goto out;
+ array = &busiest->array;
+ dst_array = &this_rq->array;

- /*
- * We first consider expired tasks. Those will likely not be
- * executed in the near future, and they are most likely to
- * be cache-cold, thus switching CPUs has the least effect
- * on them.
- */
- if (busiest->expired->nr_active) {
- array = busiest->expired;
- dst_array = this_rq->expired;
- } else {
- array = busiest->active;
- dst_array = this_rq->active;
- }
-
-new_array:
/* Start searching at priority 0: */
idx = 0;
skip_bitmap:
@@ -1457,14 +1282,8 @@ skip_bitmap:
idx = sched_find_first_bit(array->bitmap);
else
idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
- if (idx >= MAX_PRIO) {
- if (array == busiest->expired && busiest->active->nr_active) {
- array = busiest->active;
- dst_array = this_rq->active;
- goto new_array;
- }
+ if (idx >= MAX_PRIO)
goto out;
- }

head = array->queue + idx;
curr = head->prev;
@@ -1479,7 +1298,7 @@ skip_queue:
idx++;
goto skip_bitmap;
}
- pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
+ pull_task(busiest, tmp, this_rq, this_cpu);
pulled++;

/* We only want to steal up to the prescribed number of tasks. */
@@ -1938,22 +1757,6 @@ DEFINE_PER_CPU(struct kernel_stat, kstat
EXPORT_PER_CPU_SYMBOL(kstat);

/*
- * We place interactive tasks back into the active array, if possible.
- *
- * To guarantee that this does not starve expired tasks we ignore the
- * interactivity of a task if the first expired task had to wait more
- * than a 'reasonable' amount of time. This deadline timeout is
- * load-dependent, as the frequency of array switched decreases with
- * increasing number of running tasks. We also ignore the interactivity
- * if a better static_prio task has expired:
- */
-#define EXPIRED_STARVING(rq) \
- ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
- (jiffies - (rq)->expired_timestamp >= \
- STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
- ((rq)->curr->static_prio > (rq)->best_expired_prio))
-
-/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
*
@@ -1998,76 +1801,38 @@ void scheduler_tick(int user_ticks, int
cpustat->system += sys_ticks;

/* Task might have expired already, but not scheduled off yet */
- if (p->array != rq->active) {
+ if (p->array != &rq->array) {
set_tsk_need_resched(p);
goto out;
}
spin_lock(&rq->lock);
- /*
- * The task was running during this tick - update the
- * time slice counter. Note: we do not update a thread's
- * priority until it either goes to sleep or uses up its
- * timeslice. This makes it possible for interactive tasks
- * to use up their timeslices at their highest priority levels.
- */
- if (unlikely(rt_task(p))) {
- /*
- * RR tasks need a special form of timeslice management.
- * FIFO tasks have no timeslices.
- */
- if ((p->policy == SCHED_RR) && !--p->time_slice) {
- p->time_slice = task_timeslice(p);
- p->first_time_slice = 0;
- set_tsk_need_resched(p);
-
- /* put it at the end of the queue: */
- dequeue_task(p, rq->active);
- enqueue_task(p, rq->active);
- }
+
+ // SCHED_FIFO tasks never run out of timeslice.
+ if (unlikely(p->policy == SCHED_FIFO))
+ goto out_unlock;
+ // Tasks lose a deadline rank each time they use up a full slice().
+ if (!--p->slice) {
+ set_tsk_need_resched(p);
+ dequeue_task(p, rq);
+ dec_deadline(p);
+ p->slice = slice(p);
+ p->prio = effective_prio(p);
+ p->time_slice = RR_INTERVAL;
+ enqueue_task(p, rq);
+ p->first_time_slice = 0;
goto out_unlock;
}
+ /*
+ * Tasks that run out of time_slice but still have slice left get
+ * requeued with a lower priority && RR_INTERVAL time_slice.
+ */
if (!--p->time_slice) {
- dequeue_task(p, rq->active);
set_tsk_need_resched(p);
+ dequeue_task(p, rq);
p->prio = effective_prio(p);
- p->time_slice = task_timeslice(p);
- p->first_time_slice = 0;
-
- if (!rq->expired_timestamp)
- rq->expired_timestamp = jiffies;
- if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
- enqueue_task(p, rq->expired);
- if (p->static_prio < rq->best_expired_prio)
- rq->best_expired_prio = p->static_prio;
- } else
- enqueue_task(p, rq->active);
- } else {
- /*
- * Prevent a too long timeslice allowing a task to monopolize
- * the CPU. We do this by splitting up the timeslice into
- * smaller pieces.
- *
- * Note: this does not mean the task's timeslices expire or
- * get lost in any way, they just might be preempted by
- * another task of equal priority. (one with higher
- * priority would have preempted this task already.) We
- * requeue this task to the end of the list on this priority
- * level, which is in essence a round-robin of tasks with
- * equal priority.
- *
- * This only applies to tasks in the interactive
- * delta range with at least TIMESLICE_GRANULARITY to requeue.
- */
- if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
- p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
- (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
- (p->array == rq->active)) {
-
- dequeue_task(p, rq->active);
- set_tsk_need_resched(p);
- p->prio = effective_prio(p);
- enqueue_task(p, rq->active);
- }
+ p->time_slice = RR_INTERVAL;
+ enqueue_task(p, rq);
+ goto out_unlock;
}
out_unlock:
spin_unlock(&rq->lock);
@@ -2131,8 +1896,8 @@ static inline int dependent_sleeper(int
* task from using an unfair proportion of the
* physical cpu's resources. -ck
*/
- if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) >
- task_timeslice(p) || rt_task(smt_curr)) &&
+ if (((smt_curr->slice * (100 - sd->per_cpu_gain) / 100) >
+ slice(p) || rt_task(smt_curr)) &&
p->mm && smt_curr->mm && !rt_task(p))
ret = 1;

@@ -2141,8 +1906,8 @@ static inline int dependent_sleeper(int
* or wake it up if it has been put to sleep for priority
* reasons.
*/
- if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) >
- task_timeslice(smt_curr) || rt_task(p)) &&
+ if ((((p->slice * (100 - sd->per_cpu_gain) / 100) >
+ slice(smt_curr) || rt_task(p)) &&
smt_curr->mm && p->mm && !rt_task(smt_curr)) ||
(smt_curr == smt_rq->idle && smt_rq->nr_running))
resched_task(smt_curr);
@@ -2168,10 +1933,9 @@ asmlinkage void __sched schedule(void)
long *switch_count;
task_t *prev, *next;
runqueue_t *rq;
- prio_array_t *array;
+ prio_array_t* array;
struct list_head *queue;
unsigned long long now;
- unsigned long run_time;
int cpu, idx;

/*
@@ -2193,19 +1957,8 @@ need_resched:

release_kernel_lock(prev);
now = sched_clock();
- if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG))
- run_time = now - prev->timestamp;
- else
- run_time = NS_MAX_SLEEP_AVG;
-
- /*
- * Tasks with interactive credits get charged less run_time
- * at high sleep_avg to delay them losing their interactive
- * status
- */
- if (HIGH_CREDIT(prev))
- run_time /= (CURRENT_BONUS(prev) ? : 1);

+ prev->runtime = now - prev->timestamp;
spin_lock_irq(&rq->lock);

/*
@@ -2227,56 +1980,22 @@ need_resched:
idle_balance(cpu, rq);
if (!rq->nr_running) {
next = rq->idle;
- rq->expired_timestamp = 0;
wake_sleeping_dependent(cpu, rq);
goto switch_tasks;
}
}

- array = rq->active;
- if (unlikely(!array->nr_active)) {
- /*
- * Switch the active and expired arrays.
- */
- rq->active = rq->expired;
- rq->expired = array;
- array = rq->active;
- rq->expired_timestamp = 0;
- rq->best_expired_prio = MAX_PRIO;
- }
-
+ array = &rq->array;
idx = sched_find_first_bit(array->bitmap);
queue = array->queue + idx;
next = list_entry(queue->next, task_t, run_list);

- if (dependent_sleeper(cpu, rq, next)) {
+ if (dependent_sleeper(cpu, rq, next))
next = rq->idle;
- goto switch_tasks;
- }
-
- if (!rt_task(next) && next->activated > 0) {
- unsigned long long delta = now - next->timestamp;
-
- if (next->activated == 1)
- delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
-
- array = next->array;
- dequeue_task(next, array);
- recalc_task_prio(next, next->timestamp + delta);
- enqueue_task(next, array);
- }
- next->activated = 0;
switch_tasks:
prefetch(next);
clear_tsk_need_resched(prev);
RCU_qsctr(task_cpu(prev))++;
-
- prev->sleep_avg -= run_time;
- if ((long)prev->sleep_avg <= 0) {
- prev->sleep_avg = 0;
- if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev)))
- prev->interactive_credit--;
- }
prev->timestamp = now;

if (likely(prev != next)) {
@@ -2542,7 +2261,7 @@ EXPORT_SYMBOL(sleep_on_timeout);
void set_user_nice(task_t *p, long nice)
{
unsigned long flags;
- prio_array_t *array;
+ prio_array_t* array;
runqueue_t *rq;
int old_prio, new_prio, delta;

@@ -2565,7 +2284,7 @@ void set_user_nice(task_t *p, long nice)
}
array = p->array;
if (array)
- dequeue_task(p, array);
+ dequeue_task(p, rq);

old_prio = p->prio;
new_prio = NICE_TO_PRIO(nice);
@@ -2574,7 +2293,7 @@ void set_user_nice(task_t *p, long nice)
p->prio += delta;

if (array) {
- enqueue_task(p, array);
+ enqueue_task(p, rq);
/*
* If the task increased its priority or is running and
* lowered its priority, then reschedule its CPU:
@@ -2696,7 +2415,7 @@ static int setscheduler(pid_t pid, int p
struct sched_param lp;
int retval = -EINVAL;
int oldprio;
- prio_array_t *array;
+ prio_array_t* array;
unsigned long flags;
runqueue_t *rq;
task_t *p;
@@ -2965,28 +2684,17 @@ out_unlock:
/**
* sys_sched_yield - yield the current processor to other threads.
*
- * this function yields the current CPU by moving the calling thread
- * to the expired array. If there are no other threads running on this
* CPU then this function will return.
*/
asmlinkage long sys_sched_yield(void)
{
runqueue_t *rq = this_rq_lock();
- prio_array_t *array = current->array;
- prio_array_t *target = rq->expired;

- /*
- * We implement yielding by moving the task into the expired
- * queue.
- *
- * (special rule: RT tasks will just roundrobin in the active
- * array.)
- */
- if (unlikely(rt_task(current)))
- target = rq->active;
-
- dequeue_task(current, array);
- enqueue_task(current, target);
+ dequeue_task(current, rq);
+ current->slice = slice(current);
+ current->time_slice = current->slice;
+ current->prio = effective_prio(current);
+ enqueue_task(current, rq);

/*
* Since we are going to call schedule() anyway, there's
@@ -3125,7 +2833,7 @@ long sys_sched_rr_get_interval(pid_t pid
goto out_unlock;

jiffies_to_timespec(p->policy & SCHED_FIFO ?
- 0 : task_timeslice(p), &t);
+ 0 : slice(p), &t);
read_unlock(&tasklist_lock);
retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
out_nounlock:
@@ -3246,6 +2954,7 @@ void __devinit init_idle(task_t *idle, i
idle->array = NULL;
idle->prio = MAX_PRIO;
idle->state = TASK_RUNNING;
+ idle->deadline = 0;
set_task_cpu(idle, cpu);
double_rq_unlock(idle_rq, rq);
set_tsk_need_resched(idle);
@@ -3878,7 +3587,7 @@ int in_sched_functions(unsigned long add
void __init sched_init(void)
{
runqueue_t *rq;
- int i, j, k;
+ int i, j;

#ifdef CONFIG_SMP
/* Set up an initial dummy domain for early boot */
@@ -3899,13 +3608,10 @@ void __init sched_init(void)
#endif

for (i = 0; i < NR_CPUS; i++) {
- prio_array_t *array;
+ prio_array_t* array;

rq = cpu_rq(i);
spin_lock_init(&rq->lock);
- rq->active = rq->arrays;
- rq->expired = rq->arrays + 1;
- rq->best_expired_prio = MAX_PRIO;

#ifdef CONFIG_SMP
rq->sd = &sched_domain_init;
@@ -3916,16 +3622,14 @@ void __init sched_init(void)
INIT_LIST_HEAD(&rq->migration_queue);
#endif
atomic_set(&rq->nr_iowait, 0);
+ array = &rq->array;

- for (j = 0; j < 2; j++) {
- array = rq->arrays + j;
- for (k = 0; k < MAX_PRIO; k++) {
- INIT_LIST_HEAD(array->queue + k);
- __clear_bit(k, array->bitmap);
- }
- // delimiter for bitsearch
- __set_bit(MAX_PRIO, array->bitmap);
+ for (j = 0; j <= MAX_PRIO; j++) {
+ INIT_LIST_HEAD(array->queue + j);
+ __clear_bit(j, array->bitmap);
}
+ // delimiter for bitsearch
+ __set_bit(MAX_PRIO, array->bitmap);
}
/*
* We have to do a little magic to get the first
diff -Naurp --exclude-from=dontdiff linux-2.6.7-rc2-base/kernel/sysctl.c linux-2.6.7-rc2-s6.3/kernel/sysctl.c
--- linux-2.6.7-rc2-base/kernel/sysctl.c 2004-05-31 21:29:24.000000000 +1000
+++ linux-2.6.7-rc2-s6.3/kernel/sysctl.c 2004-06-07 00:09:21.513441965 +1000
@@ -636,6 +636,22 @@ static ctl_table kern_table[] = {
.mode = 0444,
.proc_handler = &proc_dointvec,
},
+ {
+ .ctl_name = KERN_INTERACTIVE,
+ .procname = "interactive",
+ .data = &interactive,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = KERN_COMPUTE,
+ .procname = "compute",
+ .data = &compute,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
{ .ctl_name = 0 }
};