QNX-style scheduling v1.06

Joerg Lehrke (Joerg.Lehrke@ecrc.de)
Sat, 16 Aug 1997 17:36:26 +0200


This is a multipart MIME message.

--==_Exmh_-15322369010
Content-Type: text/plain; charset=us-ascii

Hi!

I ported the QNX-style scheduler v1.06 of Adam McKee to 2.1.50.
It has some nice features and it's worth drawing attention to.
Documentation can be found in Adam's README to the original
patch against 2.1.31-pre*

-- 
  ``---         J"org Lehrke                            Tel. +49 89 926 99185
     ||_        European Computer Research Centre       email: jlehrke@ECRC.de
ECRC || |@  ,,  Arabellastr 17, 81925 Munich, Germany   Yes, I use PGP.
BUSINESS-NET    <A HREF="http://www.fsf.org/">Protect your freedom!</A>

--==_Exmh_-15322369010 Content-Type: application/x-patch ; name="QNX-scheduler-2.1.50.patch" Content-Description: QNX-scheduler-2.1.50.patch Content-Disposition: attachment; filename="QNX-scheduler-2.1.50.patch"

diff -u --recursive --new-file linux-2.1.50/arch/i386/kernel/smp.c linux/arch/i386/kernel/smp.c --- linux-2.1.50/arch/i386/kernel/smp.c Sat Aug 16 19:14:08 1997 +++ linux/arch/i386/kernel/smp.c Sat Aug 16 17:27:38 1997 @@ -1353,7 +1353,7 @@ p->counter = 0; need_resched = 1; } - if (p->priority < DEF_PRIORITY) + if (p->run_q > DEF_RUN_QUEUE) kstat.cpu_nice += user; else kstat.cpu_user += user; diff -u --recursive --new-file linux-2.1.50/fs/proc/array.c linux/fs/proc/array.c --- linux-2.1.50/fs/proc/array.c Sat Aug 16 19:14:14 1997 +++ linux/fs/proc/array.c Sat Aug 16 17:27:38 1997 @@ -722,14 +722,36 @@ else tty_pgrp = -1; - /* scale priority and nice values from timeslices to -20..20 */ + /* scale priority and nice values from run-queue # to -20..20 */ /* to make it look like a "normal" unix priority/nice value */ - priority = tsk->counter; - priority = 20 - (priority * 10 + DEF_PRIORITY / 2) / DEF_PRIORITY; - nice = tsk->priority; - nice = 20 - (nice * 20 + DEF_PRIORITY / 2) / DEF_PRIORITY; + priority = tsk->run_q; + if (priority < DEF_RUN_QUEUE) { + priority *= 19; + priority /= (DEF_RUN_QUEUE - 1); + priority = -20 + priority; + } else if (priority == DEF_RUN_QUEUE) { + priority = 0; + } else { + priority -= (DEF_RUN_QUEUE + 1); + priority *= 19; + priority /= (NR_RUN_QUEUES - DEF_RUN_QUEUE - 2); + priority += 1; + } + nice = tsk->run_q_min; + if (nice < DEF_RUN_QUEUE) { + nice *= 19; + nice /= (DEF_RUN_QUEUE - 1); + nice = -20 + nice; + } else if (nice == DEF_RUN_QUEUE) { + nice = 0; + } else { + nice -= (DEF_RUN_QUEUE + 1); + nice *= 19; + nice /= (NR_RUN_QUEUES - DEF_RUN_QUEUE - 2); + nice += 1; + } - return sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \ + return sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \ %lu %lu %lu %lu %lu %ld %ld %ld %ld %ld %ld %lu %lu %ld %lu %lu %lu %lu %lu \ %lu %lu %lu %lu %lu %lu %lu %lu\n", pid, diff -u --recursive --new-file linux-2.1.50/include/linux/sched.h linux/include/linux/sched.h --- linux-2.1.50/include/linux/sched.h Sat Aug 16 19:14:23 1997 +++ linux/include/linux/sched.h Sat Aug 16 18:38:44 1997 @@ -78,10 +78,13 @@ #define TASK_STOPPED 4 #define TASK_SWAPPING 5 +#define NR_RUN_QUEUES 32 /* the scheduler uses 32 run-queues */ +#define DEF_RUN_QUEUE 15 /* default run-queue */ + /* * Scheduling policies */ -#define SCHED_OTHER 0 +#define SCHED_ADAPTIVE 0 #define SCHED_FIFO 1 #define SCHED_RR 2 @@ -256,6 +259,9 @@ struct mm_struct *mm; /* signal handlers */ struct signal_struct *sig; +/* QNX-style scheduler */ + unsigned long last_run, nr_forks; + unsigned int run_q, run_q_min, run_q_sw; /* SMP state */ int has_cpu; int processor; @@ -320,7 +326,7 @@ /* tarray */ &task[0], \ /* chld wait */ NULL, \ /* uid etc */ 0,0,0,0,0,0,0,0, \ -/* timeout */ 0,SCHED_OTHER,0,0,0,0,0,0,0, \ +/* timeout */ 0,SCHED_ADAPTIVE,0,0,0,0,0,0,0, \ /* timer */ { NULL, NULL, 0, 0, it_real_fn }, \ /* utime */ {0,0,0,0},0, \ /* flt */ 0,0,0,0,0,0, \ @@ -336,6 +342,7 @@ /* files */ &init_files, \ /* mm */ &init_mm, \ /* signals */ &init_signals, \ +/* QNX sched */ 0, 0, DEF_RUN_QUEUE, DEF_RUN_QUEUE, NR_RUN_QUEUES, \ /* SMP */ 0,0,0,0, \ /* locks */ INIT_LOCKS \ } diff -u --recursive --new-file linux-2.1.50/kernel/fork.c linux/kernel/fork.c --- linux-2.1.50/kernel/fork.c Sat Aug 16 19:15:08 1997 +++ linux/kernel/fork.c Sat Aug 16 17:27:38 1997 @@ -431,6 +431,22 @@ #endif p->lock_depth = 0; p->start_time = jiffies; + current->nr_forks++; + p->nr_forks = 0; + /* + * If a process forks a lot, its children will start out on a higher + * run-queue. Perhaps useful for defending fork-bombs, and helps the + * scheduler to deal with tasks like "make". + */ + if ((p->policy == SCHED_ADAPTIVE) && + (p->run_q < NR_RUN_QUEUES - 1)) { + unsigned long t = current->times.tms_utime + + current->times.tms_stime; + t /= 15*HZ; + if (!t || (current->nr_forks / t)) + p->run_q++; + } + p->run_q_sw = NR_RUN_QUEUES; p->tarray_ptr = &task[nr]; *p->tarray_ptr = p; SET_LINKS(p); diff -u --recursive --new-file linux-2.1.50/kernel/sched.c linux/kernel/sched.c --- linux-2.1.50/kernel/sched.c Sat Aug 16 19:14:24 1997 +++ linux/kernel/sched.c Sat Aug 16 18:27:59 1997 @@ -7,6 +7,7 @@ * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and * make semaphores SMP safe * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better. + * 1997-08-01 Modified by Adam McKee to use multi-level feedback scheduling. */ /* @@ -57,6 +58,8 @@ DECLARE_TASK_QUEUE(tq_immediate); DECLARE_TASK_QUEUE(tq_scheduler); +static struct task_struct * run_q[NR_RUN_QUEUES]; + /* * phase-lock loop variables */ @@ -104,40 +107,95 @@ static inline void add_to_runqueue(struct task_struct * p) { - if (p->counter > current->counter + 3) + struct task_struct *q_head; + unsigned long t; + + /* based on how long p slept, reset p->run_q */ + if (p->run_q > p->run_q_min) { + if (jiffies < p->last_run) + t = ULONG_MAX - p->last_run + jiffies; + else + t = jiffies - p->last_run; + t /= (100*HZ/1000); + if (t <= p->run_q - p->run_q_min) + p->run_q -= t; + else + p->run_q = p->run_q_min; + } + + q_head = run_q[p->run_q]; + if ((p->run_q < current->run_q) || + ((p->run_q == current->run_q) && + (p->counter > current->counter + 3))) need_resched = 1; nr_running++; - (p->prev_run = init_task.prev_run)->next_run = p; - p->next_run = &init_task; - init_task.prev_run = p; + + if (q_head == NULL) { + run_q[p->run_q] = p; + p->prev_run = p; + p->next_run = p; + } else { + p->prev_run = q_head->prev_run; + p->next_run = q_head; + q_head->prev_run->next_run = p; + q_head->prev_run = p; + } + p->last_run = jiffies; } static inline void del_from_runqueue(struct task_struct * p) { struct task_struct *next = p->next_run; struct task_struct *prev = p->prev_run; + struct task_struct *q_head = run_q[p->run_q]; nr_running--; next->prev_run = prev; prev->next_run = next; p->next_run = NULL; p->prev_run = NULL; + + /* if p was the q_head, reset q_head */ + if (p == q_head) { + if (next == p) + run_q[p->run_q] = NULL; + else + run_q[p->run_q] = next; + } } -static inline void move_last_runqueue(struct task_struct * p) +static inline void switch_runqueues(struct task_struct * p, unsigned int new_run_q) { struct task_struct *next = p->next_run; struct task_struct *prev = p->prev_run; + struct task_struct *q_head = run_q[p->run_q]; /* remove from list */ next->prev_run = prev; prev->next_run = next; - /* add back to list */ - p->next_run = &init_task; - prev = init_task.prev_run; - init_task.prev_run = p; - p->prev_run = prev; - prev->next_run = p; + + /* if p was the q_head, reset q_head */ + if (p == q_head) { + if (next == p) + run_q[p->run_q] = NULL; + else + run_q[p->run_q] = next; + } + + p->run_q = new_run_q; + q_head = run_q[p->run_q]; + if (q_head == NULL) { + run_q[p->run_q] = p; + p->prev_run = p; + p->next_run = p; + } else { + p->prev_run = q_head->prev_run; + p->next_run = q_head; + q_head->prev_run->next_run = p; + q_head->prev_run = p; + } + p->last_run = jiffies; + p->counter = DEF_PRIORITY; } #ifdef __SMP__ @@ -202,12 +260,14 @@ int weight; /* - * Realtime process, select the first one on the - * runqueue (taking priorities within processes - * into account). + * Special treatment of SCHED_FIFO and SCHED_RR */ - if (p->policy != SCHED_OTHER) - return 1000 + p->rt_priority; + if (p->policy != SCHED_ADAPTIVE) { + if (p->policy == SCHED_FIFO || p->counter) + return 1000 + p->rt_priority; + else + return 0; + } /* * Give the process a first-approximation goodness value @@ -364,10 +424,8 @@ #endif /* - * 'schedule()' is the scheduler function. It's a very simple and nice - * scheduler: it's not perfect, but certainly works for most things. - * - * The goto is "interesting". + * 'schedule()' implements QNX-style multi-level feedback scheduling. + * It does run-queue maintenance and selects a process to run. * * NOTE!! Task 0 is the 'idle' task, which gets called when no other * tasks can run. It can not be killed, and it cannot sleep. The 'state' @@ -376,6 +434,7 @@ asmlinkage void schedule(void) { int lock_depth; + static unsigned long next_maintenance = 0; struct task_struct * prev, * next; unsigned long timeout; int this_cpu; @@ -394,8 +453,8 @@ /* move an exhausted RR process to be last.. */ if (!prev->counter && prev->policy == SCHED_RR) { - prev->counter = prev->priority; - move_last_runqueue(prev); + prev->counter = DEF_PRIORITY; + run_q[prev->run_q] = prev->next_run; } timeout = 0; switch (prev->state) { @@ -412,53 +471,87 @@ } default: del_from_runqueue(prev); + prev->counter = DEF_PRIORITY; + prev->last_run = jiffies; case TASK_RUNNING: } { - struct task_struct * p = init_task.next_run; + struct task_struct * p; /* - * This is subtle. - * Note how we can enable interrupts here, even - * though interrupts can add processes to the run- - * queue. This is because any new processes will - * be added to the front of the queue, so "p" above - * is a safe starting point. - * run-queue deletion and re-ordering is protected by - * the scheduler lock + * Take care of re-niced processes, promote starving processes */ - spin_unlock_irq(&runqueue_lock); + if (jiffies >= next_maintenance) { + for_each_task(p) { + if (p->run_q_sw < NR_RUN_QUEUES) { + if (p->next_run) { + p->run_q_min = p->run_q_sw; + if (p->run_q != p->run_q_sw) + switch_runqueues(p, p->run_q_sw); + } else + p->run_q_min = p->run_q = p->run_q_sw; + p->run_q_sw = NR_RUN_QUEUES; + } else { + unsigned long t; + if (jiffies < p->last_run) + t = ULONG_MAX - p->last_run + jiffies; + else + t = jiffies - p->last_run; + if (p->next_run && (t >= HZ) && + (p->run_q > p->run_q_min + 1) && + prev->pid) + switch_runqueues(p, p->run_q - 1); + } + } + next_maintenance = jiffies + (200*HZ/1000); + } + #ifdef __SMP__ prev->has_cpu = 0; #endif -/* - * Note! there may appear new tasks on the run-queue during this, as - * interrupts are enabled. However, they will be put on front of the - * list, so our list starting at "p" is essentially fixed. - */ /* this is the scheduler proper: */ { - int c = -1000; + int q, c = -1000; next = idle_task; - while (p != &init_task) { - if (can_schedule(p)) { + for (q = 0; q < NR_RUN_QUEUES; q++) { + p = run_q[q]; + if (!p) continue; + if (!p->pid) { + p = p->next_run; + if (!p->pid) continue; + } + do { int weight = goodness(p, prev, this_cpu); - if (weight > c) - c = weight, next = p; + if (weight > c) { + c = weight; + next = p; + } + p = p->next_run; + } while (p != run_q[q]); + if (!c) { + p = run_q[q]; + do { + p->counter = DEF_PRIORITY; + p = p->next_run; + } while (p != run_q[q]); } - p = p->next_run; + if (c > -1000) break; } + next->last_run = jiffies; - /* Do we need to re-calculate counters? */ - if (!c) { - struct task_struct *p; - read_lock(&tasklist_lock); - for_each_task(p) - p->counter = (p->counter >> 1) + p->priority; - read_unlock(&tasklist_lock); - } + /* Possibly demote the previous task */ + if (prev->pid && next->pid && + (next != prev) && + prev->next_run && + !prev->counter && + (q == prev->run_q) && + (q < NR_RUN_QUEUES - 1) && + (prev->policy == SCHED_ADAPTIVE)) + switch_runqueues(prev, prev->run_q + 1); + } } - } + + spin_unlock_irq(&runqueue_lock); #ifdef __SMP__ next->has_cpu = 1; @@ -1306,17 +1399,17 @@ if (policy < 0) policy = p->policy; else if (policy != SCHED_FIFO && policy != SCHED_RR && - policy != SCHED_OTHER) + policy != SCHED_ADAPTIVE) return -EINVAL; /* * Valid priorities for SCHED_FIFO and SCHED_RR are 1..99, valid - * priority for SCHED_OTHER is 0. + * priority for SCHED_ADAPTIVE is 0. */ if (lp.sched_priority < 0 || lp.sched_priority > 99) return -EINVAL; - if ((policy == SCHED_OTHER) != (lp.sched_priority == 0)) - return -EINVAL; + if ((policy == SCHED_ADAPTIVE) != (lp.sched_priority == 0)) + return -EINVAL; if ((policy == SCHED_FIFO || policy == SCHED_RR) && !suser()) return -EPERM; @@ -1326,10 +1419,11 @@ p->policy = policy; p->rt_priority = lp.sched_priority; + p->run_q_sw = p->run_q_min; spin_lock(&scheduler_lock); spin_lock_irq(&runqueue_lock); if (p->next_run) - move_last_runqueue(p); + run_q[p->run_q] = p->next_run; spin_unlock_irq(&runqueue_lock); spin_unlock(&scheduler_lock); need_resched = 1; @@ -1386,7 +1480,7 @@ current->counter = 0; spin_lock(&scheduler_lock); spin_lock_irq(&runqueue_lock); - move_last_runqueue(current); + run_q[current->run_q] = current->next_run; spin_unlock_irq(&runqueue_lock); spin_unlock(&scheduler_lock); need_resched = 1; @@ -1402,7 +1496,7 @@ case SCHED_RR: ret = 99; break; - case SCHED_OTHER: + case SCHED_ADAPTIVE: ret = 0; break; } @@ -1418,7 +1512,7 @@ case SCHED_RR: ret = 1; break; - case SCHED_OTHER: + case SCHED_ADAPTIVE: ret = 0; } return ret; @@ -1470,7 +1564,7 @@ if (t.tv_sec == 0 && t.tv_nsec <= 2000000L && - current->policy != SCHED_OTHER) + current->policy != SCHED_ADAPTIVE) { /* * Short delay requests up to 2 ms will be handled with @@ -1568,7 +1662,7 @@ * process right in SMP mode. */ int cpu=hard_smp_processor_id(); - int nr = NR_TASKS; + int q, nr = NR_TASKS; init_task.processor=cpu; @@ -1582,4 +1676,9 @@ init_bh(TIMER_BH, timer_bh); init_bh(TQUEUE_BH, tqueue_bh); init_bh(IMMEDIATE_BH, immediate_bh); + + for (q = 0; q < NR_RUN_QUEUES; q++) + run_q[q] = NULL; + run_q[DEF_RUN_QUEUE] = &init_task; + printk("QNX-style scheduling v1.06 <amckee@poboxes.com>\n"); } diff -u --recursive --new-file linux-2.1.50/kernel/sys.c linux/kernel/sys.c --- linux-2.1.50/kernel/sys.c Mon Jul 14 06:20:11 1997 +++ linux/kernel/sys.c Sat Aug 16 17:27:39 1997 @@ -88,25 +88,29 @@ asmlinkage int sys_setpriority(int which, int who, int niceval) { struct task_struct *p; - unsigned int priority; + unsigned int run_q; int error; if (which > 2 || which < 0) return -EINVAL; - /* normalize: avoid signed division (rounding problems) */ error = ESRCH; - priority = niceval; - if (niceval < 0) - priority = -niceval; - if (priority > 20) - priority = 20; - priority = (priority * DEF_PRIORITY + 10) / 20 + DEF_PRIORITY; - - if (niceval >= 0) { - priority = 2*DEF_PRIORITY - priority; - if (!priority) - priority = 1; + if (niceval < -20) + niceval = -20; + else if (niceval > 20) + niceval = 20; + if (niceval < 0) { + run_q = -niceval - 1; + run_q *= (DEF_RUN_QUEUE - 1); + run_q /= 19; + run_q = DEF_RUN_QUEUE - 1 - run_q; + } else if (!niceval) { + run_q = DEF_RUN_QUEUE; + } else { + run_q = niceval - 1; + run_q *= (NR_RUN_QUEUES - DEF_RUN_QUEUE - 2); + run_q /= 19; + run_q = DEF_RUN_QUEUE + 1 + run_q; } read_lock(&tasklist_lock); @@ -120,10 +124,10 @@ } if (error == ESRCH) error = 0; - if (priority > p->priority && !suser()) - error = EACCES; + if (run_q < p->run_q_min && !suser()) + error = EACCES; else - p->priority = priority; + p->run_q_sw = run_q; } read_unlock(&tasklist_lock); @@ -138,7 +142,7 @@ asmlinkage int sys_getpriority(int which, int who) { struct task_struct *p; - long max_prio = -ESRCH; + unsigned int run_q_min = NR_RUN_QUEUES; if (which > 2 || which < 0) return -EINVAL; @@ -147,15 +151,28 @@ for_each_task (p) { if (!proc_sel(p, which, who)) continue; - if (p->priority > max_prio) - max_prio = p->priority; + if (p->run_q_min < run_q_min) + run_q_min = p->run_q_min; } read_unlock(&tasklist_lock); - /* scale the priority from timeslice to 0..40 */ - if (max_prio > 0) - max_prio = (max_prio * 20 + DEF_PRIORITY/2) / DEF_PRIORITY; - return max_prio; + if (run_q_min == NR_RUN_QUEUES) + return -ESRCH; + + /* scale the run_q to 0..40 */ + if (run_q_min < DEF_RUN_QUEUE) { + run_q_min *= 19; + run_q_min /= (DEF_RUN_QUEUE - 1); + } else if (run_q_min == DEF_RUN_QUEUE) { + run_q_min = 20; + } else { + run_q_min -= (DEF_RUN_QUEUE + 1); + run_q_min *= 19; + run_q_min /= (NR_RUN_QUEUES - DEF_RUN_QUEUE - 2); + run_q_min += 21; + } + run_q_min = 40 - run_q_min; + return run_q_min; } #ifndef __alpha__ diff -u --recursive --new-file linux-2.1.50/mm/vmscan.c linux/mm/vmscan.c --- linux-2.1.50/mm/vmscan.c Sat Aug 16 19:13:30 1997 +++ linux/mm/vmscan.c Sat Aug 16 17:27:39 1997 @@ -447,6 +447,7 @@ current->priority = 32; /* Fixme --- we need to standardise our namings for POSIX.4 realtime scheduling priorities. */ + current->run_q_sw = 0; init_swap_timer();

--==_Exmh_-15322369010--