diff -urNp 2.5.58-ms-ilb-nb/kernel/sched.c 2.5.58-ms-ilb-nb-sm-var/kernel/sched.c --- 2.5.58-ms-ilb-nb/kernel/sched.c 2003-01-14 17:10:37.000000000 +0100 +++ 2.5.58-ms-ilb-nb-sm-var/kernel/sched.c 2003-01-14 17:12:35.000000000 +0100 @@ -67,8 +67,9 @@ #define INTERACTIVE_DELTA 2 #define MAX_SLEEP_AVG (2*HZ) #define STARVATION_LIMIT (2*HZ) -#define NODE_BALANCE_RATIO 10 #define NODE_THRESHOLD 125 +#define NODE_BALANCE_MIN 10 +#define NODE_BALANCE_MAX 40 /* * If a task is 'interactive' then we reinsert it in the active @@ -158,6 +159,7 @@ struct runqueue { #ifdef CONFIG_NUMA atomic_t * node_ptr; unsigned int nr_balanced; + int prev_node_load[MAX_NUMNODES]; #endif task_t *migration_thread; struct list_head migration_queue; @@ -185,6 +187,8 @@ static struct runqueue runqueues[NR_CPUS #if CONFIG_NUMA static atomic_t node_nr_running[MAX_NUMNODES] ____cacheline_maxaligned_in_smp = {[0 ...MAX_NUMNODES-1] = ATOMIC_INIT(0)}; +static int internode_lb[MAX_NUMNODES] ____cacheline_maxaligned_in_smp = + {[0 ...MAX_NUMNODES-1] = NODE_BALANCE_MAX}; static inline void nr_running_inc(runqueue_t *rq) { @@ -725,25 +729,54 @@ void sched_balance_exec(void) } } +/* + * Find the busiest node. All previous node loads contribute with a + * geometrically deccaying weight to the load measure: + * load_{t} = load_{t-1}/2 + nr_node_running_{t} + * This way sudden load peaks are flattened out a bit. + */ static int find_busiest_node(int this_node) { int i, node = -1, load, this_load, maxload; + int avg_load; - this_load = maxload = atomic_read(&node_nr_running[this_node]); + this_load = maxload = (this_rq()->prev_node_load[this_node] >> 1) + + atomic_read(&node_nr_running[this_node]); + this_rq()->prev_node_load[this_node] = this_load; + avg_load = this_load; for (i = 0; i < numnodes; i++) { if (i == this_node) continue; - load = atomic_read(&node_nr_running[i]); + load = (this_rq()->prev_node_load[i] >> 1) + + atomic_read(&node_nr_running[i]); + avg_load += load; + this_rq()->prev_node_load[i] = load; if (load > maxload && (100*load > ((NODE_THRESHOLD*100*this_load)/100))) { maxload = load; node = i; } } - if (maxload <= 4) + avg_load = avg_load / (numnodes ? numnodes : 1); + if (this_load < (avg_load / 2)) { + if (internode_lb[this_node] == MAX_INTERNODE_LB) + internode_lb[this_node] = MIN_INTERNODE_LB; + } else + if (internode_lb[this_node] == MIN_INTERNODE_LB) + internode_lb[this_node] = MAX_INTERNODE_LB; + if (maxload <= 4+2+1 || this_load >= avg_load) node = -1; return node; } + +static inline int remote_steal_factor(runqueue_t *rq) +{ + int cpu = __cpu_to_node(task_cpu(rq->curr)); + + return (cpu == __cpu_to_node(smp_processor_id())) ? 1 : 2; +} +#else +#define remote_steal_factor(rq) 1 #endif /* CONFIG_NUMA */ #if CONFIG_SMP @@ -956,7 +989,7 @@ skip_queue: goto skip_bitmap; } pull_task(busiest, array, tmp, this_rq, this_cpu); - if (!idle && --imbalance) { + if (!idle && ((--imbalance)/remote_steal_factor(busiest))) { if (curr != head) goto skip_queue; idx++;