[RFC PATCH 03/10] sched: Select a better task to pull across node using iterations

From: Srikar Dronamraju
Date: Tue Jul 30 2013 - 03:49:24 EST


While selecting a task to pull across a node, try to choose a task that
improves locatity. i.e choose a task that has more affinity to the
destination node.

To achieve this, parse the list of tasks in multiple iterations. For now
choose just two iterations. In the first iteration, a task is chosen to
move if and only if moving such a task helps improve node locality. In
the last iteration, choose the default behaviour, i.e, a task is chosen
irrespective of whether it improves node locality or not.(behaviour
before this change). This iteration logic is only for cross node
migration and with CONFIG_NUMA_BALANCING enabled.

So if there are two tasks in a runq, both eligible to be migrated to
another runq belonging to a different node, then this change tries to
chose a task among the two that improves locality.

Similar logic was first used in Peter Zijlstra's numa core.

Signed-off-by: Srikar Dronamraju <srikar@xxxxxxxxxxxxxxxxxx>
---
kernel/sched/fair.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 48 insertions(+), 0 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3df7f76..8fcbf96 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3906,6 +3906,7 @@ struct lb_env {
unsigned int loop;
unsigned int loop_break;
unsigned int loop_max;
+ unsigned int iterations;
};

/*
@@ -4030,6 +4031,21 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
return 1;
}

+#ifdef CONFIG_NUMA_BALANCING
+static bool preferred_node(struct task_struct *p, struct lb_env *env)
+{
+ if (!(env->sd->flags & SD_NUMA))
+ return false;
+
+ return (can_numa_migrate_task(p, env->dst_rq, env->src_rq) == 1);
+}
+#else
+static bool preferred_node(struct task_struct *p, struct lb_env *env)
+{
+ return false;
+}
+#endif
+
/*
* move_one_task tries to move exactly one task from busiest to this_rq, as
* part of active balancing operations within "domain".
@@ -4041,7 +4057,11 @@ static int move_one_task(struct lb_env *env)
{
struct task_struct *p, *n;

+again:
list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
+ if (!preferred_node(p, env))
+ continue;
+
if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
continue;

@@ -4049,6 +4069,7 @@ static int move_one_task(struct lb_env *env)
continue;

move_task(p, env);
+
/*
* Right now, this is only the second place move_task()
* is called, so we can safely collect move_task()
@@ -4057,6 +4078,9 @@ static int move_one_task(struct lb_env *env)
schedstat_inc(env->sd, lb_gained[env->idle]);
return 1;
}
+ if (!env->iterations++)
+ goto again;
+
return 0;
}

@@ -4096,6 +4120,9 @@ static int move_tasks(struct lb_env *env)
break;
}

+ if (!preferred_node(p, env))
+ goto next;
+
if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
goto next;

@@ -5099,6 +5126,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
.idle = idle,
.loop_break = sched_nr_migrate_break,
.cpus = cpus,
+ .iterations = 1,
};

cpumask_copy(cpus, cpu_active_mask);
@@ -5130,6 +5158,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
ld_moved = 0;
lb_iterations = 1;
if (busiest->nr_running > 1) {
+#ifdef CONFIG_NUMA_BALANCING
+ if (sd->flags & SD_NUMA) {
+ if (cpu_to_node(env.dst_cpu) != cpu_to_node(env.src_cpu))
+ env.iterations = 0;
+ }
+#endif
/*
* Attempt to move tasks. If find_busiest_group has found
* an imbalance but busiest->nr_running <= 1, the group is
@@ -5160,6 +5194,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
goto more_balance;
}

+ if (!ld_moved && !env.iterations++) {
+ env.loop = 0;
+ env.loop_break = sched_nr_migrate_break;
+ goto more_balance;
+ }
+
/*
* some other cpu did the load balance for us.
*/
@@ -5407,8 +5447,16 @@ static int active_load_balance_cpu_stop(void *data)
.src_cpu = busiest_rq->cpu,
.src_rq = busiest_rq,
.idle = CPU_IDLE,
+ .iterations = 1,
};

+#ifdef CONFIG_NUMA_BALANCING
+ if ((sd->flags & SD_NUMA)) {
+ if (cpu_to_node(env.dst_cpu) != cpu_to_node(env.src_cpu))
+ env.iterations = 0;
+ }
+#endif
+
schedstat_inc(sd, alb_count);

if (move_one_task(&env))
--
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/