[tip:sched/numa] sched/numa: Provide sysctl knob to disable numa scheduling and turn it off by default

From: tip-bot for Peter Zijlstra
Date: Fri May 18 2012 - 06:48:14 EST


Commit-ID: bcdf5162b92dfc0999b0e0ecf25d778733cc4c4d
Gitweb: http://git.kernel.org/tip/bcdf5162b92dfc0999b0e0ecf25d778733cc4c4d
Author: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
AuthorDate: Thu, 17 May 2012 15:07:31 +0200
Committer: Ingo Molnar <mingo@xxxxxxxxxx>
CommitDate: Fri, 18 May 2012 09:48:59 +0200

sched/numa: Provide sysctl knob to disable numa scheduling and turn it off by default

Provide a knob to make all this numa scheduling go-away.

Also provide a Kconfig entry to set the default for this new knob.

Requested-by: Ingo Molnar <mingo@xxxxxxxxxx>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Link: http://lkml.kernel.org/n/tip-lz8zudea6tqgbxduk9mcs7x3@xxxxxxxxxxxxxx
Signed-off-by: Ingo Molnar <mingo@xxxxxxxxxx>
---
include/linux/sched.h | 13 +++++++
init/Kconfig | 18 ++++++++++
kernel/sched/numa.c | 89 +++++++++++++++++++++++++++++++++++++++---------
kernel/sysctl.c | 11 ++++++
4 files changed, 114 insertions(+), 17 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 024a5f9..4879103 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -90,6 +90,7 @@ struct sched_param {
#include <linux/latencytop.h>
#include <linux/cred.h>
#include <linux/llist.h>
+#include <linux/jump_label.h>

#include <asm/processor.h>

@@ -1584,9 +1585,14 @@ struct task_struct {
/* Future-safe accessor for struct task_struct's cpus_allowed. */
#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)

+extern struct static_key sched_numa_disabled;
+
static inline int tsk_home_node(struct task_struct *p)
{
#ifdef CONFIG_NUMA
+ if (static_key_false(&sched_numa_disabled))
+ return -1;
+
return p->node;
#else
return -1;
@@ -2058,6 +2064,13 @@ static inline void sched_autogroup_exit(struct signal_struct *sig) { }
extern unsigned int sysctl_sched_cfs_bandwidth_slice;
#endif

+#ifdef CONFIG_NUMA
+extern int sysctl_sched_numa;
+int sched_numa_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos);
+#endif
+
#ifdef CONFIG_RT_MUTEXES
extern int rt_mutex_getprio(struct task_struct *p);
extern void rt_mutex_setprio(struct task_struct *p, int prio);
diff --git a/init/Kconfig b/init/Kconfig
index e4e84f2..2f6bfc1 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -865,6 +865,24 @@ config SCHED_AUTOGROUP
desktop applications. Task group autogeneration is currently based
upon task session.

+config SCHED_NUMA_DEFAULT
+ bool "Enable NUMA scheduling by default"
+ depends on NUMA
+ default n
+ help
+ This option selects the default enablement of a scheduler feature
+ that gives each process a home-node and allocates all its memory
+ from there and tries to schedule all the process tasks on that node
+ (or as near to it) while trying to maintain fairness.
+
+ Without this feature memory is allocated on whatever node a task
+ happens to run on and the scheduler is free to migrate tasks around
+ at will -- which can result in significant cross-node memory
+ traffic.
+
+ Regardless of this setting it can always be changed at runtime
+ by changing /proc/sys/kernel/sched_numa.
+
config MM_OWNER
def_bool NUMA

diff --git a/kernel/sched/numa.c b/kernel/sched/numa.c
index 7b74a15..b98338b 100644
--- a/kernel/sched/numa.c
+++ b/kernel/sched/numa.c
@@ -18,6 +18,9 @@

#include "sched.h"

+struct static_key sched_numa_disabled = STATIC_KEY_INIT_FALSE;
+static DEFINE_MUTEX(sched_numa_mutex);
+int sysctl_sched_numa = IS_ENABLED(CONFIG_SCHED_NUMA_DEFAULT);

static const int numa_balance_interval = 2 * HZ; /* 2 seconds */

@@ -137,7 +140,7 @@ bool account_numa_enqueue(struct task_struct *p)

void account_numa_dequeue(struct task_struct *p)
{
- int home_node = tsk_home_node(p);
+ int home_node = p->node; /* ignore sched_numa_disabled */
struct numa_cpu_load *nl;
struct rq *rq;

@@ -444,7 +447,7 @@ void select_task_node(struct task_struct *p, struct mm_struct *mm, int sd_flags)
{
int node;

- if (!sched_feat(NUMA_SELECT)) {
+ if (!sched_feat(NUMA_SELECT) || !sysctl_sched_numa) {
p->node = -1;
return;
}
@@ -766,13 +769,74 @@ static int numad_thread(void *data)
return 0;
}

+static int numad_create(struct node_queue *nq)
+{
+ struct task_struct *numad;
+
+ if (!sysctl_sched_numa)
+ return 0;
+
+ numad = kthread_create_on_node(numad_thread,
+ nq, nq->node, "numad/%d", nq->node);
+ if (IS_ERR(numad))
+ return PTR_ERR(numad);
+
+ nq->numad = numad;
+ nq->next_schedule = jiffies + HZ;
+
+ return 0;
+}
+
+static void numad_destroy(struct node_queue *nq)
+{
+ kthread_stop(nq->numad);
+ nq->numad = NULL;
+}
+
+int sched_numa_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int old, new, ret, node;
+
+ mutex_lock(&sched_numa_mutex);
+ get_online_cpus();
+
+ old = sysctl_sched_numa;
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ new = sysctl_sched_numa;
+
+ if (old == new)
+ goto unlock;
+
+ if (new)
+ static_key_slow_dec(&sched_numa_disabled);
+ else
+ static_key_slow_inc(&sched_numa_disabled);
+
+ for_each_online_node(node) {
+ struct node_queue *nq = nq_of(node);
+
+ if (new && !nq->numad) {
+ if (!numad_create(nq))
+ wake_up_process(nq->numad);
+ } else if (!new && nq->numad)
+ numad_destroy(nq);
+ }
+
+unlock:
+ put_online_cpus();
+ mutex_unlock(&sched_numa_mutex);
+
+ return ret;
+}
+
static int __cpuinit
numa_hotplug(struct notifier_block *nb, unsigned long action, void *hcpu)
{
int cpu = (long)hcpu;
int node = cpu_to_node(cpu);
struct node_queue *nq = nq_of(node);
- struct task_struct *numad;
int err = 0;

switch (action & ~CPU_TASKS_FROZEN) {
@@ -780,19 +844,12 @@ numa_hotplug(struct notifier_block *nb, unsigned long action, void *hcpu)
if (nq->numad)
break;

- numad = kthread_create_on_node(numad_thread,
- nq, node, "numad/%d", node);
- if (IS_ERR(numad)) {
- err = PTR_ERR(numad);
- break;
- }
-
- nq->numad = numad;
- nq->next_schedule = jiffies + HZ; // XXX sync-up?
+ err = numad_create(nq);
break;

case CPU_ONLINE:
- wake_up_process(nq->numad);
+ if (nq->numad)
+ wake_up_process(nq->numad);
break;

case CPU_DEAD:
@@ -801,10 +858,8 @@ numa_hotplug(struct notifier_block *nb, unsigned long action, void *hcpu)
break;

if (cpumask_any_and(cpu_online_mask,
- cpumask_of_node(node)) >= nr_cpu_ids) {
- kthread_stop(nq->numad);
- nq->numad = NULL;
- }
+ cpumask_of_node(node)) >= nr_cpu_ids)
+ numad_destroy(nq);
break;
}

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4ab1187..40ecba2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -373,6 +373,17 @@ static struct ctl_table kern_table[] = {
.extra1 = &one,
},
#endif
+#ifdef CONFIG_NUMA
+ {
+ .procname = "sched_numa",
+ .data = &sysctl_sched_numa,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = sched_numa_handler,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
#ifdef CONFIG_PROVE_LOCKING
{
.procname = "prove_locking",
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/