Re: [PATCH 26/31] sched, numa, mm: Add fault driven placement andmigration policy

From: Mel Gorman
Date: Thu Nov 01 2012 - 11:40:26 EST


On Thu, Oct 25, 2012 at 02:16:43PM +0200, Peter Zijlstra wrote:
> As per the problem/design document Documentation/scheduler/numa-problem.txt
> implement 3ac & 4.
>
> ( A pure 3a was found too unstable, I did briefly try 3bc
> but found no significant improvement. )
>
> Implement a per-task memory placement scheme relying on a regular
> PROT_NONE 'migration' fault to scan the memory space of the procress
> and uses a two stage migration scheme to reduce the invluence of
> unlikely usage relations.
>
> It relies on the assumption that the compute part is tied to a
> paticular task and builds a task<->page relation set to model the
> compute<->data relation.
>
> In the previous patch we made memory migrate towards where the task
> is running, here we select the node on which most memory is located
> as the preferred node to run on.
>
> This creates a feed-back control loop between trying to schedule a
> task on a node and migrating memory towards the node the task is
> scheduled on.
>

Ok.

> Suggested-by: Andrea Arcangeli <aarcange@xxxxxxxxxx>
> Suggested-by: Rik van Riel <riel@xxxxxxxxxx>
> Fixes-by: David Rientjes <rientjes@xxxxxxxxxx>
> Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
> Signed-off-by: Ingo Molnar <mingo@xxxxxxxxxx>
> ---
> include/linux/mm_types.h | 4 +
> include/linux/sched.h | 35 +++++++--
> kernel/sched/core.c | 16 ++++
> kernel/sched/fair.c | 175 +++++++++++++++++++++++++++++++++++++++++++++++
> kernel/sched/features.h | 1
> kernel/sched/sched.h | 31 +++++---
> kernel/sysctl.c | 31 +++++++-
> mm/huge_memory.c | 7 +
> mm/memory.c | 4 -
> 9 files changed, 282 insertions(+), 22 deletions(-)
> Index: tip/include/linux/mm_types.h
> ===================================================================
> --- tip.orig/include/linux/mm_types.h
> +++ tip/include/linux/mm_types.h
> @@ -403,6 +403,10 @@ struct mm_struct {
> #ifdef CONFIG_CPUMASK_OFFSTACK
> struct cpumask cpumask_allocation;
> #endif
> +#ifdef CONFIG_SCHED_NUMA
> + unsigned long numa_next_scan;

comment.

> + int numa_scan_seq;

comment! at least the other one is easy to guess. This thing looks like
it's preventing multiple threads in a process space scanning and
updating PTEs at the same time. Effectively it's a type of barrier but
without a comment I'm not sure if what it's doing is what you expect it
to be doing or something else entirely.

> +#endif
> struct uprobes_state uprobes_state;
> };
>
> Index: tip/include/linux/sched.h
> ===================================================================
> --- tip.orig/include/linux/sched.h
> +++ tip/include/linux/sched.h
> @@ -1481,9 +1481,16 @@ struct task_struct {
> short pref_node_fork;
> #endif
> #ifdef CONFIG_SCHED_NUMA
> - int node;
> + int node; /* task home node */
> + int numa_scan_seq;
> + int numa_migrate_seq;
> + unsigned int numa_scan_period;
> + u64 node_stamp; /* migration stamp */
> unsigned long numa_contrib;
> -#endif
> + unsigned long *numa_faults;
> + struct callback_head numa_work;
> +#endif /* CONFIG_SCHED_NUMA */
> +
> struct rcu_head rcu;
>
> /*
> @@ -1558,15 +1565,24 @@ struct task_struct {
> /* Future-safe accessor for struct task_struct's cpus_allowed. */
> #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
>
> +#ifdef CONFIG_SCHED_NUMA
> static inline int tsk_home_node(struct task_struct *p)
> {
> -#ifdef CONFIG_SCHED_NUMA
> return p->node;
> +}
> +
> +extern void task_numa_fault(int node, int pages);
> #else
> +static inline int tsk_home_node(struct task_struct *p)
> +{
> return -1;
> -#endif
> }
>
> +static inline void task_numa_fault(int node, int pages)
> +{
> +}
> +#endif /* CONFIG_SCHED_NUMA */
> +
> /*
> * Priority of a process goes from 0..MAX_PRIO-1, valid RT
> * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
> @@ -2004,6 +2020,10 @@ enum sched_tunable_scaling {
> };
> extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
>
> +extern unsigned int sysctl_sched_numa_scan_period_min;
> +extern unsigned int sysctl_sched_numa_scan_period_max;
> +extern unsigned int sysctl_sched_numa_settle_count;
> +
> #ifdef CONFIG_SCHED_DEBUG
> extern unsigned int sysctl_sched_migration_cost;
> extern unsigned int sysctl_sched_nr_migrate;
> @@ -2014,18 +2034,17 @@ extern unsigned int sysctl_sched_shares_
> int sched_proc_update_handler(struct ctl_table *table, int write,
> void __user *buffer, size_t *length,
> loff_t *ppos);
> -#endif
> -#ifdef CONFIG_SCHED_DEBUG
> +
> static inline unsigned int get_sysctl_timer_migration(void)
> {
> return sysctl_timer_migration;
> }
> -#else
> +#else /* CONFIG_SCHED_DEBUG */
> static inline unsigned int get_sysctl_timer_migration(void)
> {
> return 1;
> }
> -#endif
> +#endif /* CONFIG_SCHED_DEBUG */
> extern unsigned int sysctl_sched_rt_period;
> extern int sysctl_sched_rt_runtime;
>
> Index: tip/kernel/sched/core.c
> ===================================================================
> --- tip.orig/kernel/sched/core.c
> +++ tip/kernel/sched/core.c
> @@ -1533,6 +1533,21 @@ static void __sched_fork(struct task_str
> #ifdef CONFIG_PREEMPT_NOTIFIERS
> INIT_HLIST_HEAD(&p->preempt_notifiers);
> #endif
> +
> +#ifdef CONFIG_SCHED_NUMA
> + if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
> + p->mm->numa_next_scan = jiffies;
> + p->mm->numa_scan_seq = 0;
> + }
> +
> + p->node = -1;
> + p->node_stamp = 0ULL;
> + p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
> + p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
> + p->numa_faults = NULL;
> + p->numa_scan_period = sysctl_sched_numa_scan_period_min;
> + p->numa_work.next = &p->numa_work;
> +#endif /* CONFIG_SCHED_NUMA */
> }
>
> /*
> @@ -1774,6 +1789,7 @@ static void finish_task_switch(struct rq
> if (mm)
> mmdrop(mm);
> if (unlikely(prev_state == TASK_DEAD)) {
> + task_numa_free(prev);
> /*
> * Remove function-return probe instances associated with this
> * task and put them back on the free list.
> Index: tip/kernel/sched/fair.c
> ===================================================================
> --- tip.orig/kernel/sched/fair.c
> +++ tip/kernel/sched/fair.c
> @@ -27,6 +27,8 @@
> #include <linux/profile.h>
> #include <linux/interrupt.h>
> #include <linux/random.h>
> +#include <linux/mempolicy.h>
> +#include <linux/task_work.h>
>
> #include <trace/events/sched.h>
>
> @@ -775,6 +777,21 @@ update_stats_curr_start(struct cfs_rq *c
>
> /**************************************************
> * Scheduling class numa methods.
> + *
> + * The purpose of the NUMA bits are to maintain compute (task) and data
> + * (memory) locality. We try and achieve this by making tasks stick to
> + * a particular node (their home node) but if fairness mandates they run
> + * elsewhere for long enough, we let the memory follow them.
> + *
> + * Tasks start out with their home-node unset (-1) this effectively means
> + * they act !NUMA until we've established the task is busy enough to bother
> + * with placement.
> + *
> + * We keep a home-node per task and use periodic fault scans to try and
> + * estalish a task<->page relation. This assumes the task<->page relation is a
> + * compute<->data relation, this is false for things like virt. and n:m
> + * threading solutions but its the best we can do given the information we
> + * have.
> */
>
> #ifdef CONFIG_SMP
> @@ -805,6 +822,157 @@ static void account_numa_dequeue(struct
> } else
> rq->onnode_running--;
> }
> +
> +/*
> + * numa task sample period in ms: 5s
> + */
> +unsigned int sysctl_sched_numa_scan_period_min = 5000;
> +unsigned int sysctl_sched_numa_scan_period_max = 5000*16;
> +
> +/*
> + * Wait for the 2-sample stuff to settle before migrating again
> + */
> +unsigned int sysctl_sched_numa_settle_count = 2;
> +
> +static void task_numa_placement(struct task_struct *p)
> +{
> + unsigned long faults, max_faults = 0;
> + int node, max_node = -1;
> + int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
> +
> + if (p->numa_scan_seq == seq)
> + return;
> +
> + p->numa_scan_seq = seq;
> +
> + for (node = 0; node < nr_node_ids; node++) {
> + faults = p->numa_faults[node];
> +
> + if (faults > max_faults) {
> + max_faults = faults;
> + max_node = node;
> + }
> +
> + p->numa_faults[node] /= 2;
> + }

No comments explaining the logic behind the decaying average. It can be
inferred if someone reads Documentation/scheduler/numa-problem.txt and
point 3c carefully enough. At the very least point them at it.


> +
> + if (max_node == -1)
> + return;
> +
> + if (p->node != max_node) {
> + p->numa_scan_period = sysctl_sched_numa_scan_period_min;
> + if (sched_feat(NUMA_SETTLE) &&
> + (seq - p->numa_migrate_seq) <= (int)sysctl_sched_numa_settle_count)
> + return;
> + p->numa_migrate_seq = seq;
> + sched_setnode(p, max_node);

Ok, so at a guess even if we do ping-pong it will only take effect every
10 seconds which could be far worse.

> + } else {
> + p->numa_scan_period = min(sysctl_sched_numa_scan_period_max,
> + p->numa_scan_period * 2);
> + }
> +}
> +
> +/*
> + * Got a PROT_NONE fault for a page on @node.
> + */
> +void task_numa_fault(int node, int pages)
> +{
> + struct task_struct *p = current;
> +
> + if (unlikely(!p->numa_faults)) {
> + int size = sizeof(unsigned long) * nr_node_ids;
> +
> + p->numa_faults = kzalloc(size, GFP_KERNEL);
> + if (!p->numa_faults)
> + return;
> + }
> +

On a maximally configured machine this will be an order-4 allocation and
you need at least 512 nodes before it's an order-1 allocation. As unlikely
as it is, should this be GFP_NOWARN?

> + task_numa_placement(p);
> +
> + p->numa_faults[node] += pages;
> +}
> +
> +/*
> + * The expensive part of numa migration is done from task_work context.
> + * Triggered from task_tick_numa().
> + */
> +void task_numa_work(struct callback_head *work)
> +{
> + unsigned long migrate, next_scan, now = jiffies;
> + struct task_struct *p = current;
> + struct mm_struct *mm = p->mm;
> +
> + WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
> +
> + work->next = work; /* protect against double add */
> + /*
> + * Who cares about NUMA placement when they're dying.
> + *
> + * NOTE: make sure not to dereference p->mm before this check,
> + * exit_task_work() happens _after_ exit_mm() so we could be called
> + * without p->mm even though we still had it when we enqueued this
> + * work.
> + */
> + if (p->flags & PF_EXITING)
> + return;
> +
> + /*
> + * Enforce maximal scan/migration frequency..
> + */
> + migrate = mm->numa_next_scan;
> + if (time_before(now, migrate))
> + return;
> +
> + next_scan = now + 2*msecs_to_jiffies(sysctl_sched_numa_scan_period_min);
> + if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
> + return;
> +
> + ACCESS_ONCE(mm->numa_scan_seq)++;
> + {
> + struct vm_area_struct *vma;
> +
> + down_write(&mm->mmap_sem);
> + for (vma = mm->mmap; vma; vma = vma->vm_next) {
> + if (!vma_migratable(vma))
> + continue;
> + change_protection(vma, vma->vm_start, vma->vm_end, vma_prot_none(vma), 0);
> + }
> + up_write(&mm->mmap_sem);
> + }
> +}

Ok, I like the idea of the scanning cost being incurred by the process.
I was going to complain though that for very large processes that the length
time it takes to complete this scan could be considerable. However,
a quick glance forward indicates that you cope with this problem later by
limiting how much is scanned each time.

> +
> +/*
> + * Drive the periodic memory faults..
> + */
> +void task_tick_numa(struct rq *rq, struct task_struct *curr)
> +{
> + struct callback_head *work = &curr->numa_work;
> + u64 period, now;
> +
> + /*
> + * We don't care about NUMA placement if we don't have memory.
> + */
> + if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
> + return;
> +
> + /*
> + * Using runtime rather than walltime has the dual advantage that
> + * we (mostly) drive the selection from busy threads and that the
> + * task needs to have done some actual work before we bother with
> + * NUMA placement.
> + */

Makes sense.

> + now = curr->se.sum_exec_runtime;
> + period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
> +
> + if (now - curr->node_stamp > period) {
> + curr->node_stamp = now;
> +
> + if (!time_before(jiffies, curr->mm->numa_next_scan)) {
> + init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
> + task_work_add(curr, work, true);
> + }
> + }
> +}
> #else
> #ifdef CONFIG_SMP
> static struct list_head *account_numa_enqueue(struct rq *rq, struct task_struct *p)
> @@ -816,6 +984,10 @@ static struct list_head *account_numa_en
> static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
> {
> }
> +
> +static void task_tick_numa(struct rq *rq, struct task_struct *curr)
> +{
> +}
> #endif /* CONFIG_SCHED_NUMA */
>
> /**************************************************
> @@ -5265,6 +5437,9 @@ static void task_tick_fair(struct rq *rq
> cfs_rq = cfs_rq_of(se);
> entity_tick(cfs_rq, se, queued);
> }
> +
> + if (sched_feat_numa(NUMA))
> + task_tick_numa(rq, curr);
> }
>
> /*
> Index: tip/kernel/sched/features.h
> ===================================================================
> --- tip.orig/kernel/sched/features.h
> +++ tip/kernel/sched/features.h
> @@ -69,5 +69,6 @@ SCHED_FEAT(NUMA_TTWU_BIAS, false)
> SCHED_FEAT(NUMA_TTWU_TO, false)
> SCHED_FEAT(NUMA_PULL, true)
> SCHED_FEAT(NUMA_PULL_BIAS, true)
> +SCHED_FEAT(NUMA_SETTLE, true)
> #endif
>
> Index: tip/kernel/sched/sched.h
> ===================================================================
> --- tip.orig/kernel/sched/sched.h
> +++ tip/kernel/sched/sched.h
> @@ -3,6 +3,7 @@
> #include <linux/mutex.h>
> #include <linux/spinlock.h>
> #include <linux/stop_machine.h>
> +#include <linux/slab.h>
>
> #include "cpupri.h"
>
> @@ -476,15 +477,6 @@ struct rq {
> #endif
> };
>
> -static inline struct list_head *offnode_tasks(struct rq *rq)
> -{
> -#ifdef CONFIG_SCHED_NUMA
> - return &rq->offnode_tasks;
> -#else
> - return NULL;
> -#endif
> -}
> -
> static inline int cpu_of(struct rq *rq)
> {
> #ifdef CONFIG_SMP
> @@ -502,6 +494,27 @@ DECLARE_PER_CPU(struct rq, runqueues);
> #define cpu_curr(cpu) (cpu_rq(cpu)->curr)
> #define raw_rq() (&__raw_get_cpu_var(runqueues))
>
> +#ifdef CONFIG_SCHED_NUMA
> +static inline struct list_head *offnode_tasks(struct rq *rq)
> +{
> + return &rq->offnode_tasks;
> +}
> +
> +static inline void task_numa_free(struct task_struct *p)
> +{
> + kfree(p->numa_faults);
> +}
> +#else /* CONFIG_SCHED_NUMA */
> +static inline struct list_head *offnode_tasks(struct rq *rq)
> +{
> + return NULL;
> +}
> +
> +static inline void task_numa_free(struct task_struct *p)
> +{
> +}
> +#endif /* CONFIG_SCHED_NUMA */
> +
> #ifdef CONFIG_SMP
>
> #define rcu_dereference_check_sched_domain(p) \
> Index: tip/kernel/sysctl.c
> ===================================================================
> --- tip.orig/kernel/sysctl.c
> +++ tip/kernel/sysctl.c
> @@ -256,9 +256,11 @@ static int min_sched_granularity_ns = 10
> static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
> static int min_wakeup_granularity_ns; /* 0 usecs */
> static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
> +#ifdef CONFIG_SMP
> static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
> static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
> -#endif
> +#endif /* CONFIG_SMP */
> +#endif /* CONFIG_SCHED_DEBUG */
>
> #ifdef CONFIG_COMPACTION
> static int min_extfrag_threshold;
> @@ -301,6 +303,7 @@ static struct ctl_table kern_table[] = {
> .extra1 = &min_wakeup_granularity_ns,
> .extra2 = &max_wakeup_granularity_ns,
> },
> +#ifdef CONFIG_SMP
> {
> .procname = "sched_tunable_scaling",
> .data = &sysctl_sched_tunable_scaling,
> @@ -347,7 +350,31 @@ static struct ctl_table kern_table[] = {
> .extra1 = &zero,
> .extra2 = &one,
> },
> -#endif
> +#endif /* CONFIG_SMP */
> +#ifdef CONFIG_SCHED_NUMA
> + {
> + .procname = "sched_numa_scan_period_min_ms",
> + .data = &sysctl_sched_numa_scan_period_min,
> + .maxlen = sizeof(unsigned int),
> + .mode = 0644,
> + .proc_handler = proc_dointvec,
> + },
> + {
> + .procname = "sched_numa_scan_period_max_ms",
> + .data = &sysctl_sched_numa_scan_period_max,
> + .maxlen = sizeof(unsigned int),
> + .mode = 0644,
> + .proc_handler = proc_dointvec,
> + },
> + {
> + .procname = "sched_numa_settle_count",
> + .data = &sysctl_sched_numa_settle_count,
> + .maxlen = sizeof(unsigned int),
> + .mode = 0644,
> + .proc_handler = proc_dointvec,
> + },
> +#endif /* CONFIG_SCHED_NUMA */
> +#endif /* CONFIG_SCHED_DEBUG */
> {
> .procname = "sched_rt_period_us",
> .data = &sysctl_sched_rt_period,
> Index: tip/mm/huge_memory.c
> ===================================================================
> --- tip.orig/mm/huge_memory.c
> +++ tip/mm/huge_memory.c
> @@ -774,9 +774,10 @@ fixup:
>
> unlock:
> spin_unlock(&mm->page_table_lock);
> - if (page)
> + if (page) {
> + task_numa_fault(page_to_nid(page), HPAGE_PMD_NR);
> put_page(page);
> -
> + }
> return;
>
> migrate:
> @@ -845,6 +846,8 @@ migrate:
>
> put_page(page); /* Drop the rmap reference */
>
> + task_numa_fault(node, HPAGE_PMD_NR);
> +
> if (lru)
> put_page(page); /* drop the LRU isolation reference */
>
> Index: tip/mm/memory.c
> ===================================================================
> --- tip.orig/mm/memory.c
> +++ tip/mm/memory.c
> @@ -3512,8 +3512,10 @@ out_pte_upgrade_unlock:
> out_unlock:
> pte_unmap_unlock(ptep, ptl);
> out:
> - if (page)
> + if (page) {
> + task_numa_fault(page_nid, 1);
> put_page(page);
> + }
>
> return 0;
>
>
>

--
Mel Gorman
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/