[PATCH] mm, sched, numa: Create a per-task MPOL_INTERLEAVE policy

From: Peter Zijlstra
Date: Thu Jul 25 2013 - 06:46:57 EST

Next message: Lai Jiangshan: "[PATCH] workqueue: clear workers of a pool after the CPU is offline"
Previous message: Paul Bolle: "Re: [PATCH] mtd: remove obsolete JEDEC mapping drivers"
Next in thread: Peter Zijlstra: "Re: [PATCH] mm, sched, numa: Create a per-task MPOL_INTERLEAVE policy"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

Subject: mm, sched, numa: Create a per-task MPOL_INTERLEAVE policy
From: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Date: Mon Jul 22 10:42:38 CEST 2013

Just an idea.. the rest of the code doesn't work good enough for this to
matter, also there's something sickly with it since it makes my box
explode. But wanted to put the idea out there anyway.

Signed-off-by: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
---
include/linux/mempolicy.h | 5 +-
kernel/sched/fair.c | 44 +++++++++++++++++++++
kernel/sched/features.h | 1
mm/huge_memory.c | 28 +++++++------
mm/memory.c | 33 ++++++++++------
mm/mempolicy.c | 94 +++++++++++++++++++++++++++++-----------------
6 files changed, 145 insertions(+), 60 deletions(-)

--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -60,6 +60,7 @@ struct mempolicy {
* The default fast path of a NULL MPOL_DEFAULT policy is always inlined.
*/

+extern struct mempolicy *__mpol_new(unsigned short, unsigned short);
extern void __mpol_put(struct mempolicy *pol);
static inline void mpol_put(struct mempolicy *pol)
{
@@ -187,7 +188,7 @@ static inline int vma_migratable(struct
return 1;
}

-extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long);
+extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long, int *);

#else

@@ -307,7 +308,7 @@ static inline int mpol_to_str(char *buff
}

static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
- unsigned long address)
+ unsigned long address, int *account_node)
{
return -1; /* no node preference */
}
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -893,6 +893,47 @@ static inline unsigned long task_faults(
return p->numa_faults[2*nid] + p->numa_faults[2*nid+1];
}

+/*
+ * Create/Update p->mempolicy MPOL_INTERLEAVE to match p->numa_faults[].
+ */
+static void task_numa_mempol(struct task_struct *p, long max_faults)
+{
+ struct mempolicy *pol = p->mempolicy, *new = NULL;
+ nodemask_t nodes = NODE_MASK_NONE;
+ int node;
+
+ if (!pol) {
+ new = __mpol_new(MPOL_INTERLEAVE, MPOL_F_MOF | MPOL_F_MORON);
+ if (IS_ERR(new))
+ return;
+ }
+
+ task_lock(p);
+
+ pol = p->mempolicy; /* lock forces a re-read */
+ if (!pol) {
+ pol = p->mempolicy = new;
+ new = NULL;
+ }
+
+ if (!(pol->flags & MPOL_F_MORON))
+ goto unlock;
+
+ for_each_node(node) {
+ if (task_faults(p, node) > max_faults/2)
+ node_set(node, nodes);
+ }
+
+ mpol_rebind_task(p, &nodes, MPOL_REBIND_STEP1);
+ mpol_rebind_task(p, &nodes, MPOL_REBIND_STEP2);
+
+unlock:
+ task_unlock(p);
+
+ if (new)
+ __mpol_put(new);
+}
+
static unsigned long weighted_cpuload(const int cpu);
static unsigned long source_load(int cpu, int type);
static unsigned long target_load(int cpu, int type);
@@ -1106,6 +1147,9 @@ static void task_numa_placement(struct t
}
}

+ if (sched_feat(NUMA_INTERLEAVE))
+ task_numa_mempol(p, max_faults);
+
/* Preferred node as the node with the most faults */
if (max_faults && max_nid != p->numa_preferred_nid) {

--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -72,4 +72,5 @@ SCHED_FEAT(NUMA_FORCE, false)
SCHED_FEAT(NUMA_BALANCE, true)
SCHED_FEAT(NUMA_FAULTS_UP, true)
SCHED_FEAT(NUMA_FAULTS_DOWN, true)
+SCHED_FEAT(NUMA_INTERLEAVE, false)
#endif
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1292,7 +1292,7 @@ int do_huge_pmd_numa_page(struct mm_stru
{
struct page *page;
unsigned long haddr = addr & HPAGE_PMD_MASK;
- int page_nid = -1, this_nid = numa_node_id();
+ int page_nid = -1, account_nid = -1, this_nid = numa_node_id();
int target_nid, last_nidpid;
bool migrated = false;

@@ -1301,7 +1301,6 @@ int do_huge_pmd_numa_page(struct mm_stru
goto out_unlock;

page = pmd_page(pmd);
- get_page(page);

/*
* Do not account for faults against the huge zero page. The read-only
@@ -1317,13 +1316,12 @@ int do_huge_pmd_numa_page(struct mm_stru
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);

last_nidpid = page_nidpid_last(page);
- target_nid = mpol_misplaced(page, vma, haddr);
- if (target_nid == -1) {
- put_page(page);
+ target_nid = mpol_misplaced(page, vma, haddr, &account_nid);
+ if (target_nid == -1)
goto clear_pmdnuma;
- }

/* Acquire the page lock to serialise THP migrations */
+ get_page(page);
spin_unlock(&mm->page_table_lock);
lock_page(page);

@@ -1332,6 +1330,7 @@ int do_huge_pmd_numa_page(struct mm_stru
if (unlikely(!pmd_same(pmd, *pmdp))) {
unlock_page(page);
put_page(page);
+ account_nid = page_nid = -1; /* someone else took our fault */
goto out_unlock;
}
spin_unlock(&mm->page_table_lock);
@@ -1339,17 +1338,20 @@ int do_huge_pmd_numa_page(struct mm_stru
/* Migrate the THP to the requested node */
migrated = migrate_misplaced_transhuge_page(mm, vma,
pmdp, pmd, addr, page, target_nid);
- if (migrated)
- page_nid = target_nid;
- else
+ if (!migrated) {
+ account_nid = -1; /* account against the old page */
goto check_same;
+ }

+ page_nid = target_nid;
goto out;

check_same:
spin_lock(&mm->page_table_lock);
- if (unlikely(!pmd_same(pmd, *pmdp)))
+ if (unlikely(!pmd_same(pmd, *pmdp))) {
+ page_nid = -1; /* someone else took our fault */
goto out_unlock;
+ }
clear_pmdnuma:
pmd = pmd_mknonnuma(pmd);
set_pmd_at(mm, haddr, pmdp, pmd);
@@ -1359,8 +1361,10 @@ int do_huge_pmd_numa_page(struct mm_stru
spin_unlock(&mm->page_table_lock);

out:
- if (page_nid != -1)
- task_numa_fault(last_nidpid, page_nid, HPAGE_PMD_NR, migrated);
+ if (account_nid == -1)
+ account_nid = page_nid;
+ if (account_nid != -1)
+ task_numa_fault(last_nidpid, account_nid, HPAGE_PMD_NR, migrated);

return 0;
}
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3516,16 +3516,17 @@ static int do_nonlinear_fault(struct mm_
return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
}

-int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
- unsigned long addr, int current_nid)
+static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
+ unsigned long addr, int page_nid,
+ int *account_nid)
{
get_page(page);

count_vm_numa_event(NUMA_HINT_FAULTS);
- if (current_nid == numa_node_id())
+ if (page_nid == numa_node_id())
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);

- return mpol_misplaced(page, vma, addr);
+ return mpol_misplaced(page, vma, addr, account_nid);
}

int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -3533,7 +3534,7 @@ int do_numa_page(struct mm_struct *mm, s
{
struct page *page = NULL;
spinlock_t *ptl;
- int page_nid = -1;
+ int page_nid = -1, account_nid = -1;
int target_nid, last_nidpid;
bool migrated = false;

@@ -3570,7 +3571,7 @@ int do_numa_page(struct mm_struct *mm, s

last_nidpid = page_nidpid_last(page);
page_nid = page_to_nid(page);
- target_nid = numa_migrate_prep(page, vma, addr, page_nid);
+ target_nid = numa_migrate_prep(page, vma, addr, page_nid, &account_nid);
pte_unmap_unlock(ptep, ptl);
if (target_nid == -1) {
put_page(page);
@@ -3583,8 +3584,10 @@ int do_numa_page(struct mm_struct *mm, s
page_nid = target_nid;

out:
- if (page_nid != -1)
- task_numa_fault(last_nidpid, page_nid, 1, migrated);
+ if (account_nid == -1)
+ account_nid = page_nid;
+ if (account_nid != -1)
+ task_numa_fault(last_nidpid, account_nid, 1, migrated);

return 0;
}
@@ -3623,7 +3626,7 @@ static int do_pmd_numa_page(struct mm_st
for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
pte_t pteval = *pte;
struct page *page;
- int page_nid = -1;
+ int page_nid = -1, account_nid = -1;
int target_nid;
bool migrated = false;

@@ -3648,19 +3651,25 @@ static int do_pmd_numa_page(struct mm_st
last_nidpid = page_nidpid_last(page);
page_nid = page_to_nid(page);
target_nid = numa_migrate_prep(page, vma, addr,
- page_nid);
+ page_nid, &account_nid);
pte_unmap_unlock(pte, ptl);

if (target_nid != -1) {
migrated = migrate_misplaced_page(page, vma, target_nid);
if (migrated)
page_nid = target_nid;
+ else
+ account_nid = -1;
} else {
put_page(page);
}

- if (page_nid != -1)
- task_numa_fault(last_nidpid, page_nid, 1, migrated);
+ if (account_nid == -1)
+ account_nid = page_nid;
+ if (account_nid != -1)
+ task_numa_fault(last_nidpid, account_nid, 1, migrated);
+
+ cond_resched();

pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
}
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -118,22 +118,18 @@ static struct mempolicy default_policy =
.flags = MPOL_F_LOCAL,
};

-static struct mempolicy preferred_node_policy[MAX_NUMNODES];
+static struct mempolicy numa_policy = {
+ .refcnt = ATOMIC_INIT(1), /* never free it */
+ .mode = MPOL_PREFERRED,
+ .flags = MPOL_F_LOCAL | MPOL_F_MOF | MPOL_F_MORON,
+};

static struct mempolicy *get_task_policy(struct task_struct *p)
{
struct mempolicy *pol = p->mempolicy;
- int node;

- if (!pol) {
- node = numa_node_id();
- if (node != NUMA_NO_NODE)
- pol = &preferred_node_policy[node];
-
- /* preferred_node_policy is not initialised early in boot */
- if (!pol->mode)
- pol = NULL;
- }
+ if (!pol)
+ pol = &numa_policy;

return pol;
}
@@ -248,6 +244,20 @@ static int mpol_set_nodemask(struct memp
return ret;
}

+struct mempolicy *__mpol_new(unsigned short mode, unsigned short flags)
+{
+ struct mempolicy *policy;
+
+ policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
+ if (!policy)
+ return ERR_PTR(-ENOMEM);
+ atomic_set(&policy->refcnt, 1);
+ policy->mode = mode;
+ policy->flags = flags;
+
+ return policy;
+}
+
/*
* This function just creates a new policy, does some check and simple
* initialization. You must invoke mpol_set_nodemask() to set nodes.
@@ -255,8 +265,6 @@ static int mpol_set_nodemask(struct memp
static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
nodemask_t *nodes)
{
- struct mempolicy *policy;
-
pr_debug("setting mode %d flags %d nodes[0] %lx\n",
mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);

@@ -284,14 +292,8 @@ static struct mempolicy *mpol_new(unsign
mode = MPOL_PREFERRED;
} else if (nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
- policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
- if (!policy)
- return ERR_PTR(-ENOMEM);
- atomic_set(&policy->refcnt, 1);
- policy->mode = mode;
- policy->flags = flags;

- return policy;
+ return __mpol_new(mode, flags);
}

/* Slow path of a mpol destructor. */
@@ -2234,12 +2236,13 @@ static void sp_free(struct sp_node *n)
* Policy determination "mimics" alloc_page_vma().
* Called from fault path where we know the vma and faulting address.
*/
-int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
+int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr, int *account_node)
{
struct mempolicy *pol;
struct zone *zone;
int curnid = page_to_nid(page);
unsigned long pgoff;
+ int thisnid = numa_node_id();
int polnid = -1;
int ret = -1;

@@ -2261,7 +2264,7 @@ int mpol_misplaced(struct page *page, st

case MPOL_PREFERRED:
if (pol->flags & MPOL_F_LOCAL)
- polnid = numa_node_id();
+ polnid = thisnid;
else
polnid = pol->v.preferred_node;
break;
@@ -2276,7 +2279,7 @@ int mpol_misplaced(struct page *page, st
if (node_isset(curnid, pol->v.nodes))
goto out;
(void)first_zones_zonelist(
- node_zonelist(numa_node_id(), GFP_HIGHUSER),
+ node_zonelist(thisnid, GFP_HIGHUSER),
gfp_zone(GFP_HIGHUSER),
&pol->v.nodes, &zone);
polnid = zone->node;
@@ -2291,8 +2294,7 @@ int mpol_misplaced(struct page *page, st
int last_nidpid;
int this_nidpid;

- polnid = numa_node_id();
- this_nidpid = nid_pid_to_nidpid(polnid, current->pid);;
+ this_nidpid = nid_pid_to_nidpid(thisnid, current->pid);;

/*
* Multi-stage node selection is used in conjunction
@@ -2318,6 +2320,39 @@ int mpol_misplaced(struct page *page, st
last_nidpid = page_nidpid_xchg_last(page, this_nidpid);
if (!nidpid_pid_unset(last_nidpid) && nidpid_to_nid(last_nidpid) != polnid)
goto out;
+
+ /*
+ * Preserve interleave pages while allowing useful
+ * ->numa_faults[] statistics.
+ *
+ * When migrating into an interleave set, migrate to
+ * the correct interleaved node but account against the
+ * current node (where the task is running).
+ *
+ * Not doing this would result in ->numa_faults[] being
+ * flat across the interleaved nodes, making it
+ * impossible to shrink the node list even when all
+ * tasks are running on a single node.
+ *
+ * src dst migrate account
+ * 0 0 -- this_node $page_node
+ * 0 1 -- policy_node this_node
+ * 1 0 -- this_node $page_node
+ * 1 1 -- policy_node this_node
+ *
+ */
+ switch (pol->mode) {
+ case MPOL_INTERLEAVE:
+ if (node_isset(thisnid, pol->v.nodes)) {
+ if (account_node)
+ *account_node = thisnid;
+ }
+ break;
+
+ default:
+ polnid = thisnid;
+ break;
+ }
}

if (curnid != polnid)
@@ -2580,15 +2615,6 @@ void __init numa_policy_init(void)
sizeof(struct sp_node),
0, SLAB_PANIC, NULL);

- for_each_node(nid) {
- preferred_node_policy[nid] = (struct mempolicy) {
- .refcnt = ATOMIC_INIT(1),
- .mode = MPOL_PREFERRED,
- .flags = MPOL_F_MOF | MPOL_F_MORON,
- .v = { .preferred_node = nid, },
- };
- }
-
/*
* Set interleaving policy for system init. Interleaving is only
* enabled across suitably sized nodes (default is >= 16MB), or
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Next message: Lai Jiangshan: "[PATCH] workqueue: clear workers of a pool after the CPU is offline"
Previous message: Paul Bolle: "Re: [PATCH] mtd: remove obsolete JEDEC mapping drivers"
Next in thread: Peter Zijlstra: "Re: [PATCH] mm, sched, numa: Create a per-task MPOL_INTERLEAVE policy"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]