Re: [PATCH] mm, sched, numa: Create a per-task MPOL_INTERLEAVE policy

From: Peter Zijlstra
Date: Mon Aug 26 2013 - 12:10:56 EST

Next message: Dan Carpenter: "Re: [PATCH 09/24] Staging: winbond: reg: fixed some lines over 80characters"
Previous message: Dan Carpenter: "Re: [PATCH 05/24] Staging: winbond: phy_calibration: fixed somelines over 80 characters"
Next in thread: Peter Zijlstra: "Re: [PATCH] mm, sched, numa: Create a per-task MPOL_INTERLEAVE policy"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

OK, here's one that actually works and doesn't have magical crashes.

---
Subject: mm, sched, numa: Create a per-task MPOL_INTERLEAVE policy
From: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Date: Mon Jul 22 10:42:38 CEST 2013

For those tasks belonging to groups that span nodes -- for whatever
reason -- we want to interleave their memory allocations to minimize
their performance penalty.

There's a subtlety to interleaved memory allocations though, once you
establish an interleave mask a measurement of where the actual memory
is is completely flat across those nodes. Therefore we'll never
actually shrink the interleave mask, even if at some point all tasks
can/do run on a single node again.

To fix this issue, change the accounting so that when we find a page
part of the interleave mask, we still account it against the current
node, not the node the page is really at.

Finally, simplify the 'default' numa policy. It used a per-node
preferred node policy and always picked the current node, this can be
written with a single MPOL_F_LOCAL policy.

Signed-off-by: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
---
include/linux/mempolicy.h | 5 +-
kernel/sched/fair.c | 56 +++++++++++++++++++++++++++
kernel/sched/features.h | 1
mm/huge_memory.c | 28 +++++++------
mm/memory.c | 33 ++++++++++-----
mm/mempolicy.c | 95 +++++++++++++++++++++++++++++-----------------
6 files changed, 158 insertions(+), 60 deletions(-)

--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -60,6 +60,7 @@ struct mempolicy {
* The default fast path of a NULL MPOL_DEFAULT policy is always inlined.
*/

+extern struct mempolicy *__mpol_new(unsigned short, unsigned short);
extern void __mpol_put(struct mempolicy *pol);
static inline void mpol_put(struct mempolicy *pol)
{
@@ -187,7 +188,7 @@ static inline int vma_migratable(struct
return 1;
}

-extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long);
+extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long, int *);

#else

@@ -307,7 +308,7 @@ static inline int mpol_to_str(char *buff
}

static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
- unsigned long address)
+ unsigned long address, int *account_node)
{
return -1; /* no node preference */
}
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -893,6 +893,59 @@ static inline unsigned long task_faults(
return p->numa_faults[2*nid] + p->numa_faults[2*nid+1];
}

+/*
+ * Create/Update p->mempolicy MPOL_INTERLEAVE to match p->numa_faults[].
+ */
+static void task_numa_mempol(struct task_struct *p, long max_faults)
+{
+ struct mempolicy *pol = p->mempolicy, *new = NULL;
+ nodemask_t nodes = NODE_MASK_NONE;
+ int node;
+
+ if (!max_faults)
+ return;
+
+ if (!pol) {
+ new = __mpol_new(MPOL_INTERLEAVE, MPOL_F_MOF | MPOL_F_MORON);
+ if (IS_ERR(new))
+ return;
+ }
+
+ task_lock(p);
+
+ pol = p->mempolicy; /* lock forces a re-read */
+ if (!pol)
+ pol = new;
+
+ if (!(pol->flags & MPOL_F_MORON))
+ goto unlock;
+
+ for_each_node(node) {
+ if (task_faults(p, node) > max_faults/2)
+ node_set(node, nodes);
+ }
+
+ if (pol == new) {
+ /*
+ * XXX 'borrowed' from do_set_mempolicy()
+ */
+ pol->v.nodes = nodes;
+ p->mempolicy = pol;
+ p->flags |= PF_MEMPOLICY;
+ p->il_next = first_node(nodes);
+ new = NULL;
+ } else {
+ mpol_rebind_task(p, &nodes, MPOL_REBIND_STEP1);
+ mpol_rebind_task(p, &nodes, MPOL_REBIND_STEP2);
+ }
+
+unlock:
+ task_unlock(p);
+
+ if (new)
+ __mpol_put(new);
+}
+
static unsigned long weighted_cpuload(const int cpu);
static unsigned long source_load(int cpu, int type);
static unsigned long target_load(int cpu, int type);
@@ -1106,6 +1159,9 @@ static void task_numa_placement(struct t
}
}

+ if (sched_feat(NUMA_INTERLEAVE))
+ task_numa_mempol(p, max_faults);
+
/* Preferred node as the node with the most faults */
if (max_faults && max_nid != p->numa_preferred_nid) {

--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -72,4 +72,5 @@ SCHED_FEAT(NUMA_FORCE, false)
SCHED_FEAT(NUMA_BALANCE, true)
SCHED_FEAT(NUMA_FAULTS_UP, true)
SCHED_FEAT(NUMA_FAULTS_DOWN, true)
+SCHED_FEAT(NUMA_INTERLEAVE, false)
#endif
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1292,7 +1292,7 @@ int do_huge_pmd_numa_page(struct mm_stru
{
struct page *page;
unsigned long haddr = addr & HPAGE_PMD_MASK;
- int page_nid = -1, this_nid = numa_node_id();
+ int page_nid = -1, account_nid = -1, this_nid = numa_node_id();
int target_nid, last_nidpid;
bool migrated = false;

@@ -1301,7 +1301,6 @@ int do_huge_pmd_numa_page(struct mm_stru
goto out_unlock;

page = pmd_page(pmd);
- get_page(page);

/*
* Do not account for faults against the huge zero page. The read-only
@@ -1317,13 +1316,12 @@ int do_huge_pmd_numa_page(struct mm_stru
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);

last_nidpid = page_nidpid_last(page);
- target_nid = mpol_misplaced(page, vma, haddr);
- if (target_nid == -1) {
- put_page(page);
+ target_nid = mpol_misplaced(page, vma, haddr, &account_nid);
+ if (target_nid == -1)
goto clear_pmdnuma;
- }

/* Acquire the page lock to serialise THP migrations */
+ get_page(page);
spin_unlock(&mm->page_table_lock);
lock_page(page);

@@ -1332,6 +1330,7 @@ int do_huge_pmd_numa_page(struct mm_stru
if (unlikely(!pmd_same(pmd, *pmdp))) {
unlock_page(page);
put_page(page);
+ account_nid = page_nid = -1; /* someone else took our fault */
goto out_unlock;
}
spin_unlock(&mm->page_table_lock);
@@ -1339,17 +1338,20 @@ int do_huge_pmd_numa_page(struct mm_stru
/* Migrate the THP to the requested node */
migrated = migrate_misplaced_transhuge_page(mm, vma,
pmdp, pmd, addr, page, target_nid);
- if (migrated)
- page_nid = target_nid;
- else
+ if (!migrated) {
+ account_nid = -1; /* account against the old page */
goto check_same;
+ }

+ page_nid = target_nid;
goto out;

check_same:
spin_lock(&mm->page_table_lock);
- if (unlikely(!pmd_same(pmd, *pmdp)))
+ if (unlikely(!pmd_same(pmd, *pmdp))) {
+ page_nid = -1; /* someone else took our fault */
goto out_unlock;
+ }
clear_pmdnuma:
pmd = pmd_mknonnuma(pmd);
set_pmd_at(mm, haddr, pmdp, pmd);
@@ -1359,8 +1361,10 @@ int do_huge_pmd_numa_page(struct mm_stru
spin_unlock(&mm->page_table_lock);

out:
- if (page_nid != -1)
- task_numa_fault(last_nidpid, page_nid, HPAGE_PMD_NR, migrated);
+ if (account_nid == -1)
+ account_nid = page_nid;
+ if (account_nid != -1)
+ task_numa_fault(last_nidpid, account_nid, HPAGE_PMD_NR, migrated);

return 0;
}
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3529,16 +3529,17 @@ static int do_nonlinear_fault(struct mm_
return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
}

-int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
- unsigned long addr, int current_nid)
+static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
+ unsigned long addr, int page_nid,
+ int *account_nid)
{
get_page(page);

count_vm_numa_event(NUMA_HINT_FAULTS);
- if (current_nid == numa_node_id())
+ if (page_nid == numa_node_id())
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);

- return mpol_misplaced(page, vma, addr);
+ return mpol_misplaced(page, vma, addr, account_nid);
}

int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -3546,7 +3547,7 @@ int do_numa_page(struct mm_struct *mm, s
{
struct page *page = NULL;
spinlock_t *ptl;
- int page_nid = -1;
+ int page_nid = -1, account_nid = -1;
int target_nid, last_nidpid;
bool migrated = false;

@@ -3583,7 +3584,7 @@ int do_numa_page(struct mm_struct *mm, s

last_nidpid = page_nidpid_last(page);
page_nid = page_to_nid(page);
- target_nid = numa_migrate_prep(page, vma, addr, page_nid);
+ target_nid = numa_migrate_prep(page, vma, addr, page_nid, &account_nid);
pte_unmap_unlock(ptep, ptl);
if (target_nid == -1) {
put_page(page);
@@ -3596,8 +3597,10 @@ int do_numa_page(struct mm_struct *mm, s
page_nid = target_nid;

out:
- if (page_nid != -1)
- task_numa_fault(last_nidpid, page_nid, 1, migrated);
+ if (account_nid == -1)
+ account_nid = page_nid;
+ if (account_nid != -1)
+ task_numa_fault(last_nidpid, account_nid, 1, migrated);

return 0;
}
@@ -3636,7 +3639,7 @@ static int do_pmd_numa_page(struct mm_st
for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
pte_t pteval = *pte;
struct page *page;
- int page_nid = -1;
+ int page_nid = -1, account_nid = -1;
int target_nid;
bool migrated = false;

@@ -3661,19 +3664,25 @@ static int do_pmd_numa_page(struct mm_st
last_nidpid = page_nidpid_last(page);
page_nid = page_to_nid(page);
target_nid = numa_migrate_prep(page, vma, addr,
- page_nid);
+ page_nid, &account_nid);
pte_unmap_unlock(pte, ptl);

if (target_nid != -1) {
migrated = migrate_misplaced_page(page, vma, target_nid);
if (migrated)
page_nid = target_nid;
+ else
+ account_nid = -1;
} else {
put_page(page);
}

- if (page_nid != -1)
- task_numa_fault(last_nidpid, page_nid, 1, migrated);
+ if (account_nid == -1)
+ account_nid = page_nid;
+ if (account_nid != -1)
+ task_numa_fault(last_nidpid, account_nid, 1, migrated);
+
+ cond_resched();

pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
}
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -118,22 +118,18 @@ static struct mempolicy default_policy =
.flags = MPOL_F_LOCAL,
};

-static struct mempolicy preferred_node_policy[MAX_NUMNODES];
+static struct mempolicy numa_policy = {
+ .refcnt = ATOMIC_INIT(1), /* never free it */
+ .mode = MPOL_PREFERRED,
+ .flags = MPOL_F_LOCAL | MPOL_F_MOF | MPOL_F_MORON,
+};

static struct mempolicy *get_task_policy(struct task_struct *p)
{
struct mempolicy *pol = p->mempolicy;
- int node;

- if (!pol) {
- node = numa_node_id();
- if (node != NUMA_NO_NODE)
- pol = &preferred_node_policy[node];
-
- /* preferred_node_policy is not initialised early in boot */
- if (!pol->mode)
- pol = NULL;
- }
+ if (!pol)
+ pol = &numa_policy;

return pol;
}
@@ -248,6 +244,20 @@ static int mpol_set_nodemask(struct memp
return ret;
}

+struct mempolicy *__mpol_new(unsigned short mode, unsigned short flags)
+{
+ struct mempolicy *policy;
+
+ policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
+ if (!policy)
+ return ERR_PTR(-ENOMEM);
+ atomic_set(&policy->refcnt, 1);
+ policy->mode = mode;
+ policy->flags = flags;
+
+ return policy;
+}
+
/*
* This function just creates a new policy, does some check and simple
* initialization. You must invoke mpol_set_nodemask() to set nodes.
@@ -255,8 +265,6 @@ static int mpol_set_nodemask(struct memp
static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
nodemask_t *nodes)
{
- struct mempolicy *policy;
-
pr_debug("setting mode %d flags %d nodes[0] %lx\n",
mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);

@@ -284,14 +292,8 @@ static struct mempolicy *mpol_new(unsign
mode = MPOL_PREFERRED;
} else if (nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
- policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
- if (!policy)
- return ERR_PTR(-ENOMEM);
- atomic_set(&policy->refcnt, 1);
- policy->mode = mode;
- policy->flags = flags;

- return policy;
+ return __mpol_new(mode, flags);
}

/* Slow path of a mpol destructor. */
@@ -2242,12 +2244,13 @@ static void sp_free(struct sp_node *n)
* Policy determination "mimics" alloc_page_vma().
* Called from fault path where we know the vma and faulting address.
*/
-int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
+int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr, int *account_node)
{
struct mempolicy *pol;
struct zone *zone;
int curnid = page_to_nid(page);
unsigned long pgoff;
+ int thisnid = numa_node_id();
int polnid = -1;
int ret = -1;

@@ -2269,7 +2272,7 @@ int mpol_misplaced(struct page *page, st

case MPOL_PREFERRED:
if (pol->flags & MPOL_F_LOCAL)
- polnid = numa_node_id();
+ polnid = thisnid;
else
polnid = pol->v.preferred_node;
break;
@@ -2284,7 +2287,7 @@ int mpol_misplaced(struct page *page, st
if (node_isset(curnid, pol->v.nodes))
goto out;
(void)first_zones_zonelist(
- node_zonelist(numa_node_id(), GFP_HIGHUSER),
+ node_zonelist(thisnid, GFP_HIGHUSER),
gfp_zone(GFP_HIGHUSER),
&pol->v.nodes, &zone);
polnid = zone->node;
@@ -2299,8 +2302,7 @@ int mpol_misplaced(struct page *page, st
int last_nidpid;
int this_nidpid;

- polnid = numa_node_id();
- this_nidpid = nid_pid_to_nidpid(polnid, current->pid);;
+ this_nidpid = nid_pid_to_nidpid(thisnid, current->pid);

/*
* Multi-stage node selection is used in conjunction
@@ -2326,6 +2328,40 @@ int mpol_misplaced(struct page *page, st
last_nidpid = page_nidpid_xchg_last(page, this_nidpid);
if (!nidpid_pid_unset(last_nidpid) && nidpid_to_nid(last_nidpid) != polnid)
goto out;
+
+ /*
+ * Preserve interleave pages while allowing useful
+ * ->numa_faults[] statistics.
+ *
+ * When migrating into an interleave set, migrate to
+ * the correct interleaved node but account against the
+ * current node (where the task is running).
+ *
+ * Not doing this would result in ->numa_faults[] being
+ * flat across the interleaved nodes, making it
+ * impossible to shrink the node list even when all
+ * tasks are running on a single node.
+ *
+ * src dst migrate account
+ * 0 0 -- this_node $page_node
+ * 0 1 -- policy_node this_node
+ * 1 0 -- this_node $page_node
+ * 1 1 -- policy_node this_node
+ *
+ */
+ switch (pol->mode) {
+ case MPOL_INTERLEAVE:
+ if (node_isset(thisnid, pol->v.nodes)) {
+ if (account_node)
+ *account_node = thisnid;
+ break;
+ }
+ /* fall-through for nodes outside the set */
+
+ default:
+ polnid = thisnid;
+ break;
+ }
}

if (curnid != polnid)
@@ -2588,15 +2624,6 @@ void __init numa_policy_init(void)
sizeof(struct sp_node),
0, SLAB_PANIC, NULL);

- for_each_node(nid) {
- preferred_node_policy[nid] = (struct mempolicy) {
- .refcnt = ATOMIC_INIT(1),
- .mode = MPOL_PREFERRED,
- .flags = MPOL_F_MOF | MPOL_F_MORON,
- .v = { .preferred_node = nid, },
- };
- }
-
/*
* Set interleaving policy for system init. Interleaving is only
* enabled across suitably sized nodes (default is >= 16MB), or
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Next message: Dan Carpenter: "Re: [PATCH 09/24] Staging: winbond: reg: fixed some lines over 80characters"
Previous message: Dan Carpenter: "Re: [PATCH 05/24] Staging: winbond: phy_calibration: fixed somelines over 80 characters"
Next in thread: Peter Zijlstra: "Re: [PATCH] mm, sched, numa: Create a per-task MPOL_INTERLEAVE policy"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]