[tip:numa/core] sched/numa/mm: Improve migration

From: tip-bot for Peter Zijlstra
Date: Thu Oct 18 2012 - 13:06:02 EST


Commit-ID: 713f937655c4b15131b5a0eae4610918a4febe17
Gitweb: http://git.kernel.org/tip/713f937655c4b15131b5a0eae4610918a4febe17
Author: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
AuthorDate: Fri, 12 Oct 2012 19:30:14 +0200
Committer: Ingo Molnar <mingo@xxxxxxxxxx>
CommitDate: Mon, 15 Oct 2012 14:18:40 +0200

sched/numa/mm: Improve migration

Add THP migration. Extend task_numa_fault() to absorb THP faults.

[ Would be nice if the gents on Cc: expressed their opinion about
this change. A missing detail might be cgroup page accounting,
plus the fact that some architectures might cache PMD_NONE pmds
in their TLBs, needing some extra TLB magic beyond what we already
do here? ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
Cc: Johannes Weiner <hannes@xxxxxxxxxxx>
Cc: Mel Gorman <mgorman@xxxxxxx>
Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Link: http://lkml.kernel.org/n/tip-yv9vbiz2s455zxq1ffzx3fye@xxxxxxxxxxxxxx
[ Significant fixes ]
Signed-off-by: Ingo Molnar <mingo@xxxxxxxxxx>
---
include/linux/sched.h | 4 +-
kernel/sched/fair.c | 4 +-
mm/huge_memory.c | 142 +++++++++++++++++++++++++++++++++++++++---------
mm/migrate.c | 2 +-
4 files changed, 120 insertions(+), 32 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 22be2d6..2c3009b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1609,7 +1609,7 @@ static inline int tsk_home_node(struct task_struct *p)
}

extern void task_numa_placement(void);
-extern void task_numa_fault(int node);
+extern void task_numa_fault(int node, int pages);
#else
static inline int tsk_home_node(struct task_struct *p)
{
@@ -1620,7 +1620,7 @@ static inline void task_numa_placement(void)
{
}

-static inline void task_numa_fault(int node)
+static inline void task_numa_fault(int node, int pages)
{
}
#endif /* CONFIG_SCHED_NUMA */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7d522d0..df35c8d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -835,7 +835,7 @@ unsigned int sysctl_sched_numa_settle_count = 2;
/*
* Got a PROT_NONE fault for a page on @node.
*/
-void task_numa_fault(int node)
+void task_numa_fault(int node, int pages)
{
struct task_struct *p = current;

@@ -846,7 +846,7 @@ void task_numa_fault(int node)
return;
}

- p->numa_faults[node]++;
+ p->numa_faults[node] += pages;
}

void task_numa_placement(void)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d14c8b2..2b65116 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -767,11 +767,13 @@ void do_huge_pmd_prot_none(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned int flags, pmd_t entry)
{
unsigned long haddr = address & HPAGE_PMD_MASK;
+ struct page *new_page = NULL;
struct page *page = NULL;
+ int node, lru;

spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_same(*pmd, entry)))
- goto out_unlock;
+ goto unlock;

if (unlikely(pmd_trans_splitting(entry))) {
spin_unlock(&mm->page_table_lock);
@@ -779,44 +781,130 @@ void do_huge_pmd_prot_none(struct mm_struct *mm, struct vm_area_struct *vma,
return;
}

-#ifdef CONFIG_NUMA
page = pmd_page(entry);
- VM_BUG_ON(!PageCompound(page) || !PageHead(page));
+ if (page) {
+ VM_BUG_ON(!PageCompound(page) || !PageHead(page));

- get_page(page);
+ get_page(page);
+ node = mpol_misplaced(page, vma, haddr);
+ if (node != -1)
+ goto migrate;
+ }
+
+fixup:
+ /* change back to regular protection */
+ entry = pmd_modify(entry, vma->vm_page_prot);
+ set_pmd_at(mm, haddr, pmd, entry);
+ update_mmu_cache(vma, address, entry);
+
+unlock:
spin_unlock(&mm->page_table_lock);
+ if (page) {
+ task_numa_placement();
+ task_numa_fault(page_to_nid(page), HPAGE_PMD_NR);
+ put_page(page);
+ }
+ return;

- /*
- * XXX should we serialize against split_huge_page ?
- */
+migrate:
+ WARN_ON(!(((unsigned long)page->mapping & PAGE_MAPPING_ANON)));
+ WARN_ON((((unsigned long)page->mapping & PAGE_MAPPING_KSM)));
+ BUG_ON(PageSwapCache(page));
+
+ spin_unlock(&mm->page_table_lock);

- if (mpol_misplaced(page, vma, haddr) == -1)
- goto do_fixup;
+ lock_page(page);
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(*pmd, entry))) {
+ spin_unlock(&mm->page_table_lock);
+ unlock_page(page);
+ put_page(page);
+ return;
+ }
+ spin_unlock(&mm->page_table_lock);

- /*
- * Due to lacking code to migrate thp pages, we'll split
- * (which preserves the special PROT_NONE) and re-take the
- * fault on the normal pages.
- */
- split_huge_page(page);
- put_page(page);
- return;
+ task_numa_placement();
+
+ new_page = alloc_pages_node(node,
+ (GFP_TRANSHUGE | GFP_THISNODE) & ~(__GFP_NO_KSWAPD | __GFP_WAIT),
+ HPAGE_PMD_ORDER);
+
+ WARN_ON(PageLRU(new_page));
+
+ if (!new_page)
+ goto alloc_fail;
+
+ lru = PageLRU(page);
+
+ if (lru && isolate_lru_page(page)) /* does an implicit get_page() */
+ goto alloc_fail;
+
+ if (!trylock_page(new_page))
+ BUG();
+
+ /* anon mapping, we can simply copy page->mapping to the new page: */
+ new_page->mapping = page->mapping;
+ new_page->index = page->index;
+
+ migrate_page_copy(new_page, page);
+
+ WARN_ON(PageLRU(new_page));

-do_fixup:
spin_lock(&mm->page_table_lock);
- if (unlikely(!pmd_same(*pmd, entry)))
- goto out_unlock;
-#endif
+ if (unlikely(!pmd_same(*pmd, entry))) {
+ spin_unlock(&mm->page_table_lock);
+ if (lru)
+ putback_lru_page(page);

- /* change back to regular protection */
- entry = pmd_modify(entry, vma->vm_page_prot);
- if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1))
- update_mmu_cache(vma, address, entry);
+ unlock_page(new_page);
+ ClearPageActive(new_page); /* Set by migrate_page_copy() */
+ new_page->mapping = NULL;
+ put_page(new_page); /* Free it */

-out_unlock:
+ unlock_page(page);
+ put_page(page); /* Drop the local reference */
+
+ return;
+ }
+
+ entry = mk_pmd(new_page, vma->vm_page_prot);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ entry = pmd_mkhuge(entry);
+
+ page_add_new_anon_rmap(new_page, vma, haddr);
+
+ set_pmd_at(mm, haddr, pmd, entry);
+ update_mmu_cache(vma, address, entry);
+ page_remove_rmap(page);
spin_unlock(&mm->page_table_lock);
- if (page)
+
+ put_page(page); /* Drop the rmap reference */
+
+ task_numa_fault(node, HPAGE_PMD_NR);
+
+ if (lru)
+ put_page(page); /* drop the LRU isolation reference */
+
+ unlock_page(new_page);
+ unlock_page(page);
+ put_page(page); /* Drop the local reference */
+
+ return;
+
+alloc_fail:
+ if (new_page)
+ put_page(new_page);
+
+ task_numa_fault(page_to_nid(page), HPAGE_PMD_NR);
+ unlock_page(page);
+
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(*pmd, entry))) {
put_page(page);
+ page = NULL;
+ goto unlock;
+ }
+ goto fixup;
}

int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
diff --git a/mm/migrate.c b/mm/migrate.c
index e03ed0b..e3cff03 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -417,7 +417,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
*/
void migrate_page_copy(struct page *newpage, struct page *page)
{
- if (PageHuge(page))
+ if (PageHuge(page) || PageTransHuge(page))
copy_huge_page(newpage, page);
else
copy_highpage(newpage, page);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/