[PATCH 4/4] mm: prepare page_referenced() and page_idle to new THP refcounting

From: Kirill A. Shutemov
Date: Tue Nov 03 2015 - 10:27:47 EST


I've missed two simlar codepath which need some preparation to work well
with reworked THP refcounting.

Both page_referenced() and page_idle_clear_pte_refs_one() assume that
THP can only be mapped with PMD, so there's no reason to look on PTEs
for PageTransHuge() pages. That's no true anymore: THP can be mapped
with PTEs too.

The patch removes PageTransHuge() test from the functions and opencode
page table check.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx>
Cc: Vladimir Davydov <vdavydov@xxxxxxxxxxxxx>
---
include/linux/huge_mm.h | 4 --
include/linux/mm.h | 19 ++++++++
mm/huge_memory.c | 54 ----------------------
mm/page_idle.c | 64 ++++++++++++++++++++++----
mm/rmap.c | 118 +++++++++++++++++++++++++++++++++---------------
5 files changed, 155 insertions(+), 104 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index f7c3f13f3a9c..5c7b00e88236 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -51,10 +51,6 @@ enum transparent_hugepage_flag {
#endif
};

-extern pmd_t *page_check_address_pmd(struct page *page,
- struct mm_struct *mm,
- unsigned long address,
- spinlock_t **ptl);
extern int pmd_freeable(pmd_t pmd);

#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b4cd988a794a..a36f9fa4e4cd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -432,6 +432,25 @@ static inline int page_mapcount(struct page *page)
return ret;
}

+static inline int total_mapcount(struct page *page)
+{
+ int i, ret;
+
+ VM_BUG_ON_PAGE(PageTail(page), page);
+
+ if (likely(!PageCompound(page)))
+ return atomic_read(&page->_mapcount) + 1;
+
+ ret = compound_mapcount(page);
+ if (PageHuge(page))
+ return ret;
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ ret += atomic_read(&page[i]._mapcount) + 1;
+ if (PageDoubleMap(page))
+ ret -= HPAGE_PMD_NR;
+ return ret;
+}
+
static inline int page_count(struct page *page)
{
return atomic_read(&compound_head(page)->_count);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3700981f8035..14cbbad54a3e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1713,46 +1713,6 @@ bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
return false;
}

-/*
- * This function returns whether a given @page is mapped onto the @address
- * in the virtual space of @mm.
- *
- * When it's true, this function returns *pmd with holding the page table lock
- * and passing it back to the caller via @ptl.
- * If it's false, returns NULL without holding the page table lock.
- */
-pmd_t *page_check_address_pmd(struct page *page,
- struct mm_struct *mm,
- unsigned long address,
- spinlock_t **ptl)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
-
- if (address & ~HPAGE_PMD_MASK)
- return NULL;
-
- pgd = pgd_offset(mm, address);
- if (!pgd_present(*pgd))
- return NULL;
- pud = pud_offset(pgd, address);
- if (!pud_present(*pud))
- return NULL;
- pmd = pmd_offset(pud, address);
-
- *ptl = pmd_lock(mm, pmd);
- if (!pmd_present(*pmd))
- goto unlock;
- if (pmd_page(*pmd) != page)
- goto unlock;
- if (pmd_trans_huge(*pmd))
- return pmd;
-unlock:
- spin_unlock(*ptl);
- return NULL;
-}
-
#define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)

int hugepage_madvise(struct vm_area_struct *vma,
@@ -3169,20 +3129,6 @@ static void unfreeze_page(struct anon_vma *anon_vma, struct page *page)
}
}

-static int total_mapcount(struct page *page)
-{
- int i, ret;
-
- ret = compound_mapcount(page);
- for (i = 0; i < HPAGE_PMD_NR; i++)
- ret += atomic_read(&page[i]._mapcount) + 1;
-
- if (PageDoubleMap(page))
- ret -= HPAGE_PMD_NR;
-
- return ret;
-}
-
static int __split_huge_page_tail(struct page *head, int tail,
struct lruvec *lruvec, struct list_head *list)
{
diff --git a/mm/page_idle.c b/mm/page_idle.c
index 1c245d9027e3..2c9ebe12b40d 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -56,23 +56,69 @@ static int page_idle_clear_pte_refs_one(struct page *page,
{
struct mm_struct *mm = vma->vm_mm;
spinlock_t *ptl;
+ pgd_t *pgd;
+ pud_t *pud;
pmd_t *pmd;
pte_t *pte;
bool referenced = false;

- if (unlikely(PageTransHuge(page))) {
- pmd = page_check_address_pmd(page, mm, addr, &ptl);
- if (pmd) {
- referenced = pmdp_clear_young_notify(vma, addr, pmd);
+ pgd = pgd_offset(mm, addr);
+ if (!pgd_present(*pgd))
+ return SWAP_AGAIN;
+ pud = pud_offset(pgd, addr);
+ if (!pud_present(*pud))
+ return SWAP_AGAIN;
+ pmd = pmd_offset(pud, addr);
+
+ if (pmd_trans_huge(*pmd)) {
+ ptl = pmd_lock(mm, pmd);
+ if (!pmd_present(*pmd))
+ goto unlock_pmd;
+ if (unlikely(!pmd_trans_huge(*pmd))) {
spin_unlock(ptl);
+ goto map_pte;
}
+
+ if (pmd_page(*pmd) != page)
+ goto unlock_pmd;
+
+ referenced = pmdp_clear_young_notify(vma, addr, pmd);
+ spin_unlock(ptl);
+ goto found;
+unlock_pmd:
+ spin_unlock(ptl);
+ return SWAP_AGAIN;
} else {
- pte = page_check_address(page, mm, addr, &ptl, 0);
- if (pte) {
- referenced = ptep_clear_young_notify(vma, addr, pte);
- pte_unmap_unlock(pte, ptl);
- }
+ pmd_t pmde = *pmd;
+ barrier();
+ if (!pmd_present(pmde) || pmd_trans_huge(pmde))
+ return SWAP_AGAIN;
+
+ }
+map_pte:
+ pte = pte_offset_map(pmd, addr);
+ if (!pte_present(*pte)) {
+ pte_unmap(pte);
+ return SWAP_AGAIN;
}
+
+ ptl = pte_lockptr(mm, pmd);
+ spin_lock(ptl);
+
+ if (!pte_present(*pte)) {
+ pte_unmap_unlock(pte, ptl);
+ return SWAP_AGAIN;
+ }
+
+ /* THP can be referenced by any subpage */
+ if (pte_pfn(*pte) - page_to_pfn(page) >= hpage_nr_pages(page)) {
+ pte_unmap_unlock(pte, ptl);
+ return SWAP_AGAIN;
+ }
+
+ referenced = ptep_clear_young_notify(vma, addr, pte);
+ pte_unmap_unlock(pte, ptl);
+found:
if (referenced) {
clear_page_idle(page);
/*
diff --git a/mm/rmap.c b/mm/rmap.c
index ad9af8b3a381..0837487d3737 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -812,60 +812,104 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
spinlock_t *ptl;
int referenced = 0;
struct page_referenced_arg *pra = arg;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;

- if (unlikely(PageTransHuge(page))) {
- pmd_t *pmd;
-
- /*
- * rmap might return false positives; we must filter
- * these out using page_check_address_pmd().
- */
- pmd = page_check_address_pmd(page, mm, address, &ptl);
- if (!pmd)
+ if (unlikely(PageHuge(page))) {
+ /* when pud is not present, pte will be NULL */
+ pte = huge_pte_offset(mm, address);
+ if (!pte)
return SWAP_AGAIN;

- if (vma->vm_flags & VM_LOCKED) {
+ ptl = huge_pte_lockptr(page_hstate(page), mm, pte);
+ goto check_pte;
+ }
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ return SWAP_AGAIN;
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ return SWAP_AGAIN;
+ pmd = pmd_offset(pud, address);
+
+ if (pmd_trans_huge(*pmd)) {
+ int ret = SWAP_AGAIN;
+
+ ptl = pmd_lock(mm, pmd);
+ if (!pmd_present(*pmd))
+ goto unlock_pmd;
+ if (unlikely(!pmd_trans_huge(*pmd))) {
spin_unlock(ptl);
+ goto map_pte;
+ }
+
+ if (pmd_page(*pmd) != page)
+ goto unlock_pmd;
+
+ if (vma->vm_flags & VM_LOCKED) {
pra->vm_flags |= VM_LOCKED;
- return SWAP_FAIL; /* To break the loop */
+ ret = SWAP_FAIL; /* To break the loop */
+ goto unlock_pmd;
}

if (pmdp_clear_flush_young_notify(vma, address, pmd))
referenced++;
-
spin_unlock(ptl);
+ goto found;
+unlock_pmd:
+ spin_unlock(ptl);
+ return ret;
} else {
- pte_t *pte;
-
- /*
- * rmap might return false positives; we must filter
- * these out using page_check_address().
- */
- pte = page_check_address(page, mm, address, &ptl, 0);
- if (!pte)
+ pmd_t pmde = *pmd;
+ barrier();
+ if (!pmd_present(pmde) || pmd_trans_huge(pmde))
return SWAP_AGAIN;
+ }
+map_pte:
+ pte = pte_offset_map(pmd, address);
+ if (!pte_present(*pte)) {
+ pte_unmap(pte);
+ return SWAP_AGAIN;
+ }

- if (vma->vm_flags & VM_LOCKED) {
- pte_unmap_unlock(pte, ptl);
- pra->vm_flags |= VM_LOCKED;
- return SWAP_FAIL; /* To break the loop */
- }
+ ptl = pte_lockptr(mm, pmd);
+check_pte:
+ spin_lock(ptl);

- if (ptep_clear_flush_young_notify(vma, address, pte)) {
- /*
- * Don't treat a reference through a sequentially read
- * mapping as such. If the page has been used in
- * another mapping, we will catch it; if this other
- * mapping is already gone, the unmap path will have
- * set PG_referenced or activated the page.
- */
- if (likely(!(vma->vm_flags & VM_SEQ_READ)))
- referenced++;
- }
+ if (!pte_present(*pte)) {
+ pte_unmap_unlock(pte, ptl);
+ return SWAP_AGAIN;
+ }
+
+ /* THP can be referenced by any subpage */
+ if (pte_pfn(*pte) - page_to_pfn(page) >= hpage_nr_pages(page)) {
+ pte_unmap_unlock(pte, ptl);
+ return SWAP_AGAIN;
+ }

+ if (vma->vm_flags & VM_LOCKED) {
pte_unmap_unlock(pte, ptl);
+ pra->vm_flags |= VM_LOCKED;
+ return SWAP_FAIL; /* To break the loop */
}

+ if (ptep_clear_flush_young_notify(vma, address, pte)) {
+ /*
+ * Don't treat a reference through a sequentially read
+ * mapping as such. If the page has been used in
+ * another mapping, we will catch it; if this other
+ * mapping is already gone, the unmap path will have
+ * set PG_referenced or activated the page.
+ */
+ if (likely(!(vma->vm_flags & VM_SEQ_READ)))
+ referenced++;
+ }
+ pte_unmap_unlock(pte, ptl);
+
+found:
if (referenced)
clear_page_idle(page);
if (test_and_clear_page_young(page))
@@ -912,7 +956,7 @@ int page_referenced(struct page *page,
int ret;
int we_locked = 0;
struct page_referenced_arg pra = {
- .mapcount = page_mapcount(page),
+ .mapcount = total_mapcount(page),
.memcg = memcg,
};
struct rmap_walk_control rwc = {
--
2.6.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/