[RFC 2/4] Add non-swap page flag to mark a page will not swap

From: Hui Zhu
Date: Mon Aug 22 2016 - 04:43:54 EST


After a page marked non-swap flag in swap driver, it will add to
unevictable lru list.
This page will be kept in this status before its data changed.

Signed-off-by: Hui Zhu <zhuhui@xxxxxxxxxx>
---
fs/proc/meminfo.c | 6 ++++++
include/linux/mm_inline.h | 20 ++++++++++++++++++--
include/linux/mmzone.h | 3 +++
include/linux/page-flags.h | 8 ++++++++
include/trace/events/mmflags.h | 9 ++++++++-
kernel/events/uprobes.c | 16 +++++++++++++++-
mm/Kconfig | 5 +++++
mm/memory.c | 34 ++++++++++++++++++++++++++++++++++
mm/migrate.c | 4 ++++
mm/mprotect.c | 8 ++++++++
mm/vmscan.c | 41 ++++++++++++++++++++++++++++++++++++++++-
11 files changed, 149 insertions(+), 5 deletions(-)

diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index b9a8c81..5c79b2e 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -79,6 +79,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
#endif
"SwapTotal: %8lu kB\n"
"SwapFree: %8lu kB\n"
+#ifdef CONFIG_NON_SWAP
+ "NonSwap: %8lu kB\n"
+#endif
"Dirty: %8lu kB\n"
"Writeback: %8lu kB\n"
"AnonPages: %8lu kB\n"
@@ -138,6 +141,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
#endif
K(i.totalswap),
K(i.freeswap),
+#ifdef CONFIG_NON_SWAP
+ K(global_page_state(NR_NON_SWAP)),
+#endif
K(global_node_page_state(NR_FILE_DIRTY)),
K(global_node_page_state(NR_WRITEBACK)),
K(global_node_page_state(NR_ANON_MAPPED)),
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 71613e8..92298ce 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -46,15 +46,31 @@ static __always_inline void update_lru_size(struct lruvec *lruvec,
static __always_inline void add_page_to_lru_list(struct page *page,
struct lruvec *lruvec, enum lru_list lru)
{
- update_lru_size(lruvec, lru, page_zonenum(page), hpage_nr_pages(page));
+ int nr_pages = hpage_nr_pages(page);
+ enum zone_type zid = page_zonenum(page);
+#ifdef CONFIG_NON_SWAP
+ if (PageNonSwap(page)) {
+ lru = LRU_UNEVICTABLE;
+ update_lru_size(lruvec, NR_NON_SWAP, zid, nr_pages);
+ }
+#endif
+ update_lru_size(lruvec, lru, zid, nr_pages);
list_add(&page->lru, &lruvec->lists[lru]);
}

static __always_inline void del_page_from_lru_list(struct page *page,
struct lruvec *lruvec, enum lru_list lru)
{
+ int nr_pages = hpage_nr_pages(page);
+ enum zone_type zid = page_zonenum(page);
+#ifdef CONFIG_NON_SWAP
+ if (PageNonSwap(page)) {
+ lru = LRU_UNEVICTABLE;
+ update_lru_size(lruvec, NR_NON_SWAP, zid, -nr_pages);
+ }
+#endif
list_del(&page->lru);
- update_lru_size(lruvec, lru, page_zonenum(page), -hpage_nr_pages(page));
+ update_lru_size(lruvec, lru, zid, -nr_pages);
}

/**
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d572b78..da08d20 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -138,6 +138,9 @@ enum zone_stat_item {
NUMA_OTHER, /* allocation from other node */
#endif
NR_FREE_CMA_PAGES,
+#ifdef CONFIG_NON_SWAP
+ NR_NON_SWAP,
+#endif
NR_VM_ZONE_STAT_ITEMS };

enum node_stat_item {
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 74e4dda..0cd80db9 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -105,6 +105,9 @@ enum pageflags {
PG_young,
PG_idle,
#endif
+#ifdef CONFIG_NON_SWAP
+ PG_non_swap,
+#endif
__NR_PAGEFLAGS,

/* Filesystems */
@@ -303,6 +306,11 @@ PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL)
PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND)
TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND)

+#ifdef CONFIG_NON_SWAP
+PAGEFLAG(NonSwap, non_swap, PF_NO_TAIL)
+ TESTSCFLAG(NonSwap, non_swap, PF_NO_TAIL)
+#endif
+
#ifdef CONFIG_HIGHMEM
/*
* Must use a macro here due to header dependency issues. page_zone() is not
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 5a81ab4..1c0ccc9 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -79,6 +79,12 @@
#define IF_HAVE_PG_IDLE(flag,string)
#endif

+#ifdef CONFIG_NON_SWAP
+#define IF_HAVE_PG_NON_SWAP(flag,string) ,{1UL << flag, string}
+#else
+#define IF_HAVE_PG_NON_SWAP(flag,string)
+#endif
+
#define __def_pageflag_names \
{1UL << PG_locked, "locked" }, \
{1UL << PG_error, "error" }, \
@@ -104,7 +110,8 @@ IF_HAVE_PG_MLOCK(PG_mlocked, "mlocked" ) \
IF_HAVE_PG_UNCACHED(PG_uncached, "uncached" ) \
IF_HAVE_PG_HWPOISON(PG_hwpoison, "hwpoison" ) \
IF_HAVE_PG_IDLE(PG_young, "young" ) \
-IF_HAVE_PG_IDLE(PG_idle, "idle" )
+IF_HAVE_PG_IDLE(PG_idle, "idle" ) \
+IF_HAVE_PG_NON_SWAP(PG_non_swap, "non_swap" )

#define show_page_flags(flags) \
(flags) ? __print_flags(flags, "|", \
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index b7a525a..a7e4153 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -160,6 +160,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
const unsigned long mmun_start = addr;
const unsigned long mmun_end = addr + PAGE_SIZE;
struct mem_cgroup *memcg;
+ pte_t pte;
+#ifdef CONFIG_NON_SWAP
+ bool non_swap;
+#endif

err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg,
false);
@@ -176,6 +180,11 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
goto unlock;

get_page(kpage);
+#ifdef CONFIG_NON_SWAP
+ non_swap = TestClearPageNonSwap(page);
+ if (non_swap)
+ SetPageNonSwap(kpage);
+#endif
page_add_new_anon_rmap(kpage, vma, addr, false);
mem_cgroup_commit_charge(kpage, memcg, false, false);
lru_cache_add_active_or_unevictable(kpage, vma);
@@ -187,7 +196,12 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,

flush_cache_page(vma, addr, pte_pfn(*ptep));
ptep_clear_flush_notify(vma, addr, ptep);
- set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
+ pte = mk_pte(kpage, vma->vm_page_prot);
+#ifdef CONFIG_NON_SWAP
+ if (non_swap)
+ pte = pte_wrprotect(pte);
+#endif
+ set_pte_at_notify(mm, addr, ptep, pte);

page_remove_rmap(page, false);
if (!page_mapped(page))
diff --git a/mm/Kconfig b/mm/Kconfig
index 57ecdb3..d8d4b41 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -708,3 +708,8 @@ config ARCH_HAS_PKEYS
config LATE_UNMAP
bool
depends on SWAP
+
+config NON_SWAP
+ bool
+ depends on SWAP
+ select LATE_UNMAP
diff --git a/mm/memory.c b/mm/memory.c
index 83be99d..2448004 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -64,6 +64,7 @@
#include <linux/debugfs.h>
#include <linux/userfaultfd_k.h>
#include <linux/dax.h>
+#include <linux/mm_inline.h>

#include <asm/io.h>
#include <asm/mmu_context.h>
@@ -2338,6 +2339,26 @@ static int wp_page_shared(struct fault_env *fe, pte_t orig_pte,
return wp_page_reuse(fe, orig_pte, old_page, page_mkwrite, 1);
}

+#ifdef CONFIG_NON_SWAP
+static void
+clear_page_non_swap(struct page *page)
+{
+ struct zone *zone;
+ struct lruvec *lruvec;
+
+ if (!PageLRU(page) || !page_evictable(page))
+ return;
+
+ zone = page_zone(page);
+ spin_lock_irq(zone_lru_lock(zone));
+ __dec_zone_page_state(page, NR_NON_SWAP);
+ lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
+ del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
+ add_page_to_lru_list(page, lruvec, page_lru(page));
+ spin_unlock_irq(zone_lru_lock(zone));
+}
+#endif
+
/*
* This routine handles present pages, when users try to write
* to a shared page. It is done by copying the page to a new address
@@ -2400,6 +2421,10 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte)
put_page(old_page);
}
if (reuse_swap_page(old_page, &total_mapcount)) {
+#ifdef CONFIG_NON_SWAP
+ if (unlikely(TestClearPageNonSwap(old_page)))
+ clear_page_non_swap(old_page);
+#endif
if (total_mapcount == 1) {
/*
* The page is all ours. Move it to
@@ -2581,6 +2606,11 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
goto out_release;
}

+#ifdef CONFIG_NON_SWAP
+ if ((fe->flags & FAULT_FLAG_WRITE) && unlikely(TestClearPageNonSwap(page)))
+ clear_page_non_swap(page);
+#endif
+
/*
* Make sure try_to_free_swap or reuse_swap_page or swapoff did not
* release the swapcache from under us. The page pin, and pte_same
@@ -2638,6 +2668,10 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
flush_icache_page(vma, page);
if (pte_swp_soft_dirty(orig_pte))
pte = pte_mksoft_dirty(pte);
+#ifdef CONFIG_NON_SWAP
+ if (!(fe->flags & FAULT_FLAG_WRITE) && PageNonSwap(page))
+ pte = pte_wrprotect(pte);
+#endif
set_pte_at(vma->vm_mm, fe->address, fe->pte, pte);
if (page == swapcache) {
do_page_add_anon_rmap(page, vma, fe->address, exclusive);
diff --git a/mm/migrate.c b/mm/migrate.c
index f7ee04a..46ac926 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -640,6 +640,10 @@ void migrate_page_copy(struct page *newpage, struct page *page)
SetPageChecked(newpage);
if (PageMappedToDisk(page))
SetPageMappedToDisk(newpage);
+#ifdef CONFIG_NON_SWAP
+ if (TestClearPageNonSwap(page))
+ SetPageNonSwap(newpage);
+#endif

/* Move dirty on pages not done by migrate_page_move_mapping() */
if (PageDirty(page))
diff --git a/mm/mprotect.c b/mm/mprotect.c
index a4830f0..6539c6e 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -79,6 +79,9 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
if (pte_present(oldpte)) {
pte_t ptent;
bool preserve_write = prot_numa && pte_write(oldpte);
+#ifdef CONFIG_NON_SWAP
+ struct page *page;
+#endif

/*
* Avoid trapping faults against the zero or KSM
@@ -107,6 +110,11 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
!(vma->vm_flags & VM_SOFTDIRTY))) {
ptent = pte_mkwrite(ptent);
}
+#ifdef CONFIG_NON_SWAP
+ page = vm_normal_page(vma, addr, oldpte);
+ if (page && PageNonSwap(page))
+ ptent = pte_wrprotect(ptent);
+#endif
ptep_modify_prot_commit(mm, addr, pte, ptent);
pages++;
} else if (IS_ENABLED(CONFIG_MIGRATION)) {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 32fef7d..14d49cd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -758,14 +758,38 @@ redo:
ClearPageUnevictable(page);

if (page_evictable(page)) {
+#ifdef CONFIG_NON_SWAP
+ bool added = false;
+
+ if (unlikely(PageNonSwap(page))) {
+ struct zone *zone = page_zone(page);
+
+ BUG_ON(irqs_disabled());
+
+ spin_lock_irq(zone_lru_lock(zone));
+ if (likely(PageNonSwap(page))) {
+ struct lruvec *lruvec;
+
+ lruvec = mem_cgroup_page_lruvec(page,
+ zone->zone_pgdat);
+ SetPageLRU(page);
+ add_page_to_lru_list(page, lruvec,
+ LRU_UNEVICTABLE);
+ added = true;
+ }
+ spin_unlock_irq(zone_lru_lock(zone));
+ }
+
/*
* For evictable pages, we can use the cache.
* In event of a race, worst case is we end up with an
* unevictable page on [in]active list.
* We know how to handle that.
*/
+ if (!added)
+#endif
+ lru_cache_add(page);
is_unevictable = false;
- lru_cache_add(page);
} else {
/*
* Put unevictable pages directly on zone's unevictable
@@ -1199,6 +1223,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
if (PageDirty(page))
goto keep_locked;

+#ifdef CONFIG_NON_SWAP
+ if (PageNonSwap(page)) {
+ try_to_free_swap(page);
+ unlock_page(page);
+ goto non_swap_keep;
+ }
+#endif
+
if (page_mapped(page) && mapping)
TRY_TO_UNMAP(page, ttu_flags);
}
@@ -1281,6 +1313,9 @@ cull_mlocked:
if (PageSwapCache(page))
try_to_free_swap(page);
unlock_page(page);
+#ifdef CONFIG_NON_SWAP
+ ClearPageNonSwap(page);
+#endif
list_add(&page->lru, &ret_pages);
continue;

@@ -1294,6 +1329,10 @@ activate_locked:
keep_locked:
unlock_page(page);
keep:
+#ifdef CONFIG_NON_SWAP
+ ClearPageNonSwap(page);
+non_swap_keep:
+#endif
list_add(&page->lru, &ret_pages);
VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
}
--
1.9.1