Re: [PATCH v5 2/6] ksm: support unsharing zero pages placed by KSM

From: David Hildenbrand
Date: Wed Jan 18 2023 - 09:28:09 EST


On 30.12.22 02:13, yang.yang29@xxxxxxxxxx wrote:
From: xu xin <xu.xin16@xxxxxxxxxx>

use_zero_pages may be very useful, not just because of cache colouring
as described in doc, but also because use_zero_pages can accelerate
merging empty pages when there are plenty of empty pages (full of zeros)
as the time of page-by-page comparisons (unstable_tree_search_insert) is
saved.

But when enabling use_zero_pages, madvise(addr, len, MADV_UNMERGEABLE) and
other ways (like write 2 to /sys/kernel/mm/ksm/run) to trigger unsharing
will *not* actually unshare the shared zeropage as placed by KSM (which is
against the MADV_UNMERGEABLE documentation). As these KSM-placed zero pages
are out of the control of KSM, the related counts of ksm pages don't expose
how many zero pages are placed by KSM (these special zero pages are different
from those initially mapped zero pages, because the zero pages mapped to
MADV_UNMERGEABLE areas are expected to be a complete and unshared page)

To not blindly unshare all shared zero_pages in applicable VMAs, the patch
introduces a dedicated flag ZERO_PAGE_FLAG to mark the rmap_items of those
shared zero_pages. and guarantee that these rmap_items will be not freed
during the time of zero_pages not being writing, so we can only unshare
the *KSM-placed* zero_pages.

The patch will not degrade the performance of use_zero_pages as it doesn't
change the way of merging empty pages in use_zero_pages's feature.

Signed-off-by: xu xin <xu.xin16@xxxxxxxxxx>
Reported-by: David Hildenbrand <david@xxxxxxxxxx>
Cc: Claudio Imbrenda <imbrenda@xxxxxxxxxxxxx>
Cc: Xuexin Jiang <jiang.xuexin@xxxxxxxxxx>
Reviewed-by: Xiaokai Ran <ran.xiaokai@xxxxxxxxxx>
Reviewed-by: Yang Yang <yang.yang29@xxxxxxxxxx>

[...]


/* The stable and unstable tree heads */
static struct rb_root one_stable_tree[1] = { RB_ROOT };
@@ -420,6 +421,11 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
return atomic_read(&mm->mm_users) == 0;
}

+enum break_ksm_pmd_entry_return_flag {
+ HAVE_KSM_PAGE = 1,
+ HAVE_ZERO_PAGE
+};

Why use flags if they both conditions are mutually exclusive?

+
static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next,
struct mm_walk *walk)
{
@@ -427,6 +433,7 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex
spinlock_t *ptl;
pte_t *pte;
int ret;
+ bool is_zero_page = false;

if (pmd_leaf(*pmd) || !pmd_present(*pmd))
return 0;
@@ -434,6 +441,8 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex
pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
if (pte_present(*pte)) {
page = vm_normal_page(walk->vma, addr, *pte);
+ if (!page)
+ is_zero_page = is_zero_pfn(pte_pfn(*pte));
} else if (!pte_none(*pte)) {
swp_entry_t entry = pte_to_swp_entry(*pte);

@@ -444,7 +453,14 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex
if (is_migration_entry(entry))
page = pfn_swap_entry_to_page(entry);
}
- ret = page && PageKsm(page);
+
+ if (page && PageKsm(page))
+ ret = HAVE_KSM_PAGE;
+ else if (is_zero_page)
+ ret = HAVE_ZERO_PAGE;
+ else
+ ret = 0;
+
pte_unmap_unlock(pte, ptl);
return ret;
}
@@ -466,19 +482,22 @@ static const struct mm_walk_ops break_ksm_ops = {
* of the process that owns 'vma'. We also do not want to enforce
* protection keys here anyway.
*/
-static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
+static int break_ksm(struct vm_area_struct *vma, unsigned long addr,
+ bool unshare_zero_page)
{
vm_fault_t ret = 0;

do {
- int ksm_page;
+ int walk_result;

cond_resched();
- ksm_page = walk_page_range_vma(vma, addr, addr + 1,
+ walk_result = walk_page_range_vma(vma, addr, addr + 1,
&break_ksm_ops, NULL);
- if (WARN_ON_ONCE(ksm_page < 0))
- return ksm_page;
- if (!ksm_page)
+ if (WARN_ON_ONCE(walk_result < 0))
+ return walk_result;
+ if (!walk_result)
+ return 0;
+ if (walk_result == HAVE_ZERO_PAGE && !unshare_zero_page)
return 0;
ret = handle_mm_fault(vma, addr,
FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
@@ -539,7 +558,7 @@ static void break_cow(struct ksm_rmap_item *rmap_item)
mmap_read_lock(mm);
vma = find_mergeable_vma(mm, addr);
if (vma)
- break_ksm(vma, addr);
+ break_ksm(vma, addr, false);
mmap_read_unlock(mm);
}

@@ -764,6 +783,30 @@ static struct page *get_ksm_page(struct ksm_stable_node *stable_node,
return NULL;
}

+/*
+ * Cleaning the rmap_item's ZERO_PAGE_FLAG
+ * This function will be called when unshare or writing on zero pages.
+ */
+static inline void clean_rmap_item_zero_flag(struct ksm_rmap_item *rmap_item)
+{
+ if (rmap_item->address & ZERO_PAGE_FLAG)
+ rmap_item->address &= PAGE_MASK;
+}
+
+/* Only called when rmap_item is going to be freed */
+static inline void unshare_zero_pages(struct ksm_rmap_item *rmap_item)
+{
+ struct vm_area_struct *vma;
+
+ if (rmap_item->address & ZERO_PAGE_FLAG) {
+ vma = vma_lookup(rmap_item->mm, rmap_item->address);
+ if (vma && !ksm_test_exit(rmap_item->mm))
+ break_ksm(vma, rmap_item->address, true);
+ }
+ /* Put at last. */
+ clean_rmap_item_zero_flag(rmap_item);
+}
+
/*
* Removing rmap_item from stable or unstable tree.
* This function will clean the information from the stable/unstable tree.
@@ -824,6 +867,7 @@ static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list)
struct ksm_rmap_item *rmap_item = *rmap_list;
*rmap_list = rmap_item->rmap_list;
remove_rmap_item_from_tree(rmap_item);
+ unshare_zero_pages(rmap_item);
free_rmap_item(rmap_item);
}
}
@@ -853,7 +897,7 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
if (signal_pending(current))
err = -ERESTARTSYS;
else
- err = break_ksm(vma, addr);
+ err = break_ksm(vma, addr, false);
}

MADV_UNMERGEABLE -> unmerge_ksm_pages() will never unshare the shared zeropage? I thought the patch description mentions that that is one of the goals?

--
Thanks,

David / dhildenb