Re: [PATCH 6/11] ksm: remove old stable nodes more thoroughly

From: Simon Jeons
Date: Sun Jan 27 2013 - 21:12:13 EST


On Fri, 2013-01-25 at 18:01 -0800, Hugh Dickins wrote:
> Switching merge_across_nodes after running KSM is liable to oops on stale
> nodes still left over from the previous stable tree. It's not something

Since this patch solve the problem, so the description of
merge_across_nodes(Value can be changed only when there is no ksm shared
pages in system) should be changed in this patch.

> that people will often want to do, but it would be lame to demand a reboot
> when they're trying to determine which merge_across_nodes setting is best.
>
> How can this happen? We only permit switching merge_across_nodes when
> pages_shared is 0, and usually set run 2 to force that beforehand, which
> ought to unmerge everything: yet oopses still occur when you then run 1.
>
> Three causes:
>
> 1. The old stable tree (built according to the inverse merge_across_nodes)
> has not been fully torn down. A stable node lingers until get_ksm_page()
> notices that the page it references no longer references it: but the page
> is not necessarily freed as soon as expected, particularly when swapcache.
>
> Fix this with a pass through the old stable tree, applying get_ksm_page()
> to each of the remaining nodes (most found stale and removed immediately),
> with forced removal of any left over. Unless the page is still mapped:
> I've not seen that case, it shouldn't occur, but better to WARN_ON_ONCE
> and EBUSY than BUG.
>
> 2. __ksm_enter() has a nice little optimization, to insert the new mm
> just behind ksmd's cursor, so there's a full pass for it to stabilize
> (or be removed) before ksmd addresses it. Nice when ksmd is running,
> but not so nice when we're trying to unmerge all mms: we were missing
> those mms forked and inserted behind the unmerge cursor. Easily fixed
> by inserting at the end when KSM_RUN_UNMERGE.
>
> 3. It is possible for a KSM page to be faulted back from swapcache into
> an mm, just after unmerge_and_remove_all_rmap_items() scanned past it.
> Fix this by copying on fault when KSM_RUN_UNMERGE: but that is private
> to ksm.c, so dissolve the distinction between ksm_might_need_to_copy()
> and ksm_does_need_to_copy(), doing it all in the one call into ksm.c.
>
> A long outstanding, unrelated bugfix sneaks in with that third fix:
> ksm_does_need_to_copy() would copy from a !PageUptodate page (implying
> I/O error when read in from swap) to a page which it then marks Uptodate.
> Fix this case by not copying, letting do_swap_page() discover the error.
>
> Signed-off-by: Hugh Dickins <hughd@xxxxxxxxxx>
> ---
> include/linux/ksm.h | 18 ++-------
> mm/ksm.c | 83 +++++++++++++++++++++++++++++++++++++++---
> mm/memory.c | 19 ++++-----
> 3 files changed, 92 insertions(+), 28 deletions(-)
>
> --- mmotm.orig/include/linux/ksm.h 2013-01-25 14:27:58.220193250 -0800
> +++ mmotm/include/linux/ksm.h 2013-01-25 14:37:00.764206145 -0800
> @@ -16,9 +16,6 @@
> struct stable_node;
> struct mem_cgroup;
>
> -struct page *ksm_does_need_to_copy(struct page *page,
> - struct vm_area_struct *vma, unsigned long address);
> -
> #ifdef CONFIG_KSM
> int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
> unsigned long end, int advice, unsigned long *vm_flags);
> @@ -73,15 +70,8 @@ static inline void set_page_stable_node(
> * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE,
> * but what if the vma was unmerged while the page was swapped out?
> */
> -static inline int ksm_might_need_to_copy(struct page *page,
> - struct vm_area_struct *vma, unsigned long address)
> -{
> - struct anon_vma *anon_vma = page_anon_vma(page);
> -
> - return anon_vma &&
> - (anon_vma->root != vma->anon_vma->root ||
> - page->index != linear_page_index(vma, address));
> -}
> +struct page *ksm_might_need_to_copy(struct page *page,
> + struct vm_area_struct *vma, unsigned long address);
>
> int page_referenced_ksm(struct page *page,
> struct mem_cgroup *memcg, unsigned long *vm_flags);
> @@ -113,10 +103,10 @@ static inline int ksm_madvise(struct vm_
> return 0;
> }
>
> -static inline int ksm_might_need_to_copy(struct page *page,
> +static inline struct page *ksm_might_need_to_copy(struct page *page,
> struct vm_area_struct *vma, unsigned long address)
> {
> - return 0;
> + return page;
> }
>
> static inline int page_referenced_ksm(struct page *page,
> --- mmotm.orig/mm/ksm.c 2013-01-25 14:36:58.856206099 -0800
> +++ mmotm/mm/ksm.c 2013-01-25 14:37:00.768206145 -0800
> @@ -644,6 +644,57 @@ static int unmerge_ksm_pages(struct vm_a
> /*
> * Only called through the sysfs control interface:
> */
> +static int remove_stable_node(struct stable_node *stable_node)
> +{
> + struct page *page;
> + int err;
> +
> + page = get_ksm_page(stable_node, true);
> + if (!page) {
> + /*
> + * get_ksm_page did remove_node_from_stable_tree itself.
> + */
> + return 0;
> + }
> +
> + if (WARN_ON_ONCE(page_mapped(page)))
> + err = -EBUSY;
> + else {
> + /*
> + * This page might be in a pagevec waiting to be freed,
> + * or it might be PageSwapCache (perhaps under writeback),
> + * or it might have been removed from swapcache a moment ago.
> + */
> + set_page_stable_node(page, NULL);
> + remove_node_from_stable_tree(stable_node);
> + err = 0;
> + }
> +
> + unlock_page(page);
> + put_page(page);
> + return err;
> +}
> +
> +static int remove_all_stable_nodes(void)
> +{
> + struct stable_node *stable_node;
> + int nid;
> + int err = 0;
> +
> + for (nid = 0; nid < nr_node_ids; nid++) {
> + while (root_stable_tree[nid].rb_node) {
> + stable_node = rb_entry(root_stable_tree[nid].rb_node,
> + struct stable_node, node);
> + if (remove_stable_node(stable_node)) {
> + err = -EBUSY;
> + break; /* proceed to next nid */
> + }
> + cond_resched();
> + }
> + }
> + return err;
> +}
> +
> static int unmerge_and_remove_all_rmap_items(void)
> {
> struct mm_slot *mm_slot;
> @@ -691,6 +742,8 @@ static int unmerge_and_remove_all_rmap_i
> }
> }
>
> + /* Clean up stable nodes, but don't worry if some are still busy */
> + remove_all_stable_nodes();
> ksm_scan.seqnr = 0;
> return 0;
>
> @@ -1586,11 +1639,19 @@ int __ksm_enter(struct mm_struct *mm)
> spin_lock(&ksm_mmlist_lock);
> insert_to_mm_slots_hash(mm, mm_slot);
> /*
> - * Insert just behind the scanning cursor, to let the area settle
> + * When KSM_RUN_MERGE (or KSM_RUN_STOP),
> + * insert just behind the scanning cursor, to let the area settle
> * down a little; when fork is followed by immediate exec, we don't
> * want ksmd to waste time setting up and tearing down an rmap_list.
> + *
> + * But when KSM_RUN_UNMERGE, it's important to insert ahead of its
> + * scanning cursor, otherwise KSM pages in newly forked mms will be
> + * missed: then we might as well insert at the end of the list.
> */
> - list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
> + if (ksm_run & KSM_RUN_UNMERGE)
> + list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
> + else
> + list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
> spin_unlock(&ksm_mmlist_lock);
>
> set_bit(MMF_VM_MERGEABLE, &mm->flags);
> @@ -1640,11 +1701,25 @@ void __ksm_exit(struct mm_struct *mm)
> }
> }
>
> -struct page *ksm_does_need_to_copy(struct page *page,
> +struct page *ksm_might_need_to_copy(struct page *page,
> struct vm_area_struct *vma, unsigned long address)
> {
> + struct anon_vma *anon_vma = page_anon_vma(page);
> struct page *new_page;
>
> + if (PageKsm(page)) {
> + if (page_stable_node(page) &&
> + !(ksm_run & KSM_RUN_UNMERGE))
> + return page; /* no need to copy it */
> + } else if (!anon_vma) {
> + return page; /* no need to copy it */
> + } else if (anon_vma->root == vma->anon_vma->root &&
> + page->index == linear_page_index(vma, address)) {
> + return page; /* still no need to copy it */
> + }
> + if (!PageUptodate(page))
> + return page; /* let do_swap_page report the error */
> +
> new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
> if (new_page) {
> copy_user_highpage(new_page, page, address, vma);
> @@ -2024,7 +2099,7 @@ static ssize_t merge_across_nodes_store(
>
> mutex_lock(&ksm_thread_mutex);
> if (ksm_merge_across_nodes != knob) {
> - if (ksm_pages_shared)
> + if (ksm_pages_shared || remove_all_stable_nodes())
> err = -EBUSY;
> else
> ksm_merge_across_nodes = knob;
> --- mmotm.orig/mm/memory.c 2013-01-25 14:27:58.220193250 -0800
> +++ mmotm/mm/memory.c 2013-01-25 14:37:00.768206145 -0800
> @@ -2994,17 +2994,16 @@ static int do_swap_page(struct mm_struct
> if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
> goto out_page;
>
> - if (ksm_might_need_to_copy(page, vma, address)) {
> - swapcache = page;
> - page = ksm_does_need_to_copy(page, vma, address);
> -
> - if (unlikely(!page)) {
> - ret = VM_FAULT_OOM;
> - page = swapcache;
> - swapcache = NULL;
> - goto out_page;
> - }
> + swapcache = page;
> + page = ksm_might_need_to_copy(page, vma, address);
> + if (unlikely(!page)) {
> + ret = VM_FAULT_OOM;
> + page = swapcache;
> + swapcache = NULL;
> + goto out_page;
> }
> + if (page == swapcache)
> + swapcache = NULL;
>
> if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
> ret = VM_FAULT_OOM;
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@xxxxxxxxxx For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/