Re: patch cow-swapin [was Re: Very bad swap bug -- 2.0, 2.1 at least]

Andrea Arcangeli (andrea@e-mind.com)
Tue, 29 Sep 1998 16:18:36 +0200 (CEST)


On Sun, 27 Sep 1998, Andrea Arcangeli wrote:

>Unfortunately the swapin-parent for 2.0 has a bug (I am not sure if it

Reading 2.0 swap code I think to have spotted some bugs (another bugfix
was also suggested by Stephen (mmap_sem)). Could you reverse my last
swapin-parent stuff against 2.0 and apply this new one? Note that I am not
sure it will be stable so apply it at your own risk! Sure it' s more
stable than the last patch though ;-). I never got lockups or fs
corruption or worse also with the last buggy one, but don' t apply the
patch if it' s a production machine. The probelems I had was only some
process that was segfaulting because it couldn' t been swapped in so I
think that it' s not very very dangerous but how knows ;-)... Hopefully
everything should work fine now.

If you' ll apply the patch and it will work fine for 5/6 days with many
process like cron autofs and friends swapped out let me know (let me know
also if there will be problems of course ;-).

patch against 2.0.36-pre9 (I have not downloaded newer 36pre yet but
should apply everywhere).

BTW, I repeat again to be sure that people will understand right: THIS
CODE IS _NOT_ STABLE. This code is EXPERIMENTAL right now.

Note also that this patch don' t work well with swapin caused by readonly
access because I don' t play anymore with the read case to avoid
increasing the page count number if the page is in the swap cache (I will
eventually improve the patch in the way it has to be done for 2.0 if it
will be enough stable as it is now).

diff -urN /home/andrea/devel/kernel-tree/linux-2.0.36-pre9/CREDITS linux/CREDITS
--- /home/andrea/devel/kernel-tree/linux-2.0.36-pre9/CREDITS Tue Sep 29 15:15:51 1998
+++ linux/CREDITS Tue Sep 29 14:55:48 1998
@@ -42,6 +42,7 @@
D: Fixed a 2.0.33 mm bug that corrupts memory in linux/mm/vmalloc.c
D: Author of lil (Linux Interrupt Latency benchmark)
D: Fixed the shm swap deallocation at swapoff time
+D: Developed linux/mm/swapin_parent.c (avoids continuous swapin of cow pages)
S: Via Ciaclini 26
S: Imola 40026
S: Italy
diff -urN /home/andrea/devel/kernel-tree/linux-2.0.36-pre9/include/linux/swap.h linux/include/linux/swap.h
--- /home/andrea/devel/kernel-tree/linux-2.0.36-pre9/include/linux/swap.h Thu Jun 18 23:48:21 1998
+++ linux/include/linux/swap.h Tue Sep 29 14:55:48 1998
@@ -65,6 +65,9 @@
extern void swap_in(struct task_struct *, struct vm_area_struct *,
pte_t *, unsigned long, int);

+/* linux/mm/swapin_parent.c */
+extern void swapin_parent(struct task_struct *, unsigned long,
+ pte_t *, unsigned long, unsigned int);

/* linux/mm/swap_state.c */
extern void show_swap_cache_info(void);
diff -urN /home/andrea/devel/kernel-tree/linux-2.0.36-pre9/mm/Makefile linux/mm/Makefile
--- /home/andrea/devel/kernel-tree/linux-2.0.36-pre9/mm/Makefile Fri Mar 22 11:56:56 1996
+++ linux/mm/Makefile Tue Sep 29 14:55:48 1998
@@ -9,7 +9,7 @@

O_TARGET := mm.o
O_OBJS := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
- kmalloc.o vmalloc.o \
+ kmalloc.o vmalloc.o swapin_parent.o \
swap.o vmscan.o page_io.o page_alloc.o swap_state.o swapfile.o

include $(TOPDIR)/Rules.make
diff -urN /home/andrea/devel/kernel-tree/linux-2.0.36-pre9/mm/memory.c linux/mm/memory.c
--- /home/andrea/devel/kernel-tree/linux-2.0.36-pre9/mm/memory.c Wed Sep 11 16:57:19 1996
+++ linux/mm/memory.c Tue Sep 29 14:55:48 1998
@@ -843,6 +843,8 @@

if (!vma->vm_ops || !vma->vm_ops->swapin) {
swap_in(tsk, vma, page_table, pte_val(entry), write_access);
+ swapin_parent(tsk, address, page_table, pte_val(entry),
+ write_access);
flush_page_to_ram(pte_page(*page_table));
return;
}
diff -urN /home/andrea/devel/kernel-tree/linux-2.0.36-pre9/mm/swapin_parent.c linux/mm/swapin_parent.c
--- /home/andrea/devel/kernel-tree/linux-2.0.36-pre9/mm/swapin_parent.c Thu Jan 1 01:00:00 1970
+++ linux/mm/swapin_parent.c Tue Sep 29 15:51:22 1998
@@ -0,0 +1,182 @@
+/*
+ * swapin_parent: join mem between swapped in childs and a swapped out parents
+ * Copyright (C) 1998 Andrea Arcangeli
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * You can reach Andrea Arcangeli at <andrea@e-mind.com>.
+ */
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/swap.h>
+
+#include <asm/pgtable.h>
+
+static __inline__ unsigned long duplicate(unsigned long old_page)
+{
+ /*
+ * I don' t like to avoid swapins swapping out some other thing
+ * more recently used.
+ */
+ unsigned long new_page = __get_free_page(GFP_ATOMIC);
+ if (new_page)
+ memcpy((void *) new_page, (void *) old_page, PAGE_SIZE);
+ return new_page;
+}
+
+#define pte_mkcow(new_page, vma) \
+ pte_mkwrite(pte_mkdirty(mk_pte((new_page), (vma)->vm_page_prot)))
+
+static void do_swapin_parent(pte_t *new_pte, pte_t *pte,
+ unsigned long entry, struct vm_area_struct *vma,
+ struct task_struct *parent, unsigned long address)
+{
+ struct page *page;
+ unsigned long map_nr;
+ pte_t __new_pte;
+ unsigned long new_page;
+
+ __new_pte= *new_pte;
+ map_nr = MAP_NR(pte_page(__new_pte));
+ page = &mem_map[map_nr];
+
+ if (PageReserved(mem_map+map_nr))
+ {
+ printk(KERN_ERR "do_swapin_parent: "
+ "swapped in page was reserved!\n");
+ return;
+ }
+
+ if (!pte_write(__new_pte))
+ {
+ printk(KERN_WARNING "do_swapin_parent: swapin after "
+ "writefault marked the page not writable\n");
+ return;
+ }
+
+ new_page = duplicate(pte_page(__new_pte));
+ if (!new_page)
+ return;
+
+ flush_page_to_ram(pte_page(__new_pte));
+ flush_cache_page(vma, address);
+ set_pte(pte, pte_mkcow(new_page, vma));
+ flush_tlb_page(vma, address);
+
+ ++vma->vm_mm->rss;
+ ++parent->maj_flt;
+ swap_free(entry);
+}
+
+static void try_to_swapin_parent(struct task_struct *parent,
+ unsigned long address,
+ pte_t *new_pte, unsigned long entry)
+{
+ pgd_t *pgd;
+ pmd_t *pmd;
+ pte_t *pte;
+ struct vm_area_struct *vma;
+ struct semaphore * mmap_sem;
+
+ vma = find_vma(parent->mm, address);
+ if (!vma)
+ {
+ printk(KERN_WARNING "try_to_swapin_parent: NULL vma!\n");
+ return;
+ }
+
+ mmap_sem = &vma->vm_mm->mmap_sem;
+
+ down(mmap_sem);
+
+ pgd = pgd_offset(vma->vm_mm, address);
+ if (pgd_none(*pgd))
+ goto out;
+ if (pgd_bad(*pgd)) {
+ printk(KERN_ERR "try_to_swapin_parent: bad pgd (%08lx)\n",
+ pgd_val(*pgd));
+ pgd_clear(pgd);
+ goto out;
+ }
+
+ pmd = pmd_offset(pgd, address);
+ if (pmd_none(*pmd))
+ goto out;
+ if (pmd_bad(*pmd))
+ {
+ printk(KERN_ERR "try_to_swapin_parent: bad pmd (%08lx)\n",
+ pmd_val(*pmd));
+ pmd_clear(pmd);
+ goto out;
+ }
+
+ pte = pte_offset(pmd, address);
+
+ if (pte_val(*pte) != entry)
+ goto out;
+
+ do_swapin_parent(new_pte, pte, entry, vma, parent, address);
+
+ out:
+ up(mmap_sem);
+}
+
+void swapin_parent(struct task_struct *child, unsigned long address,
+ pte_t *new_pte, unsigned long entry, unsigned int write)
+{
+ struct task_struct *parent;
+
+ if (!write || child->did_exec)
+ return;
+
+ /*
+ * A bit of PARANOID.
+ */
+ if (pte_val(*new_pte) == entry)
+ {
+ printk(KERN_WARNING "swapin_parent: child not yet swapped "
+ "in\n");
+ return;
+ }
+ if (pte_val(*new_pte) == pte_val(BAD_PAGE))
+ {
+ printk(KERN_WARNING "swapin_parent: swapped in page is BAD\n");
+ return;
+ }
+ if (pte_none(*new_pte))
+ {
+ printk(KERN_ERR "swapin_parent: child page table NULL!\n");
+ return;
+ }
+ if (!pte_present(*new_pte))
+ {
+ printk(KERN_ERR "swapin_parent: child wrong swap entry!\n");
+ return;
+ }
+
+ parent = child->p_pptr;
+ if (!parent)
+ {
+ printk(KERN_ERR "swapin_parent: parent NULL!\n");
+ return;
+ }
+#ifdef __SMP__
+ if (parent->processor != NO_PROC_ID)
+ return;
+#endif
+ try_to_swapin_parent(parent, address, new_pte, entry);
+}

Andrea[s] Arcangeli

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/