Re: [patch] arca-vm-6, killed kswapd [Re: [patch] new-vm improvement , [Re: 2.2.0 Bug summary]]

Andrea Arcangeli (andrea@e-mind.com)
Tue, 5 Jan 1999 01:32:17 +0100 (CET)


On Mon, 4 Jan 1999, Andrea Arcangeli wrote:

> I am going to do something like that right now...

Here a new patch (arca-vm-7). It pratically removes kswapd for all places
except the ATOMIC memory allocation if there aren't process that are just
freeing memory.

It returns also to the stock/arca-vm-3 shrink_mmap() (even if it seems
slower).

Index: linux/mm/vmscan.c
diff -u linux/mm/vmscan.c:1.1.1.9 linux/mm/vmscan.c:1.1.1.1.2.64
--- linux/mm/vmscan.c:1.1.1.9 Sat Jan 2 15:46:20 1999
+++ linux/mm/vmscan.c Tue Jan 5 01:02:43 1999
@@ -10,6 +10,14 @@
* Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
*/

+/*
+ * Developed the balanced page freeing algorithm (do_free_user_and_cache).
+ * Developed a smart mechanism to handle the swapout weight.
+ * Allowed the process to swapout async and only then get the credit from
+ * the bank. This has doubled swapout performances and fluidness.
+ * Copyright (C) 1998 Andrea Arcangeli
+ */
+
#include <linux/slab.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
@@ -21,12 +29,15 @@
#include <asm/pgtable.h>

/*
+ * When are we next due for a page scan?
+ */
+static atomic_t nr_tasks_freeing_memory = ATOMIC_INIT(0);
+
+/*
* The wait queue for waking up the pageout daemon:
*/
static struct task_struct * kswapd_task = NULL;

-static void init_swap_timer(void);
-
/*
* The swap-out functions return 1 if they successfully
* threw something out, and we got a free page. It returns
@@ -163,7 +174,7 @@
* cache. */
if (PageSwapCache(page_map)) {
__free_page(page_map);
- return (atomic_read(&page_map->count) == 0);
+ return 1;
}
add_to_swap_cache(page_map, entry);
/* We checked we were unlocked way up above, and we
@@ -195,7 +206,7 @@
flush_tlb_page(vma, address);
swap_duplicate(entry);
__free_page(page_map);
- return (atomic_read(&page_map->count) == 0);
+ return 1;
}
/*
* A clean page to be discarded? Must be mmap()ed from
@@ -210,9 +221,8 @@
flush_cache_page(vma, address);
pte_clear(page_table);
flush_tlb_page(vma, address);
- entry = (atomic_read(&page_map->count) == 1);
__free_page(page_map);
- return entry;
+ return 1;
}

/*
@@ -230,7 +240,7 @@
*/

static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * vma,
- pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+ pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask, unsigned long * counter)
{
pte_t * pte;
unsigned long pmd_end;
@@ -251,18 +261,20 @@

do {
int result;
- tsk->swap_address = address + PAGE_SIZE;
result = try_to_swap_out(tsk, vma, address, pte, gfp_mask);
+ address += PAGE_SIZE;
+ tsk->swap_address = address;
if (result)
return result;
- address += PAGE_SIZE;
+ if (!--*counter)
+ return 0;
pte++;
} while (address < end);
return 0;
}

static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct * vma,
- pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+ pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask, unsigned long * counter)
{
pmd_t * pmd;
unsigned long pgd_end;
@@ -282,9 +294,11 @@
end = pgd_end;

do {
- int result = swap_out_pmd(tsk, vma, pmd, address, end, gfp_mask);
+ int result = swap_out_pmd(tsk, vma, pmd, address, end, gfp_mask, counter);
if (result)
return result;
+ if (!*counter)
+ return 0;
address = (address + PMD_SIZE) & PMD_MASK;
pmd++;
} while (address < end);
@@ -292,7 +306,7 @@
}

static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma,
- unsigned long address, int gfp_mask)
+ unsigned long address, int gfp_mask, unsigned long * counter)
{
pgd_t *pgdir;
unsigned long end;
@@ -306,16 +320,19 @@

end = vma->vm_end;
while (address < end) {
- int result = swap_out_pgd(tsk, vma, pgdir, address, end, gfp_mask);
+ int result = swap_out_pgd(tsk, vma, pgdir, address, end, gfp_mask, counter);
if (result)
return result;
+ if (!*counter)
+ return 0;
address = (address + PGDIR_SIZE) & PGDIR_MASK;
pgdir++;
}
return 0;
}

-static int swap_out_process(struct task_struct * p, int gfp_mask)
+static int swap_out_process(struct task_struct * p, int gfp_mask,
+ unsigned long * counter)
{
unsigned long address;
struct vm_area_struct* vma;
@@ -334,9 +351,12 @@
address = vma->vm_start;

for (;;) {
- int result = swap_out_vma(p, vma, address, gfp_mask);
+ int result = swap_out_vma(p, vma, address, gfp_mask,
+ counter);
if (result)
return result;
+ if (!*counter)
+ return 0;
vma = vma->vm_next;
if (!vma)
break;
@@ -350,6 +370,25 @@
return 0;
}

+static inline unsigned long calc_swapout_weight(int priority)
+{
+ struct task_struct * p;
+ unsigned long total_vm = 0;
+
+ read_lock(&tasklist_lock);
+ for_each_task(p)
+ {
+ if (!p->swappable)
+ continue;
+ if (p->mm->rss == 0)
+ continue;
+ total_vm += p->mm->total_vm;
+ }
+ read_unlock(&tasklist_lock);
+
+ return total_vm / (priority+1);
+}
+
/*
* Select the task with maximal swap_cnt and try to swap out a page.
* N.B. This function returns only 0 or 1. Return values != 1 from
@@ -358,7 +397,10 @@
static int swap_out(unsigned int priority, int gfp_mask)
{
struct task_struct * p, * pbest;
- int counter, assign, max_cnt;
+ int assign;
+ unsigned long counter, max_cnt;
+
+ counter = calc_swapout_weight(priority);

/*
* We make one or two passes through the task list, indexed by
@@ -374,23 +416,17 @@
* Think of swap_cnt as a "shadow rss" - it tells us which process
* we want to page out (always try largest first).
*/
- counter = nr_tasks / (priority+1);
- if (counter < 1)
- counter = 1;
- if (counter > nr_tasks)
- counter = nr_tasks;
-
- for (; counter >= 0; counter--) {
+ while (counter != 0) {
assign = 0;
max_cnt = 0;
pbest = NULL;
select:
read_lock(&tasklist_lock);
- p = init_task.next_task;
- for (; p != &init_task; p = p->next_task) {
+ for_each_task(p)
+ {
if (!p->swappable)
continue;
- if (p->mm->rss <= 0)
+ if (p->mm->rss == 0)
continue;
/* Refresh swap_cnt? */
if (assign)
@@ -410,10 +446,11 @@
}

/*
- * Nonzero means we cleared out something, but only "1" means
- * that we actually free'd up a page as a result.
+ * Nonzero means we cleared out something, and "1" means
+ * that we actually moved a page from the process memory
+ * to the swap cache (it's not been freed yet).
*/
- if (swap_out_process(pbest, gfp_mask) == 1)
+ if (swap_out_process(pbest, gfp_mask, &counter))
return 1;
}
out:
@@ -441,39 +478,62 @@
printk ("Starting kswapd v%.*s\n", i, s);
}

-#define free_memory(fn) \
- count++; do { if (!--count) goto done; } while (fn)
+static int do_free_user_and_cache(int priority, int gfp_mask)
+{
+ if (shrink_mmap(priority, gfp_mask))
+ return 1;

-static int kswapd_free_pages(int kswapd_state)
+ /*
+ * NOTE: Here we allow also the process to do async swapout
+ * because the swapout is really only a credit at the bank of
+ * free memory right now. So we don't care to have it _now_.
+ * Allowing async I/O we are going to improve drammatically
+ * swapout performance -arca (discovered this afternoon ;) 980105
+ */
+ if (swap_out(priority, gfp_mask & ~__GFP_WAIT))
+ /*
+ * We done at least some swapping progress so return 1 in
+ * this case. -arca
+ */
+ return 1;
+
+ return 0;
+}
+
+static int do_free_page(int * state, int gfp_mask)
{
- unsigned long end_time;
+ int priority = 8;

- /* Always trim SLAB caches when memory gets low. */
- kmem_cache_reap(0);
+ switch (*state) {
+ do {
+ case 0:
+ if (do_free_user_and_cache(priority, gfp_mask))
+ return 1;
+ *state = 1;
+ case 1:
+ if (shm_swap(priority, gfp_mask))
+ return 1;
+ *state = 0;

- /* max one hundreth of a second */
- end_time = jiffies + (HZ-1)/100;
- do {
- int priority = 8;
- int count = pager_daemon.swap_cluster;
+ shrink_dcache_memory(priority, gfp_mask);
+ kmem_cache_reap(gfp_mask);
+ } while (--priority >= 0);
+ }
+ return 0;
+}

- switch (kswapd_state) {
- do {
- default:
- free_memory(shrink_mmap(priority, 0));
- free_memory(swap_out(priority, 0));
- kswapd_state++;
- case 1:
- free_memory(shm_swap(priority, 0));
- shrink_dcache_memory(priority, 0);
- kswapd_state = 0;
- } while (--priority >= 0);
- return kswapd_state;
- }
-done:
- if (nr_free_pages > freepages.high + pager_daemon.swap_cluster)
+static int kswapd_free_pages(int kswapd_state)
+{
+ for(;;)
+ {
+ do_free_page(&kswapd_state, 0);
+ if (nr_free_pages > freepages.high)
+ break;
+ if (atomic_read(&nr_tasks_freeing_memory))
break;
- } while (time_before_eq(jiffies,end_time));
+ if (kswapd_task->need_resched)
+ schedule();
+ };
return kswapd_state;
}

@@ -496,13 +556,6 @@
lock_kernel();

/*
- * Set the base priority to something smaller than a
- * regular process. We will scale up the priority
- * dynamically depending on how much memory we need.
- */
- current->priority = (DEF_PRIORITY * 2) / 3;
-
- /*
* Tell the memory management that we're a "memory allocator",
* and that if we need more memory we should get access to it
* regardless (see "try_to_free_pages()"). "kswapd" should
@@ -516,7 +569,6 @@
*/
current->flags |= PF_MEMALLOC;

- init_swap_timer();
kswapd_task = current;
while (1) {
int state = 0;
@@ -543,107 +595,37 @@
* if we need more memory as part of a swap-out effort we
* will just silently return "success" to tell the page
* allocator to accept the allocation.
- *
- * We want to try to free "count" pages, and we need to
- * cluster them so that we get good swap-out behaviour. See
- * the "free_memory()" macro for details.
*/
int try_to_free_pages(unsigned int gfp_mask, int count)
{
- int retval;
-
+ int retval = 1;
lock_kernel();

- /* Always trim SLAB caches when memory gets low. */
- kmem_cache_reap(gfp_mask);
-
- retval = 1;
if (!(current->flags & PF_MEMALLOC)) {
- int priority;
-
current->flags |= PF_MEMALLOC;
-
- priority = 8;
- do {
- free_memory(shrink_mmap(priority, gfp_mask));
- free_memory(shm_swap(priority, gfp_mask));
- free_memory(swap_out(priority, gfp_mask));
- shrink_dcache_memory(priority, gfp_mask);
- } while (--priority >= 0);
- retval = 0;
-done:
+ atomic_inc(&nr_tasks_freeing_memory);
+ while (count--)
+ {
+ static int state = 0;
+ if (!do_free_page(&state, gfp_mask))
+ {
+ retval = 0;
+ break;
+ }
+ }
+ atomic_dec(&nr_tasks_freeing_memory);
current->flags &= ~PF_MEMALLOC;
}
unlock_kernel();

return retval;
}
-
-/*
- * Wake up kswapd according to the priority
- * 0 - no wakeup
- * 1 - wake up as a low-priority process
- * 2 - wake up as a normal process
- * 3 - wake up as an almost real-time process
- *
- * This plays mind-games with the "goodness()"
- * function in kernel/sched.c.
- */
-static inline void kswapd_wakeup(struct task_struct *p, int priority)
-{
- if (priority) {
- p->counter = p->priority << priority;
- wake_up_process(p);
- }
-}

-/*
- * The swap_tick function gets called on every clock tick.
- */
-void swap_tick(void)
+void kswapd_wakeup(void)
{
- struct task_struct *p = kswapd_task;
-
- /*
- * Only bother to try to wake kswapd up
- * if the task exists and can be woken.
- */
- if (p && (p->state & TASK_INTERRUPTIBLE)) {
- unsigned int pages;
- int want_wakeup;
-
- /*
- * Schedule for wakeup if there isn't lots
- * of free memory or if there is too much
- * of it used for buffers or pgcache.
- *
- * "want_wakeup" is our priority: 0 means
- * not to wake anything up, while 3 means
- * that we'd better give kswapd a realtime
- * priority.
- */
- want_wakeup = 0;
- pages = nr_free_pages;
- if (pages < freepages.high)
- want_wakeup = 1;
- if (pages < freepages.low)
- want_wakeup = 2;
- if (pages < freepages.min)
- want_wakeup = 3;
-
- kswapd_wakeup(p,want_wakeup);
- }
-
- timer_active |= (1<<SWAP_TIMER);
-}
+ struct task_struct * p = kswapd_task;

-/*
- * Initialise the swap timer
- */
-
-void init_swap_timer(void)
-{
- timer_table[SWAP_TIMER].expires = jiffies;
- timer_table[SWAP_TIMER].fn = swap_tick;
- timer_active |= (1<<SWAP_TIMER);
+ if (p && (p->state & TASK_INTERRUPTIBLE) &&
+ !atomic_read(&nr_tasks_freeing_memory))
+ wake_up_process(p);
}
Index: linux/mm/page_alloc.c
diff -u linux/mm/page_alloc.c:1.1.1.5 linux/mm/page_alloc.c:1.1.1.1.2.15
--- linux/mm/page_alloc.c:1.1.1.5 Sun Jan 3 20:42:44 1999
+++ linux/mm/page_alloc.c Tue Jan 5 01:13:00 1999
@@ -151,7 +151,6 @@
if (!PageReserved(page) && atomic_dec_and_test(&page->count)) {
if (PageSwapCache(page))
panic ("Freeing swap cache page");
- page->flags &= ~(1 << PG_referenced);
free_pages_ok(page->map_nr, 0);
return;
}
@@ -173,7 +172,6 @@
if (atomic_dec_and_test(&map->count)) {
if (PageSwapCache(map))
panic ("Freeing swap cache pages");
- map->flags &= ~(1 << PG_referenced);
free_pages_ok(map_nr, order);
return;
}
@@ -260,7 +258,8 @@
if (nr_free_pages > freepages.min) {
if (!current->trashing_memory)
goto ok_to_allocate;
- if (nr_free_pages > freepages.low) {
+ if (!(current->flags & PF_MEMALLOC) &&
+ nr_free_pages > freepages.low) {
current->trashing_memory = 0;
goto ok_to_allocate;
}
@@ -271,8 +270,11 @@
* memory.
*/
current->trashing_memory = 1;
- if (!try_to_free_pages(gfp_mask, SWAP_CLUSTER_MAX) && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
+ if (!try_to_free_pages(gfp_mask, freepages.high - nr_free_pages + 1<<order) && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
goto nopage;
+ } else {
+ if (nr_free_pages < freepages.min)
+ kswapd_wakeup();
}
ok_to_allocate:
spin_lock_irqsave(&page_alloc_lock, flags);
Index: linux/include/linux/mm.h
diff -u linux/include/linux/mm.h:1.1.1.3 linux/include/linux/mm.h:1.1.1.1.2.13
--- linux/include/linux/mm.h:1.1.1.3 Sat Jan 2 15:24:18 1999
+++ linux/include/linux/mm.h Mon Jan 4 18:42:52 1999
@@ -118,7 +118,6 @@
unsigned long offset;
struct page *next_hash;
atomic_t count;
- unsigned int unused;
unsigned long flags; /* atomic flags, some possibly updated asynchronously */
struct wait_queue *wait;
struct page **pprev_hash;
@@ -295,8 +294,7 @@

/* filemap.c */
extern void remove_inode_page(struct page *);
-extern unsigned long page_unuse(struct page *);
-extern int shrink_mmap(int, int);
+extern int FASTCALL(shrink_mmap(int, int));
extern void truncate_inode_pages(struct inode *, unsigned long);
extern unsigned long get_cached_page(struct inode *, unsigned long, int);
extern void put_cached_page(unsigned long);
Index: linux/mm/swap.c
diff -u linux/mm/swap.c:1.1.1.5 linux/mm/swap.c:1.1.1.1.2.8
--- linux/mm/swap.c:1.1.1.5 Sat Jan 2 15:24:40 1999
+++ linux/mm/swap.c Sat Jan 2 21:40:13 1999
@@ -64,13 +64,13 @@
swapstat_t swapstats = {0};

buffer_mem_t buffer_mem = {
- 2, /* minimum percent buffer */
+ 5, /* minimum percent buffer */
10, /* borrow percent buffer */
60 /* maximum percent buffer */
};

buffer_mem_t page_cache = {
- 2, /* minimum percent page cache */
+ 5, /* minimum percent page cache */
15, /* borrow percent page cache */
75 /* maximum */
};
Index: linux/include/linux/swap.h
diff -u linux/include/linux/swap.h:1.1.1.4 linux/include/linux/swap.h:1.1.1.1.2.10
--- linux/include/linux/swap.h:1.1.1.4 Tue Dec 29 01:39:03 1998
+++ linux/include/linux/swap.h Tue Jan 5 01:12:59 1999
@@ -83,6 +83,7 @@

/* linux/mm/vmscan.c */
extern int try_to_free_pages(unsigned int gfp_mask, int count);
+extern void kswapd_wakeup(void);

/* linux/mm/page_io.c */
extern void rw_swap_page(int, unsigned long, char *, int);
@@ -167,9 +168,11 @@
count = atomic_read(&page->count);
if (PageSwapCache(page))
{
+#if 0
/* PARANOID */
if (page->inode != &swapper_inode)
panic("swap cache page has wrong inode\n");
+#endif
count += swap_count(page->offset) - 2;
}
if (PageFreeAfter(page))

Andrea Arcangeli

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/