Time for a pre-patch-2.0.31-3? (buffer/cache patch included)

Dr. Werner Fink (werner@suse.de)
Wed, 2 Jul 1997 00:32:30 +0200


Hi,

here is the collected patch against pre-patch-2.0.31-2. The buffer code
is based on the NEW buffer-code of 2.0.30. I've added some old own and old
foreign changes (let's look into http://www.linuxhq.com/patch/20-p0595.html):

-----
mm/vmscan.c
calculate can_do_io in try_to_free_page() to avoid unnecessary
repeated calls of `priority != GFP_BUFFER' in swap_out() (e.g. for
nr_tasks = 100 and priority = 0 we get a counter of 800 ... we would
get 800 calls of `priority != GFP_BUFFER').
export can_do_io in try_to_free_page() to shrink_mmap()
change try_to_free_page() in some point to have a more aggressive
swapping strategy in low memory situations.
change in swap_tick(): wakeup kswapd more often in low memory
situations (adapted from 2.1.xx)
change in kswapd(): call try_to_free_page() with wait = 1 in low
memory situations
mm/filemap.c
change in shrink_mmap() to avoid unnecessary freeing buffer on
buffer request
mm/page_alloc.c
change in __get_free_pages(): allow call of try_to_free_page() on
buffer request but only if we are really in low memory situations.
set absolute minimum of min_free_pages to 24
-----

This patch also includes some further changes:

mm/filemap.c
change in shrink_mmap(): do not mark a visit page as
referenced because shrink_mmap() does not use it
... instead age the page if it is a non buffer page.

mm/buffer.c
fix in sync_old_buffers() (thanks to Pavel Krauz <kra@elanor.cz>)
fix in can_reclaim() (thanks to Bill Hawes <whawes@star.net>)

Still missing in shrink_mmap() of mm/buffer.c:
A better aging condition due to the age of the page table entries.
(thanks to Mark Hemment <markhe@sco.COM>)

I've tested the following patch under real high load, big buffer requests,
big cache requests, nfs, X-Servers on two consoles, netscape, a big grok
data base, starting gnu/x emacs, reading/writing (r)mails, ...

But what's about big news and mails servers (Hallo Matthias) and other
machines with other hardware and configurations ... who knows.

Therefore IMHO a third pre-patch for a running 2.0.31 _and_ testers are
needed :-))

Werner

-----------------------------------------------------------------------------
diff -urN -x *~[12] linux-2.0.31-clean/fs/buffer.c linux/fs/buffer.c
--- linux-2.0.31-clean/fs/buffer.c Tue Jun 10 12:58:46 1997
+++ linux/fs/buffer.c Tue Jul 1 20:14:24 1997
@@ -548,7 +548,8 @@

if (mem_map[MAP_NR((unsigned long) bh->b_data)].count != 1 ||
buffer_dirty(bh)) {
- refile_buffer(bh);
+ /* WSH: don't attempt to refile here! */
+ /* refile_buffer(bh); */
return 0;
}

@@ -660,20 +661,15 @@
goto repeat;
}

- /* Too bad, that was not enough. Try a little harder to grow some. */
-
- if (nr_free_pages > min_free_pages + 5) {
- if (grow_buffers(GFP_BUFFER, size)) {
- needed -= PAGE_SIZE;
- goto repeat;
- };
- }
+ /* Too bad, that was not enough. Try a little harder to grow some.
+ * and repeat until we find something good
+ */

- /* and repeat until we find something good */
- if (grow_buffers(GFP_ATOMIC, size))
+ if (grow_buffers(GFP_BUFFER, size))
needed -= PAGE_SIZE;
else
wakeup_bdflush(1);
+
goto repeat;
}

@@ -931,11 +927,16 @@
* This is critical. We can't swap out pages to get
* more buffer heads, because the swap-out may need
* more buffer-heads itself. Thus GFP_ATOMIC.
+ *
+ * This is no longer true, it is GFP_BUFFER again, the
+ * swapping code now knows not to perform I/O when that
+ * GFP level is specified... -DaveM
*/
+
/* we now use kmalloc() here instead of gfp as we want
to be able to easily release buffer heads - they
took up quite a bit of memory (tridge) */
- bh = (struct buffer_head *) kmalloc(sizeof(*bh),GFP_ATOMIC);
+ bh = (struct buffer_head *) kmalloc(sizeof(*bh),GFP_BUFFER);
if (bh) {
put_unused_buffer_head(bh);
nr_buffer_heads++;
@@ -1534,6 +1535,7 @@
next->b_count--;
}
}
+ run_task_queue(&tq_disk);
#ifdef DEBUG
if (ncount) printk("sync_old_buffers: %d dirty buffers not on dirty list\n", ncount);
printk("Wrote %d/%d buffers\n", nwritten, ndirty);
diff -urN -x *~[12] linux-2.0.31-clean/include/linux/mm.h linux/include/linux/mm.h
--- linux-2.0.31-clean/include/linux/mm.h Sat Mar 29 01:08:17 1997
+++ linux/include/linux/mm.h Tue Jul 1 20:04:59 1997
@@ -295,7 +295,7 @@

/* filemap.c */
extern unsigned long page_unuse(unsigned long);
-extern int shrink_mmap(int, int);
+extern int shrink_mmap(int, int, int);
extern void truncate_inode_pages(struct inode *, unsigned long);

#define GFP_BUFFER 0x00
diff -urN -x *~[12] linux-2.0.31-clean/mm/filemap.c linux/mm/filemap.c
--- linux-2.0.31-clean/mm/filemap.c Tue Jun 10 12:58:48 1997
+++ linux/mm/filemap.c Tue Jul 1 14:05:21 1997
@@ -21,6 +21,7 @@
#include <linux/string.h>
#include <linux/malloc.h>
#include <linux/fs.h>
+#include <linux/swapctl.h>
#include <linux/locks.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
@@ -127,7 +128,7 @@
}
}

-int shrink_mmap(int priority, int dma)
+int shrink_mmap(int priority, int dma, int can_do_io)
{
static int clock = 0;
struct page * page;
@@ -183,13 +184,15 @@
}

/* is it a buffer cache page? */
- if (bh && try_to_free_buffer(bh, &bh, 6))
+ if (can_do_io && bh && try_to_free_buffer(bh, &bh, 6))
return 1;
break;

default:
/* more than one users: we can't throw it away */
- set_bit(PG_referenced, &page->flags);
+ /* if we visit a non buffer page we should age it */
+ if (!bh)
+ age_page(page);
/* fall through */
case 0:
/* nothing */
diff -urN -x *~[12] linux-2.0.31-clean/mm/page_alloc.c linux/mm/page_alloc.c
--- linux-2.0.31-clean/mm/page_alloc.c Sat Aug 17 20:19:29 1996
+++ linux/mm/page_alloc.c Tue Jun 17 13:43:33 1997
@@ -202,9 +202,14 @@
priority = GFP_ATOMIC;
}
}
- reserved_pages = 5;
- if (priority != GFP_NFS)
- reserved_pages = min_free_pages;
+ switch (priority) {
+ case GFP_NFS:
+ case GFP_BUFFER:
+ reserved_pages = 5;
+ break;
+ default:
+ reserved_pages = min_free_pages;
+ }
save_flags(flags);
repeat:
cli();
@@ -214,7 +219,7 @@
return 0;
}
restore_flags(flags);
- if (priority != GFP_BUFFER && try_to_free_page(priority, dma, 1))
+ if (try_to_free_page(priority, dma, 1))
goto repeat;
return 0;
}
@@ -264,11 +269,11 @@

/*
* select nr of pages we try to keep free for important stuff
- * with a minimum of 16 pages. This is totally arbitrary
+ * with a minimum of 24 pages. This is totally arbitrary
*/
i = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT+7);
- if (i < 16)
- i = 16;
+ if (i < 24)
+ i = 24;
min_free_pages = i;
free_pages_low = i + (i>>1);
free_pages_high = i + i;
diff -urN -x *~[12] linux-2.0.31-clean/mm/vmscan.c linux/mm/vmscan.c
--- linux-2.0.31-clean/mm/vmscan.c Sat Dec 14 13:24:31 1996
+++ linux/mm/vmscan.c Tue Jun 24 20:15:29 1997
@@ -68,7 +68,7 @@
* have died while we slept).
*/
static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struct* vma,
- unsigned long address, pte_t * page_table, int dma, int wait)
+ unsigned long address, pte_t * page_table, int dma, int wait, int can_do_io)
{
pte_t pte;
unsigned long entry;
@@ -100,6 +100,8 @@
if (page_map->age)
return 0;
if (pte_dirty(pte)) {
+ if(!can_do_io)
+ return 0;
if (vma->vm_ops && vma->vm_ops->swapout) {
pid_t pid = tsk->pid;
vma->vm_mm->rss--;
@@ -157,7 +159,8 @@
*/

static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * vma,
- pmd_t *dir, unsigned long address, unsigned long end, int dma, int wait)
+ pmd_t *dir, unsigned long address, unsigned long end, int dma, int wait,
+ int can_do_io)
{
pte_t * pte;
unsigned long pmd_end;
@@ -179,7 +182,7 @@
do {
int result;
tsk->swap_address = address + PAGE_SIZE;
- result = try_to_swap_out(tsk, vma, address, pte, dma, wait);
+ result = try_to_swap_out(tsk, vma, address, pte, dma, wait, can_do_io);
if (result)
return result;
address += PAGE_SIZE;
@@ -189,7 +192,8 @@
}

static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct * vma,
- pgd_t *dir, unsigned long address, unsigned long end, int dma, int wait)
+ pgd_t *dir, unsigned long address, unsigned long end, int dma, int wait,
+ int can_do_io)
{
pmd_t * pmd;
unsigned long pgd_end;
@@ -209,7 +213,7 @@
end = pgd_end;

do {
- int result = swap_out_pmd(tsk, vma, pmd, address, end, dma, wait);
+ int result = swap_out_pmd(tsk, vma, pmd, address, end, dma, wait, can_do_io);
if (result)
return result;
address = (address + PMD_SIZE) & PMD_MASK;
@@ -219,7 +223,7 @@
}

static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma,
- pgd_t *pgdir, unsigned long start, int dma, int wait)
+ pgd_t *pgdir, unsigned long start, int dma, int wait, int can_do_io)
{
unsigned long end;

@@ -230,7 +234,7 @@

end = vma->vm_end;
while (start < end) {
- int result = swap_out_pgd(tsk, vma, pgdir, start, end, dma, wait);
+ int result = swap_out_pgd(tsk, vma, pgdir, start, end, dma, wait, can_do_io);
if (result)
return result;
start = (start + PGDIR_SIZE) & PGDIR_MASK;
@@ -239,7 +243,7 @@
return 0;
}

-static int swap_out_process(struct task_struct * p, int dma, int wait)
+static int swap_out_process(struct task_struct * p, int dma, int wait, int can_do_io)
{
unsigned long address;
struct vm_area_struct* vma;
@@ -260,7 +264,7 @@
address = vma->vm_start;

for (;;) {
- int result = swap_out_vma(p, vma, pgd_offset(p->mm, address), address, dma, wait);
+ int result = swap_out_vma(p, vma, pgd_offset(p->mm, address), address, dma, wait, can_do_io);
if (result)
return result;
vma = vma->vm_next;
@@ -272,7 +276,7 @@
return 0;
}

-static int swap_out(unsigned int priority, int dma, int wait)
+static int swap_out(unsigned int priority, int dma, int wait, int can_do_io)
{
static int swap_task;
int loop, counter;
@@ -311,7 +315,7 @@
}
if (!--p->swap_cnt)
swap_task++;
- switch (swap_out_process(p, dma, wait)) {
+ switch (swap_out_process(p, dma, wait, can_do_io)) {
case 0:
if (p->swap_cnt)
swap_task++;
@@ -330,37 +334,65 @@
* to be. This works out OK, because we now do proper aging on page
* contents.
*/
+
int try_to_free_page(int priority, int dma, int wait)
{
- static int state = 0;
- int i=6;
- int stop;
-
- /* we don't try as hard if we're not waiting.. */
- stop = 3;
- if (wait)
+ static int common_state = 0;
+ static int single_state = 0;
+ int state;
+ int i=6, stop = 3, can_do_io = (priority != GFP_BUFFER);
+
+ /* We don't try as hard if we're not waiting.
+ * If we are called with wait, we really need a page ... therefore we switch
+ * to the next unused field. This in addition has a better chance to get a
+ * page as fast as possible. Werner <werner@suse.de>
+ */
+ state = common_state;
+ if (wait) {
+ state = single_state;
stop = 0;
+ }
+ single_state = common_state;
+
switch (state) {
do {
case 0:
- if (shrink_mmap(i, dma))
+ barrier();
+ /* Don't worry here for the GFP_BUFFER case, shrink_mmap never
+ * tries to write dirty things out...
+ */
+ if (shrink_mmap(i, dma, can_do_io)) {
+ single_state = 1;
return 1;
- state = 1;
+ }
+ common_state = 1;
+ barrier();
case 1:
- if (shm_swap(i, dma))
+ barrier();
+ /* shm_swap must always perform some I/O if it succeeds
+ * in finding things to free up, so don't waste any time
+ * if we are trying to get some buffer heads...
+ */
+ if (can_do_io && shm_swap(i, dma)) {
+ single_state = 2;
return 1;
- state = 2;
+ }
+ common_state = 2;
+ barrier();
default:
- if (swap_out(i, dma, wait))
+ barrier();
+ if (swap_out(i, dma, wait, can_do_io)) {
+ single_state = 0;
return 1;
- state = 0;
+ }
+ common_state = 0;
+ barrier();
i--;
} while ((i - stop) >= 0);
}
return 0;
}

-
/*
* The background pageout daemon.
* Started as a kernel thread from the init process.
@@ -410,7 +442,7 @@
swapstats.wakeups++;
/* Do the background pageout: */
for (i=0; i < kswapd_ctl.maxpages; i++)
- try_to_free_page(GFP_KERNEL, 0, 0);
+ try_to_free_page(GFP_KERNEL, 0, (nr_free_pages < min_free_pages));
}
}

@@ -440,7 +472,11 @@
wake_up(&kswapd_wait);
need_resched = 1;
}
- next_swap_jiffies = jiffies + swapout_interval;
+ /* low on memory, we need to start swapping soon */
+ if(last_wakeup_low)
+ next_swap_jiffies = jiffies;
+ else
+ next_swap_jiffies = jiffies + swapout_interval;
}
timer_active |= (1<<SWAP_TIMER);
}