[PATCH] Provide an interface to limit total page cache.

From: Roy Huang
Date: Mon Jan 15 2007 - 04:40:23 EST


A patch provide a interface to limit total page cache in
/proc/sys/vm/pagecache_ratio. The default value is 90 percent. Any
feedback is appreciated.

-Roy

diff -urp a/include/linux/pagemap.h b/include/linux/pagemap.h
--- a/include/linux/pagemap.h 2006-11-30 05:57:37.000000000 +0800
+++ b/include/linux/pagemap.h 2007-01-15 17:03:09.000000000 +0800
@@ -12,6 +12,12 @@
#include <asm/uaccess.h>
#include <linux/gfp.h>

+extern int pagecache_ratio;
+extern long pagecache_limit;
+
+int pagecache_ratio_sysctl_handler(struct ctl_table *, int,
+ struct file *, void __user *, size_t *, loff_t *);
+
/*
* Bits in mapping->flags. The lower __GFP_BITS_SHIFT bits are the page
* allocation mode flags.
diff -urp a/include/linux/sysctl.h b/include/linux/sysctl.h
--- a/include/linux/sysctl.h 2007-01-15 17:18:46.000000000 +0800
+++ b/include/linux/sysctl.h 2007-01-15 17:03:09.000000000 +0800
@@ -202,6 +202,7 @@ enum
VM_PANIC_ON_OOM=33, /* panic at out-of-memory */
VM_VDSO_ENABLED=34, /* map VDSO into new processes? */
VM_MIN_SLAB=35, /* Percent pages ignored by zone reclaim */
+ VM_PAGECACHE_RATIO=36, /* Percent memory is used as page cache */
};


diff -urp a/kernel/sysctl.c b/kernel/sysctl.c
--- a/kernel/sysctl.c 2007-01-15 17:18:46.000000000 +0800
+++ b/kernel/sysctl.c 2007-01-15 17:03:09.000000000 +0800
@@ -1035,6 +1035,15 @@ static ctl_table vm_table[] = {
.extra1 = &zero,
},
#endif
+ {
+ .ctl_name = VM_PAGECACHE_RATIO,
+ .procname = "pagecache_ratio",
+ .data = &pagecache_ratio,
+ .maxlen = sizeof(pagecache_ratio),
+ .mode = 0644,
+ .proc_handler = &pagecache_ratio_sysctl_handler,
+ .strategy = &sysctl_intvec,
+ },
{ .ctl_name = 0 }
};

diff -urp a/mm/filemap.c b/mm/filemap.c
--- a/mm/filemap.c 2007-01-15 17:18:46.000000000 +0800
+++ b/mm/filemap.c 2007-01-15 17:03:09.000000000 +0800
@@ -30,6 +30,7 @@
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/cpuset.h>
+#include <linux/sysctl.h>
#include "filemap.h"
#include "internal.h"

@@ -108,6 +109,48 @@ generic_file_direct_IO(int rw, struct ki
*/

/*
+ * Start release pagecache (via kswapd) at the percentage.
+ */
+int pagecache_ratio __read_mostly = 90;
+
+long pagecache_limit = 0;
+
+int setup_pagecache_limit(void)
+{
+ pagecache_limit = pagecache_ratio * nr_free_pagecache_pages() / 100;
+ return 0;
+}
+
+int pagecache_ratio_sysctl_handler(ctl_table *table, int write,
+ struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+ proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+ setup_pagecache_limit();
+ return 0;
+}
+
+static inline int balance_pagecache(void)
+{
+ if (global_page_state(NR_FILE_PAGES) > pagecache_limit) {
+ int nid, j;
+ pg_data_t *pgdat;
+ struct zone *zone;
+
+ for_each_online_node(nid) {
+ pgdat = NODE_DATA(nid);
+ for (j = 0; j < MAX_NR_ZONES; j++) {
+ zone = pgdat->node_zones + j;
+ wakeup_kswapd(zone, 0);
+ }
+ }
+ }
+
+ return 0;
+}
+
+module_init(setup_pagecache_limit)
+
+/*
* Remove a page from the page cache and free it. Caller has to make
* sure the page is locked and that nobody else uses it - or that usage
* is safe. The caller must hold a write_lock on the mapping's tree_lock.
@@ -1085,6 +1128,8 @@ out:
page_cache_release(cached_page);
if (filp)
file_accessed(filp);
+
+ balance_pagecache();
}
EXPORT_SYMBOL(do_generic_mapping_read);

@@ -2212,6 +2257,8 @@ zero_length_segment:
status = filemap_write_and_wait(mapping);

pagevec_lru_add(&lru_pvec);
+ balance_pagecache();
+
return written ? written : status;
}
EXPORT_SYMBOL(generic_file_buffered_write);
diff -urp a/mm/vmscan.c b/mm/vmscan.c
--- a/mm/vmscan.c 2007-01-15 17:18:46.000000000 +0800
+++ b/mm/vmscan.c 2007-01-15 17:03:09.000000000 +0800
@@ -1316,6 +1316,7 @@ static int kswapd(void *p)
order = 0;
for ( ; ; ) {
unsigned long new_order;
+ long over_limit;

try_to_freeze();

@@ -1335,6 +1336,9 @@ static int kswapd(void *p)
finish_wait(&pgdat->kswapd_wait, &wait);

balance_pgdat(pgdat, order);
+ over_limit = global_page_state(NR_FILE_PAGES) - pagecache_limit;
+ if (over_limit > 0)
+ shrink_all_memory(over_limit);
}
return 0;
}
@@ -1350,8 +1354,10 @@ void wakeup_kswapd(struct zone *zone, in
return;

pgdat = zone->zone_pgdat;
- if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0))
- return;
+ if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0)) {
+ if (global_page_state(NR_FILE_PAGES) < pagecache_limit)
+ return;
+ }
if (pgdat->kswapd_max_order < order)
pgdat->kswapd_max_order = order;
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
@@ -1361,7 +1367,6 @@ void wakeup_kswapd(struct zone *zone, in
wake_up_interruptible(&pgdat->kswapd_wait);
}

-#ifdef CONFIG_PM
/*
* Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages
* from LRU lists system-wide, for given pass and priority, and returns the
@@ -1510,7 +1515,6 @@ out:

return ret;
}
-#endif

/* It's optimal to keep kswapds on the same CPUs as their memory, but
not required for correctness. So if the last cpu in a node goes
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/