[RFC, PATCH] Slab counter troubles with swap prefetch?

From: Christoph Lameter
Date: Thu Nov 10 2005 - 16:55:09 EST


Currently the slab allocator uses a page_state counter called nr_slab.
The VM swap prefetch code assumes that this describes the number of pages
used on a node by the slab allocator. However, that is not really true.

Currently nr_slab is the number of total pages allocated which may
be local or remote pages. Remote allocations may artificially inflate
nr_slab and therefore disable swap prefetching.

This patch splits the counter into the nr_local_slab which reflects
slab pages allocated from the local zones (and this number is useful
at least as a guidance for the VM) and the remotely allocated pages.

However, there is currently no counter reflecting the number of pages
used in a zone/node for the slab although the proc statistics give
an impression otherwise.

We cannot update counters from other nodes since these counters are per cpu.
A counter could be put into struct zone however that would require to
take a spinlock for each update.

Signed-off-by: Christoph Lameter <clameter@xxxxxxx>

Index: linux-2.6.14-mm1/mm/slab.c
===================================================================
--- linux-2.6.14-mm1.orig/mm/slab.c 2005-11-10 12:59:49.000000000 -0800
+++ linux-2.6.14-mm1/mm/slab.c 2005-11-10 13:00:11.000000000 -0800
@@ -1201,7 +1201,12 @@ static void *kmem_getpages(kmem_cache_t
i = (1 << cachep->gfporder);
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
atomic_add(i, &slab_reclaim_pages);
- add_page_state(nr_slab, i);
+
+ if (page_to_nid(page) == numa_node_id())
+ add_page_state(nr_local_slab, i);
+ else
+ add_page_state(nr_remote_slab, i);
+
while (i--) {
SetPageSlab(page);
page++;
@@ -1223,7 +1228,10 @@ static void kmem_freepages(kmem_cache_t
BUG();
page++;
}
- sub_page_state(nr_slab, nr_freed);
+ if (page_to_nid(page) == numa_node_id())
+ sub_page_state(nr_local_slab, nr_freed);
+ else
+ sub_page_state(nr_remote_slab, nr_freed);
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += nr_freed;
free_pages((unsigned long)addr, cachep->gfporder);
Index: linux-2.6.14-mm1/drivers/base/node.c
===================================================================
--- linux-2.6.14-mm1.orig/drivers/base/node.c 2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14-mm1/drivers/base/node.c 2005-11-10 13:00:11.000000000 -0800
@@ -55,8 +55,10 @@ static ssize_t node_read_meminfo(struct
ps.nr_writeback = 0;
if ((long)ps.nr_mapped < 0)
ps.nr_mapped = 0;
- if ((long)ps.nr_slab < 0)
- ps.nr_slab = 0;
+ if ((long)ps.nr_local_slab < 0)
+ ps.nr_local_slab = 0;
+ if ((long)ps.nr_remote_slab < 0)
+ ps.nr_remote_slab = 0;

n = sprintf(buf, "\n"
"Node %d MemTotal: %8lu kB\n"
@@ -71,7 +73,8 @@ static ssize_t node_read_meminfo(struct
"Node %d Dirty: %8lu kB\n"
"Node %d Writeback: %8lu kB\n"
"Node %d Mapped: %8lu kB\n"
- "Node %d Slab: %8lu kB\n",
+ "Node %d LocalSlab: %8lu kB\n"
+ "Node %d RemoteSlab: %8lu kB\n",
nid, K(i.totalram),
nid, K(i.freeram),
nid, K(i.totalram - i.freeram),
@@ -84,7 +87,8 @@ static ssize_t node_read_meminfo(struct
nid, K(ps.nr_dirty),
nid, K(ps.nr_writeback),
nid, K(ps.nr_mapped),
- nid, K(ps.nr_slab));
+ nid, K(ps.nr_local_slab),
+ nid, K(ps.nr_remote_slab));
n += hugetlb_report_node_meminfo(nid, buf + n);
return n;
}
Index: linux-2.6.14-mm1/fs/proc/proc_misc.c
===================================================================
--- linux-2.6.14-mm1.orig/fs/proc/proc_misc.c 2005-11-09 10:47:35.000000000 -0800
+++ linux-2.6.14-mm1/fs/proc/proc_misc.c 2005-11-10 13:00:11.000000000 -0800
@@ -168,7 +168,8 @@ static int meminfo_read_proc(char *page,
"Dirty: %8lu kB\n"
"Writeback: %8lu kB\n"
"Mapped: %8lu kB\n"
- "Slab: %8lu kB\n"
+ "LocalSlab: %8lu kB\n"
+ "RemoteSlab: %8lu kB\n"
"CommitLimit: %8lu kB\n"
"Committed_AS: %8lu kB\n"
"PageTables: %8lu kB\n"
@@ -191,7 +192,8 @@ static int meminfo_read_proc(char *page,
K(ps.nr_dirty),
K(ps.nr_writeback),
K(ps.nr_mapped),
- K(ps.nr_slab),
+ K(ps.nr_local_slab),
+ K(ps.nr_remote_slab),
K(allowed),
K(committed),
K(ps.nr_page_table_pages),
Index: linux-2.6.14-mm1/mm/swap_prefetch.c
===================================================================
--- linux-2.6.14-mm1.orig/mm/swap_prefetch.c 2005-11-10 11:33:03.000000000 -0800
+++ linux-2.6.14-mm1/mm/swap_prefetch.c 2005-11-10 13:00:11.000000000 -0800
@@ -327,7 +327,7 @@ static int prefetch_suitable(void)
* >2/3 of the ram is mapped or swapcache, we need some free for
* pagecache
*/
- limit = ps.nr_mapped + ps.nr_slab + pending_writes +
+ limit = ps.nr_mapped + ps.nr_local_slab + pending_writes +
total_swapcache_pages;
if (limit > mapped_limit)
goto out;
Index: linux-2.6.14-mm1/mm/page_alloc.c
===================================================================
--- linux-2.6.14-mm1.orig/mm/page_alloc.c 2005-11-09 10:47:37.000000000 -0800
+++ linux-2.6.14-mm1/mm/page_alloc.c 2005-11-10 13:00:11.000000000 -0800
@@ -1423,14 +1423,15 @@ void show_free_areas(void)
K(nr_free_highpages()));

printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
- "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
+ "unstable:%lu free:%u localslab:%lu remoteslab:%lu mapped:%lu pagetables:%lu\n",
active,
inactive,
ps.nr_dirty,
ps.nr_writeback,
ps.nr_unstable,
nr_free_pages(),
- ps.nr_slab,
+ ps.nr_local_slab,
+ ps.nr_remote_slab,
ps.nr_mapped,
ps.nr_page_table_pages);

@@ -2312,7 +2313,8 @@ static char *vmstat_text[] = {
"nr_unstable",
"nr_page_table_pages",
"nr_mapped",
- "nr_slab",
+ "nr_local_slab",
+ "nr_remote_slab",

"pgpgin",
"pgpgout",
Index: linux-2.6.14-mm1/include/linux/page-flags.h
===================================================================
--- linux-2.6.14-mm1.orig/include/linux/page-flags.h 2005-11-09 10:47:29.000000000 -0800
+++ linux-2.6.14-mm1/include/linux/page-flags.h 2005-11-10 13:00:11.000000000 -0800
@@ -87,8 +87,9 @@ struct page_state {
unsigned long nr_unstable; /* NFS unstable pages */
unsigned long nr_page_table_pages;/* Pages used for pagetables */
unsigned long nr_mapped; /* mapped into pagetables */
- unsigned long nr_slab; /* In slab */
-#define GET_PAGE_STATE_LAST nr_slab
+ unsigned long nr_local_slab; /* local slab pages */
+ unsigned long nr_remote_slab; /* remote slab pages */
+#define GET_PAGE_STATE_LAST nr_remote_slab

/*
* The below are zeroed by get_page_state(). Use get_full_page_state()
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/