Increase page fault rate by prezeroing V1 [2/3]: zeroing and scrubd

From: Christoph Lameter
Date: Tue Dec 21 2004 - 15:10:38 EST

Next message: Greg KH: "Re: Cleanup PCI power states"
Previous message: Greg KH: "Re: [PATCH]PCI Express Port Bus Driver"
In reply to: Christoph Lameter: "Increase page fault rate by prezeroing V1 [1/3]: Introduce __GFP_ZERO"
Next in thread: Christoph Lameter: "Increase page fault rate by prezeroing V1 [3/3]: Altix SN2 BTEZeroing"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

o Add page zeroing
o Add scrub daemon
o Add ability to view amount of zeroed information in /proc/meminfo

Index: linux-2.6.9/mm/page_alloc.c
===================================================================
--- linux-2.6.9.orig/mm/page_alloc.c 2004-12-21 10:19:37.000000000 -0800
+++ linux-2.6.9/mm/page_alloc.c 2004-12-21 11:01:40.000000000 -0800
@@ -12,6 +12,7 @@
* Zone balancing, Kanoj Sarcar, SGI, Jan 2000
* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
* (lots of bits borrowed from Ingo Molnar & Andrew Morton)
+ * Support for page zeroing, Christoph Lameter, SGI, Dec 2004
*/

#include <linux/config.h>
@@ -32,6 +33,7 @@
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/nodemask.h>
+#include <linux/scrub.h>

#include <asm/tlbflush.h>

@@ -179,7 +181,7 @@
* -- wli
*/

-static inline void __free_pages_bulk (struct page *page, struct page *base,
+static inline int __free_pages_bulk (struct page *page, struct page *base,
struct zone *zone, struct free_area *area, unsigned int order)
{
unsigned long page_idx, index, mask;
@@ -192,11 +194,10 @@
BUG();
index = page_idx >> (1 + order);

- zone->free_pages += 1 << order;
while (order < MAX_ORDER-1) {
struct page *buddy1, *buddy2;

- BUG_ON(area >= zone->free_area + MAX_ORDER);
+ BUG_ON(area >= zone->free_area[ZEROED] + MAX_ORDER);
if (!__test_and_change_bit(index, area->map))
/*
* the buddy page is still allocated.
@@ -216,6 +217,7 @@
page_idx &= mask;
}
list_add(&(base + page_idx)->lru, &area->free_list);
+ return order;
}

static inline void free_pages_check(const char *function, struct page *page)
@@ -258,7 +260,7 @@
int ret = 0;

base = zone->zone_mem_map;
- area = zone->free_area + order;
+ area = zone->free_area[NOT_ZEROED] + order;
spin_lock_irqsave(&zone->lock, flags);
zone->all_unreclaimable = 0;
zone->pages_scanned = 0;
@@ -266,7 +268,10 @@
page = list_entry(list->prev, struct page, lru);
/* have to delete it as __free_pages_bulk list manipulates */
list_del(&page->lru);
- __free_pages_bulk(page, base, zone, area, order);
+ zone->free_pages += 1 << order;
+ if (__free_pages_bulk(page, base, zone, area, order)
+ >= sysctl_scrub_start)
+ wakeup_kscrubd(zone);
ret++;
}
spin_unlock_irqrestore(&zone->lock, flags);
@@ -288,6 +293,21 @@
free_pages_bulk(page_zone(page), 1, &list, order);
}

+void end_zero_page(struct page *page)
+{
+ unsigned long flags;
+ int order = page->index;
+ struct zone * zone = page_zone(page);
+
+ spin_lock_irqsave(&zone->lock, flags);
+
+ zone->zero_pages += 1 << order;
+ __free_pages_bulk(page, zone->zone_mem_map, zone, zone->free_area[ZEROED] + order, order);
+
+ spin_unlock_irqrestore(&zone->lock, flags);
+}
+
+
#define MARK_USED(index, order, area) \
__change_bit((index) >> (1+(order)), (area)->map)

@@ -366,25 +386,46 @@
* Do the hard work of removing an element from the buddy allocator.
* Call me with the zone->lock already held.
*/
-static struct page *__rmqueue(struct zone *zone, unsigned int order)
+static void inline rmpage(struct page *page, struct zone *zone, struct free_area *area, int order)
+{
+ list_del(&page->lru);
+ if (order != MAX_ORDER-1)
+ MARK_USED(page - zone->zone_mem_map, order, area);
+}
+
+struct page *scrubd_rmpage(struct zone *zone, struct free_area *area, int order)
+{
+ unsigned long flags;
+ struct page *page = NULL;
+
+ spin_lock_irqsave(&zone->lock, flags);
+
+ if (!list_empty(&area->free_list)) {
+ page = list_entry(area->free_list.next, struct page, lru);
+
+ rmpage(page, zone, area, order);
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return page;
+}
+
+static struct page *__rmqueue(struct zone *zone, unsigned int order, int zero)
{
struct free_area * area;
unsigned int current_order;
struct page *page;
- unsigned int index;

for (current_order = order; current_order < MAX_ORDER; ++current_order) {
- area = zone->free_area + current_order;
+ area = zone->free_area[zero] + current_order;
if (list_empty(&area->free_list))
continue;

page = list_entry(area->free_list.next, struct page, lru);
- list_del(&page->lru);
- index = page - zone->zone_mem_map;
- if (current_order != MAX_ORDER-1)
- MARK_USED(index, current_order, area);
+ rmpage(page, zone, area, current_order);
zone->free_pages -= 1UL << order;
- return expand(zone, page, index, order, current_order, area);
+ if (zero)
+ zone->zero_pages -= 1UL << order;
+ return expand(zone, page, page - zone->zone_mem_map, order, current_order, area);
}

return NULL;
@@ -396,7 +437,7 @@
* Returns the number of new pages which were placed at *list.
*/
static int rmqueue_bulk(struct zone *zone, unsigned int order,
- unsigned long count, struct list_head *list)
+ unsigned long count, struct list_head *list, int zero)
{
unsigned long flags;
int i;
@@ -405,7 +446,7 @@

spin_lock_irqsave(&zone->lock, flags);
for (i = 0; i < count; ++i) {
- page = __rmqueue(zone, order);
+ page = __rmqueue(zone, order, zero);
if (page == NULL)
break;
allocated++;
@@ -546,7 +587,9 @@
{
unsigned long flags;
struct page *page = NULL;
- int cold = !!(gfp_flags & __GFP_COLD);
+ int nr_pages = 1 << order;
+ int zero = !!((gfp_flags & __GFP_ZERO) && zone->zero_pages >= nr_pages);
+ int cold = !!(gfp_flags & __GFP_COLD) + 2*zero;

if (order == 0) {
struct per_cpu_pages *pcp;
@@ -555,7 +598,7 @@
local_irq_save(flags);
if (pcp->count <= pcp->low)
pcp->count += rmqueue_bulk(zone, 0,
- pcp->batch, &pcp->list);
+ pcp->batch, &pcp->list, zero);
if (pcp->count) {
page = list_entry(pcp->list.next, struct page, lru);
list_del(&page->lru);
@@ -567,19 +610,30 @@

if (page == NULL) {
spin_lock_irqsave(&zone->lock, flags);
- page = __rmqueue(zone, order);
+
+ page = __rmqueue(zone, order, zero);
+
+ /*
+ * If we failed to obtain a zero and/or unzeroed page
+ * then we may still be able to obtain the other
+ * type of page.
+ */
+ if (!page) {
+ page = __rmqueue(zone, order, !zero);
+ zero = 0;
+ }
+
spin_unlock_irqrestore(&zone->lock, flags);
}

if (page != NULL) {
BUG_ON(bad_range(zone, page));
- mod_page_state_zone(zone, pgalloc, 1 << order);
- prep_new_page(page, order);
+ mod_page_state_zone(zone, pgalloc, nr_pages);

- if (gfp_flags & __GFP_ZERO) {
+ if ((gfp_flags & __GFP_ZERO) && !zero) {
#ifdef CONFIG_HIGHMEM
if (PageHighMem(page)) {
- int n = 1 << order;
+ int n = nr_pages;

while (n-- >0)
clear_highpage(page + n);
@@ -587,6 +641,7 @@
#endif
zero_page(page_address(page), order);
}
+ prep_new_page(page, order);
if (order && (gfp_flags & __GFP_COMP))
prep_compound_page(page, order);
}
@@ -974,7 +1029,7 @@
}

void __get_zone_counts(unsigned long *active, unsigned long *inactive,
- unsigned long *free, struct pglist_data *pgdat)
+ unsigned long *free, unsigned long *zero, struct pglist_data *pgdat)
{
struct zone *zones = pgdat->node_zones;
int i;
@@ -982,27 +1037,31 @@
*active = 0;
*inactive = 0;
*free = 0;
+ *zero = 0;
for (i = 0; i < MAX_NR_ZONES; i++) {
*active += zones[i].nr_active;
*inactive += zones[i].nr_inactive;
*free += zones[i].free_pages;
+ *zero += zones[i].zero_pages;
}
}

void get_zone_counts(unsigned long *active,
- unsigned long *inactive, unsigned long *free)
+ unsigned long *inactive, unsigned long *free, unsigned long *zero)
{
struct pglist_data *pgdat;

*active = 0;
*inactive = 0;
*free = 0;
+ *zero = 0;
for_each_pgdat(pgdat) {
- unsigned long l, m, n;
- __get_zone_counts(&l, &m, &n, pgdat);
+ unsigned long l, m, n,o;
+ __get_zone_counts(&l, &m, &n, &o, pgdat);
*active += l;
*inactive += m;
*free += n;
+ *zero += o;
}
}

@@ -1039,6 +1098,7 @@

#define K(x) ((x) << (PAGE_SHIFT-10))

+const char *temp[3] = { "hot", "cold", "zero" };
/*
* Show free area list (used inside shift_scroll-lock stuff)
* We also calculate the percentage fragmentation. We do this by counting the
@@ -1051,6 +1111,7 @@
unsigned long active;
unsigned long inactive;
unsigned long free;
+ unsigned long zero;
struct zone *zone;

for_each_zone(zone) {
@@ -1071,10 +1132,10 @@

pageset = zone->pageset + cpu;

- for (temperature = 0; temperature < 2; temperature++)
+ for (temperature = 0; temperature < 3; temperature++)
printk("cpu %d %s: low %d, high %d, batch %d\n",
cpu,
- temperature ? "cold" : "hot",
+ temp[temperature],
pageset->pcp[temperature].low,
pageset->pcp[temperature].high,
pageset->pcp[temperature].batch);
@@ -1082,20 +1143,21 @@
}

get_page_state(&ps);
- get_zone_counts(&active, &inactive, &free);
+ get_zone_counts(&active, &inactive, &free, &zero);

printk("\nFree pages: %11ukB (%ukB HighMem)\n",
K(nr_free_pages()),
K(nr_free_highpages()));

printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
- "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
+ "unstable:%lu free:%u zero:%lu slab:%lu mapped:%lu pagetables:%lu\n",
active,
inactive,
ps.nr_dirty,
ps.nr_writeback,
ps.nr_unstable,
nr_free_pages(),
+ zero,
ps.nr_slab,
ps.nr_mapped,
ps.nr_page_table_pages);
@@ -1146,7 +1208,7 @@
spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++) {
nr = 0;
- list_for_each(elem, &zone->free_area[order].free_list)
+ list_for_each(elem, &zone->free_area[NOT_ZEROED][order].free_list)
++nr;
total += nr << order;
printk("%lu*%lukB ", nr, K(1UL) << order);
@@ -1470,14 +1532,18 @@
for (order = 0; ; order++) {
unsigned long bitmap_size;

- INIT_LIST_HEAD(&zone->free_area[order].free_list);
+ INIT_LIST_HEAD(&zone->free_area[NOT_ZEROED][order].free_list);
+ INIT_LIST_HEAD(&zone->free_area[ZEROED][order].free_list);
if (order == MAX_ORDER-1) {
- zone->free_area[order].map = NULL;
+ zone->free_area[NOT_ZEROED][order].map = NULL;
+ zone->free_area[ZEROED][order].map = NULL;
break;
}

bitmap_size = pages_to_bitmap_size(order, size);
- zone->free_area[order].map =
+ zone->free_area[NOT_ZEROED][order].map =
+ (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
+ zone->free_area[ZEROED][order].map =
(unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
}
}
@@ -1503,6 +1569,7 @@

pgdat->nr_zones = 0;
init_waitqueue_head(&pgdat->kswapd_wait);
+ init_waitqueue_head(&pgdat->kscrubd_wait);

for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
@@ -1525,6 +1592,7 @@
spin_lock_init(&zone->lru_lock);
zone->zone_pgdat = pgdat;
zone->free_pages = 0;
+ zone->zero_pages = 0;

zone->temp_priority = zone->prev_priority = DEF_PRIORITY;

@@ -1558,6 +1626,13 @@
pcp->high = 2 * batch;
pcp->batch = 1 * batch;
INIT_LIST_HEAD(&pcp->list);
+
+ pcp = &zone->pageset[cpu].pcp[2]; /* zero pages */
+ pcp->count = 0;
+ pcp->low = 0;
+ pcp->high = 2 * batch;
+ pcp->batch = 1 * batch;
+ INIT_LIST_HEAD(&pcp->list);
}
printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
zone_names[j], realsize, batch);
@@ -1687,7 +1762,7 @@
unsigned long nr_bufs = 0;
struct list_head *elem;

- list_for_each(elem, &(zone->free_area[order].free_list))
+ list_for_each(elem, &(zone->free_area[NOT_ZEROED][order].free_list))
++nr_bufs;
seq_printf(m, "%6lu ", nr_bufs);
}
Index: linux-2.6.9/include/linux/mmzone.h
===================================================================
--- linux-2.6.9.orig/include/linux/mmzone.h 2004-12-17 14:40:16.000000000 -0800
+++ linux-2.6.9/include/linux/mmzone.h 2004-12-21 11:01:15.000000000 -0800
@@ -51,7 +51,7 @@
};

struct per_cpu_pageset {
- struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */
+ struct per_cpu_pages pcp[3]; /* 0: hot. 1: cold 2: cold zeroed pages */
#ifdef CONFIG_NUMA
unsigned long numa_hit; /* allocated in intended node */
unsigned long numa_miss; /* allocated in non intended node */
@@ -107,10 +107,14 @@
* ZONE_HIGHMEM > 896 MB only page cache and user processes
*/

+#define NOT_ZEROED 0
+#define ZEROED 1
+
struct zone {
/* Fields commonly accessed by the page allocator */
unsigned long free_pages;
unsigned long pages_min, pages_low, pages_high;
+ unsigned long zero_pages;
/*
* protection[] is a pre-calculated number of extra pages that must be
* available in a zone in order for __alloc_pages() to allocate memory
@@ -131,7 +135,7 @@
* free areas of different sizes
*/
spinlock_t lock;
- struct free_area free_area[MAX_ORDER];
+ struct free_area free_area[2][MAX_ORDER];

ZONE_PADDING(_pad1_)
@@ -265,6 +269,9 @@
struct pglist_data *pgdat_next;
wait_queue_head_t kswapd_wait;
struct task_struct *kswapd;
+
+ wait_queue_head_t kscrubd_wait;
+ struct task_struct *kscrubd;
} pg_data_t;

#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
@@ -274,9 +281,9 @@
extern struct pglist_data *pgdat_list;

void __get_zone_counts(unsigned long *active, unsigned long *inactive,
- unsigned long *free, struct pglist_data *pgdat);
+ unsigned long *free, unsigned long *zero, struct pglist_data *pgdat);
void get_zone_counts(unsigned long *active, unsigned long *inactive,
- unsigned long *free);
+ unsigned long *free, unsigned long *zero);
void build_all_zonelists(void);
void wakeup_kswapd(struct zone *zone);

Index: linux-2.6.9/fs/proc/proc_misc.c
===================================================================
--- linux-2.6.9.orig/fs/proc/proc_misc.c 2004-12-17 14:40:15.000000000 -0800
+++ linux-2.6.9/fs/proc/proc_misc.c 2004-12-21 11:01:15.000000000 -0800
@@ -158,13 +158,14 @@
unsigned long inactive;
unsigned long active;
unsigned long free;
+ unsigned long zero;
unsigned long vmtot;
unsigned long committed;
unsigned long allowed;
struct vmalloc_info vmi;

get_page_state(&ps);
- get_zone_counts(&active, &inactive, &free);
+ get_zone_counts(&active, &inactive, &free, &zero);

/*
* display in kilobytes.
@@ -187,6 +188,7 @@
len = sprintf(page,
"MemTotal: %8lu kB\n"
"MemFree: %8lu kB\n"
+ "MemZero: %8lu kB\n"
"Buffers: %8lu kB\n"
"Cached: %8lu kB\n"
"SwapCached: %8lu kB\n"
@@ -210,6 +212,7 @@
"VmallocChunk: %8lu kB\n",
K(i.totalram),
K(i.freeram),
+ K(zero),
K(i.bufferram),
K(get_page_cache_size()-total_swapcache_pages-i.bufferram),
K(total_swapcache_pages),
Index: linux-2.6.9/mm/readahead.c
===================================================================
--- linux-2.6.9.orig/mm/readahead.c 2004-10-18 14:53:11.000000000 -0700
+++ linux-2.6.9/mm/readahead.c 2004-12-21 11:01:15.000000000 -0800
@@ -570,7 +570,8 @@
unsigned long active;
unsigned long inactive;
unsigned long free;
+ unsigned long zero;

- __get_zone_counts(&active, &inactive, &free, NODE_DATA(numa_node_id()));
+ __get_zone_counts(&active, &inactive, &free, &zero, NODE_DATA(numa_node_id()));
return min(nr, (inactive + free) / 2);
}
Index: linux-2.6.9/drivers/base/node.c
===================================================================
--- linux-2.6.9.orig/drivers/base/node.c 2004-10-18 14:53:22.000000000 -0700
+++ linux-2.6.9/drivers/base/node.c 2004-12-21 11:01:15.000000000 -0800
@@ -41,13 +41,15 @@
unsigned long inactive;
unsigned long active;
unsigned long free;
+ unsigned long zero;

si_meminfo_node(&i, nid);
- __get_zone_counts(&active, &inactive, &free, NODE_DATA(nid));
+ __get_zone_counts(&active, &inactive, &free, &zero, NODE_DATA(nid));

n = sprintf(buf, "\n"
"Node %d MemTotal: %8lu kB\n"
"Node %d MemFree: %8lu kB\n"
+ "Node %d MemZero: %8lu kB\n"
"Node %d MemUsed: %8lu kB\n"
"Node %d Active: %8lu kB\n"
"Node %d Inactive: %8lu kB\n"
@@ -57,6 +59,7 @@
"Node %d LowFree: %8lu kB\n",
nid, K(i.totalram),
nid, K(i.freeram),
+ nid, K(zero),
nid, K(i.totalram - i.freeram),
nid, K(active),
nid, K(inactive),
Index: linux-2.6.9/include/linux/sched.h
===================================================================
--- linux-2.6.9.orig/include/linux/sched.h 2004-12-17 14:40:16.000000000 -0800
+++ linux-2.6.9/include/linux/sched.h 2004-12-21 11:01:15.000000000 -0800
@@ -715,6 +715,7 @@
#define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */
#define PF_SYNCWRITE 0x00200000 /* I am doing a sync write */
#define PF_BORROWED_MM 0x00400000 /* I am a kthread doing use_mm */
+#define PF_KSCRUBD 0x00800000 /* I am kscrubd */

#ifdef CONFIG_SMP
extern int set_cpus_allowed(task_t *p, cpumask_t new_mask);
Index: linux-2.6.9/mm/Makefile
===================================================================
--- linux-2.6.9.orig/mm/Makefile 2004-10-18 14:54:37.000000000 -0700
+++ linux-2.6.9/mm/Makefile 2004-12-21 11:01:15.000000000 -0800
@@ -5,7 +5,7 @@
mmu-y := nommu.o
mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
- vmalloc.o
+ vmalloc.o scrubd.o

obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
page_alloc.o page-writeback.o pdflush.o prio_tree.o \
Index: linux-2.6.9/mm/scrubd.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.9/mm/scrubd.c 2004-12-21 11:01:15.000000000 -0800
@@ -0,0 +1,148 @@
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/file.h>
+#include <linux/suspend.h>
+#include <linux/sysctl.h>
+#include <linux/scrub.h>
+
+unsigned int sysctl_scrub_start = MAX_ORDER; /* Off */
+unsigned int sysctl_scrub_stop = 2; /* Mininum order of page to zero */
+
+/*
+ * sysctl handler for /proc/sys/vm/scrub_start
+ */
+int scrub_start_handler(ctl_table *table, int write,
+ struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+ proc_dointvec(table, write, file, buffer, length, ppos);
+ if (sysctl_scrub_start < MAX_ORDER) {
+ struct zone *zone;
+
+ for_each_zone(zone)
+ wakeup_kscrubd(zone);
+ }
+ return 0;
+}
+
+
+
+LIST_HEAD(zero_drivers);
+
+/*
+ * zero_highest_order_page takes a page off the freelist
+ * and then hands it off to block zeroing agents.
+ * The cleared pages are added to the back of
+ * the freelist where the page allocator may pick them up.
+ */
+int zero_highest_order_page(struct zone *z)
+{
+ int order;
+
+ for(order = MAX_ORDER-1; order >= sysctl_scrub_stop; order--) {
+ struct free_area *area = z->free_area[NOT_ZEROED] + order;
+ if (!list_empty(&area->free_list)) {
+ struct page *page = scrubd_rmpage(z, area, order);
+ struct list_head *l;
+
+ if (!page)
+ continue;
+
+ page->index = order;
+
+ list_for_each(l, &zero_drivers) {
+ struct zero_driver *driver = list_entry(l, struct zero_driver, list);
+ unsigned long size = PAGE_SIZE << order;
+
+ if (driver->start(page_address(page), size) == 0) {
+
+ unsigned ticks = (size*HZ)/driver->rate;
+ if (ticks) {
+ /* Wait the minimum time of the transfer */
+ current->state = TASK_INTERRUPTIBLE;
+ schedule_timeout(ticks);
+ }
+ /* Then keep on checking until transfer is complete */
+ while (!driver->check())
+ schedule();
+ goto out;
+ }
+ }
+
+ /* Unable to find a zeroing device that would
+ * deal with this page so just do it on our own.
+ * This will likely thrash the cpu caches.
+ */
+ cond_resched();
+ zero_page(page_address(page), order);
+out:
+ end_zero_page(page);
+ cond_resched();
+ return 1 << order;
+ }
+ }
+ return 0;
+}
+
+/*
+ * scrub_pgdat() will work across all this node's zones.
+ */
+static void scrub_pgdat(pg_data_t *pgdat)
+{
+ int i;
+ unsigned long pages_zeroed;
+
+ if (system_state != SYSTEM_RUNNING)
+ return;
+
+ do {
+ pages_zeroed = 0;
+ for (i = 0; i < pgdat->nr_zones; i++) {
+ struct zone *zone = pgdat->node_zones + i;
+
+ pages_zeroed += zero_highest_order_page(zone);
+ }
+ } while (pages_zeroed);
+}
+
+/*
+ * The background scrub daemon, started as a kernel thread
+ * from the init process.
+ */
+static int kscrubd(void *p)
+{
+ pg_data_t *pgdat = (pg_data_t*)p;
+ struct task_struct *tsk = current;
+ DEFINE_WAIT(wait);
+ cpumask_t cpumask;
+
+ daemonize("kscrubd%d", pgdat->node_id);
+ cpumask = node_to_cpumask(pgdat->node_id);
+ if (!cpus_empty(cpumask))
+ set_cpus_allowed(tsk, cpumask);
+
+ tsk->flags |= PF_MEMALLOC | PF_KSCRUBD;
+
+ for ( ; ; ) {
+ if (current->flags & PF_FREEZE)
+ refrigerator(PF_FREEZE);
+ prepare_to_wait(&pgdat->kscrubd_wait, &wait, TASK_INTERRUPTIBLE);
+ schedule();
+ finish_wait(&pgdat->kscrubd_wait, &wait);
+
+ scrub_pgdat(pgdat);
+ }
+ return 0;
+}
+
+static int __init kscrubd_init(void)
+{
+ pg_data_t *pgdat;
+ for_each_pgdat(pgdat)
+ pgdat->kscrubd
+ = find_task_by_pid(kernel_thread(kscrubd, pgdat, CLONE_KERNEL));
+ return 0;
+}
+
+module_init(kscrubd_init)
Index: linux-2.6.9/include/linux/scrub.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.9/include/linux/scrub.h 2004-12-21 11:01:15.000000000 -0800
@@ -0,0 +1,48 @@
+#ifndef _LINUX_SCRUB_H
+#define _LINUX_SCRUB_H
+
+/*
+ * Definitions for scrubbing of memory include an interface
+ * for drivers that may that allow the zeroing of memory
+ * without invalidating the caches.
+ *
+ * Christoph Lameter, December 2004.
+ */
+
+struct zero_driver {
+ int (*start)(void *, unsigned length); /* Start bzero transfer */
+ int (*check)(void); /* Check if bzero is complete */
+ int rate; /* bzero rate in MB/sec */
+ struct list_head list;
+};
+
+extern struct list_head zero_drivers;
+
+extern unsigned int sysctl_scrub_start;
+extern unsigned int sysctl_scrub_stop;
+
+/* Registering and unregistering zero drivers */
+static inline void register_zero_driver(struct zero_driver *z)
+{
+ list_add(&z->list, &zero_drivers);
+}
+
+static inline void unregister_zero_driver(struct zero_driver *z)
+{
+ list_del(&z->list);
+}
+
+extern struct page *scrubd_rmpage(struct zone *zone, struct free_area *area, int order);
+
+static void inline wakeup_kscrubd(struct zone *zone)
+{
+ if (!waitqueue_active(&zone->zone_pgdat->kscrubd_wait))
+ return;
+ wake_up_interruptible(&zone->zone_pgdat->kscrubd_wait);
+}
+
+int scrub_start_handler(struct ctl_table *, int, struct file *,
+ void __user *, size_t *, loff_t *);
+
+extern void end_zero_page(struct page *page);
+#endif
Index: linux-2.6.9/kernel/sysctl.c
===================================================================
--- linux-2.6.9.orig/kernel/sysctl.c 2004-12-17 14:40:17.000000000 -0800
+++ linux-2.6.9/kernel/sysctl.c 2004-12-21 11:01:15.000000000 -0800
@@ -40,6 +40,7 @@
#include <linux/times.h>
#include <linux/limits.h>
#include <linux/dcache.h>
+#include <linux/scrub.h>
#include <linux/syscalls.h>

#include <asm/uaccess.h>
@@ -816,6 +817,24 @@
.strategy = &sysctl_jiffies,
},
#endif
+ {
+ .ctl_name = VM_SCRUB_START,
+ .procname = "scrub_start",
+ .data = &sysctl_scrub_start,
+ .maxlen = sizeof(sysctl_scrub_start),
+ .mode = 0644,
+ .proc_handler = &scrub_start_handler,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = VM_SCRUB_STOP,
+ .procname = "scrub_stop",
+ .data = &sysctl_scrub_stop,
+ .maxlen = sizeof(sysctl_scrub_stop),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ },
{ .ctl_name = 0 }
};

Index: linux-2.6.9/include/linux/sysctl.h
===================================================================
--- linux-2.6.9.orig/include/linux/sysctl.h 2004-12-17 14:40:16.000000000 -0800
+++ linux-2.6.9/include/linux/sysctl.h 2004-12-21 11:01:15.000000000 -0800
@@ -168,6 +168,8 @@
VM_VFS_CACHE_PRESSURE=26, /* dcache/icache reclaim pressure */
VM_LEGACY_VA_LAYOUT=27, /* legacy/compatibility virtual address space layout */
VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */
+ VM_SCRUB_START=30, /* percentage * 10 at which to start scrubd */
+ VM_SCRUB_STOP=31, /* percentage * 10 at which to stop scrubd */
};

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Next message: Greg KH: "Re: Cleanup PCI power states"
Previous message: Greg KH: "Re: [PATCH]PCI Express Port Bus Driver"
In reply to: Christoph Lameter: "Increase page fault rate by prezeroing V1 [1/3]: Introduce __GFP_ZERO"
Next in thread: Christoph Lameter: "Increase page fault rate by prezeroing V1 [3/3]: Altix SN2 BTEZeroing"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]