--- .orig/include/linux/swap.h 2006-04-05 20:29:09.000000000 -0400 +++ 01-mapped-bias/include/linux/swap.h 2006-04-03 19:25:15.000000000 -0400 @@ -175,6 +175,7 @@ extern int try_to_free_pages(struct zone **, gfp_t); extern int shrink_all_memory(int); extern int vm_swappiness; +extern int vm_mapped_bias; #ifdef CONFIG_NUMA extern int zone_reclaim_mode; --- .orig/include/linux/sysctl.h 2006-04-05 20:29:09.000000000 -0400 +++ 01-mapped-bias/include/linux/sysctl.h 2006-04-03 19:25:04.000000000 -0400 @@ -186,6 +186,7 @@ VM_PERCPU_PAGELIST_FRACTION=30,/* int: fraction of pages in each percpu_pagelist */ VM_ZONE_RECLAIM_MODE=31, /* reclaim local zone memory before going off node */ VM_ZONE_RECLAIM_INTERVAL=32, /* time period to wait after reclaim failure */ + VM_MAPPED_BIAS=33, /* bias reclaim towards preserving mapped memory */ }; --- .orig/kernel/sysctl.c 2006-04-05 20:29:09.000000000 -0400 +++ 01-mapped-bias/kernel/sysctl.c 2006-04-03 19:40:49.000000000 -0400 @@ -916,6 +916,16 @@ .strategy = &sysctl_jiffies, }, #endif + { + .ctl_name = VM_MAPPED_BIAS, + .procname = "mapped_bias", + .data = &vm_mapped_bias, + .maxlen = sizeof(vm_mapped_bias), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, { .ctl_name = 0 } }; --- .orig/mm/filemap.c 2006-04-05 20:29:09.000000000 -0400 +++ 01-mapped-bias/mm/filemap.c 2006-04-03 19:31:07.000000000 -0400 @@ -2020,7 +2020,14 @@ if (status >= 0) status = -EFAULT; unlock_page(page); - mark_page_accessed(page); + + /* + * Only mark page accessed for partial write + * when mapped bias is in effect. + */ + if (!vm_mapped_bias || offset + bytes != PAGE_CACHE_SIZE) + mark_page_accessed(page); + page_cache_release(page); if (status < 0) break; --- .orig/mm/page_alloc.c 2006-04-05 20:29:10.000000000 -0400 +++ 01-mapped-bias/mm/page_alloc.c 2006-04-03 20:00:22.000000000 -0400 @@ -360,7 +360,6 @@ 1 << PG_private | 1 << PG_locked | 1 << PG_active | - 1 << PG_reclaim | 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback | @@ -518,7 +517,6 @@ 1 << PG_locked | 1 << PG_active | 1 << PG_dirty | - 1 << PG_reclaim | 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback | @@ -534,7 +532,8 @@ page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_referenced | 1 << PG_arch_1 | - 1 << PG_checked | 1 << PG_mappedtodisk); + 1 << PG_checked | 1 << PG_mappedtodisk | + 1 << PG_reclaim); set_page_private(page, 0); set_page_refs(page, order); kernel_map_pages(page, 1 << order, 1); --- .orig/mm/swap.c 2006-04-05 20:29:10.000000000 -0400 +++ 01-mapped-bias/mm/swap.c 2006-04-03 19:28:55.000000000 -0400 @@ -103,12 +103,23 @@ { struct zone *zone = page_zone(page); + if (unlikely(PageReclaim(page))) + ClearPageReclaim(page); + spin_lock_irq(&zone->lru_lock); if (PageLRU(page) && !PageActive(page)) { - del_page_from_inactive_list(zone, page); - SetPageActive(page); - add_page_to_active_list(zone, page); - inc_page_state(pgactivate); + /* + * Never activate an unmapped page when + * mapped bias is in effect. + */ + if (!vm_mapped_bias || page_mapped(page)) { + del_page_from_inactive_list(zone, page); + SetPageActive(page); + add_page_to_active_list(zone, page); + inc_page_state(pgactivate); + } else if (page->lru.prev != &zone->inactive_list) { + list_move(&page->lru, &zone->inactive_list); + } } spin_unlock_irq(&zone->lru_lock); } @@ -127,6 +138,8 @@ ClearPageReferenced(page); } else if (!PageReferenced(page)) { SetPageReferenced(page); + if (PageReclaim(page)) + ClearPageReclaim(page); } } --- .orig/mm/vmscan.c 2006-04-05 20:29:10.000000000 -0400 +++ 01-mapped-bias/mm/vmscan.c 2006-04-03 20:08:36.000000000 -0400 @@ -79,6 +79,12 @@ * In this context, it doesn't matter that we scan the * whole list at once. */ int swap_cluster_max; + + /* Should we reclaim mapped memory? */ + int reclaim_mapped; + + /* LRU pass */ + int pass; }; /* @@ -128,6 +134,30 @@ int vm_swappiness = 60; static long total_memory; +/* + * When non-zero, place all unmapped pages on + * the inactive list and do not reclaim any mapped + * pages unless mapped memory exceeds the threshold + * implied by swappiness above. + */ +int vm_mapped_bias = 0; +static int mapped_bias = 0; +static atomic_t scanner_running = ATOMIC_INIT(-1); + +static inline void scanner_start(void) +{ + /* + * Re-sync mapped bias on first run. + */ + if (atomic_inc_and_test(&scanner_running)) + mapped_bias = vm_mapped_bias; +} + +static inline void scanner_stop(void) +{ + atomic_dec(&scanner_running); +} + static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); @@ -199,7 +229,17 @@ unsigned long total_scan; unsigned long max_pass = (*shrinker->shrinker)(0, gfp_mask); - delta = (4 * scanned) / shrinker->seeks; + /* + * With mapped bias in effect, we only count + * inactive pages as part of lru_pages causing + * the default algorithm to be quite aggressive. + * To remedy that, we scan the slabs at a fraction + * of the LRU scan rate. + */ + if (!mapped_bias) + delta = (4 * scanned) / shrinker->seeks; + else + delta = scanned / shrinker->seeks; delta *= max_pass; do_div(delta, lru_pages + 1); shrinker->nr += delta; @@ -444,15 +484,28 @@ sc->nr_scanned++; + /* + * Do not reclaim a mapped page unless + * necessary when mapped bias is in effect. + */ + if (mapped_bias && !sc->reclaim_mapped && page_mapped(page)) + goto do_activate_locked; + if (!sc->may_swap && page_mapped(page)) goto keep_locked; /* Double the slab pressure for mapped and swapcache pages */ - if (page_mapped(page) || PageSwapCache(page)) + if (!mapped_bias && (page_mapped(page) || PageSwapCache(page))) sc->nr_scanned++; - if (PageWriteback(page)) + if (PageWriteback(page)) { + /* + * Set reclaim bit when mapped bias is in effect. + */ + if (mapped_bias) + SetPageReclaim(page); goto keep_locked; + } referenced = page_referenced(page, 1); /* In active use or really unfreeable? Activate it. */ @@ -567,8 +620,15 @@ continue; activate_locked: - SetPageActive(page); - pgactivate++; + /* + * Do not activate an unmapped page if + * mapped bias is in effect. + */ + if (!mapped_bias || page_mapped(page)) { +do_activate_locked: + SetPageActive(page); + pgactivate++; + } keep_locked: unlock_page(page); keep: @@ -1200,48 +1260,6 @@ LIST_HEAD(l_active); /* Pages to go onto the active_list */ struct page *page; struct pagevec pvec; - int reclaim_mapped = 0; - - if (unlikely(sc->may_swap)) { - long mapped_ratio; - long distress; - long swap_tendency; - - /* - * `distress' is a measure of how much trouble we're having - * reclaiming pages. 0 -> no problems. 100 -> great trouble. - */ - distress = 100 >> zone->prev_priority; - - /* - * The point of this algorithm is to decide when to start - * reclaiming mapped memory instead of just pagecache. Work out - * how much memory - * is mapped. - */ - mapped_ratio = (sc->nr_mapped * 100) / total_memory; - - /* - * Now decide how much we really want to unmap some pages. The - * mapped ratio is downgraded - just because there's a lot of - * mapped memory doesn't necessarily mean that page reclaim - * isn't succeeding. - * - * The distress ratio is important - we don't want to start - * going oom. - * - * A 100% value of vm_swappiness overrides this algorithm - * altogether. - */ - swap_tendency = mapped_ratio / 2 + distress + vm_swappiness; - - /* - * Now use this metric to decide whether to start moving mapped - * memory onto the inactive list. - */ - if (swap_tendency >= 100) - reclaim_mapped = 1; - } lru_add_drain(); spin_lock_irq(&zone->lru_lock); @@ -1256,7 +1274,7 @@ page = lru_to_page(&l_hold); list_del(&page->lru); if (page_mapped(page)) { - if (!reclaim_mapped || + if (!sc->reclaim_mapped || (total_swap_pages == 0 && PageAnon(page)) || page_referenced(page, 0)) { list_add(&page->lru, &l_active); @@ -1333,6 +1351,51 @@ unsigned long nr_active; unsigned long nr_inactive; + sc->reclaim_mapped = 0; + if (unlikely(sc->may_swap)) { + long mapped_ratio; + long distress; + long swap_tendency; + + /* + * `distress' is a measure of how much trouble we're having + * reclaiming pages. 0 -> no problems. 100 -> great trouble. + * + * When mapped bias is in effect, only apply distress in the + * last pass. + */ + distress = (sc->pass == 0) ? 100 >> zone->prev_priority : 0; + + /* + * The point of this algorithm is to decide when to start + * reclaiming mapped memory instead of just pagecache. Work out + * how much memory + * is mapped. + */ + mapped_ratio = (sc->nr_mapped * 100) / total_memory; + + /* + * Now decide how much we really want to unmap some pages. The + * mapped ratio is downgraded - just because there's a lot of + * mapped memory doesn't necessarily mean that page reclaim + * isn't succeeding. + * + * The distress ratio is important - we don't want to start + * going oom. + * + * A 100% value of vm_swappiness overrides this algorithm + * altogether. + */ + swap_tendency = mapped_ratio / 2 + distress + vm_swappiness; + + /* + * Now use this metric to decide whether to start moving mapped + * memory onto the inactive list. + */ + if (swap_tendency >= 100) + sc->reclaim_mapped = 1; + } + atomic_inc(&zone->reclaim_in_progress); /* @@ -1437,10 +1500,14 @@ struct scan_control sc; unsigned long lru_pages = 0; int i; + int max_priority; + + scanner_start(); sc.gfp_mask = gfp_mask; sc.may_writepage = !laptop_mode; sc.may_swap = 1; + sc.pass = !!mapped_bias; inc_page_state(allocstall); @@ -1451,10 +1518,22 @@ continue; zone->temp_priority = DEF_PRIORITY; - lru_pages += zone->nr_active + zone->nr_inactive; + + /* + * When mapped bias is in effect, + * do not count active pages. + */ + lru_pages += zone->nr_inactive; + if (!mapped_bias) + lru_pages += zone->nr_active; } +again: + /* + * Only scan down to 0 on the last pass. + */ + max_priority = (sc.pass == 0) ? 0 : 1; - for (priority = DEF_PRIORITY; priority >= 0; priority--) { + for (priority = DEF_PRIORITY; priority >= max_priority; priority--) { sc.nr_mapped = read_page_state(nr_mapped); sc.nr_scanned = 0; sc.nr_reclaimed = 0; @@ -1491,6 +1570,8 @@ if (sc.nr_scanned && priority < DEF_PRIORITY - 2) blk_congestion_wait(WRITE, HZ/10); } + if (--sc.pass >= 0) + goto again; out: for (i = 0; zones[i] != 0; i++) { struct zone *zone = zones[i]; @@ -1500,6 +1581,7 @@ zone->prev_priority = zone->temp_priority; } + scanner_stop(); return ret; } @@ -1531,12 +1613,15 @@ static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order) { int to_free = nr_pages; - int all_zones_ok; + int all_zones_ok = 1; int priority; int i; int total_scanned, total_reclaimed; struct reclaim_state *reclaim_state = current->reclaim_state; struct scan_control sc; + int max_priority; + + scanner_start(); loop_again: total_scanned = 0; @@ -1545,6 +1630,7 @@ sc.may_writepage = !laptop_mode; sc.may_swap = 1; sc.nr_mapped = read_page_state(nr_mapped); + sc.pass = (nr_pages == 0) ? !!mapped_bias : 0; inc_page_state(pageoutrun); @@ -1553,8 +1639,13 @@ zone->temp_priority = DEF_PRIORITY; } - - for (priority = DEF_PRIORITY; priority >= 0; priority--) { +again: + /* + * Only scan down to 0 on the last pass. + */ + max_priority = (sc.pass == 0) ? 0 : 1; + + for (priority = DEF_PRIORITY; priority >= max_priority; priority--) { int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long lru_pages = 0; @@ -1563,7 +1654,7 @@ disable_swap_token(); all_zones_ok = 1; - + if (nr_pages == 0) { /* * Scan in the highmem->dma direction for the highest @@ -1593,7 +1684,13 @@ for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; - lru_pages += zone->nr_active + zone->nr_inactive; + /* + * When mapped bias is in effect, + * do not count active pages. + */ + lru_pages += zone->nr_inactive; + if (!mapped_bias) + lru_pages += zone->nr_active; } /* @@ -1637,7 +1734,8 @@ if (zone->all_unreclaimable) continue; if (nr_slab == 0 && zone->pages_scanned >= - (zone->nr_active + zone->nr_inactive) * 4) + (zone->nr_active + zone->nr_inactive) * 4 * + (mapped_bias + 1)) zone->all_unreclaimable = 1; /* * If we've done a decent amount of scanning and @@ -1651,7 +1749,7 @@ if (nr_pages && to_free > total_reclaimed) continue; /* swsusp: need to do more work */ if (all_zones_ok) - break; /* kswapd: all done */ + goto out; /* kswapd: all done */ /* * OK, kswapd is getting into trouble. Take a nap, then take * another pass across the zones. @@ -1666,8 +1764,10 @@ * on zone->*_priority. */ if ((total_reclaimed >= SWAP_CLUSTER_MAX) && (!nr_pages)) - break; + goto out; } + if (--sc.pass >= 0) + goto again; out: for (i = 0; i < pgdat->nr_zones; i++) { struct zone *zone = pgdat->node_zones + i; @@ -1678,6 +1778,7 @@ cond_resched(); goto loop_again; } + scanner_stop(); return total_reclaimed; } @@ -1898,6 +1999,8 @@ if (!cpus_empty(mask) && node_id != numa_node_id()) return 0; + scanner_start(); + sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE); sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP); sc.nr_scanned = 0; @@ -1905,6 +2008,7 @@ sc.priority = ZONE_RECLAIM_PRIORITY + 1; sc.nr_mapped = read_page_state(nr_mapped); sc.gfp_mask = gfp_mask; + sc.pass = 0; disable_swap_token(); @@ -1952,6 +2056,8 @@ if (sc.nr_reclaimed == 0) zone->last_unsuccessful_zone_reclaim = jiffies; + scanner_stop(); + return sc.nr_reclaimed >= nr_pages; } #endif