Re: [patch] my latest oom stuff

Andrea Arcangeli (andrea@e-mind.com)
Mon, 26 Oct 1998 15:25:00 +0100 (CET)


On Mon, 26 Oct 1998, Andrea Arcangeli wrote:

>Here a little _incremental_ patch though:

I changed try_to_free_pages(). Now it doesn' t stop if
do_try_to_free_pages() fail, and instead it schedule() if need_resched is
true. Doing that the kernel doesn' t deadlock and we can continue to
do_try_to_free_pages(). I also changed the calc of the retval (I should
check that it can happen a try_to_free_pages() == 0 if PF_MEMALLOC is not
set, but theorically make sense).

I added also a dynamic priority to take care of the
buffer/pgcache_over_max case. In such case we don' t want that kswapd runs
with an high priority since we just have tons of memory free. This patch
cause kswapd to be also more aggressive against other normal process when
we are low on memory.

This is the _whole_ new patch against pre-2.1.127-1 (the one against
2.1.126 is here: ftp://e-mind.com/pub/linux/oom-16...).

Seems the best solution so far here.

Index: linux/mm/page_alloc.c
diff -u linux/mm/page_alloc.c:1.1.1.3 linux/mm/page_alloc.c:1.1.1.1.18.4
--- linux/mm/page_alloc.c:1.1.1.3 Sun Oct 25 01:28:52 1998
+++ linux/mm/page_alloc.c Sat Oct 24 20:25:17 1998
@@ -237,43 +237,29 @@
unsigned long __get_free_pages(int gfp_mask, unsigned long order)
{
unsigned long flags;
+ int again = 0;
+ int wait = gfp_mask & __GFP_WAIT;

if (order >= NR_MEM_LISTS)
goto nopage;

- if (gfp_mask & __GFP_WAIT) {
- if (in_interrupt()) {
- static int count = 0;
- if (++count < 5) {
- printk("gfp called nonatomically from interrupt %p\n",
- __builtin_return_address(0));
- }
- goto nopage;
- }
-
- if (freepages.min > nr_free_pages) {
- int freed;
- freed = try_to_free_pages(gfp_mask, SWAP_CLUSTER_MAX);
- /*
- * Low priority (user) allocations must not
- * succeed if we didn't have enough memory
- * and we couldn't get more..
- */
- if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
- goto nopage;
- }
+ if (wait && in_interrupt()) {
+ printk("gfp called nonatomically from interrupt %p\n",
+ __builtin_return_address(0));
+ goto nopage;
}
+ again:
spin_lock_irqsave(&page_alloc_lock, flags);
RMQUEUE(order, (gfp_mask & GFP_DMA));
spin_unlock_irqrestore(&page_alloc_lock, flags);
+
+ if (!again && wait)
+ {
+ again = 1;
+ if (try_to_free_pages(gfp_mask, SWAP_CLUSTER_MAX))
+ goto again;
+ }

- /*
- * If we failed to find anything, we'll return NULL, but we'll
- * wake up kswapd _now_ and even wait for it synchronously if
- * we can.. This way we'll at least make some forward progress
- * over time.
- */
- kswapd_notify(gfp_mask);
nopage:
return 0;
}
Index: linux/mm/vmscan.c
diff -u linux/mm/vmscan.c:1.1.1.4 linux/mm/vmscan.c:1.1.1.2.4.17
--- linux/mm/vmscan.c:1.1.1.4 Sun Oct 25 01:28:52 1998
+++ linux/mm/vmscan.c Mon Oct 26 14:55:32 1998
@@ -442,39 +442,43 @@
static int do_try_to_free_page(int gfp_mask)
{
static int state = 0;
- int i=6;
- int stop;
+ int from_prio, to_prio;

/* Always trim SLAB caches when memory gets low. */
kmem_cache_reap(gfp_mask);

/* We try harder if we are waiting .. */
- stop = 3;
if (gfp_mask & __GFP_WAIT)
- stop = 0;
+ {
+ from_prio = 3;
+ to_prio = 0;
+ } else {
+ from_prio = 6;
+ to_prio = 3;
+ }

if (buffer_over_borrow() || pgcache_over_borrow())
- shrink_mmap(i, gfp_mask);
+ state = 0;

switch (state) {
do {
case 0:
- if (shrink_mmap(i, gfp_mask))
+ if (shrink_mmap(from_prio, gfp_mask))
return 1;
state = 1;
case 1:
- if (shm_swap(i, gfp_mask))
+ if (shm_swap(from_prio, gfp_mask))
return 1;
state = 2;
case 2:
- if (swap_out(i, gfp_mask))
+ if (swap_out(from_prio, gfp_mask))
return 1;
state = 3;
case 3:
- shrink_dcache_memory(i, gfp_mask);
+ shrink_dcache_memory(from_prio, gfp_mask);
state = 0;
- i--;
- } while ((i - stop) >= 0);
+ from_prio--;
+ } while (from_prio >= to_prio);
}
return 0;
}
@@ -517,13 +521,6 @@
lock_kernel();

/*
- * Set the base priority to something smaller than a
- * regular process. We will scale up the priority
- * dynamically depending on how much memory we need.
- */
- current->priority = (DEF_PRIORITY * 2) / 3;
-
- /*
* Tell the memory management that we're a "memory allocator",
* and that if we need more memory we should get access to it
* regardless (see "try_to_free_pages()"). "kswapd" should
@@ -540,44 +537,28 @@
init_swap_timer();
kswapd_task = current;
while (1) {
- int tries;
+ unsigned long stop;

- current->state = TASK_INTERRUPTIBLE;
+/* run_task_queue(&tq_disk); */
flush_signals(current);
- run_task_queue(&tq_disk);
+ /*
+ * Remeber to enable up the swap tick before go to sleep.
+ */
+ timer_active |= 1<<SWAP_TIMER;
+ current->state = TASK_INTERRUPTIBLE;
schedule();
swapstats.wakeups++;

/*
- * Do the background pageout: be
- * more aggressive if we're really
- * low on free memory.
- *
- * We try page_daemon.tries_base times, divided by
- * an 'urgency factor'. In practice this will mean
- * a value of pager_daemon.tries_base / 8 or 4 = 64
- * or 128 pages at a time.
- * This gives us 64 (or 128) * 4k * 4 (times/sec) =
- * 1 (or 2) MB/s swapping bandwidth in low-priority
- * background paging. This number rises to 8 MB/s
- * when the priority is highest (but then we'll be
- * woken up more often and the rate will be even
- * higher).
+ * Do the pageout for at most some jiffy.
*/
- tries = pager_daemon.tries_base;
- tries >>= 4*free_memory_available();
-
+ stop = jiffies + (2 - free_memory_available());
do {
do_try_to_free_page(0);
- /*
- * Syncing large chunks is faster than swapping
- * synchronously (less head movement). -- Rik.
- */
- if (atomic_read(&nr_async_pages) >= pager_daemon.swap_cluster)
- run_task_queue(&tq_disk);
- if (free_memory_available() > 1)
+ if (free_memory_available() == 2 && buffer_under_max()
+ && pgcache_under_max())
break;
- } while (--tries > 0);
+ } while ((long)(stop - jiffies) >= 0);
}
/* As if we could ever get here - maybe we want to make this killable */
kswapd_task = NULL;
@@ -592,81 +573,41 @@
*
* The "PF_MEMALLOC" flag protects us against recursion:
* if we need more memory as part of a swap-out effort we
- * will just silently return "success" to tell the page
- * allocator to accept the allocation.
+ * will just silently return "fail" to tell the page
+ * allocator that we are recursing.
*/
int try_to_free_pages(unsigned int gfp_mask, int count)
{
- int retval = 1;
+ int retval = 0;

lock_kernel();
if (!(current->flags & PF_MEMALLOC)) {
+ retval = 1;
current->flags |= PF_MEMALLOC;
- do {
- retval = do_try_to_free_page(gfp_mask);
- if (!retval)
+ while (count--)
+ if (!do_try_to_free_page(gfp_mask))
break;
- count--;
- } while (count > 0);
current->flags &= ~PF_MEMALLOC;
}
unlock_kernel();
return retval;
}

-/*
- * Wake up kswapd according to the priority
- * 0 - no wakeup
- * 1 - wake up as a low-priority process
- * 2 - wake up as a normal process
- * 3 - wake up as an almost real-time process
- *
- * This plays mind-games with the "goodness()"
- * function in kernel/sched.c.
- */
-static inline void kswapd_wakeup(int priority)
-{
- if (priority) {
- struct task_struct *p = kswapd_task;
- if (p) {
- p->counter = p->priority << priority;
- wake_up_process(p);
- }
- }
-}
-
/*
* The swap_tick function gets called on every clock tick.
*/
void swap_tick(void)
{
- unsigned int pages;
- int want_wakeup;
-
+ int free_memory = free_memory_available();
/*
* Schedule for wakeup if there isn't lots
* of free memory or if there is too much
* of it used for buffers or pgcache.
- *
- * "want_wakeup" is our priority: 0 means
- * not to wake anything up, while 3 means
- * that we'd better give kswapd a realtime
- * priority.
*/
- want_wakeup = 0;
- if (buffer_over_max() || pgcache_over_max())
- want_wakeup = 1;
- pages = nr_free_pages;
- if (pages < freepages.high)
- want_wakeup = 1;
- if (pages < freepages.low)
- want_wakeup = 2;
- if (pages < freepages.min)
- want_wakeup = 3;
-
- kswapd_wakeup(want_wakeup);
-
- timer_active |= (1<<SWAP_TIMER);
+ if (free_memory != 2 || buffer_over_max() || pgcache_over_max())
+ kswapd_wakeup(free_memory);
+ else
+ timer_active |= (1<<SWAP_TIMER);
}

/*
Index: linux/kernel/fork.c
diff -u linux/kernel/fork.c:1.1.1.2 linux/kernel/fork.c:1.1.1.2.4.3
--- linux/kernel/fork.c:1.1.1.2 Fri Oct 9 17:44:09 1998
+++ linux/kernel/fork.c Mon Oct 26 14:54:48 1998
@@ -292,10 +292,16 @@
void mmput(struct mm_struct *mm)
{
if (atomic_dec_and_test(&mm->count)) {
+ int free_memory;
+
release_segments(mm);
exit_mmap(mm);
free_page_tables(mm);
kmem_cache_free(mm_cachep, mm);
+ free_memory = free_memory_available();
+
+ if (free_memory != 2)
+ kswapd_wakeup(free_memory);
}
}

Index: linux/include/linux/mm.h
diff -u linux/include/linux/mm.h:1.1.1.3 linux/include/linux/mm.h:1.1.1.1.16.3
--- linux/include/linux/mm.h:1.1.1.3 Sun Oct 25 01:28:37 1998
+++ linux/include/linux/mm.h Mon Oct 26 14:54:48 1998
@@ -330,14 +330,25 @@
extern int free_memory_available(void);
extern struct task_struct * kswapd_task;

-extern inline void kswapd_notify(unsigned int gfp_mask)
+static inline void kswapd_wakeup(int free_memory)
{
- if (kswapd_task) {
- wake_up_process(kswapd_task);
- if (gfp_mask & __GFP_WAIT) {
- current->policy |= SCHED_YIELD;
- schedule();
+ struct task_struct *p = kswapd_task;
+ if (p)
+ {
+ long priority;
+ switch (free_memory)
+ {
+ case 0:
+ priority = DEF_PRIORITY << 1;
+ break;
+ case 2:
+ priority = DEF_PRIORITY >> 1;
+ break;
+ default:
+ priority = DEF_PRIORITY;
}
+ p->priority = priority;
+ wake_up_process(p);
}
}

Andrea Arcangeli

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/