Out of memory problem

Rauli Ruohonen (raulir@fishy.pp.sci.fi)
Wed, 2 Jul 1997 12:58:52 +0300 (EET DST)


Here's a patch I came up with when I wanted to have more saner handling
of out of memory conditions.. Do not use it in a production system, it's
not "smart" now, and it probably contains many bugs, as this is a first
patch I've ever tried to do. The algorithm to pick the process to be
killed is NOT smart, I just wanted to do something to make it possible to
*have* an algorithm instead of just semi-randomly killing processes.

There are many places where I'm unsure whether my changes are correct,
and the "there's-always-memory" policy this patch uses is incorrect. It
would be nice if somebody more experienced in kernel hacking could finish
this, so *then* people could start arguing about the best algorithm(s)
which could be then picked at compile/run-time. It could be completely
tunable by user-mode programs, or use the original "algorithm", whatever.

This is for a 2.0.31-1, but maybe it applies to later kernels as well.
If this kind of thing ever gets to the kernel, it should be in 2.1.x though.

diff -u -d -r /home/linux.old/fs/exec.c /usr/src/linux/fs/exec.c
--- /home/linux.old/fs/exec.c Mon Sep 9 21:04:57 1996
+++ /usr/src/linux/fs/exec.c Wed Jun 18 21:05:00 1997
@@ -366,15 +366,16 @@
/*
* The clear_page_tables done later on exec does the right thing
* to the page directory when shared, except for graceful abort
- * (the oom is wrong there, too, IMHO)
*/
if (current->mm->count > 1) {
- struct mm_struct *mm = kmalloc(sizeof(*mm), GFP_KERNEL);
- if (!mm) {
- /* this is wrong, I think. */
- oom(current);
- return;
- }
+ struct mm_struct *mm;
+ int try;
+
+ for(try=0;!(mm=kmalloc(sizeof(*mm),GFP_KERNEL));try++)
+ if(!oom_kill(current,try)) {
+ oom(current);
+ return;
+ }
*mm = *current->mm;
mm->def_flags = 0; /* should future lockings be kept? */
mm->count = 1;
diff -u -d -r /home/linux.old/fs/proc/array.c /usr/src/linux/fs/proc/array.c
--- /home/linux.old/fs/proc/array.c Sun Jun 1 02:19:16 1997
+++ /usr/src/linux/fs/proc/array.c Wed Jun 18 14:41:37 1997
@@ -694,7 +694,7 @@

return sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \
%lu %lu %lu %lu %lu %ld %ld %ld %ld %ld %ld %lu %lu %ld %lu %lu %lu %lu %lu \
-%lu %lu %lu %lu %lu %lu %lu %lu\n",
+%lu %lu %lu %lu %lu %lu %lu %lu %d %d\n",
pid,
tsk->comm,
state,
@@ -731,7 +731,9 @@
sigcatch,
wchan,
tsk->nswap,
- tsk->cnswap);
+ tsk->cnswap,
+ calc_proc_score(tsk),
+ calc_user_score(tsk->euid));
}

static inline void statm_pte_range(pmd_t * pmd, unsigned long address, unsigned long size,
diff -u -d -r /home/linux.old/include/linux/mm.h /usr/src/linux/include/linux/mm.h
--- /home/linux.old/include/linux/mm.h Sat Jun 14 16:21:04 1997
+++ /usr/src/linux/include/linux/mm.h Wed Jun 18 21:13:51 1997
@@ -273,6 +273,9 @@
extern void mem_init(unsigned long start_mem, unsigned long end_mem);
extern void show_mem(void);
extern void oom(struct task_struct * tsk);
+extern int oom_kill(struct task_struct *cause,int try);
+extern int calc_proc_score(struct task_struct *task);
+extern int calc_user_score(uid_t uid);
extern void si_meminfo(struct sysinfo * val);

/* vmalloc.c */
diff -u -d -r /home/linux.old/ipc/shm.c /usr/src/linux/ipc/shm.c
--- /home/linux.old/ipc/shm.c Sun Jun 1 02:03:29 1997
+++ /usr/src/linux/ipc/shm.c Wed Jun 18 21:05:25 1997
@@ -651,11 +651,14 @@

pte_val(pte) = shp->shm_pages[idx];
if (!pte_present(pte)) {
- unsigned long page = get_free_page(GFP_KERNEL);
- if (!page) {
- oom(current);
- return BAD_PAGE;
- }
+ unsigned long page;
+ int try;
+
+ for(try=0;!(page=get_free_page(GFP_KERNEL));try++)
+ if(!oom_kill(current,try)) {
+ oom(current);
+ return BAD_PAGE;
+ }
pte_val(pte) = shp->shm_pages[idx];
if (pte_present(pte)) {
free_page (page); /* doesn't sleep */
diff -u -d -r /home/linux.old/mm/memory.c /usr/src/linux/mm/memory.c
--- /home/linux.old/mm/memory.c Mon Sep 9 21:04:57 1996
+++ /usr/src/linux/mm/memory.c Thu Jun 19 01:38:49 1997
@@ -44,6 +44,7 @@
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
+#include <linux/resource.h>

#include <asm/system.h>
#include <asm/segment.h>
@@ -82,6 +83,94 @@
send_sig(SIGKILL,task,1);
}

+/* Calculate the badness of a process, the process with highest badness
+ * value will be killed when memory runs out.
+ */
+int calc_proc_score(struct task_struct *task)
+{
+ int score=0,vsize;
+ struct sysinfo inf;
+
+ si_meminfo(&inf);
+ si_swapinfo(&inf);
+ if(task->state==TASK_ZOMBIE||task->state==-1) return -INT_MAX;
+ if(task->pid<2) return -INT_MAX;
+ if(task->policy!=SCHED_OTHER) score-=100+(5*task->rt_priority);
+ vsize=0;
+ if(task->mm&&task->mm!=&init_mm)
+ vsize=task->mm->total_vm<<(PAGE_SHIFT-10);
+
+ /* vsize=5000 if 100% of virtual memory used by task: */
+ vsize=(5000*vsize)/((inf.totalram>>10)+(inf.totalswap>>10));
+ score+=vsize;
+ return score;
+}
+
+int calc_user_score(uid_t uid)
+{
+ int procs;
+ long lim,vmuse;
+ struct task_struct *p;
+
+ if(!uid) return -1000;
+#ifndef RLIMIT_RSS
+ return -1;
+#else
+ lim=0; procs=0; vmuse=0;
+ for_each_task(p) {
+ long val;
+
+ if(p->euid!=uid) continue;
+ procs++;
+ if(p->mm&&p->mm!=&init_mm) vmuse+=p->mm->total_vm<<PAGE_SHIFT;
+ if((val=p->rlim[RLIMIT_RSS].rlim_max)>lim) {
+ if(val==RLIM_INFINITY) return -2;
+ lim=val;
+ }
+ }
+ if(!procs||!vmuse||vmuse<=lim) return 0;
+ /* Ok, user's total mem use > his max. memory limit.. */
+ return (100*(vmuse-lim))/lim; /* 2x mem use = +100 score */
+#endif
+}
+
+/*
+ * Out of memory, find a process with highest score and kill it instead of
+ * just stupidly killing the task which caused the fault.
+ * If this routine returns 0, it isn't feasible any more and the caller
+ * should call normal oom().
+ */
+int oom_kill(struct task_struct *cause,int try)
+{
+ struct task_struct *p,*victim;
+ int score,tmp,hscore=-INT_MAX;
+
+ /* This might be slow, but we don't intend to use this part of the
+ kernel often anyway.. O(n*n)
+ */
+ for_each_task(p) {
+ score=calc_proc_score(p);
+ if(score!=INT_MAX&&score!=-INT_MAX) {
+ tmp=calc_user_score(p->uid);
+ if(tmp!=INT_MAX&&tmp!=-INT_MAX) score+=tmp;
+ else
+ score=tmp;
+ }
+ if(score>hscore) { victim=p; hscore=score; }
+ }
+ if(hscore==-INT_MAX||cause==victim) return 0;
+ if(try>4) return 0;
+ printk("\nOut of memory for %s, killing %s (pid %d, score %d).\n",
+ cause->comm,victim->comm,victim->pid,hscore);
+ victim->sig->action[SIGKILL-1].sa_handler = NULL;
+ victim->blocked &= ~(1<<(SIGKILL-1));
+ send_sig(SIGKILL,victim,1);
+ victim->policy=SCHED_FIFO;
+ victim->rt_priority=101;
+ schedule(); /* The signal should be processed.. */
+ return 1;
+}
+
/*
* Note: this doesn't free the actual pages themselves. That
* has been handled earlier when unmapping all the memory regions.
@@ -550,24 +639,25 @@
pgd_t * pgd;
pmd_t * pmd;
pte_t * pte;
+ int try;

if (page >= high_memory)
printk("put_dirty_page: trying to put page %08lx at %08lx\n",page,address);
if (mem_map[MAP_NR(page)].count != 1)
printk("mem_map disagrees with %08lx at %08lx\n",page,address);
pgd = pgd_offset(tsk->mm,address);
- pmd = pmd_alloc(pgd, address);
- if (!pmd) {
- free_page(page);
- oom(tsk);
- return 0;
- }
- pte = pte_alloc(pmd, address);
- if (!pte) {
- free_page(page);
- oom(tsk);
- return 0;
- }
+ for(try=0;!(pmd=pmd_alloc(pgd,address));try++)
+ if(!oom_kill(tsk,try)) {
+ free_page(page);
+ oom(tsk);
+ return 0;
+ }
+ for(try=0;!(pte=pte_alloc(pmd,address));try++)
+ if(!oom_kill(tsk,try)) {
+ free_page(page);
+ oom(tsk);
+ return 0;
+ }
if (!pte_none(*pte)) {
printk("put_dirty_page: page already exists\n");
free_page(page);
@@ -629,6 +719,10 @@
* Do we need to copy?
*/
if (mem_map[MAP_NR(old_page)].count != 1) {
+ int try;
+
+ for(try=0;!new_page;try++,new_page=__get_free_page(GFP_KERNEL))
+ if(!oom_kill(tsk,try)) break;
if (new_page) {
if (PageReserved(mem_map + MAP_NR(old_page)))
++vma->vm_mm->rss;
@@ -877,14 +971,13 @@
pte_t * page_table;
pte_t entry;
unsigned long page;
+ int try,tmp;

pgd = pgd_offset(tsk->mm, address);
- pmd = pmd_alloc(pgd, address);
- if (!pmd)
- goto no_memory;
- page_table = pte_alloc(pmd, address);
- if (!page_table)
- goto no_memory;
+ for(try=0;!(pmd=pmd_alloc(pgd,address));try++)
+ if(!oom_kill(tsk,try)) goto no_memory;
+ for(try=0;!(page_table = pte_alloc(pmd, address));try++)
+ if(!oom_kill(tsk,try)) goto no_memory;
entry = *page_table;
if (pte_present(entry))
goto is_present;
@@ -898,10 +991,12 @@
* to copy, not share the page even if sharing is possible. It's
* essentially an early COW detection
*/
- page = vma->vm_ops->nopage(vma, address,
- (vma->vm_flags & VM_SHARED)?0:write_access);
- if (!page)
- goto sigbus;
+ tmp=(vma->vm_flags&VM_SHARED)?0:write_access;
+ for(try=0;!(page=vma->vm_ops->nopage(vma,address,tmp));try++)
+ if(!oom_kill(current,try)) {
+ oom(current);
+ goto sigbus;
+ }
++tsk->maj_flt;
++vma->vm_mm->rss;
/*
@@ -927,9 +1022,13 @@
anonymous_page:
entry = pte_wrprotect(mk_pte(ZERO_PAGE, vma->vm_page_prot));
if (write_access) {
- unsigned long page = __get_free_page(GFP_KERNEL);
- if (!page)
- goto sigbus;
+ unsigned long page;
+
+ for(try=0;!(page=__get_free_page(GFP_KERNEL));try++)
+ if(!oom_kill(current,try)) {
+ oom(current);
+ goto sigbus;
+ }
memset((void *) page, 0, PAGE_SIZE);
entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
vma->vm_mm->rss++;
@@ -940,7 +1039,6 @@
return;

sigbus:
- force_sig(SIGBUS, current);
put_page(page_table, BAD_PAGE);
/* no need to invalidate, wasn't present */
return;
@@ -993,11 +1091,11 @@
pgd_t *pgd;
pmd_t *pmd;
pte_t *pte;
+ int try;

pgd = pgd_offset(vma->vm_mm, address);
- pmd = pmd_alloc(pgd, address);
- if (!pmd)
- goto no_memory;
+ for(try=0;!(pmd=pmd_alloc(pgd,address));try++)
+ if(!oom_kill(current,try)) goto no_memory;
pte = pte_alloc(pmd, address);
if (!pte)
goto no_memory;
diff -u -d -r /home/linux.old/mm/mmap.c /usr/src/linux/mm/mmap.c
--- /home/linux.old/mm/mmap.c Sun Jun 1 02:06:28 1997
+++ /usr/src/linux/mm/mmap.c Wed Jun 18 23:56:09 1997
@@ -64,7 +64,8 @@
freepages += nr_swap_pages;
freepages += nr_swap_cache_pages;
freepages -= MAP_NR(high_memory) >> 4;
- return freepages > pages;
+/* return freepages > pages;*/
+ return 1;
}

asmlinkage unsigned long sys_brk(unsigned long brk)
diff -u -d -r /home/linux.old/mm/page_alloc.c /usr/src/linux/mm/page_alloc.c
--- /home/linux.old/mm/page_alloc.c Wed Jun 11 03:09:25 1997
+++ /usr/src/linux/mm/page_alloc.c Wed Jun 18 21:10:57 1997
@@ -309,17 +309,19 @@
pte_t * page_table, unsigned long entry, int write_access)
{
unsigned long page = __get_free_page(GFP_KERNEL);
+ int try;

if (pte_val(*page_table) != entry) {
free_page(page);
return;
}
- if (!page) {
- set_pte(page_table, BAD_PAGE);
- swap_free(entry);
- oom(tsk);
- return;
- }
+ for(try=0;!page;try++,page=__get_free_page(GFP_KERNEL))
+ if(!oom_kill(tsk,try)) {
+ set_pte(page_table, BAD_PAGE);
+ swap_free(entry);
+ oom(tsk);
+ return;
+ }
read_swap_page(entry, (char *) page);
if (pte_val(*page_table) != entry) {
free_page(page);