[RFC][PATCH] oom_kill: avoid depends on total_vm and use realRSS/swap value for oom_score (Re: Memory overcommit

From: KAMEZAWA Hiroyuki
Date: Tue Oct 27 2009 - 03:48:07 EST


On Tue, 27 Oct 2009 15:55:26 +0900
Minchan Kim <minchan.kim@xxxxxxxxx> wrote:

> >> Hmm.
> >> I wonder why we consider VM size for OOM kiling.
> >> How about RSS size?
> >>
> >
> > Maybe the current code assumes "Tons of swap have been generated, already" if
> > oom-kill is invoked. Then, just using mm->anon_rss will not be correct.
> >
> > Hm, should we count # of swap entries reference from mm ?....
>
> In Vedran case, he didn't use swap. So, Only considering vm is the problem.
> I think it would be better to consider both RSS + # of swap entries as
> Kosaki mentioned.
>
Then, maybe this kind of patch is necessary.
This is on 2.6.31...then I may have to rebase this to mmotom.
Added more CCs.

Vedran, I'm glad if you can test this patch.


==
Now, oom-killer's score uses mm->total_vm as its base value.
But, in these days, applications like GUI program tend to use
much shared libraries and total_vm grows too high even when
pages are not fully mapped.

For example, running a program "mmap" which allocates 1 GBbytes of
anonymous memory, oom_score top 10 on system will be..

score PID name
89924 3938 mixer_applet2
90210 3942 tomboy
94753 3936 clock-applet
101994 3919 pulseaudio
113525 4028 gnome-terminal
127340 1 init
128177 3871 nautilus
151003 11515 bash
256944 11653 mmap <-----------------use 1G of anon
425561 3829 gnome-session

No one believes gnome-session is more guilty than "mmap".

Instead of total_vm, we should use anon/file/swap usage of a process, I think.
This patch adds mm->swap_usage and calculate oom_score based on
anon_rss + file_rss + swap_usage.
Considering usual applications, this will be much better information than
total_vm. After this patch, the score on my desktop is

score PID name
4033 3176 gnome-panel
4077 3113 xinit
4526 3190 python
4820 3161 gnome-settings-
4989 3289 gnome-terminal
7105 3271 tomboy
8427 3177 nautilus
17549 3140 gnome-session
128501 3299 bash
256106 3383 mmap

This order is not bad, I think.

Note: This adss new counter...then new cost is added.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>
---
include/linux/mm_types.h | 1 +
mm/memory.c | 29 +++++++++++++++++++++--------
mm/oom_kill.c | 12 +++++++++---
mm/rmap.c | 1 +
mm/swapfile.c | 1 +
5 files changed, 33 insertions(+), 11 deletions(-)

Index: linux-2.6.31/include/linux/mm_types.h
===================================================================
--- linux-2.6.31.orig/include/linux/mm_types.h
+++ linux-2.6.31/include/linux/mm_types.h
@@ -228,6 +228,7 @@ struct mm_struct {
*/
mm_counter_t _file_rss;
mm_counter_t _anon_rss;
+ mm_counter_t _swap_usage;

unsigned long hiwater_rss; /* High-watermark of RSS usage */
unsigned long hiwater_vm; /* High-water virtual memory usage */
Index: linux-2.6.31/mm/memory.c
===================================================================
--- linux-2.6.31.orig/mm/memory.c
+++ linux-2.6.31/mm/memory.c
@@ -361,12 +361,15 @@ int __pte_alloc_kernel(pmd_t *pmd, unsig
return 0;
}

-static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
+static inline
+void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss, int swaps)
{
if (file_rss)
add_mm_counter(mm, file_rss, file_rss);
if (anon_rss)
add_mm_counter(mm, anon_rss, anon_rss);
+ if (swaps)
+ add_mm_counter(mm, swap_usage, swaps);
}

/*
@@ -562,6 +565,8 @@ copy_one_pte(struct mm_struct *dst_mm, s
&src_mm->mmlist);
spin_unlock(&mmlist_lock);
}
+ if (!is_migration_entry(entry))
+ rss[2]++;
if (is_write_migration_entry(entry) &&
is_cow_mapping(vm_flags)) {
/*
@@ -611,10 +616,10 @@ static int copy_pte_range(struct mm_stru
pte_t *src_pte, *dst_pte;
spinlock_t *src_ptl, *dst_ptl;
int progress = 0;
- int rss[2];
+ int rss[3];

again:
- rss[1] = rss[0] = 0;
+ rss[2] = rss[1] = rss[0] = 0;
dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
if (!dst_pte)
return -ENOMEM;
@@ -645,7 +650,7 @@ again:
arch_leave_lazy_mmu_mode();
spin_unlock(src_ptl);
pte_unmap_nested(src_pte - 1);
- add_mm_rss(dst_mm, rss[0], rss[1]);
+ add_mm_rss(dst_mm, rss[0], rss[1], rss[2]);
pte_unmap_unlock(dst_pte - 1, dst_ptl);
cond_resched();
if (addr != end)
@@ -769,6 +774,7 @@ static unsigned long zap_pte_range(struc
spinlock_t *ptl;
int file_rss = 0;
int anon_rss = 0;
+ int swaps = 0;

pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
arch_enter_lazy_mmu_mode();
@@ -838,13 +844,19 @@ static unsigned long zap_pte_range(struc
if (pte_file(ptent)) {
if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
print_bad_pte(vma, addr, ptent, NULL);
- } else if
- (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
- print_bad_pte(vma, addr, ptent, NULL);
+ } else {
+ swp_entry_t entry = pte_to_swp_entry(ptent);
+
+ if (!is_migration_entry(entry))
+ swaps++;
+
+ if (unlikely(!free_swap_and_cache(entry)))
+ print_bad_pte(vma, addr, ptent, NULL);
+ }
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));

- add_mm_rss(mm, file_rss, anon_rss);
+ add_mm_rss(mm, file_rss, anon_rss, swaps);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);

@@ -2573,6 +2585,7 @@ static int do_swap_page(struct mm_struct
*/

inc_mm_counter(mm, anon_rss);
+ dec_mm_counter(mm, swap_usage);
pte = mk_pte(page, vma->vm_page_prot);
if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
Index: linux-2.6.31/mm/rmap.c
===================================================================
--- linux-2.6.31.orig/mm/rmap.c
+++ linux-2.6.31/mm/rmap.c
@@ -834,6 +834,7 @@ static int try_to_unmap_one(struct page
spin_unlock(&mmlist_lock);
}
dec_mm_counter(mm, anon_rss);
+ inc_mm_counter(mm, swap_usage);
} else if (PAGE_MIGRATION) {
/*
* Store the pfn of the page in a special migration
Index: linux-2.6.31/mm/swapfile.c
===================================================================
--- linux-2.6.31.orig/mm/swapfile.c
+++ linux-2.6.31/mm/swapfile.c
@@ -830,6 +830,7 @@ static int unuse_pte(struct vm_area_stru
}

inc_mm_counter(vma->vm_mm, anon_rss);
+ dec_mm_counter(vma->vm_mm, swap_usage);
get_page(page);
set_pte_at(vma->vm_mm, addr, pte,
pte_mkold(mk_pte(page, vma->vm_page_prot)));
Index: linux-2.6.31/mm/oom_kill.c
===================================================================
--- linux-2.6.31.orig/mm/oom_kill.c
+++ linux-2.6.31/mm/oom_kill.c
@@ -69,7 +69,8 @@ unsigned long badness(struct task_struct
/*
* The memory size of the process is the basis for the badness.
*/
- points = mm->total_vm;
+ points = get_mm_counter(mm, anon_rss) + get_mm_counter(mm, file_rss)
+ + get_mm_counter(mm, swap_usage);

/*
* After this unlock we can no longer dereference local variable `mm'
@@ -92,8 +93,13 @@ unsigned long badness(struct task_struct
*/
list_for_each_entry(child, &p->children, sibling) {
task_lock(child);
- if (child->mm != mm && child->mm)
- points += child->mm->total_vm/2 + 1;
+ if (child->mm != mm && child->mm) {
+ unsigned long cpoint;
+ /* At considering child, we don't count swap */
+ cpoint = get_mm_counter(child->mm, anon_rss) +
+ get_mm_counter(child->mm, file_rss);
+ points += cpoint/2 + 1;
+ }
task_unlock(child);
}


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/