[patch] my latest oom stuff

Andrea Arcangeli (andrea@e-mind.com)
Sat, 24 Oct 1998 20:10:15 +0200 (CEST)


Since it got automagically in the diff I include also this my mmap patch
that improve a lot performances when shared mmaps are synced. This patch
update the page tables of all process that are mapping the same shared
memory region.

I never got one problem with this patch so far.

Index: linux/mm/filemap.c
diff -u linux/mm/filemap.c:1.1.1.2 linux/mm/filemap.c:1.1.1.1.14.2
--- linux/mm/filemap.c:1.1.1.2 Sat Oct 24 15:42:51 1998
+++ linux/mm/filemap.c Sat Oct 24 16:50:08 1998
@@ -5,6 +5,10 @@
*/

/*
+ * update_shared_mappings(), 1998 Andrea Arcangeli
+ */
+
+/*
* This file handles the generic file mmap semantics used by
* most "normal" filesystems (but you don't /have/ to use this:
* the NFS filesystem used to do this differently, for example)
@@ -1222,6 +1226,75 @@
return mk_pte(page,vma->vm_page_prot);
}

+static void update_one_shared_mapping(struct vm_area_struct *shared,
+ unsigned long address, pte_t orig_pte)
+{
+ pgd_t *pgd;
+ pmd_t *pmd;
+ pte_t *pte;
+ struct semaphore * mmap_sem = &shared->vm_mm->mmap_sem;
+
+ down(mmap_sem);
+
+ pgd = pgd_offset(shared->vm_mm, address);
+ if (pgd_none(*pgd))
+ goto out;
+ if (pgd_bad(*pgd)) {
+ printk(KERN_ERR "update_shared_mappings: bad pgd (%08lx)\n",
+ pgd_val(*pgd));
+ pgd_clear(pgd);
+ goto out;
+ }
+
+ pmd = pmd_offset(pgd, address);
+ if (pmd_none(*pmd))
+ goto out;
+ if (pmd_bad(*pmd))
+ {
+ printk(KERN_ERR "update_shared_mappings: bad pmd (%08lx)\n",
+ pmd_val(*pmd));
+ pmd_clear(pmd);
+ goto out;
+ }
+
+ pte = pte_offset(pmd, address);
+
+ if (pte_val(pte_mkclean(pte_mkyoung(*pte))) !=
+ pte_val(pte_mkclean(pte_mkyoung(orig_pte))))
+ goto out;
+
+ flush_page_to_ram(page(pte));
+ flush_cache_page(shared, address);
+ set_pte(pte, pte_mkclean(*pte));
+ flush_tlb_page(shared, address);
+
+ out:
+ up(mmap_sem);
+}
+
+static void update_shared_mappings(struct vm_area_struct *this,
+ unsigned long address,
+ pte_t orig_pte)
+{
+ if (this->vm_flags & VM_SHARED)
+ {
+ struct file * filp = this->vm_file;
+ if (filp)
+ {
+ struct inode * inode = filp->f_dentry->d_inode;
+ struct vm_area_struct * shared;
+
+ for (shared = inode->i_mmap; shared;
+ shared = shared->vm_next_share)
+ {
+ if (shared == this)
+ continue;
+ update_one_shared_mapping(shared, address,
+ orig_pte);
+ }
+ }
+ }
+}

static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
@@ -1239,6 +1312,7 @@
flush_cache_page(vma, address);
set_pte(ptep, pte_mkclean(pte));
flush_tlb_page(vma, address);
+ update_shared_mappings(vma, address, pte);
page = pte_page(pte);
atomic_inc(&mem_map[MAP_NR(page)].count);
} else {
Index: linux/mm/mmap.c
diff -u linux/mm/mmap.c:1.1.1.1 linux/mm/mmap.c:1.1.1.1.14.1
--- linux/mm/mmap.c:1.1.1.1 Fri Oct 2 19:22:39 1998
+++ linux/mm/mmap.c Tue Oct 20 01:00:08 1998
@@ -498,9 +498,6 @@
free = free->vm_next;
freed = 1;

- mm->map_count--;
- remove_shared_vm_struct(mpnt);
-
st = addr < mpnt->vm_start ? mpnt->vm_start : addr;
end = addr+len;
end = end > mpnt->vm_end ? mpnt->vm_end : end;
@@ -508,6 +505,9 @@

if (mpnt->vm_ops && mpnt->vm_ops->unmap)
mpnt->vm_ops->unmap(mpnt, st, size);
+
+ mm->map_count--;
+ remove_shared_vm_struct(mpnt);

flush_cache_range(mm, st, end);
zap_page_range(mm, st, size);

And here I rediffed my stuff against 2.1.126.

Index: linux/mm/page_alloc.c
diff -u linux/mm/page_alloc.c:1.1.1.2 linux/mm/page_alloc.c:1.1.1.1.18.3
--- linux/mm/page_alloc.c:1.1.1.2 Sat Oct 24 15:42:51 1998
+++ linux/mm/page_alloc.c Sat Oct 24 17:42:01 1998
@@ -237,45 +237,31 @@
unsigned long __get_free_pages(int gfp_mask, unsigned long order)
{
unsigned long flags;
+ int again = 0;
+ int wait = gfp_mask & __GFP_WAIT;

if (order >= NR_MEM_LISTS)
goto nopage;

- if (gfp_mask & __GFP_WAIT) {
- if (in_interrupt()) {
- static int count = 0;
- if (++count < 5) {
- printk("gfp called nonatomically from interrupt %p\n",
- __builtin_return_address(0));
- }
- goto nopage;
- }
-
- if (freepages.min > nr_free_pages) {
- int freed;
- freed = try_to_free_pages(gfp_mask, SWAP_CLUSTER_MAX);
- /*
- * Low priority (user) allocations must not
- * succeed if we didn't have enough memory
- * and we couldn't get more..
- */
- if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
- goto nopage;
- }
+ if (wait && in_interrupt()) {
+ printk("gfp called nonatomically from interrupt %p\n",
+ __builtin_return_address(0));
+ goto nopage;
}
+ again:
spin_lock_irqsave(&page_alloc_lock, flags);
RMQUEUE(order, (gfp_mask & GFP_DMA));
spin_unlock_irqrestore(&page_alloc_lock, flags);
+
+ kswapd_wakeup();
+
+ if (!again && wait)
+ {
+ again = 1;
+ if (try_to_free_pages(gfp_mask, 1 << order))
+ goto again;
+ }

- /*
- * If we failed to find anything, we'll return NULL, but we'll
- * wake up kswapd _now_ ad even wait for it synchronously if
- * we can.. This way we'll at least make some forward progress
- * over time.
- */
- wake_up(&kswapd_wait);
- if (gfp_mask & __GFP_WAIT)
- schedule();
nopage:
return 0;
}

My little vmalloc patch got automagically included too. I am using it
all the time. I know it make not interesting differences but...

Index: linux/mm/vmalloc.c
diff -u linux/mm/vmalloc.c:1.1.1.1 linux/mm/vmalloc.c:1.1.1.1.16.1
--- linux/mm/vmalloc.c:1.1.1.1 Fri Oct 2 19:22:39 1998
+++ linux/mm/vmalloc.c Tue Oct 20 01:00:08 1998
@@ -186,7 +186,8 @@
for (p = &vmlist ; (tmp = *p) ; p = &tmp->next) {
if (tmp->addr == addr) {
*p = tmp->next;
- vmfree_area_pages(VMALLOC_VMADDR(tmp->addr), tmp->size);
+ vmfree_area_pages(VMALLOC_VMADDR(tmp->addr),
+ tmp->size - PAGE_SIZE);
kfree(tmp);
return;
}

Index: linux/mm/vmscan.c
diff -u linux/mm/vmscan.c:1.1.1.3 linux/mm/vmscan.c:1.1.1.2.4.7
--- linux/mm/vmscan.c:1.1.1.3 Sat Oct 24 15:42:52 1998
+++ linux/mm/vmscan.c Sat Oct 24 19:55:47 1998
@@ -447,39 +447,43 @@
static int do_try_to_free_page(int gfp_mask)
{
static int state = 0;
- int i=6;
- int stop;
+ int from_prio, to_prio;

/* Always trim SLAB caches when memory gets low. */
kmem_cache_reap(gfp_mask);

/* We try harder if we are waiting .. */
- stop = 3;
if (gfp_mask & __GFP_WAIT)
- stop = 0;
+ {
+ from_prio = 3;
+ to_prio = 0;
+ } else {
+ from_prio = 6;
+ to_prio = 3;
+ }

if (buffer_over_borrow() || pgcache_over_borrow())
- shrink_mmap(i, gfp_mask);
+ state = 0;

switch (state) {
do {
case 0:
- if (shrink_mmap(i, gfp_mask))
+ if (shrink_mmap(from_prio, gfp_mask))
return 1;
state = 1;
case 1:
- if (shm_swap(i, gfp_mask))
+ if (shm_swap(from_prio, gfp_mask))
return 1;
state = 2;
case 2:
- if (swap_out(i, gfp_mask))
+ if (swap_out(from_prio, gfp_mask))
return 1;
state = 3;
case 3:
- shrink_dcache_memory(i, gfp_mask);
+ shrink_dcache_memory(from_prio, gfp_mask);
state = 0;
- i--;
- } while ((i - stop) >= 0);
+ from_prio--;
+ } while (from_prio >= to_prio);
}
return 0;
}
@@ -524,7 +528,6 @@
lock_kernel();

/* Give kswapd a realtime priority. */
- current->policy = SCHED_FIFO;
current->rt_priority = 32; /* Fixme --- we need to standardise our
namings for POSIX.4 realtime scheduling
priorities. */
@@ -546,12 +549,16 @@
init_swap_timer();
add_wait_queue(&kswapd_wait, &wait);
while (1) {
- int tries;
+ int tries, free_memory, count;

current->state = TASK_INTERRUPTIBLE;
flush_signals(current);
run_task_queue(&tq_disk);
+ timer_active |= 1<<SWAP_TIMER;
+ current->policy = SCHED_FIFO;
schedule();
+ current->policy = SCHED_OTHER;
+ timer_active &= ~(1<<SWAP_TIMER);
swapstats.wakeups++;

/*
@@ -570,12 +577,20 @@
* woken up more often and the rate will be even
* higher).
*/
- tries = pager_daemon.tries_base;
- tries >>= 4*free_memory_available();
+ free_memory = free_memory_available();

- do {
- do_try_to_free_page(0);
+ if (free_memory == 2)
+ continue;
+ tries = pager_daemon.tries_base >> (free_memory + 2);
+
+ for (count = 0; count < tries; count++)
+ {
/*
+ * Stop carefully if we could eat all CPU power. -arca
+ */
+ if (!do_try_to_free_page(0))
+ break;
+ /*
* Syncing large chunks is faster than swapping
* synchronously (less head movement). -- Rik.
*/
@@ -583,7 +598,7 @@
run_task_queue(&tq_disk);
if (free_memory_available() > 1)
break;
- } while (--tries > 0);
+ }
}
/* As if we could ever get here - maybe we want to make this killable */
remove_wait_queue(&kswapd_wait, &wait);
@@ -598,22 +613,22 @@
*
* The "PF_MEMALLOC" flag protects us against recursion:
* if we need more memory as part of a swap-out effort we
- * will just silently return "success" to tell the page
- * allocator to accept the allocation.
+ * will just silently return "fail" to tell the page
+ * allocator that we are OOM.
*/
int try_to_free_pages(unsigned int gfp_mask, int count)
{
- int retval = 1;
+ int retval = 0;

lock_kernel();
if (!(current->flags & PF_MEMALLOC)) {
current->flags |= PF_MEMALLOC;
- do {
+ while (count--)
+ {
retval = do_try_to_free_page(gfp_mask);
if (!retval)
break;
- count--;
- } while (count > 0);
+ }
current->flags &= ~PF_MEMALLOC;
}
unlock_kernel();
Index: linux/kernel/fork.c
diff -u linux/kernel/fork.c:1.1.1.2 linux/kernel/fork.c:1.1.1.2.4.1
--- linux/kernel/fork.c:1.1.1.2 Fri Oct 9 17:44:09 1998
+++ linux/kernel/fork.c Tue Oct 20 01:00:08 1998
@@ -296,6 +296,8 @@
exit_mmap(mm);
free_page_tables(mm);
kmem_cache_free(mm_cachep, mm);
+ if (!free_memory_available())
+ kswapd_wakeup();
}
}

This is a perfect oom tuning here.

Andrea Arcangeli

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/