[PATCH 1/2] memcg: do not trigger OOM if PF_NO_MEMCG_OOM is set

From: Michal Hocko
Date: Wed Feb 06 2013 - 10:45:07 EST


memcg oom killer might deadlock if the process which falls down to
mem_cgroup_handle_oom holds a lock which prevents other task to
terminate because it is blocked on the very same lock.
This can happen when a write system call needs to allocate a page but
the allocation hits the memcg hard limit and there is nothing to reclaim
(e.g. there is no swap or swap limit is hit as well and all cache pages
have been reclaimed already) and the process selected by memcg OOM
killer is blocked on i_mutex on the same inode (e.g. truncate it).

Process A
[<ffffffff811109b8>] do_truncate+0x58/0xa0 # takes i_mutex
[<ffffffff81121c90>] do_last+0x250/0xa30
[<ffffffff81122547>] path_openat+0xd7/0x440
[<ffffffff811229c9>] do_filp_open+0x49/0xa0
[<ffffffff8110f7d6>] do_sys_open+0x106/0x240
[<ffffffff8110f950>] sys_open+0x20/0x30
[<ffffffff815b5926>] system_call_fastpath+0x18/0x1d
[<ffffffffffffffff>] 0xffffffffffffffff

Process B
[<ffffffff8110a9c1>] mem_cgroup_handle_oom+0x241/0x3b0
[<ffffffff8110b5ab>] T.1146+0x5ab/0x5c0
[<ffffffff8110c22e>] mem_cgroup_cache_charge+0xbe/0xe0
[<ffffffff810ca28c>] add_to_page_cache_locked+0x4c/0x140
[<ffffffff810ca3a2>] add_to_page_cache_lru+0x22/0x50
[<ffffffff810ca45b>] grab_cache_page_write_begin+0x8b/0xe0
[<ffffffff81193a18>] ext3_write_begin+0x88/0x270
[<ffffffff810c8fc6>] generic_file_buffered_write+0x116/0x290
[<ffffffff810cb3cc>] __generic_file_aio_write+0x27c/0x480
[<ffffffff810cb646>] generic_file_aio_write+0x76/0xf0 # takes ->i_mutex
[<ffffffff8111156a>] do_sync_write+0xea/0x130
[<ffffffff81112183>] vfs_write+0xf3/0x1f0
[<ffffffff81112381>] sys_write+0x51/0x90
[<ffffffff815b5926>] system_call_fastpath+0x18/0x1d
[<ffffffffffffffff>] 0xffffffffffffffff

This is not a hard deadlock though because administrator can still
intervene and increase the limit on the group which helps the writer to
finish the allocation and release the lock.

This patch heals the problem by forbidding OOM from dangerous context.
Memcg charging code has no way to find out whether it is called from a
locked context we have to help it via process flags. PF_OOM_ORIGIN flag
removed recently will be reused for PF_NO_MEMCG_OOM which signals that
the memcg OOM killer could lead to a deadlock.
Only locked callers of __generic_file_aio_write are currently marked. I
am pretty sure there are more places (I didn't check shmem and hugetlb
uses fancy instantion mutex during page fault and filesystems might
use some locks during the write) but I've ignored those as this will
probably be just a user specific patch without any way to get upstream
in the current form.

Reported-by: azurIt <azurit@xxxxxxxx>
Signed-off-by: Michal Hocko <mhocko@xxxxxxx>
---
drivers/staging/pohmelfs/inode.c | 2 ++
include/linux/sched.h | 1 +
mm/filemap.c | 2 ++
mm/memcontrol.c | 18 ++++++++++++++----
4 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/drivers/staging/pohmelfs/inode.c b/drivers/staging/pohmelfs/inode.c
index 7a19555..523de82e 100644
--- a/drivers/staging/pohmelfs/inode.c
+++ b/drivers/staging/pohmelfs/inode.c
@@ -921,7 +921,9 @@ ssize_t pohmelfs_write(struct file *file, const char __user *buf,
if (ret)
goto err_out_unlock;

+ current->flags |= PF_NO_MEMCG_OOM;
ret = __generic_file_aio_write(&kiocb, &iov, 1, &kiocb.ki_pos);
+ current->flags &= ~PF_NO_MEMCG_OOM;
*ppos = kiocb.ki_pos;

mutex_unlock(&inode->i_mutex);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1e86bb4..f275c8f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1781,6 +1781,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
#define PF_FROZEN 0x00010000 /* frozen for system suspend */
#define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */
#define PF_KSWAPD 0x00040000 /* I am kswapd */
+#define PF_NO_MEMCG_OOM 0x00080000 /* Memcg OOM could lead to a deadlock */
#define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
#define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */
diff --git a/mm/filemap.c b/mm/filemap.c
index 556858c..58a316b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2617,7 +2617,9 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,

mutex_lock(&inode->i_mutex);
blk_start_plug(&plug);
+ current->flags |= PF_NO_MEMCG_OOM;
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
+ current->flags &= ~PF_NO_MEMCG_OOM;
mutex_unlock(&inode->i_mutex);

if (ret > 0 || ret == -EIOCBQUEUED) {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c8425b1..128b615 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2397,6 +2397,14 @@ done:
return 0;
nomem:
*ptr = NULL;
+ if (printk_ratelimit())
+ printk(KERN_WARNING"%s: task:%s pid:%d got ENOMEM without OOM for memcg:%p."
+ " If this message shows up very often for the"
+ " same task then there is a risk that the"
+ " process is not able to make any progress"
+ " because of the current limit. Try to enlarge"
+ " the hard limit.\n", __FUNCTION__,
+ current->comm, current->pid, memcg);
return -ENOMEM;
bypass:
*ptr = NULL;
@@ -2703,7 +2711,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
struct mem_cgroup *memcg = NULL;
unsigned int nr_pages = 1;
struct page_cgroup *pc;
- bool oom = true;
+ bool oom = !(current->flags & PF_NO_MEMCG_OOM);
int ret;

if (PageTransHuge(page)) {
@@ -2770,6 +2778,7 @@ __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *memcg,
int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask)
{
+ bool oom = !(current->flags & PF_NO_MEMCG_OOM);
struct mem_cgroup *memcg = NULL;
int ret;

@@ -2782,7 +2791,7 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
mm = &init_mm;

if (page_is_file_cache(page)) {
- ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &memcg, true);
+ ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &memcg, oom);
if (ret || !memcg)
return ret;

@@ -2818,6 +2827,7 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
struct page *page,
gfp_t mask, struct mem_cgroup **ptr)
{
+ bool oom = !(current->flags & PF_NO_MEMCG_OOM);
struct mem_cgroup *memcg;
int ret;

@@ -2840,13 +2850,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
if (!memcg)
goto charge_cur_mm;
*ptr = memcg;
- ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true);
+ ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, oom);
css_put(&memcg->css);
return ret;
charge_cur_mm:
if (unlikely(!mm))
mm = &init_mm;
- return __mem_cgroup_try_charge(mm, mask, 1, ptr, true);
+ return __mem_cgroup_try_charge(mm, mask, 1, ptr, oom);
}

static void
--
1.7.10.4

--
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/