[RFC PATCH 2/8] memcg, mm: Return ENOMEM or delay if memcg_over_limit

From: Waiman Long
Date: Mon Aug 17 2020 - 10:10:10 EST


The brk(), mmap(), mlock(), mlockall() and mprotect() syscalls are
modified to check the memcg_over_limit flag and return ENOMEM when it
is set and memory control action is PR_MEMACT_ENOMEM.

In case the action is PR_MEMACT_SLOWDOWN, an artificial delay of 20ms
will be added to slow down the memory allocation syscalls.

Signed-off-by: Waiman Long <longman@xxxxxxxxxx>
---
include/linux/sched.h | 16 ++++++++++++++++
kernel/fork.c | 1 +
mm/memcontrol.c | 25 +++++++++++++++++++++++--
mm/mlock.c | 6 ++++++
mm/mmap.c | 12 ++++++++++++
mm/mprotect.c | 3 +++
6 files changed, 61 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c79d606d27ab..9ec1bd072334 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1477,6 +1477,22 @@ static inline char task_state_to_char(struct task_struct *tsk)
return task_index_to_char(task_state_index(tsk));
}

+#ifdef CONFIG_MEMCG
+extern bool mem_cgroup_check_over_limit(void);
+
+static inline bool mem_over_memcg_limit(void)
+{
+ if (READ_ONCE(current->memcg_over_limit))
+ return mem_cgroup_check_over_limit();
+ return false;
+}
+#else
+static inline bool mem_over_memcg_limit(void)
+{
+ return false;
+}
+#endif
+
/**
* is_global_init - check if a task structure is init. Since init
* is free to have sub-threads we need to check tgid.
diff --git a/kernel/fork.c b/kernel/fork.c
index 4d32190861bd..61f9a9e5f857 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -940,6 +940,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)

#ifdef CONFIG_MEMCG
tsk->active_memcg = NULL;
+ tsk->memcg_over_limit = false;
#endif
return tsk;

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1106dac024ac..5cad7bb26d13 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2646,7 +2646,9 @@ static bool __mem_cgroup_over_high_action(struct mem_cgroup *memcg, u8 action)
if (!mm)
return true; /* No more check is needed */

- current->memcg_over_limit = false;
+ if (READ_ONCE(current->memcg_over_limit))
+ WRITE_ONCE(current->memcg_over_limit, false);
+
if ((action == PR_MEMACT_SIGNAL) && !signal)
goto out;

@@ -2660,7 +2662,11 @@ static bool __mem_cgroup_over_high_action(struct mem_cgroup *memcg, u8 action)
WRITE_ONCE(current->memcg_over_limit, true);
break;
case PR_MEMACT_SLOWDOWN:
- /* Slow down by yielding the cpu */
+ /*
+ * Slow down by yielding the cpu & adding delay to
+ * memory allocation syscalls.
+ */
+ WRITE_ONCE(current->memcg_over_limit, true);
set_tsk_need_resched(current);
set_preempt_need_resched();
break;
@@ -2694,6 +2700,21 @@ static inline bool mem_cgroup_over_high_action(struct mem_cgroup *memcg)
return __mem_cgroup_over_high_action(memcg, action);
}

+/*
+ * Called from memory allocation syscalls.
+ * Return true if ENOMEM should be returned, false otherwise.
+ */
+bool mem_cgroup_check_over_limit(void)
+{
+ u8 action = READ_ONCE(current->memcg_over_high_action);
+
+ if (action == PR_MEMACT_ENOMEM)
+ return true;
+ if (action == PR_MEMACT_SLOWDOWN)
+ msleep(20); /* Artificial delay of 20ms */
+ return false;
+}
+
static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned int nr_pages)
{
diff --git a/mm/mlock.c b/mm/mlock.c
index 93ca2bf30b4f..130d4b3fa0f5 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -678,6 +678,9 @@ static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t fla
if (!can_do_mlock())
return -EPERM;

+ if (mem_over_memcg_limit())
+ return -ENOMEM;
+
len = PAGE_ALIGN(len + (offset_in_page(start)));
start &= PAGE_MASK;

@@ -807,6 +810,9 @@ SYSCALL_DEFINE1(mlockall, int, flags)
if (!can_do_mlock())
return -EPERM;

+ if (mem_over_memcg_limit())
+ return -ENOMEM;
+
lock_limit = rlimit(RLIMIT_MEMLOCK);
lock_limit >>= PAGE_SHIFT;

diff --git a/mm/mmap.c b/mm/mmap.c
index 40248d84ad5f..873ccf2560a6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -198,6 +198,10 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
bool downgraded = false;
LIST_HEAD(uf);

+ /* Too much memory used? */
+ if (mem_over_memcg_limit())
+ return -ENOMEM;
+
if (mmap_write_lock_killable(mm))
return -EINTR;

@@ -1407,6 +1411,10 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (mm->map_count > sysctl_max_map_count)
return -ENOMEM;

+ /* Too much memory used? */
+ if (mem_over_memcg_limit())
+ return -ENOMEM;
+
/* Obtain the address to map to. we verify (or select) it and ensure
* that it represents a valid section of the address space.
*/
@@ -1557,6 +1565,10 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
struct file *file = NULL;
unsigned long retval;

+ /* Too much memory used? */
+ if (mem_over_memcg_limit())
+ return -ENOMEM;
+
if (!(flags & MAP_ANONYMOUS)) {
audit_mmap_fd(fd, flags);
file = fget(fd);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index ce8b8a5eacbb..b2c0f50bb0a0 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -519,6 +519,9 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
const bool rier = (current->personality & READ_IMPLIES_EXEC) &&
(prot & PROT_READ);

+ if (mem_over_memcg_limit())
+ return -ENOMEM;
+
start = untagged_addr(start);

prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
--
2.18.1