[RFC PATCH 1/7] mm: remove the per-task RSS counter cache

From: Kairui Song
Date: Thu Jul 28 2022 - 16:45:34 EST


From: Kairui Song <kasong@xxxxxxxxxxx>

The RSS counter cached as introduced in
commit 34e55232e59f ("mm: avoid false sharing of mm_counter") to ease
the contention of the RSS counters of a mm_struct.

There are several problems with this, and the 64 events threshold
might not be an optimal value. It makes the RSS value inaccurate,
in the worst case, RSS value is not accounted until 64 pages are
allocated. With common tools like `top`, there could be hundreds of
MBs of error of the RSS value being reported by kernel.

And since 4 counters share the same event threshold, in the worst case,
each counter will do a global sync every 16 events, which still raises
some contention.

Remove this cache for now, and prepare for a different approach.

Some helper macros are kept since they will come in handy later.

Signed-off-by: Kairui Song <kasong@xxxxxxxxxxx>
---
Documentation/filesystems/proc.rst | 7 -----
fs/exec.c | 2 --
include/linux/mm.h | 20 +-----------
include/linux/mm_types_task.h | 9 ------
include/linux/sched.h | 3 --
kernel/exit.c | 5 ---
kernel/fork.c | 4 ---
kernel/kthread.c | 1 -
mm/madvise.c | 7 ++---
mm/memory.c | 49 ------------------------------
10 files changed, 3 insertions(+), 104 deletions(-)

diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index 1bc91fb8c321..04a0a18da262 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -224,13 +224,6 @@ memory usage. Its seven fields are explained in Table 1-3. The stat file
contains detailed information about the process itself. Its fields are
explained in Table 1-4.

-(for SMP CONFIG users)
-
-For making accounting scalable, RSS related information are handled in an
-asynchronous manner and the value may not be very precise. To see a precise
-snapshot of a moment, you can see /proc/<pid>/smaps file and scan page table.
-It's slow but very precise.
-
.. table:: Table 1-2: Contents of the status files (as of 4.19)

========================== ===================================================
diff --git a/fs/exec.c b/fs/exec.c
index 778123259e42..3c787ca8c68e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -988,8 +988,6 @@ static int exec_mmap(struct mm_struct *mm)
tsk = current;
old_mm = current->mm;
exec_mm_release(tsk, old_mm);
- if (old_mm)
- sync_mm_rss(old_mm);

ret = down_write_killable(&tsk->signal->exec_update_lock);
if (ret)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index cf3d0d673f6b..6346f7e77dc7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1998,17 +1998,7 @@ static inline bool get_user_page_fast_only(unsigned long addr,
*/
static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
{
- long val = atomic_long_read(&mm->rss_stat.count[member]);
-
-#ifdef SPLIT_RSS_COUNTING
- /*
- * counter is updated in asynchronous manner and may go to minus.
- * But it's never be expected number for users.
- */
- if (val < 0)
- val = 0;
-#endif
- return (unsigned long)val;
+ return atomic_long_read(&mm->rss_stat.count[member]);
}

void mm_trace_rss_stat(struct mm_struct *mm, int member, long count);
@@ -2094,14 +2084,6 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
*maxrss = hiwater_rss;
}

-#if defined(SPLIT_RSS_COUNTING)
-void sync_mm_rss(struct mm_struct *mm);
-#else
-static inline void sync_mm_rss(struct mm_struct *mm)
-{
-}
-#endif
-
#ifndef CONFIG_ARCH_HAS_PTE_SPECIAL
static inline int pte_special(pte_t pte)
{
diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index c1bc6731125c..a00327c663db 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -48,15 +48,6 @@ enum {
NR_MM_COUNTERS
};

-#if USE_SPLIT_PTE_PTLOCKS && defined(CONFIG_MMU)
-#define SPLIT_RSS_COUNTING
-/* per-thread cached information, */
-struct task_rss_stat {
- int events; /* for synchronization threshold */
- int count[NR_MM_COUNTERS];
-};
-#endif /* USE_SPLIT_PTE_PTLOCKS */
-
struct mm_rss_stat {
atomic_long_t count[NR_MM_COUNTERS];
};
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c46f3a63b758..11d3e1a95302 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -861,9 +861,6 @@ struct task_struct {
/* Per-thread vma caching: */
struct vmacache vmacache;

-#ifdef SPLIT_RSS_COUNTING
- struct task_rss_stat rss_stat;
-#endif
int exit_state;
int exit_code;
int exit_signal;
diff --git a/kernel/exit.c b/kernel/exit.c
index 64c938ce36fe..8c55cda5136f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -482,7 +482,6 @@ static void exit_mm(void)
exit_mm_release(current, mm);
if (!mm)
return;
- sync_mm_rss(mm);
mmap_read_lock(mm);
mmgrab(mm);
BUG_ON(mm != current->active_mm);
@@ -749,10 +748,6 @@ void __noreturn do_exit(long code)

io_uring_files_cancel();
exit_signals(tsk); /* sets PF_EXITING */
-
- /* sync mm's RSS info before statistics gathering */
- if (tsk->mm)
- sync_mm_rss(tsk->mm);
acct_update_integrals(tsk);
group_dead = atomic_dec_and_test(&tsk->signal->live);
if (group_dead) {
diff --git a/kernel/fork.c b/kernel/fork.c
index 9d44f2d46c69..c090ebd55063 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2145,10 +2145,6 @@ static __latent_entropy struct task_struct *copy_process(
p->io_uring = NULL;
#endif

-#if defined(SPLIT_RSS_COUNTING)
- memset(&p->rss_stat, 0, sizeof(p->rss_stat));
-#endif
-
p->default_timer_slack_ns = current->timer_slack_ns;

#ifdef CONFIG_PSI
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 3c677918d8f2..6bfbab4e2103 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1463,7 +1463,6 @@ void kthread_unuse_mm(struct mm_struct *mm)
* clearing tsk->mm.
*/
smp_mb__after_spinlock();
- sync_mm_rss(mm);
local_irq_disable();
tsk->mm = NULL;
membarrier_update_current_mm(NULL);
diff --git a/mm/madvise.c b/mm/madvise.c
index 0316bbc6441b..48cb9e5f92d2 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -711,12 +711,9 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
mark_page_lazyfree(page);
}
out:
- if (nr_swap) {
- if (current->mm == mm)
- sync_mm_rss(mm);
-
+ if (nr_swap)
add_mm_counter(mm, MM_SWAPENTS, nr_swap);
- }
+
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(orig_pte, ptl);
cond_resched();
diff --git a/mm/memory.c b/mm/memory.c
index 4cf7d4b6c950..6bf7826e666b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -176,53 +176,9 @@ void mm_trace_rss_stat(struct mm_struct *mm, int member, long count)
trace_rss_stat(mm, member, count);
}

-#if defined(SPLIT_RSS_COUNTING)
-
-void sync_mm_rss(struct mm_struct *mm)
-{
- int i;
-
- for (i = 0; i < NR_MM_COUNTERS; i++) {
- if (current->rss_stat.count[i]) {
- add_mm_counter(mm, i, current->rss_stat.count[i]);
- current->rss_stat.count[i] = 0;
- }
- }
- current->rss_stat.events = 0;
-}
-
-static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
-{
- struct task_struct *task = current;
-
- if (likely(task->mm == mm))
- task->rss_stat.count[member] += val;
- else
- add_mm_counter(mm, member, val);
-}
-#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
-#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
-
-/* sync counter once per 64 page faults */
-#define TASK_RSS_EVENTS_THRESH (64)
-static void check_sync_rss_stat(struct task_struct *task)
-{
- if (unlikely(task != current))
- return;
- if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
- sync_mm_rss(task->mm);
-}
-#else /* SPLIT_RSS_COUNTING */
-
#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)

-static void check_sync_rss_stat(struct task_struct *task)
-{
-}
-
-#endif /* SPLIT_RSS_COUNTING */
-
/*
* Note: this doesn't free the actual pages themselves. That
* has been handled earlier when unmapping all the memory regions.
@@ -502,8 +458,6 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
{
int i;

- if (current->mm == mm)
- sync_mm_rss(mm);
for (i = 0; i < NR_MM_COUNTERS; i++)
if (rss[i])
add_mm_counter(mm, i, rss[i]);
@@ -5120,9 +5074,6 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
count_vm_event(PGFAULT);
count_memcg_event_mm(vma->vm_mm, PGFAULT);

- /* do counter updates before entering really critical section. */
- check_sync_rss_stat(current);
-
if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
flags & FAULT_FLAG_INSTRUCTION,
flags & FAULT_FLAG_REMOTE))
--
2.35.2