Re: [PATCH v2 2/7] mm: vmscan: make global slab shrink lockless

From: Qi Zheng
Date: Sat Feb 25 2023 - 10:58:30 EST




On 2023/2/25 23:30, Kirill Tkhai wrote:
On 25.02.2023 11:08, Qi Zheng wrote:


On 2023/2/25 05:14, Kirill Tkhai wrote:
On 25.02.2023 00:02, Kirill Tkhai wrote:
On 24.02.2023 07:00, Qi Zheng wrote:


On 2023/2/24 02:24, Sultan Alsawaf wrote:
On Thu, Feb 23, 2023 at 09:27:20PM +0800, Qi Zheng wrote:
The shrinker_rwsem is a global lock in shrinkers subsystem,
it is easy to cause blocking in the following cases:

a. the write lock of shrinker_rwsem was held for too long.
     For example, there are many memcgs in the system, which
     causes some paths to hold locks and traverse it for too
     long. (e.g. expand_shrinker_info())
b. the read lock of shrinker_rwsem was held for too long,
     and a writer came at this time. Then this writer will be
     forced to wait and block all subsequent readers.
     For example:
     - be scheduled when the read lock of shrinker_rwsem is
       held in do_shrink_slab()
     - some shrinker are blocked for too long. Like the case
       mentioned in the patchset[1].

Therefore, many times in history ([2],[3],[4],[5]), some
people wanted to replace shrinker_rwsem reader with SRCU,
but they all gave up because SRCU was not unconditionally
enabled.

But now, since commit 1cd0bd06093c ("rcu: Remove CONFIG_SRCU"),
the SRCU is unconditionally enabled. So it's time to use
SRCU to protect readers who previously held shrinker_rwsem.

[1]. https://lore.kernel.org/lkml/20191129214541.3110-1-ptikhomirov@xxxxxxxxxxxxx/
[2]. https://lore.kernel.org/all/1437080113.3596.2.camel@xxxxxxxxxxxx/
[3]. https://lore.kernel.org/lkml/1510609063-3327-1-git-send-email-penguin-kernel@xxxxxxxxxxxxxxxxxxx/
[4]. https://lore.kernel.org/lkml/153365347929.19074.12509495712735843805.stgit@localhost.localdomain/
[5]. https://lore.kernel.org/lkml/20210927074823.5825-1-sultan@xxxxxxxxxxxxxxx/

Signed-off-by: Qi Zheng <zhengqi.arch@xxxxxxxxxxxxx>
---
   mm/vmscan.c | 27 +++++++++++----------------
   1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9f895ca6216c..02987a6f95d1 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -202,6 +202,7 @@ static void set_task_reclaim_state(struct task_struct *task,
     LIST_HEAD(shrinker_list);
   DECLARE_RWSEM(shrinker_rwsem);
+DEFINE_SRCU(shrinker_srcu);
     #ifdef CONFIG_MEMCG
   static int shrinker_nr_max;
@@ -706,7 +707,7 @@ void free_prealloced_shrinker(struct shrinker *shrinker)
   void register_shrinker_prepared(struct shrinker *shrinker)
   {
       down_write(&shrinker_rwsem);
-    list_add_tail(&shrinker->list, &shrinker_list);
+    list_add_tail_rcu(&shrinker->list, &shrinker_list);
       shrinker->flags |= SHRINKER_REGISTERED;
       shrinker_debugfs_add(shrinker);
       up_write(&shrinker_rwsem);
@@ -760,13 +761,15 @@ void unregister_shrinker(struct shrinker *shrinker)
           return;
         down_write(&shrinker_rwsem);
-    list_del(&shrinker->list);
+    list_del_rcu(&shrinker->list);
       shrinker->flags &= ~SHRINKER_REGISTERED;
       if (shrinker->flags & SHRINKER_MEMCG_AWARE)
           unregister_memcg_shrinker(shrinker);
       debugfs_entry = shrinker_debugfs_remove(shrinker);
       up_write(&shrinker_rwsem);
   +    synchronize_srcu(&shrinker_srcu);
+
       debugfs_remove_recursive(debugfs_entry);
         kfree(shrinker->nr_deferred);
@@ -786,6 +789,7 @@ void synchronize_shrinkers(void)
   {
       down_write(&shrinker_rwsem);
       up_write(&shrinker_rwsem);
+    synchronize_srcu(&shrinker_srcu);
   }
   EXPORT_SYMBOL(synchronize_shrinkers);
   @@ -996,6 +1000,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
   {
       unsigned long ret, freed = 0;
       struct shrinker *shrinker;
+    int srcu_idx;
         /*
        * The root memcg might be allocated even though memcg is disabled
@@ -1007,10 +1012,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
       if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
           return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
   -    if (!down_read_trylock(&shrinker_rwsem))
-        goto out;
+    srcu_idx = srcu_read_lock(&shrinker_srcu);
   -    list_for_each_entry(shrinker, &shrinker_list, list) {
+    list_for_each_entry_srcu(shrinker, &shrinker_list, list,
+                 srcu_read_lock_held(&shrinker_srcu)) {
           struct shrink_control sc = {
               .gfp_mask = gfp_mask,
               .nid = nid,
@@ -1021,19 +1026,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
           if (ret == SHRINK_EMPTY)
               ret = 0;
           freed += ret;
-        /*
-         * Bail out if someone want to register a new shrinker to
-         * prevent the registration from being stalled for long periods
-         * by parallel ongoing shrinking.
-         */
-        if (rwsem_is_contended(&shrinker_rwsem)) {
-            freed = freed ? : 1;
-            break;
-        }
       }
   -    up_read(&shrinker_rwsem);
-out:
+    srcu_read_unlock(&shrinker_srcu, srcu_idx);
       cond_resched();
       return freed;
   }
--
2.20.1



Hi Qi,

A different problem I realized after my old attempt to use SRCU was that the
unregister_shrinker() path became quite slow due to the heavy synchronize_srcu()
call. Both register_shrinker() *and* unregister_shrinker() are called frequently
these days, and SRCU is too unfair to the unregister path IMO.

Hi Sultan,

IIUC, for unregister_shrinker(), the wait time is hardly longer with
SRCU than with shrinker_rwsem before.

And I just did a simple test. After using the script in cover letter to
increase the shrink_slab hotspot, I did umount 1k times at the same
time, and then I used bpftrace to measure the time consumption of
unregister_shrinker() as follows:

bpftrace -e 'kprobe:unregister_shrinker { @start[tid] = nsecs; } kretprobe:unregister_shrinker /@start[tid]/ { @ns[comm] = hist(nsecs - @start[tid]); delete(@start[tid]); }'

@ns[umount]:
[16K, 32K)             3 |      |
[32K, 64K)            66 |@@@@@@@@@@      |
[64K, 128K)           32 |@@@@@      |
[128K, 256K)          22 |@@@      |
[256K, 512K)          48 |@@@@@@@      |
[512K, 1M)            19 |@@@      |
[1M, 2M)             131 |@@@@@@@@@@@@@@@@@@@@@      |
[2M, 4M)             313 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[4M, 8M)             302 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@  |
[8M, 16M)             55 |@@@@@@@@@

I see that the highest time-consuming of unregister_shrinker() is between 8ms and 16ms, which feels tolerable?

Hi Kirill,


The fundamental difference is that before the patchset this for_each_set_bit() iteration could be broken in the middle
of two do_shrink_slab() calls, while after the patchset we can leave for_each_set_bit() only after visiting all set bits.

After looking at the git log[1], I saw that we originally introduced
rwsem_is_contendent() here to aviod blocking register_shrinker(),
not unregister_shrinker().

So I am curious, do we really care about the speed of
unregister_shrinker()?

My opinion is that for general reasons we should avoid long unbreakable actions in kernel. Especially when they may be called
synchronous from userspace.

Got it.

And maybe you missed the previous comments below.


We even have this as generic rule. See check_hung_task().

Before, the longest sleep in unregister_shrinker() was a sleep waiting for single longest do_shrink_slab().

After the patch the longest sleep will be a sleep waiting for all do_shrink_slab() calls (all set bits in shrinker_info).

And after using SRCU, register_shrinker() will not be blocked by slab
shrink at all.

[1]. https://github.com/torvalds/linux/commit/e496612


Using only synchronize_srcu_expedited() won't help here.

My opinion is we should restore a check similar to the rwsem_is_contendent() check that we had before. Something like

If we really care about the speed of unregister_shrinker() like
register_shrinker(), I think this is a good idea. This guarantees
at least the speed of the unregister_shrinker() is not deteriorated. :)

the below on top of your patchset merged into appropriate patch:

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 27ef9946ae8a..50e7812468ec 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -204,6 +204,7 @@ static void set_task_reclaim_state(struct task_struct *task,
  LIST_HEAD(shrinker_list);
  DEFINE_MUTEX(shrinker_mutex);
  DEFINE_SRCU(shrinker_srcu);
+static atomic_t shrinker_srcu_generation = ATOMIC_INIT(0);
    #ifdef CONFIG_MEMCG
  static int shrinker_nr_max;
@@ -782,6 +783,7 @@ void unregister_shrinker(struct shrinker *shrinker)
      debugfs_entry = shrinker_debugfs_remove(shrinker);
      mutex_unlock(&shrinker_mutex);
  +    atomic_inc(&shrinker_srcu_generation);
      synchronize_srcu(&shrinker_srcu);
        debugfs_remove_recursive(debugfs_entry);
@@ -799,6 +801,7 @@ EXPORT_SYMBOL(unregister_shrinker);
   */
  void synchronize_shrinkers(void)
  {
+    atomic_inc(&shrinker_srcu_generation);
      synchronize_srcu(&shrinker_srcu);
  }
  EXPORT_SYMBOL(synchronize_shrinkers);
@@ -908,7 +911,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
  {
      struct shrinker_info *info;
      unsigned long ret, freed = 0;
-    int srcu_idx;
+    int srcu_idx, generation;
      int i;
        if (!mem_cgroup_online(memcg))
@@ -919,6 +922,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
      if (unlikely(!info))
          goto unlock;
  +    generation = atomic_read(&shrinker_srcu_generation);
      for_each_set_bit(i, info->map, info->map_nr_max) {
          struct shrink_control sc = {
              .gfp_mask = gfp_mask,
@@ -965,6 +969,11 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
                  set_shrinker_bit(memcg, nid, i);
          }
          freed += ret;
+
+        if (atomic_read(&shrinker_srcu_generation) != generation) {
+            freed = freed ? : 1;
+            break;
+        }
      }
  unlock:
      srcu_read_unlock(&shrinker_srcu, srcu_idx);
@@ -1004,7 +1013,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
  {
      unsigned long ret, freed = 0;
      struct shrinker *shrinker;
-    int srcu_idx;
+    int srcu_idx, generation;
        /*
       * The root memcg might be allocated even though memcg is disabled
@@ -1017,6 +1026,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
          return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
        srcu_idx = srcu_read_lock(&shrinker_srcu);
+    generation = atomic_read(&shrinker_srcu_generation);
        list_for_each_entry_srcu(shrinker, &shrinker_list, list,
                   srcu_read_lock_held(&shrinker_srcu)) {
@@ -1030,6 +1040,11 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
          if (ret == SHRINK_EMPTY)
              ret = 0;
          freed += ret;
+
+        if (atomic_read(&shrinker_srcu_generation) != generation) {
+            freed = freed ? : 1;
+            break;
+        }
      }
        srcu_read_unlock(&shrinker_srcu, srcu_idx);

Even more, for memcg shrinkers we may unlock SRCU and continue iterations from the same shrinker id:

Maybe we can also do this for global slab shrink? Like below:

How about this?


diff --git a/mm/vmscan.c b/mm/vmscan.c
index ffddbd204259..9d8c53075298 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1012,7 +1012,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
                                 int priority)
 {
        unsigned long ret, freed = 0;
-       struct shrinker *shrinker;
+       struct shrinker *shrinker = NULL;
        int srcu_idx, generation;

        /*
@@ -1025,11 +1025,15 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
        if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
                return shrink_slab_memcg(gfp_mask, nid, memcg, priority);

+again:
        srcu_idx = srcu_read_lock(&shrinker_srcu);

        generation = atomic_read(&shrinker_srcu_generation);
-       list_for_each_entry_srcu(shrinker, &shrinker_list, list,
-                                srcu_read_lock_held(&shrinker_srcu)) {
+       if (!shrinker)
+               shrinker = list_entry_rcu(shrinker_list.next, struct shrinker, list);
+       else
+               shrinker = list_entry_rcu(shrinker->list.next, struct shrinker, list);
+       list_for_each_entry_from_rcu(shrinker, &shrinker_list, list) {
                struct shrink_control sc = {
                        .gfp_mask = gfp_mask,
                        .nid = nid,
@@ -1042,8 +1046,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
                freed += ret;

                if (atomic_read(&shrinker_srcu_generation) != generation) {
-                       freed = freed ? : 1;
-                       break;
+                       srcu_read_unlock(&shrinker_srcu, srcu_idx);
+                       cond_resched();
+                       goto again;
                }
        }


diff --git a/mm/vmscan.c b/mm/vmscan.c
index 27ef9946ae8a..0b197bba1257 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -204,6 +204,7 @@ static void set_task_reclaim_state(struct task_struct *task,
  LIST_HEAD(shrinker_list);
  DEFINE_MUTEX(shrinker_mutex);
  DEFINE_SRCU(shrinker_srcu);
+static atomic_t shrinker_srcu_generation = ATOMIC_INIT(0);
    #ifdef CONFIG_MEMCG
  static int shrinker_nr_max;
@@ -782,6 +783,7 @@ void unregister_shrinker(struct shrinker *shrinker)
      debugfs_entry = shrinker_debugfs_remove(shrinker);
      mutex_unlock(&shrinker_mutex);
  +    atomic_inc(&shrinker_srcu_generation);
      synchronize_srcu(&shrinker_srcu);
        debugfs_remove_recursive(debugfs_entry);
@@ -799,6 +801,7 @@ EXPORT_SYMBOL(unregister_shrinker);
   */
  void synchronize_shrinkers(void)
  {
+    atomic_inc(&shrinker_srcu_generation);
      synchronize_srcu(&shrinker_srcu);
  }
  EXPORT_SYMBOL(synchronize_shrinkers);
@@ -908,18 +911,19 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
  {
      struct shrinker_info *info;
      unsigned long ret, freed = 0;
-    int srcu_idx;
-    int i;
+    int srcu_idx, generation;
+    int i = 0;
        if (!mem_cgroup_online(memcg))
          return 0;
-
+again:
      srcu_idx = srcu_read_lock(&shrinker_srcu);
      info = shrinker_info_srcu(memcg, nid);
      if (unlikely(!info))
          goto unlock;
  -    for_each_set_bit(i, info->map, info->map_nr_max) {
+    generation = atomic_read(&shrinker_srcu_generation);
+    for_each_set_bit_from(i, info->map, info->map_nr_max) {
          struct shrink_control sc = {
              .gfp_mask = gfp_mask,
              .nid = nid,
@@ -965,6 +969,11 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
                  set_shrinker_bit(memcg, nid, i);
          }
          freed += ret;
+
+        if (atomic_read(&shrinker_srcu_generation) != generation) {
+            srcu_read_unlock(&shrinker_srcu, srcu_idx);

Maybe we can add the following code here, so as to avoid repeating the
current id and avoid triggering softlockup:

            i++;
            cond_resched();

And this. :)

Thanks,
Qi


Thanks,
Qi

+            goto again;
+        }
      }
  unlock:
      srcu_read_unlock(&shrinker_srcu, srcu_idx);
@@ -1004,7 +1013,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
  {
      unsigned long ret, freed = 0;
      struct shrinker *shrinker;
-    int srcu_idx;
+    int srcu_idx, generation;
        /*
       * The root memcg might be allocated even though memcg is disabled
@@ -1017,6 +1026,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
          return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
        srcu_idx = srcu_read_lock(&shrinker_srcu);
+    generation = atomic_read(&shrinker_srcu_generation);
        list_for_each_entry_srcu(shrinker, &shrinker_list, list,
                   srcu_read_lock_held(&shrinker_srcu)) {
@@ -1030,6 +1040,11 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
          if (ret == SHRINK_EMPTY)
              ret = 0;
          freed += ret;
+
+        if (atomic_read(&shrinker_srcu_generation) != generation) {
+            freed = freed ? : 1;
+            break;
+        }
      }
        srcu_read_unlock(&shrinker_srcu, srcu_idx);





--
Thanks,
Qi