Re: [PATCH v2 2/7] mm: vmscan: make global slab shrink lockless

From: Kirill Tkhai
Date: Sat Feb 25 2023 - 10:30:58 EST


On 25.02.2023 11:08, Qi Zheng wrote:
>
>
> On 2023/2/25 05:14, Kirill Tkhai wrote:
>> On 25.02.2023 00:02, Kirill Tkhai wrote:
>>> On 24.02.2023 07:00, Qi Zheng wrote:
>>>>
>>>>
>>>> On 2023/2/24 02:24, Sultan Alsawaf wrote:
>>>>> On Thu, Feb 23, 2023 at 09:27:20PM +0800, Qi Zheng wrote:
>>>>>> The shrinker_rwsem is a global lock in shrinkers subsystem,
>>>>>> it is easy to cause blocking in the following cases:
>>>>>>
>>>>>> a. the write lock of shrinker_rwsem was held for too long.
>>>>>>      For example, there are many memcgs in the system, which
>>>>>>      causes some paths to hold locks and traverse it for too
>>>>>>      long. (e.g. expand_shrinker_info())
>>>>>> b. the read lock of shrinker_rwsem was held for too long,
>>>>>>      and a writer came at this time. Then this writer will be
>>>>>>      forced to wait and block all subsequent readers.
>>>>>>      For example:
>>>>>>      - be scheduled when the read lock of shrinker_rwsem is
>>>>>>        held in do_shrink_slab()
>>>>>>      - some shrinker are blocked for too long. Like the case
>>>>>>        mentioned in the patchset[1].
>>>>>>
>>>>>> Therefore, many times in history ([2],[3],[4],[5]), some
>>>>>> people wanted to replace shrinker_rwsem reader with SRCU,
>>>>>> but they all gave up because SRCU was not unconditionally
>>>>>> enabled.
>>>>>>
>>>>>> But now, since commit 1cd0bd06093c ("rcu: Remove CONFIG_SRCU"),
>>>>>> the SRCU is unconditionally enabled. So it's time to use
>>>>>> SRCU to protect readers who previously held shrinker_rwsem.
>>>>>>
>>>>>> [1]. https://lore.kernel.org/lkml/20191129214541.3110-1-ptikhomirov@xxxxxxxxxxxxx/
>>>>>> [2]. https://lore.kernel.org/all/1437080113.3596.2.camel@xxxxxxxxxxxx/
>>>>>> [3]. https://lore.kernel.org/lkml/1510609063-3327-1-git-send-email-penguin-kernel@xxxxxxxxxxxxxxxxxxx/
>>>>>> [4]. https://lore.kernel.org/lkml/153365347929.19074.12509495712735843805.stgit@localhost.localdomain/
>>>>>> [5]. https://lore.kernel.org/lkml/20210927074823.5825-1-sultan@xxxxxxxxxxxxxxx/
>>>>>>
>>>>>> Signed-off-by: Qi Zheng <zhengqi.arch@xxxxxxxxxxxxx>
>>>>>> ---
>>>>>>    mm/vmscan.c | 27 +++++++++++----------------
>>>>>>    1 file changed, 11 insertions(+), 16 deletions(-)
>>>>>>
>>>>>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>>>>>> index 9f895ca6216c..02987a6f95d1 100644
>>>>>> --- a/mm/vmscan.c
>>>>>> +++ b/mm/vmscan.c
>>>>>> @@ -202,6 +202,7 @@ static void set_task_reclaim_state(struct task_struct *task,
>>>>>>      LIST_HEAD(shrinker_list);
>>>>>>    DECLARE_RWSEM(shrinker_rwsem);
>>>>>> +DEFINE_SRCU(shrinker_srcu);
>>>>>>      #ifdef CONFIG_MEMCG
>>>>>>    static int shrinker_nr_max;
>>>>>> @@ -706,7 +707,7 @@ void free_prealloced_shrinker(struct shrinker *shrinker)
>>>>>>    void register_shrinker_prepared(struct shrinker *shrinker)
>>>>>>    {
>>>>>>        down_write(&shrinker_rwsem);
>>>>>> -    list_add_tail(&shrinker->list, &shrinker_list);
>>>>>> +    list_add_tail_rcu(&shrinker->list, &shrinker_list);
>>>>>>        shrinker->flags |= SHRINKER_REGISTERED;
>>>>>>        shrinker_debugfs_add(shrinker);
>>>>>>        up_write(&shrinker_rwsem);
>>>>>> @@ -760,13 +761,15 @@ void unregister_shrinker(struct shrinker *shrinker)
>>>>>>            return;
>>>>>>          down_write(&shrinker_rwsem);
>>>>>> -    list_del(&shrinker->list);
>>>>>> +    list_del_rcu(&shrinker->list);
>>>>>>        shrinker->flags &= ~SHRINKER_REGISTERED;
>>>>>>        if (shrinker->flags & SHRINKER_MEMCG_AWARE)
>>>>>>            unregister_memcg_shrinker(shrinker);
>>>>>>        debugfs_entry = shrinker_debugfs_remove(shrinker);
>>>>>>        up_write(&shrinker_rwsem);
>>>>>>    +    synchronize_srcu(&shrinker_srcu);
>>>>>> +
>>>>>>        debugfs_remove_recursive(debugfs_entry);
>>>>>>          kfree(shrinker->nr_deferred);
>>>>>> @@ -786,6 +789,7 @@ void synchronize_shrinkers(void)
>>>>>>    {
>>>>>>        down_write(&shrinker_rwsem);
>>>>>>        up_write(&shrinker_rwsem);
>>>>>> +    synchronize_srcu(&shrinker_srcu);
>>>>>>    }
>>>>>>    EXPORT_SYMBOL(synchronize_shrinkers);
>>>>>>    @@ -996,6 +1000,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>>>    {
>>>>>>        unsigned long ret, freed = 0;
>>>>>>        struct shrinker *shrinker;
>>>>>> +    int srcu_idx;
>>>>>>          /*
>>>>>>         * The root memcg might be allocated even though memcg is disabled
>>>>>> @@ -1007,10 +1012,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>>>        if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
>>>>>>            return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>>>>>>    -    if (!down_read_trylock(&shrinker_rwsem))
>>>>>> -        goto out;
>>>>>> +    srcu_idx = srcu_read_lock(&shrinker_srcu);
>>>>>>    -    list_for_each_entry(shrinker, &shrinker_list, list) {
>>>>>> +    list_for_each_entry_srcu(shrinker, &shrinker_list, list,
>>>>>> +                 srcu_read_lock_held(&shrinker_srcu)) {
>>>>>>            struct shrink_control sc = {
>>>>>>                .gfp_mask = gfp_mask,
>>>>>>                .nid = nid,
>>>>>> @@ -1021,19 +1026,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>>>>            if (ret == SHRINK_EMPTY)
>>>>>>                ret = 0;
>>>>>>            freed += ret;
>>>>>> -        /*
>>>>>> -         * Bail out if someone want to register a new shrinker to
>>>>>> -         * prevent the registration from being stalled for long periods
>>>>>> -         * by parallel ongoing shrinking.
>>>>>> -         */
>>>>>> -        if (rwsem_is_contended(&shrinker_rwsem)) {
>>>>>> -            freed = freed ? : 1;
>>>>>> -            break;
>>>>>> -        }
>>>>>>        }
>>>>>>    -    up_read(&shrinker_rwsem);
>>>>>> -out:
>>>>>> +    srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>>>>>        cond_resched();
>>>>>>        return freed;
>>>>>>    }
>>>>>> -- 
>>>>>> 2.20.1
>>>>>>
>>>>>>
>>>>>
>>>>> Hi Qi,
>>>>>
>>>>> A different problem I realized after my old attempt to use SRCU was that the
>>>>> unregister_shrinker() path became quite slow due to the heavy synchronize_srcu()
>>>>> call. Both register_shrinker() *and* unregister_shrinker() are called frequently
>>>>> these days, and SRCU is too unfair to the unregister path IMO.
>>>>
>>>> Hi Sultan,
>>>>
>>>> IIUC, for unregister_shrinker(), the wait time is hardly longer with
>>>> SRCU than with shrinker_rwsem before.
>>>>
>>>> And I just did a simple test. After using the script in cover letter to
>>>> increase the shrink_slab hotspot, I did umount 1k times at the same
>>>> time, and then I used bpftrace to measure the time consumption of
>>>> unregister_shrinker() as follows:
>>>>
>>>> bpftrace -e 'kprobe:unregister_shrinker { @start[tid] = nsecs; } kretprobe:unregister_shrinker /@start[tid]/ { @ns[comm] = hist(nsecs - @start[tid]); delete(@start[tid]); }'
>>>>
>>>> @ns[umount]:
>>>> [16K, 32K)             3 |      |
>>>> [32K, 64K)            66 |@@@@@@@@@@      |
>>>> [64K, 128K)           32 |@@@@@      |
>>>> [128K, 256K)          22 |@@@      |
>>>> [256K, 512K)          48 |@@@@@@@      |
>>>> [512K, 1M)            19 |@@@      |
>>>> [1M, 2M)             131 |@@@@@@@@@@@@@@@@@@@@@      |
>>>> [2M, 4M)             313 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
>>>> [4M, 8M)             302 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@  |
>>>> [8M, 16M)             55 |@@@@@@@@@
>>>>
>>>> I see that the highest time-consuming of unregister_shrinker() is between 8ms and 16ms, which feels tolerable?
>
> Hi Kirill,
>
>>>
>>> The fundamental difference is that before the patchset this for_each_set_bit() iteration could be broken in the middle
>>> of two do_shrink_slab() calls, while after the patchset we can leave for_each_set_bit() only after visiting all set bits.
>
> After looking at the git log[1], I saw that we originally introduced
> rwsem_is_contendent() here to aviod blocking register_shrinker(),
> not unregister_shrinker().
>
> So I am curious, do we really care about the speed of
> unregister_shrinker()?

My opinion is that for general reasons we should avoid long unbreakable actions in kernel. Especially when they may be called
synchronous from userspace.

We even have this as generic rule. See check_hung_task().

Before, the longest sleep in unregister_shrinker() was a sleep waiting for single longest do_shrink_slab().

After the patch the longest sleep will be a sleep waiting for all do_shrink_slab() calls (all set bits in shrinker_info).

> And after using SRCU, register_shrinker() will not be blocked by slab
> shrink at all.
>
> [1]. https://github.com/torvalds/linux/commit/e496612
>
>>>
>>> Using only synchronize_srcu_expedited() won't help here.
>>>
>>> My opinion is we should restore a check similar to the rwsem_is_contendent() check that we had before. Something like
>
> If we really care about the speed of unregister_shrinker() like
> register_shrinker(), I think this is a good idea. This guarantees
> at least the speed of the unregister_shrinker() is not deteriorated. :)
>
>>> the below on top of your patchset merged into appropriate patch:
>>>
>>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>>> index 27ef9946ae8a..50e7812468ec 100644
>>> --- a/mm/vmscan.c
>>> +++ b/mm/vmscan.c
>>> @@ -204,6 +204,7 @@ static void set_task_reclaim_state(struct task_struct *task,
>>>   LIST_HEAD(shrinker_list);
>>>   DEFINE_MUTEX(shrinker_mutex);
>>>   DEFINE_SRCU(shrinker_srcu);
>>> +static atomic_t shrinker_srcu_generation = ATOMIC_INIT(0);
>>>     #ifdef CONFIG_MEMCG
>>>   static int shrinker_nr_max;
>>> @@ -782,6 +783,7 @@ void unregister_shrinker(struct shrinker *shrinker)
>>>       debugfs_entry = shrinker_debugfs_remove(shrinker);
>>>       mutex_unlock(&shrinker_mutex);
>>>   +    atomic_inc(&shrinker_srcu_generation);
>>>       synchronize_srcu(&shrinker_srcu);
>>>         debugfs_remove_recursive(debugfs_entry);
>>> @@ -799,6 +801,7 @@ EXPORT_SYMBOL(unregister_shrinker);
>>>    */
>>>   void synchronize_shrinkers(void)
>>>   {
>>> +    atomic_inc(&shrinker_srcu_generation);
>>>       synchronize_srcu(&shrinker_srcu);
>>>   }
>>>   EXPORT_SYMBOL(synchronize_shrinkers);
>>> @@ -908,7 +911,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>>   {
>>>       struct shrinker_info *info;
>>>       unsigned long ret, freed = 0;
>>> -    int srcu_idx;
>>> +    int srcu_idx, generation;
>>>       int i;
>>>         if (!mem_cgroup_online(memcg))
>>> @@ -919,6 +922,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>>       if (unlikely(!info))
>>>           goto unlock;
>>>   +    generation = atomic_read(&shrinker_srcu_generation);
>>>       for_each_set_bit(i, info->map, info->map_nr_max) {
>>>           struct shrink_control sc = {
>>>               .gfp_mask = gfp_mask,
>>> @@ -965,6 +969,11 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>>                   set_shrinker_bit(memcg, nid, i);
>>>           }
>>>           freed += ret;
>>> +
>>> +        if (atomic_read(&shrinker_srcu_generation) != generation) {
>>> +            freed = freed ? : 1;
>>> +            break;
>>> +        }
>>>       }
>>>   unlock:
>>>       srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>> @@ -1004,7 +1013,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>   {
>>>       unsigned long ret, freed = 0;
>>>       struct shrinker *shrinker;
>>> -    int srcu_idx;
>>> +    int srcu_idx, generation;
>>>         /*
>>>        * The root memcg might be allocated even though memcg is disabled
>>> @@ -1017,6 +1026,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>           return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>>>         srcu_idx = srcu_read_lock(&shrinker_srcu);
>>> +    generation = atomic_read(&shrinker_srcu_generation);
>>>         list_for_each_entry_srcu(shrinker, &shrinker_list, list,
>>>                    srcu_read_lock_held(&shrinker_srcu)) {
>>> @@ -1030,6 +1040,11 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>>           if (ret == SHRINK_EMPTY)
>>>               ret = 0;
>>>           freed += ret;
>>> +
>>> +        if (atomic_read(&shrinker_srcu_generation) != generation) {
>>> +            freed = freed ? : 1;
>>> +            break;
>>> +        }
>>>       }
>>>         srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>
>> Even more, for memcg shrinkers we may unlock SRCU and continue iterations from the same shrinker id:
>
> Maybe we can also do this for global slab shrink? Like below:
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index ffddbd204259..9d8c53075298 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1012,7 +1012,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>                                  int priority)
>  {
>         unsigned long ret, freed = 0;
> -       struct shrinker *shrinker;
> +       struct shrinker *shrinker = NULL;
>         int srcu_idx, generation;
>
>         /*
> @@ -1025,11 +1025,15 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>         if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
>                 return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>
> +again:
>         srcu_idx = srcu_read_lock(&shrinker_srcu);
>
>         generation = atomic_read(&shrinker_srcu_generation);
> -       list_for_each_entry_srcu(shrinker, &shrinker_list, list,
> -                                srcu_read_lock_held(&shrinker_srcu)) {
> +       if (!shrinker)
> +               shrinker = list_entry_rcu(shrinker_list.next, struct shrinker, list);
> +       else
> +               shrinker = list_entry_rcu(shrinker->list.next, struct shrinker, list);
> +       list_for_each_entry_from_rcu(shrinker, &shrinker_list, list) {
>                 struct shrink_control sc = {
>                         .gfp_mask = gfp_mask,
>                         .nid = nid,
> @@ -1042,8 +1046,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>                 freed += ret;
>
>                 if (atomic_read(&shrinker_srcu_generation) != generation) {
> -                       freed = freed ? : 1;
> -                       break;
> +                       srcu_read_unlock(&shrinker_srcu, srcu_idx);
> +                       cond_resched();
> +                       goto again;
>                 }
>         }
>
>>
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index 27ef9946ae8a..0b197bba1257 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -204,6 +204,7 @@ static void set_task_reclaim_state(struct task_struct *task,
>>   LIST_HEAD(shrinker_list);
>>   DEFINE_MUTEX(shrinker_mutex);
>>   DEFINE_SRCU(shrinker_srcu);
>> +static atomic_t shrinker_srcu_generation = ATOMIC_INIT(0);
>>     #ifdef CONFIG_MEMCG
>>   static int shrinker_nr_max;
>> @@ -782,6 +783,7 @@ void unregister_shrinker(struct shrinker *shrinker)
>>       debugfs_entry = shrinker_debugfs_remove(shrinker);
>>       mutex_unlock(&shrinker_mutex);
>>   +    atomic_inc(&shrinker_srcu_generation);
>>       synchronize_srcu(&shrinker_srcu);
>>         debugfs_remove_recursive(debugfs_entry);
>> @@ -799,6 +801,7 @@ EXPORT_SYMBOL(unregister_shrinker);
>>    */
>>   void synchronize_shrinkers(void)
>>   {
>> +    atomic_inc(&shrinker_srcu_generation);
>>       synchronize_srcu(&shrinker_srcu);
>>   }
>>   EXPORT_SYMBOL(synchronize_shrinkers);
>> @@ -908,18 +911,19 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>   {
>>       struct shrinker_info *info;
>>       unsigned long ret, freed = 0;
>> -    int srcu_idx;
>> -    int i;
>> +    int srcu_idx, generation;
>> +    int i = 0;
>>         if (!mem_cgroup_online(memcg))
>>           return 0;
>> -
>> +again:
>>       srcu_idx = srcu_read_lock(&shrinker_srcu);
>>       info = shrinker_info_srcu(memcg, nid);
>>       if (unlikely(!info))
>>           goto unlock;
>>   -    for_each_set_bit(i, info->map, info->map_nr_max) {
>> +    generation = atomic_read(&shrinker_srcu_generation);
>> +    for_each_set_bit_from(i, info->map, info->map_nr_max) {
>>           struct shrink_control sc = {
>>               .gfp_mask = gfp_mask,
>>               .nid = nid,
>> @@ -965,6 +969,11 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
>>                   set_shrinker_bit(memcg, nid, i);
>>           }
>>           freed += ret;
>> +
>> +        if (atomic_read(&shrinker_srcu_generation) != generation) {
>> +            srcu_read_unlock(&shrinker_srcu, srcu_idx);
>
> Maybe we can add the following code here, so as to avoid repeating the
> current id and avoid triggering softlockup:
>
>             i++;
>             cond_resched();
>
> Thanks,
> Qi
>
>> +            goto again;
>> +        }
>>       }
>>   unlock:
>>       srcu_read_unlock(&shrinker_srcu, srcu_idx);
>> @@ -1004,7 +1013,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>   {
>>       unsigned long ret, freed = 0;
>>       struct shrinker *shrinker;
>> -    int srcu_idx;
>> +    int srcu_idx, generation;
>>         /*
>>        * The root memcg might be allocated even though memcg is disabled
>> @@ -1017,6 +1026,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>           return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
>>         srcu_idx = srcu_read_lock(&shrinker_srcu);
>> +    generation = atomic_read(&shrinker_srcu_generation);
>>         list_for_each_entry_srcu(shrinker, &shrinker_list, list,
>>                    srcu_read_lock_held(&shrinker_srcu)) {
>> @@ -1030,6 +1040,11 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
>>           if (ret == SHRINK_EMPTY)
>>               ret = 0;
>>           freed += ret;
>> +
>> +        if (atomic_read(&shrinker_srcu_generation) != generation) {
>> +            freed = freed ? : 1;
>> +            break;
>> +        }
>>       }
>>         srcu_read_unlock(&shrinker_srcu, srcu_idx);
>>
>>
>