Re: [PATCH RFC tip/core/rcu] Add shrinker to shift to fast/inefficient GP mode

From: Paul E. McKenney
Date: Thu May 07 2020 - 11:49:12 EST


On Thu, May 07, 2020 at 05:36:47PM +0800, Hillf Danton wrote:
>
> Hello Paul
>
> On Wed, 6 May 2020 17:42:40 Paul E. McKenney wrote:
> >
> > This commit adds a shrinker so as to inform RCU when memory is scarce.
>
> A simpler hook is added in the logic of kswapd for subscribing the info
> that memory pressure is high, and then on top of it make rcu a subscriber
> by copying your code for the shrinker, wishing it makes a sense to you.
>
> What's not yet included is to make the hook per node to help make every
> reviewer convinced that memory is becoming tight. Of course without the
> cost of making subscribers node aware.
>
> Hillf

I must defer to the MM folks on the MM portion of this patch, but early
warning of impending memory pressure would be extremely good. A few
RCU-related notes inline below, though.

Thanx, Paul

> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -49,6 +49,16 @@ static inline void set_max_mapnr(unsigne
> static inline void set_max_mapnr(unsigned long limit) { }
> #endif
>
> +/* subscriber of kswapd's memory_pressure_high signal */
> +struct mph_subscriber {
> + struct list_head node;
> + void (*info) (void *data);
> + void *data;
> +};
> +
> +int mph_subscribe(struct mph_subscriber *ms);
> +void mph_unsubscribe(struct mph_subscriber *ms);
> +
> extern atomic_long_t _totalram_pages;
> static inline unsigned long totalram_pages(void)
> {
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -3536,6 +3536,40 @@ static bool kswapd_shrink_node(pg_data_t
> }
>
> /*
> + * subscribers of kswapd's signal that memory pressure is high
> + */
> +static LIST_HEAD(mph_subs);
> +static DEFINE_MUTEX(mph_lock);
> +
> +int mph_subscribe(struct mph_subscriber *ms)
> +{
> + if (!ms->info)
> + return -EAGAIN;
> +
> + mutex_lock(&mph_lock);
> + list_add_tail(&ms->node, &mph_subs);
> + mutex_unlock(&mph_lock);
> + return 0;
> +}
> +
> +void mph_unsubscribe(struct mph_subscriber *ms)
> +{
> + mutex_lock(&mph_lock);
> + list_del(&ms->node);
> + mutex_unlock(&mph_lock);
> +}
> +
> +static void kswapd_bbc_mph(void)
> +{
> + struct mph_subscriber *ms;
> +
> + mutex_lock(&mph_lock);
> + list_for_each_entry(ms, &mph_subs, node)
> + ms->info(ms->data);
> + mutex_unlock(&mph_lock);
> +}
> +
> +/*
> * For kswapd, balance_pgdat() will reclaim pages across a node from zones
> * that are eligible for use by the caller until at least one zone is
> * balanced.
> @@ -3663,8 +3697,11 @@ restart:
> * If we're getting trouble reclaiming, start doing writepage
> * even in laptop mode.
> */
> - if (sc.priority < DEF_PRIORITY - 2)
> + if (sc.priority < DEF_PRIORITY - 2) {
> sc.may_writepage = 1;
> + if (sc.priority == DEF_PRIORITY - 3)
> + kswapd_bbc_mph();
> + }
>
> /* Call soft limit reclaim before calling shrink_node. */
> sc.nr_scanned = 0;
> --- a/kernel/rcu/tree.h
> +++ b/kernel/rcu/tree.h
> @@ -325,6 +325,8 @@ struct rcu_state {
> int ncpus_snap; /* # CPUs seen last time. */
> u8 cbovld; /* Callback overload now? */
> u8 cbovldnext; /* ^ ^ next time? */
> + u8 mph; /* mm pressure high signal from kswapd */
> + unsigned long mph_end; /* time stamp in jiffies */
>
> unsigned long jiffies_force_qs; /* Time at which to invoke */
> /* force_quiescent_state(). */
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -52,6 +52,7 @@
> #include <linux/kprobes.h>
> #include <linux/gfp.h>
> #include <linux/oom.h>
> +#include <linux/mm.h>
> #include <linux/smpboot.h>
> #include <linux/jiffies.h>
> #include <linux/slab.h>
> @@ -2314,8 +2315,15 @@ static void force_qs_rnp(int (*f)(struct
> struct rcu_data *rdp;
> struct rcu_node *rnp;
>
> - rcu_state.cbovld = rcu_state.cbovldnext;
> + rcu_state.cbovld = smp_load_acquire(&rcu_state.mph) ||
> + rcu_state.cbovldnext;
> rcu_state.cbovldnext = false;
> +
> + if (READ_ONCE(rcu_state.mph) &&
> + time_after(jiffies, READ_ONCE(rcu_state.mph_end))) {
> + WRITE_ONCE(rcu_state.mph, false);
> + pr_info("%s: Ending OOM-mode grace periods.\n", __func__);
> + }
> rcu_for_each_leaf_node(rnp) {
> cond_resched_tasks_rcu_qs();
> mask = 0;
> @@ -2643,6 +2651,20 @@ static void check_cb_ovld(struct rcu_dat
> raw_spin_unlock_rcu_node(rnp);
> }
>
> +static void rcu_mph_info(void *data)

This pointer will always be &rcu_state, so why not ignore the pointer
and use "rcu_state" below?

RCU grace periods are inherently global, so I don't know of any way
for RCU to focus on a given NUMA node. All or nothing. But on the
other hand, speeding up RCU grace periods will also help specific
NUMA nodes, so I believe that it is all good.

> +{
> + struct rcu_state *state = data;
> +
> + WRITE_ONCE(state->mph_end, jiffies + HZ / 10);
> + smp_store_release(&state->mph, true);
> + rcu_force_quiescent_state();
> +}
> +
> +static struct mph_subscriber rcu_mph_subscriber = {
> + .info = rcu_mph_info,
> + .data = &rcu_state,

Then this ".data" entry can be omitted, correct?

> +};
> +
> /* Helper function for call_rcu() and friends. */
> static void
> __call_rcu(struct rcu_head *head, rcu_callback_t func)
> @@ -4036,6 +4058,8 @@ void __init rcu_init(void)
> qovld_calc = DEFAULT_RCU_QOVLD_MULT * qhimark;
> else
> qovld_calc = qovld;
> +
> + mph_subscribe(&rcu_mph_subscriber);
> }
>
> #include "tree_stall.h"
>