Re: [PATCH rcu 04/12] rcu: Switch polled grace-period APIs to ->gp_seq_polled

From: Paul E. McKenney
Date: Wed Jul 20 2022 - 21:05:01 EST


On Wed, Jul 20, 2022 at 05:53:38PM -0700, Boqun Feng wrote:
> Hi Paul,
>
> On Mon, Jun 20, 2022 at 03:51:20PM -0700, Paul E. McKenney wrote:
> > This commit switches the existing polled grace-period APIs to use a
> > new ->gp_seq_polled counter in the rcu_state structure. An additional
> > ->gp_seq_polled_snap counter in that same structure allows the normal
> > grace period kthread to interact properly with the !SMP !PREEMPT fastpath
> > through synchronize_rcu(). The first of the two to note the end of a
> > given grace period will make knowledge of this transition available to
> > the polled API.
> >
> > This commit is in preparation for polled expedited grace periods.
> >
> > Link: https://lore.kernel.org/all/20220121142454.1994916-1-bfoster@xxxxxxxxxx/
> > Link: https://docs.google.com/document/d/1RNKWW9jQyfjxw2E8dsXVTdvZYh0HnYeSHDKog9jhdN8/edit?usp=sharing
> > Cc: Brian Foster <bfoster@xxxxxxxxxx>
> > Cc: Dave Chinner <david@xxxxxxxxxxxxx>
> > Cc: Al Viro <viro@xxxxxxxxxxxxxxxxxx>
> > Cc: Ian Kent <raven@xxxxxxxxxx>
> > Signed-off-by: Paul E. McKenney <paulmck@xxxxxxxxxx>
> > ---
> > kernel/rcu/tree.c | 90 +++++++++++++++++++++++++++++++++++++++++++++--
> > kernel/rcu/tree.h | 2 ++
> > 2 files changed, 89 insertions(+), 3 deletions(-)
> >
> > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > index 46cfceea87847..637e8f9454573 100644
> > --- a/kernel/rcu/tree.c
> > +++ b/kernel/rcu/tree.c
> > @@ -1775,6 +1775,78 @@ static void rcu_strict_gp_boundary(void *unused)
> > invoke_rcu_core();
> > }
> >
> > +// Has rcu_init() been invoked? This is used (for example) to determine
> > +// whether spinlocks may be acquired safely.
> > +static bool rcu_init_invoked(void)
> > +{
> > + return !!rcu_state.n_online_cpus;
> > +}
> > +
> > +// Make the polled API aware of the beginning of a grace period.
> > +static void rcu_poll_gp_seq_start(unsigned long *snap)
> > +{
> > + struct rcu_node *rnp = rcu_get_root();
> > +
> > + if (rcu_init_invoked())
> > + raw_lockdep_assert_held_rcu_node(rnp);
> > +
> > + // If RCU was idle, note beginning of GP.
> > + if (!rcu_seq_state(rcu_state.gp_seq_polled))
> > + rcu_seq_start(&rcu_state.gp_seq_polled);
> > +
> > + // Either way, record current state.
> > + *snap = rcu_state.gp_seq_polled;
> > +}
> > +
> > +// Make the polled API aware of the end of a grace period.
> > +static void rcu_poll_gp_seq_end(unsigned long *snap)
> > +{
> > + struct rcu_node *rnp = rcu_get_root();
> > +
> > + if (rcu_init_invoked())
> > + raw_lockdep_assert_held_rcu_node(rnp);
> > +
> > + // If the the previously noted GP is still in effect, record the
> > + // end of that GP. Either way, zero counter to avoid counter-wrap
> > + // problems.
> > + if (*snap && *snap == rcu_state.gp_seq_polled) {
> > + rcu_seq_end(&rcu_state.gp_seq_polled);
> > + rcu_state.gp_seq_polled_snap = 0;
> > + } else {
> > + *snap = 0;
> > + }
> > +}
> > +
> > +// Make the polled API aware of the beginning of a grace period, but
> > +// where caller does not hold the root rcu_node structure's lock.
> > +static void rcu_poll_gp_seq_start_unlocked(unsigned long *snap)
> > +{
> > + struct rcu_node *rnp = rcu_get_root();
> > +
> > + if (rcu_init_invoked()) {
> > + lockdep_assert_irqs_enabled();
> > + raw_spin_lock_irq_rcu_node(rnp);
> > + }
> > + rcu_poll_gp_seq_start(snap);
> > + if (rcu_init_invoked())
> > + raw_spin_unlock_irq_rcu_node(rnp);
> > +}
> > +
> > +// Make the polled API aware of the end of a grace period, but where
> > +// caller does not hold the root rcu_node structure's lock.
> > +static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap)
> > +{
> > + struct rcu_node *rnp = rcu_get_root();
> > +
> > + if (rcu_init_invoked()) {
> > + lockdep_assert_irqs_enabled();
> > + raw_spin_lock_irq_rcu_node(rnp);
> > + }
> > + rcu_poll_gp_seq_end(snap);
> > + if (rcu_init_invoked())
> > + raw_spin_unlock_irq_rcu_node(rnp);
> > +}
> > +
> > /*
> > * Initialize a new grace period. Return false if no grace period required.
> > */
> > @@ -1810,6 +1882,7 @@ static noinline_for_stack bool rcu_gp_init(void)
> > rcu_seq_start(&rcu_state.gp_seq);
> > ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
> > trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start"));
> > + rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap);
> > raw_spin_unlock_irq_rcu_node(rnp);
> >
> > /*
> > @@ -2069,6 +2142,7 @@ static noinline void rcu_gp_cleanup(void)
> > * safe for us to drop the lock in order to mark the grace
> > * period as completed in all of the rcu_node structures.
> > */
> > + rcu_poll_gp_seq_end(&rcu_state.gp_seq_polled_snap);
> > raw_spin_unlock_irq_rcu_node(rnp);
> >
> > /*
> > @@ -3837,8 +3911,18 @@ void synchronize_rcu(void)
> > lock_is_held(&rcu_lock_map) ||
> > lock_is_held(&rcu_sched_lock_map),
> > "Illegal synchronize_rcu() in RCU read-side critical section");
> > - if (rcu_blocking_is_gp())
> > + if (rcu_blocking_is_gp()) {
> > + // Note well that this code runs with !PREEMPT && !SMP.
> > + // In addition, all code that advances grace periods runs
> > + // at process level. Therefore, this GP overlaps with other
> > + // GPs only by being fully nested within them, which allows
> > + // reuse of ->gp_seq_polled_snap.
> > + rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap);
> > + rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap);
> > + if (rcu_init_invoked())
> > + cond_resched_tasks_rcu_qs();
> > return; // Context allows vacuous grace periods.
> > + }
> > if (rcu_gp_is_expedited())
> > synchronize_rcu_expedited();
> > else
> > @@ -3860,7 +3944,7 @@ unsigned long get_state_synchronize_rcu(void)
> > * before the load from ->gp_seq.
> > */
> > smp_mb(); /* ^^^ */
> > - return rcu_seq_snap(&rcu_state.gp_seq);
> > + return rcu_seq_snap(&rcu_state.gp_seq_polled);
>
> I happened to run into this. There is one usage of
> get_state_synchronize_rcu() in start_poll_synchronize_rcu(), in which
> the return value of get_state_synchronize_rcu() ("gp_seq") will be used
> for rcu_start_this_gp(). I don't think this is quite right, because
> after this change, rcu_state.gp_seq and rcu_state.gp_seq_polled are
> different values, in fact ->gp_seq_polled is greater than ->gp_seq
> by how many synchronize_rcu() is called in early boot.
>
> Am I missing something here?

It does not appear that your are missing anything, sad to say!

Does the following make it work better?

Thanx, Paul

------------------------------------------------------------------------

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 2122359f0c862..cf2fd58a93a41 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3571,7 +3571,7 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
unsigned long start_poll_synchronize_rcu(void)
{
unsigned long flags;
- unsigned long gp_seq = get_state_synchronize_rcu();
+ unsigned long gp_seq = rcu_seq_snap(&rcu_state.gp_seq);
bool needwake;
struct rcu_data *rdp;
struct rcu_node *rnp;