Re: [PATCH v2] EXP rcu: Move expedited grace period (GP) work to RT kthread_worker

From: Joel Fernandes
Date: Fri Apr 08 2022 - 06:43:00 EST


On Fri, Apr 8, 2022 at 12:57 AM Kalesh Singh <kaleshsingh@xxxxxxxxxx> wrote:
>
[...]
> @@ -334,15 +334,13 @@ static bool exp_funnel_lock(unsigned long s)
> * Select the CPUs within the specified rcu_node that the upcoming
> * expedited grace period needs to wait for.
> */
> -static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
> +static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp)
> {
> int cpu;
> unsigned long flags;
> unsigned long mask_ofl_test;
> unsigned long mask_ofl_ipi;
> int ret;
> - struct rcu_exp_work *rewp =
> - container_of(wp, struct rcu_exp_work, rew_work);
> struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew);
>
> raw_spin_lock_irqsave_rcu_node(rnp, flags);
> @@ -417,13 +415,119 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
> rcu_report_exp_cpu_mult(rnp, mask_ofl_test, false);
> }
>
> +static void rcu_exp_sel_wait_wake(unsigned long s);
> +
> +#ifdef CONFIG_RCU_EXP_KTHREAD

Just my 2c:

Honestly, I am not sure if the benefits of duplicating the code to use
normal workqueues outweighs the drawbacks (namely code complexity,
code duplication - which can in turn cause more bugs and maintenance
headaches down the line). The code is harder to read and adding more
30 character function names does not help.

For something as important as expedited GPs, I can't imagine a
scenario where an RT kthread worker would cause "issues". If it does
cause issues, that's what the -rc cycles and the stable releases are
for. I prefer to trust the process than take a one-foot-in-the-door
approach.

So please, can we just keep it simple?

Thanks,

- Joel


> +static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp)
> +{
> + struct rcu_exp_work *rewp =
> + container_of(wp, struct rcu_exp_work, rew_work);
> +
> + __sync_rcu_exp_select_node_cpus(rewp);
> +}
> +
> +static inline bool rcu_gp_par_worker_started(void)
> +{
> + return !!READ_ONCE(rcu_exp_par_gp_kworker);
> +}
> +
> +static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp)
> +{
> + kthread_init_work(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
> + /*
> + * Use rcu_exp_par_gp_kworker, because flushing a work item from
> + * another work item on the same kthread worker can result in
> + * deadlock.
> + */
> + kthread_queue_work(rcu_exp_par_gp_kworker, &rnp->rew.rew_work);
> +}
> +
> +static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp)
> +{
> + kthread_flush_work(&rnp->rew.rew_work);
> +}
> +
> +/*
> + * Work-queue handler to drive an expedited grace period forward.
> + */
> +static void wait_rcu_exp_gp(struct kthread_work *wp)
> +{
> + struct rcu_exp_work *rewp;
> +
> + rewp = container_of(wp, struct rcu_exp_work, rew_work);
> + rcu_exp_sel_wait_wake(rewp->rew_s);
> +}
> +
> +static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew)
> +{
> + kthread_init_work(&rew->rew_work, wait_rcu_exp_gp);
> + kthread_queue_work(rcu_exp_gp_kworker, &rew->rew_work);
> +}
> +
> +static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew)
> +{
> +}
> +#else /* !CONFIG_RCU_EXP_KTHREAD */
> +static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
> +{
> + struct rcu_exp_work *rewp =
> + container_of(wp, struct rcu_exp_work, rew_work);
> +
> + __sync_rcu_exp_select_node_cpus(rewp);
> +}
> +
> +static inline bool rcu_gp_par_worker_started(void)
> +{
> + return !!READ_ONCE(rcu_par_gp_wq);
> +}
> +
> +static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp)
> +{
> + int cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1);
> +
> + INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
> + /* If all offline, queue the work on an unbound CPU. */
> + if (unlikely(cpu > rnp->grphi - rnp->grplo))
> + cpu = WORK_CPU_UNBOUND;
> + else
> + cpu += rnp->grplo;
> + queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work);
> +}
> +
> +static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp)
> +{
> + flush_work(&rnp->rew.rew_work);
> +}
> +
> +/*
> + * Work-queue handler to drive an expedited grace period forward.
> + */
> +static void wait_rcu_exp_gp(struct work_struct *wp)
> +{
> + struct rcu_exp_work *rewp;
> +
> + rewp = container_of(wp, struct rcu_exp_work, rew_work);
> + rcu_exp_sel_wait_wake(rewp->rew_s);
> +}
> +
> +static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew)
> +{
> + INIT_WORK_ONSTACK(&rew->rew_work, wait_rcu_exp_gp);
> + queue_work(rcu_gp_wq, &rew->rew_work);
> +}
> +
> +static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew)
> +{
> + destroy_work_on_stack(&rew->rew_work);
> +}
> +#endif /* CONFIG_RCU_EXP_KTHREAD */
> +
> /*
> * Select the nodes that the upcoming expedited grace period needs
> * to wait for.
> */
> static void sync_rcu_exp_select_cpus(void)
> {
> - int cpu;
> struct rcu_node *rnp;
>
> trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("reset"));
> @@ -435,28 +539,21 @@ static void sync_rcu_exp_select_cpus(void)
> rnp->exp_need_flush = false;
> if (!READ_ONCE(rnp->expmask))
> continue; /* Avoid early boot non-existent wq. */
> - if (!READ_ONCE(rcu_par_gp_wq) ||
> + if (!rcu_gp_par_worker_started() ||
> rcu_scheduler_active != RCU_SCHEDULER_RUNNING ||
> rcu_is_last_leaf_node(rnp)) {
> - /* No workqueues yet or last leaf, do direct call. */
> + /* No worker started yet or last leaf, do direct call. */
> sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work);
> continue;
> }
> - INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
> - cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1);
> - /* If all offline, queue the work on an unbound CPU. */
> - if (unlikely(cpu > rnp->grphi - rnp->grplo))
> - cpu = WORK_CPU_UNBOUND;
> - else
> - cpu += rnp->grplo;
> - queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work);
> + sync_rcu_exp_select_cpus_queue_work(rnp);
> rnp->exp_need_flush = true;
> }
>
> - /* Wait for workqueue jobs (if any) to complete. */
> + /* Wait for jobs (if any) to complete. */
> rcu_for_each_leaf_node(rnp)
> if (rnp->exp_need_flush)
> - flush_work(&rnp->rew.rew_work);
> + sync_rcu_exp_select_cpus_flush_work(rnp);
> }
>
> /*
> @@ -622,17 +719,6 @@ static void rcu_exp_sel_wait_wake(unsigned long s)
> rcu_exp_wait_wake(s);
> }
>
> -/*
> - * Work-queue handler to drive an expedited grace period forward.
> - */
> -static void wait_rcu_exp_gp(struct work_struct *wp)
> -{
> - struct rcu_exp_work *rewp;
> -
> - rewp = container_of(wp, struct rcu_exp_work, rew_work);
> - rcu_exp_sel_wait_wake(rewp->rew_s);
> -}
> -
> #ifdef CONFIG_PREEMPT_RCU
>
> /*
> @@ -848,20 +934,19 @@ void synchronize_rcu_expedited(void)
> } else {
> /* Marshall arguments & schedule the expedited grace period. */
> rew.rew_s = s;
> - INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
> - queue_work(rcu_gp_wq, &rew.rew_work);
> + synchronize_rcu_expedited_queue_work(&rew);
> }
>
> /* Wait for expedited grace period to complete. */
> rnp = rcu_get_root();
> wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
> sync_exp_work_done(s));
> - smp_mb(); /* Workqueue actions happen before return. */
> + smp_mb(); /* Work actions happen before return. */
>
> /* Let the next expedited grace period start. */
> mutex_unlock(&rcu_state.exp_mutex);
>
> if (likely(!boottime))
> - destroy_work_on_stack(&rew.rew_work);
> + synchronize_rcu_expedited_destroy_work(&rew);
> }
> EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
>
> base-commit: 42e7a03d3badebd4e70aea5362d6914dfc7c220b
> --
> 2.35.1.1178.g4f1659d476-goog
>