RE: [PATCH] rcu: Dump all rcuc kthreads status for CPUs that not report quiescent state

From: Zhang, Qiang1
Date: Fri Apr 15 2022 - 23:08:57 EST


On Wed, Apr 13, 2022 at 03:44:11PM +0800, Zqiang wrote:
> If the rcutree.use_softirq is configured, when RCU Stall event
> happened, dump status of all rcuc kthreads who due to starvation
> prevented grace period ends on CPUs that not report quiescent state.
>
> Signed-off-by: Zqiang <qiang1.zhang@xxxxxxxxx>

>This could be a good improvement! But a few questions and comments below.
>
> Thanx, Paul

> ---
> kernel/rcu/tree_stall.h | 8 +++++---
> 1 file changed, 5 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index
> d7956c03edbd..e6ecc32cfe23 100644
> --- a/kernel/rcu/tree_stall.h
> +++ b/kernel/rcu/tree_stall.h
> @@ -348,6 +348,7 @@ static int rcu_print_task_stall(struct rcu_node
> *rnp, unsigned long flags) } #endif /* #else #ifdef
> CONFIG_PREEMPT_RCU */
>
> +static void rcuc_kthread_dump(struct rcu_data *rdp);
> /*
> * Dump stacks of all tasks running on stalled CPUs. First try using
> * NMIs, but fall back to manual remote stack tracing on
> architectures @@ -368,6 +369,7 @@ static void rcu_dump_cpu_stacks(void)
> pr_err("Offline CPU %d blocking current GP.\n", cpu);
> else if (!trigger_single_cpu_backtrace(cpu))
> dump_cpu_task(cpu);
> + rcuc_kthread_dump(per_cpu_ptr(&rcu_data, cpu));

>Was this addition inspired by the chasing of a bug? If so, please let me know exactly what information was (or would have been) the most helpful.
>
>For example, the starvation information might be more compactly represented in the information printed by print_cpu_stall_info().
>Unless the stack trace was quite valuable, it might be best to omit it.
>After all, RCU CPU stall warnings are already infamously heavy on the text output.

Is it better to modify it like this?


diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index d7956c03edbd..37444ff00787 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -465,27 +465,38 @@ static void print_cpu_stall_info(int cpu)
falsepositive ? " (false positive?)" : "");
}

-static void rcuc_kthread_dump(struct rcu_data *rdp)
+static void rcuc_kthread_dump(void)
{
int cpu;
unsigned long j;
+ unsigned long flags
+ struct rcu_node *rnp;
struct task_struct *rcuc;
+ struct rcu_data *rdp;

- rcuc = rdp->rcu_cpu_kthread_task;
- if (!rcuc)
- return;
+ rcu_for_each_leaf_node(rnp) {
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ for_each_leaf_node_possible_cpu(rnp, cpu)
+ if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ rcuc = rdp->rcu_cpu_kthread_task;
+ if (!rcuc)
+ return;

- cpu = task_cpu(rcuc);
- if (cpu_is_offline(cpu) || idle_cpu(cpu))
- return;
+ cpu = task_cpu(rcuc);
+ if (cpu_is_offline(cpu) || idle_cpu(cpu))
+ return;

- if (!rcu_is_rcuc_kthread_starving(rdp, &j))
- return;
+ if (!rcu_is_rcuc_kthread_starving(rdp, &j))
+ return;

- pr_err("%s kthread starved for %ld jiffies\n", rcuc->comm, j);
- sched_show_task(rcuc);
- if (!trigger_single_cpu_backtrace(cpu))
- dump_cpu_task(cpu);
+ pr_err("%s kthread starved for %ld jiffies\n", rcuc->comm, j);
+ sched_show_task(rcuc);
+ if (!trigger_single_cpu_backtrace(cpu))
+ dump_cpu_task(cpu);
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
}

/* Complain about starvation of grace-period kthread. */
@@ -595,6 +606,9 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
smp_processor_id(), (long)(jiffies - gps),
(long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus);
if (ndetected) {
+ if (!use_softirq)
+ rcuc_kthread_dump();
+
rcu_dump_cpu_stacks();

/* Complain about tasks blocking the grace period. */
@@ -660,7 +674,7 @@ static void print_cpu_stall(unsigned long gps)
rcu_check_gp_kthread_starvation();

if (!use_softirq)
- rcuc_kthread_dump(rdp);
+ rcuc_kthread_dump();

rcu_dump_cpu_stacks();

Thanks
Zqiang


> }
> raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
> }
> @@ -471,6 +473,9 @@ static void rcuc_kthread_dump(struct rcu_data *rdp)
> unsigned long j;
> struct task_struct *rcuc;
>
> + if (use_softirq)
> + return;
> +
> rcuc = rdp->rcu_cpu_kthread_task;
> if (!rcuc)

>The checks for use_softirq on the one hand and the checks for non-NULL
>rdp->rcu_cpu_kthread_task are a bit "interesting". Not your fault or
>problem, of course!

Yes the ' if (!use_softirq)' is redundant, the if rcu_cpu_kthread_task Is completely enough

>
> return;
> @@ -659,9 +664,6 @@ static void print_cpu_stall(unsigned long gps)
> rcu_check_gp_kthread_expired_fqs_timer();
> rcu_check_gp_kthread_starvation();
>
> - if (!use_softirq)
>
>In particular, I am wondering if this "if" is redundant.

Yes the ' if (!use_softirq)' is also redundant

>
> - rcuc_kthread_dump(rdp);
> -
> rcu_dump_cpu_stacks();
>
> raw_spin_lock_irqsave_rcu_node(rnp, flags);
> --
> 2.25.1
>