rcu: endless stalls

From: Mike Galbraith
Date: Mon Jun 11 2012 - 06:06:44 EST


Greetings,

I received a report of a 48 core UV box hitting a gripe, taking longer
than timeout to emit same, so box griped endlessly, forcing reboot.

The below might prevent that.. and bust other stuff for free :)

rcu: one gripe at a time please

Not-compiled-by:
Not-signed-off-by:
Not-etc-by:

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 0da7b88..6462056d6 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -818,10 +818,25 @@ static void print_cpu_stall(struct rcu_state *rsp)
set_need_resched(); /* kick ourselves to get things going. */
}

+/**
+ * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
+ *
+ * Set the stall-warning timeout way off into the future, thus preventing
+ * any RCU CPU stall-warning messages from appearing in the current set of
+ * RCU grace periods.
+ *
+ * The caller must disable hard irqs.
+ */
+void rcu_cpu_stall_reset(void)
+{
+ rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2;
+ rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2;
+ rcu_preempt_stall_reset();
+}
+
static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
{
- unsigned long j;
- unsigned long js;
+ unsigned long j, js, flags;
struct rcu_node *rnp;

if (rcu_cpu_stall_suppress)
@@ -832,13 +847,23 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) {

/* We haven't checked in, so go dump stack. */
+ rcu_cpu_stall_suppress = 1;
print_cpu_stall(rsp);
+ local_irq_save(flags);
+ rcu_cpu_stall_reset();
+ local_irq_restore(flags);
+ rcu_cpu_stall_suppress = 0;

} else if (rcu_gp_in_progress(rsp) &&
ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {

/* They had a few time units to dump stack, so complain. */
+ rcu_cpu_stall_suppress = 1;
print_other_cpu_stall(rsp);
+ local_irq_save(flags);
+ rcu_cpu_stall_reset();
+ local_irq_restore(flags);
+ rcu_cpu_stall_suppress = 0;
}
}

@@ -848,22 +873,6 @@ static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
return NOTIFY_DONE;
}

-/**
- * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
- *
- * Set the stall-warning timeout way off into the future, thus preventing
- * any RCU CPU stall-warning messages from appearing in the current set of
- * RCU grace periods.
- *
- * The caller must disable hard irqs.
- */
-void rcu_cpu_stall_reset(void)
-{
- rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2;
- rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2;
- rcu_preempt_stall_reset();
-}
-
static struct notifier_block rcu_panic_block = {
.notifier_call = rcu_panic,
};


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/