Recoverable MCA interrupts from NMI handlers? IPMI and RCU?

From: Paul E. McKenney
Date: Fri Jun 06 2008 - 11:22:00 EST


Hello!

A couple of questions about the x86 architecture...

1. Can recoverable machine-check exceptions occur from within
NMI handlers? If so, there is a bug in preemptable RCU's
CONFIG_NO_HZ handling that could be fixed by a patch something
like the one shown below (untested, probably does not even
compile).

2. Does the IPMI subsystem make use of RCU read-side primitives
from within SMI handlers? If so, we need the SMI handlers to
invoke rcu_irq_enter() upon entry and rcu_irq_exit() upon exit
when they are invoked from dynticks idle state. Or something
similar, depending on restrictions on code within SMI handlers.

If I need to forward either of these questions to someone else, please
let me know!

Signed-off-by: Paul E. McKenney <paulmck@xxxxxxxxxxxxxxxxxx>
---

include/linux/rcupreempt.h | 18 -----
kernel/rcupreempt.c | 156 ++++++++++++++++++++-------------------------
2 files changed, 73 insertions(+), 101 deletions(-)

diff -urpNa -X dontdiff linux-2.6.26-rc4/include/linux/rcupreempt.h linux-2.6.26-rc4-nohznest/include/linux/rcupreempt.h
--- linux-2.6.26-rc4/include/linux/rcupreempt.h 2008-05-30 04:39:01.000000000 -0700
+++ linux-2.6.26-rc4-nohznest/include/linux/rcupreempt.h 2008-06-01 07:21:25.000000000 -0700
@@ -81,22 +81,8 @@ extern struct rcupreempt_trace *rcupreem
struct softirq_action;

#ifdef CONFIG_NO_HZ
-DECLARE_PER_CPU(long, dynticks_progress_counter);
-
-static inline void rcu_enter_nohz(void)
-{
- smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
- __get_cpu_var(dynticks_progress_counter)++;
- WARN_ON(__get_cpu_var(dynticks_progress_counter) & 0x1);
-}
-
-static inline void rcu_exit_nohz(void)
-{
- __get_cpu_var(dynticks_progress_counter)++;
- smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
- WARN_ON(!(__get_cpu_var(dynticks_progress_counter) & 0x1));
-}
-
+extern void rcu_enter_nohz(void);
+extern void rcu_exit_nohz(void);
#else /* CONFIG_NO_HZ */
#define rcu_enter_nohz() do { } while (0)
#define rcu_exit_nohz() do { } while (0)
diff -urpNa -X dontdiff linux-2.6.26-rc4/kernel/rcupreempt.c linux-2.6.26-rc4-nohznest/kernel/rcupreempt.c
--- linux-2.6.26-rc4/kernel/rcupreempt.c 2008-05-30 04:39:01.000000000 -0700
+++ linux-2.6.26-rc4-nohznest/kernel/rcupreempt.c 2008-06-01 07:21:22.000000000 -0700
@@ -415,9 +415,57 @@ static void __rcu_advance_callbacks(stru

#ifdef CONFIG_NO_HZ

-DEFINE_PER_CPU(long, dynticks_progress_counter) = 1;
+/*
+ * The low-order DYNTICKS_PROGRESS_SHIFT bits are a nesting-level count,
+ * and the rest of the bits are a counter that, when even, indicates that
+ * the corresponding CPU is in dynticks-idle mode (and thus should be
+ * ignored by RCU). Otherwise, if the counter represented by the upper
+ * bits is odd, RCU must pay attention to this CPU. This counter is
+ * manipulated atomically only when entering or exiting dynticks-idle
+ * mode. All other cases use non-atomic instructions and avoid all
+ * memory barriers.
+ */
+
+DEFINE_PER_CPU(atomic_t, dynticks_progress_counter) = ATOMIC_INIT(1);
static DEFINE_PER_CPU(long, rcu_dyntick_snapshot);
-static DEFINE_PER_CPU(int, rcu_update_flag);
+
+#define DYNTICKS_PROGRESS_SHIFT 8
+#define DYNTICKS_PROGRESS_LOB (1 << DYNITICKS_PROGRESS_SHIFT)
+#define DYNTICKS_PROGRESS_NEST_MASK (DYNTICKS_PROGRESS_LOB - 1)
+
+/**
+ * rcu_enter_nohz - Called before shutting down scheduling-clock irq.
+ *
+ * This updates the dynticks_progress_counter to let the RCU
+ * grace-period detection machinery know that this CPU is no
+ * longer active.
+ */
+void rcu_enter_nohz(void)
+{
+ atomic_t *dpcp = &__get_cpu_var(dynticks_progress_counter);
+
+ smp_mb();
+ atomic_add(DYNTICKS_PROGRESS_CTR_LOB - 1, dpcp);
+ WARN_ON(atomic_read(dpcp) &
+ (DYNTICKS_PROGRESS_CTR_LOB + DYNTICKS_PROGRESS_NEST_MASK));
+}
+
+/**
+ * rcu_exit_nohz - Called before restarting scheduling-clock irq.
+ *
+ * This updates the dynticks_progress_counter to let the RCU
+ * grace-period detection machinery know that this CPU is once
+ * again active.
+ */
+void rcu_exit_nohz(void)
+{
+ atomic_t *dpcp = &__get_cpu_var(dynticks_progress_counter);
+
+ atomic_add(DYNTICKS_PROGRESS_CTR_LOB + 1, dpcp);
+ smp_mb();
+ WARN_ON(!(atomic_read(dpcp) & DYNTICKS_PROGRESS_CTR_LOB) ||
+ !(atomic_read(dpcp) & DYNTICKS_PROGRESS_NEST_MASK));
+}

/**
* rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
@@ -429,62 +477,16 @@ static DEFINE_PER_CPU(int, rcu_update_fl
void rcu_irq_enter(void)
{
int cpu = smp_processor_id();
+ atomic_t *dpcp = &per_cpu(dynticks_progress_counter, cpu);

- if (per_cpu(rcu_update_flag, cpu))
- per_cpu(rcu_update_flag, cpu)++;
+ if (atomic_read(dpcp) & DYNTICKS_PROGRESS_CTR_LOB) {

- /*
- * Only update if we are coming from a stopped ticks mode
- * (dynticks_progress_counter is even).
- */
- if (!in_interrupt() &&
- (per_cpu(dynticks_progress_counter, cpu) & 0x1) == 0) {
- /*
- * The following might seem like we could have a race
- * with NMI/SMIs. But this really isn't a problem.
- * Here we do a read/modify/write, and the race happens
- * when an NMI/SMI comes in after the read and before
- * the write. But NMI/SMIs will increment this counter
- * twice before returning, so the zero bit will not
- * be corrupted by the NMI/SMI which is the most important
- * part.
- *
- * The only thing is that we would bring back the counter
- * to a postion that it was in during the NMI/SMI.
- * But the zero bit would be set, so the rest of the
- * counter would again be ignored.
- *
- * On return from the IRQ, the counter may have the zero
- * bit be 0 and the counter the same as the return from
- * the NMI/SMI. If the state machine was so unlucky to
- * see that, it still doesn't matter, since all
- * RCU read-side critical sections on this CPU would
- * have already completed.
- */
- per_cpu(dynticks_progress_counter, cpu)++;
- /*
- * The following memory barrier ensures that any
- * rcu_read_lock() primitives in the irq handler
- * are seen by other CPUs to follow the above
- * increment to dynticks_progress_counter. This is
- * required in order for other CPUs to correctly
- * determine when it is safe to advance the RCU
- * grace-period state machine.
- */
- smp_mb(); /* see above block comment. */
- /*
- * Since we can't determine the dynamic tick mode from
- * the dynticks_progress_counter after this routine,
- * we use a second flag to acknowledge that we came
- * from an idle state with ticks stopped.
- */
- per_cpu(rcu_update_flag, cpu)++;
- /*
- * If we take an NMI/SMI now, they will also increment
- * the rcu_update_flag, and will not update the
- * dynticks_progress_counter on exit. That is for
- * this IRQ to do.
- */
+ /* Nested interrupt, just adjust the nesting level. */
+
+ atomic_set(dpcp, atomic_read(dpcp) + 1);
+ WARN_ON(!(atomic_read(dpcp) & DYNTICKS_PROGRESS_CTR_LOB) ||
+ !(atomic_read(dpcp) & DYNTICKS_PROGRESS_NEST_MASK));
+ return;
}
}

@@ -498,41 +500,25 @@ void rcu_irq_enter(void)
void rcu_irq_exit(void)
{
int cpu = smp_processor_id();
+ atomic_t *dpcp = &per_cpu(dynticks_progress_counter, cpu);

- /*
- * rcu_update_flag is set if we interrupted the CPU
- * when it was idle with ticks stopped.
- * Once this occurs, we keep track of interrupt nesting
- * because a NMI/SMI could also come in, and we still
- * only want the IRQ that started the increment of the
- * dynticks_progress_counter to be the one that modifies
- * it on exit.
- */
- if (per_cpu(rcu_update_flag, cpu)) {
- if (--per_cpu(rcu_update_flag, cpu))
- return;
-
- /* This must match the interrupt nesting */
- WARN_ON(in_interrupt());
+ if ((atomic_read(dpcp) &
+ (DYNTICKS_PROGRESS_CTR_LOB | DYNTICKS_PROGRESS_NEST_MASK)) !=
+ (DYNTICKS_PROGRESS_CTR_LOB | DYNTICKS_PROGRESS_NEST_MASK)) {

/*
- * If an NMI/SMI happens now we are still
- * protected by the dynticks_progress_counter being odd.
+ * This rcu_irq_exit() will not bring the nesting count
+ * to zero, so it will not put us back into dynticks-idle
+ * mode. So we need only decrement the nesting count.
*/

- /*
- * The following memory barrier ensures that any
- * rcu_read_unlock() primitives in the irq handler
- * are seen by other CPUs to preceed the following
- * increment to dynticks_progress_counter. This
- * is required in order for other CPUs to determine
- * when it is safe to advance the RCU grace-period
- * state machine.
- */
- smp_mb(); /* see above block comment. */
- per_cpu(dynticks_progress_counter, cpu)++;
- WARN_ON(per_cpu(dynticks_progress_counter, cpu) & 0x1);
+ atomic_set(dpcp, atomic_read(dpcp) - 1);
+ WARN_ON(!(atomic_read(dpcp) & DYNTICKS_PROGRESS_CTR_LOB) ||
+ !(atomic_read(dpcp) & DYNTICKS_PROGRESS_NEST_MASK));
+ return;
}
+ smp_mb();
+ atomic_add(DYNTICKS_PROGRESS_CTR_LOB - 1, dpcp);
}

static void dyntick_save_progress_counter(int cpu)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/