Re: [PATCH] x86: mce: Avoid timer double-add during CMCI interrupt storms.

From: Borislav Petkov
Date: Wed Dec 10 2014 - 14:23:51 EST


On Tue, Dec 09, 2014 at 07:11:02PM -0800, Calvin Owens wrote:
> Just to make sure I understand what you're looking for:
>
> When MCE is initialized, spawn a kthread for each CPU instead of the
> current timers. If CMCI is supported, we just leave this thread parked,
> and only process errors from the CMCI interrupt handler.
>
> When a CMCI storm happens, we disable CMCI interrupts and kick the
> kthread, which polls every HZ/100 until the storm has subsided, at which
> point it re-enables CMCI interrupts and parks itself.
>
> If CMCI isn't supported though, how is the polling done? You said the
> dynamic interval is desirable, wouldn't that need to be in the kthread?
> Having both the kthread and the timer around seems ugly, even if only
> one is used on a given machine.

Ok, so in talking with the guys here internally it sounds to me like
a kernel thread or a workqueue or whatever other facility relying on
wake_up_process() we take, would have scheduling latency in there and
get delayed when polling the MCA banks. In a storm condition, this might
not be such a good idea.

So we maybe better off with the timer interrupt after all but try
to decouple the dynamic adjusting of polling frequency for non-CMCI
machines and do an on/off simpler polling on CMCI machines.

So what I'm thinking of is:

* once we've detected CMCI storm, we immediately start polling with
CMCI_STORM_INTERVAL, i.e. once per second.

* as long as we keep seeing errors during polling in storm mode, we keep
that polling frequency.

* the moment we don't see any errors anymore, we go to
CMCI_STORM_SUBSIDED and then eventually to CMCI_STORM_NONE.

The code remains unchanged for CMCI machines which are not in storm
mode and for non-CMCI machines.

Anyway, this below is completely untested but it seems simpler to me and
does away with the adaptive logic for CMCI storms (you might want to
apply it first as the diff is hardly readable and this code is nasty as
hell anyway).

Thoughts?

---
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 51b26e895933..1b9733614e4c 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -108,6 +108,7 @@ struct mca_config {
bool disabled;
bool ser;
bool bios_cmci_threshold;
+ bool cmci;
u8 banks;
s8 bootlog;
int tolerant;
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 10b46906767f..6e4984fc37ce 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -30,18 +30,19 @@ extern struct mce_bank *mce_banks;
extern mce_banks_t mce_banks_ce_disabled;

#ifdef CONFIG_X86_MCE_INTEL
-unsigned long mce_intel_adjust_timer(unsigned long interval);
+unsigned long cmci_intel_adjust_timer(unsigned long interval);
void mce_intel_cmci_poll(void);
void mce_intel_hcpu_update(unsigned long cpu);
void cmci_disable_bank(int bank);
#else
-# define mce_intel_adjust_timer mce_adjust_timer_default
+# define cmci_intel_adjust_timer mce_adjust_timer_default
static inline void mce_intel_cmci_poll(void) { }
static inline void mce_intel_hcpu_update(unsigned long cpu) { }
static inline void cmci_disable_bank(int bank) { }
#endif

void mce_timer_kick(unsigned long interval);
+int ce_error_seen(void);

#ifdef CONFIG_ACPI_APEI
int apei_write_mce(struct mce *m);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index d2c611699cd9..e3f426698164 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1324,7 +1324,7 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
static unsigned long (*mce_adjust_timer)(unsigned long interval) =
mce_adjust_timer_default;

-static int cmc_error_seen(void)
+int ce_error_seen(void)
{
unsigned long *v = this_cpu_ptr(&mce_polled_error);

@@ -1334,36 +1334,36 @@ static int cmc_error_seen(void)
static void mce_timer_fn(unsigned long data)
{
struct timer_list *t = this_cpu_ptr(&mce_timer);
+ int cpu = smp_processor_id();
unsigned long iv;
- int notify;

- WARN_ON(smp_processor_id() != data);
+ WARN_ON(cpu != data);

if (mce_available(this_cpu_ptr(&cpu_info))) {
- machine_check_poll(MCP_TIMESTAMP,
- this_cpu_ptr(&mce_poll_banks));
+ machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_poll_banks));
mce_intel_cmci_poll();
}

+ iv = __this_cpu_read(mce_next_interval);
+
+ if (mca_cfg.cmci) {
+ iv = mce_adjust_timer(iv);
+ goto done;
+ }
+
/*
- * Alert userspace if needed. If we logged an MCE, reduce the
- * polling interval, otherwise increase the polling interval.
+ * Alert userspace if needed. If we logged an MCE, reduce the polling
+ * interval, otherwise increase the polling interval.
*/
- iv = __this_cpu_read(mce_next_interval);
- notify = mce_notify_irq();
- notify |= cmc_error_seen();
- if (notify) {
+ if (mce_notify_irq())
iv = max(iv / 2, (unsigned long) HZ/100);
- } else {
+ else
iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
- iv = mce_adjust_timer(iv);
- }
+
+done:
__this_cpu_write(mce_next_interval, iv);
- /* Might have become 0 after CMCI storm subsided */
- if (iv) {
- t->expires = jiffies + iv;
- add_timer_on(t, smp_processor_id());
- }
+ t->expires = jiffies + iv;
+ add_timer_on(t, cpu);
}

/*
@@ -1682,7 +1682,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
mce_intel_feature_init(c);
- mce_adjust_timer = mce_intel_adjust_timer;
+ mce_adjust_timer = cmci_intel_adjust_timer;
break;
case X86_VENDOR_AMD:
mce_amd_feature_init(c);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index b3c97bafc123..6b8cbeaaca88 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -46,7 +46,7 @@ static DEFINE_RAW_SPINLOCK(cmci_discover_lock);

#define CMCI_THRESHOLD 1
#define CMCI_POLL_INTERVAL (30 * HZ)
-#define CMCI_STORM_INTERVAL (1 * HZ)
+#define CMCI_STORM_INTERVAL (HZ)
#define CMCI_STORM_THRESHOLD 15

static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
@@ -68,6 +68,9 @@ static int cmci_supported(int *banks)
if (mca_cfg.cmci_disabled || mca_cfg.ignore_ce)
return 0;

+ if (mca_cfg.cmci)
+ return 1;
+
/*
* Vendor check is not strictly needed, but the initial
* initialization is vendor keyed and this
@@ -79,7 +82,9 @@ static int cmci_supported(int *banks)
return 0;
rdmsrl(MSR_IA32_MCG_CAP, cap);
*banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
- return !!(cap & MCG_CMCI_P);
+ mca_cfg.cmci = !!(cap & MCG_CMCI_P);
+
+ return mca_cfg.cmci;
}

void mce_intel_cmci_poll(void)
@@ -97,12 +102,13 @@ void mce_intel_hcpu_update(unsigned long cpu)
per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
}

-unsigned long mce_intel_adjust_timer(unsigned long interval)
+unsigned long cmci_intel_adjust_timer(unsigned long interval)
{
- int r;
-
- if (interval < CMCI_POLL_INTERVAL)
- return interval;
+ if (ce_error_seen() &&
+ (__this_cpu_read(cmci_storm_state) == CMCI_STORM_ACTIVE)) {
+ mce_notify_irq();
+ return CMCI_STORM_INTERVAL;
+ }

switch (__this_cpu_read(cmci_storm_state)) {
case CMCI_STORM_ACTIVE:
@@ -112,8 +118,7 @@ unsigned long mce_intel_adjust_timer(unsigned long interval)
* timer interval is back to our poll interval.
*/
__this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED);
- r = atomic_sub_return(1, &cmci_storm_on_cpus);
- if (r == 0)
+ if (!atomic_sub_return(1, &cmci_storm_on_cpus))
pr_notice("CMCI storm subsided: switching to interrupt mode\n");
/* FALLTHROUGH */

@@ -178,7 +183,7 @@ static bool cmci_storm_detect(void)
cmci_storm_disable_banks();
__this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
r = atomic_add_return(1, &cmci_storm_on_cpus);
- mce_timer_kick(CMCI_POLL_INTERVAL);
+ mce_timer_kick(CMCI_STORM_INTERVAL);

if (r == 1)
pr_notice("CMCI storm detected: switching to poll mode\n");


--
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/