[PATCH v2 4/7] x86/mce: Fix all mce notifiers to update the mce->kflags bitmask

From: Tony Luck
Date: Fri Feb 14 2020 - 17:27:27 EST


If the handler took any action to log or deal with the error, set
a bit int mce->kflags so that the default handler on the end of
the machine check chain can see what has been done.

Get rid of NOTIFY_STOP (well almost ... mce_amd.c is currently using
it to filter out some GART TLB errors ... need to deal with that
later).

Make the EDAC and dev-mcelog handlers skip over errors already
processed by CEC.

Signed-off-by: Tony Luck <tony.luck@xxxxxxxxx>
---
arch/x86/kernel/cpu/mce/core.c | 4 +++-
arch/x86/kernel/cpu/mce/dev-mcelog.c | 5 +++++
drivers/acpi/acpi_extlog.c | 5 +++--
drivers/acpi/nfit/mce.c | 1 +
drivers/edac/i7core_edac.c | 5 +++--
drivers/edac/mce_amd.c | 9 +++++++--
drivers/edac/pnd2_edac.c | 5 +++--
drivers/edac/sb_edac.c | 5 ++++-
drivers/edac/skx_common.c | 4 ++++
drivers/ras/cec.c | 9 ++++++---
10 files changed, 39 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 06240cbe6f3e..d3d11d1e52b3 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -579,8 +579,10 @@ static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
return NOTIFY_DONE;

pfn = mce->addr >> PAGE_SHIFT;
- if (!memory_failure(pfn, 0))
+ if (!memory_failure(pfn, 0)) {
set_mce_nospec(pfn);
+ mce->kflags |= MCE_HANDLED_UC;
+ }

return NOTIFY_OK;
}
diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c
index 7c8958dee103..f1bf7535ead7 100644
--- a/arch/x86/kernel/cpu/mce/dev-mcelog.c
+++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c
@@ -43,6 +43,9 @@ static int dev_mce_log(struct notifier_block *nb, unsigned long val,
struct mce *mce = (struct mce *)data;
unsigned int entry;

+ if (mce->kflags & MCE_HANDLED_CEC)
+ return NOTIFY_DONE;
+
mutex_lock(&mce_chrdev_read_mutex);

entry = mcelog.next;
@@ -60,6 +63,7 @@ static int dev_mce_log(struct notifier_block *nb, unsigned long val,

memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
mcelog.entry[entry].finished = 1;
+ mcelog.entry[entry].kflags = 0;

/* wake processes polling /dev/mcelog */
wake_up_interruptible(&mce_chrdev_wait);
@@ -67,6 +71,7 @@ static int dev_mce_log(struct notifier_block *nb, unsigned long val,
unlock:
mutex_unlock(&mce_chrdev_read_mutex);

+ mce->kflags |= MCE_HANDLED_MCELOG;
return NOTIFY_OK;
}

diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c
index 8596a106a933..9cc3c1f92db5 100644
--- a/drivers/acpi/acpi_extlog.c
+++ b/drivers/acpi/acpi_extlog.c
@@ -146,7 +146,7 @@ static int extlog_print(struct notifier_block *nb, unsigned long val,
static u32 err_seq;

estatus = extlog_elog_entry_check(cpu, bank);
- if (estatus == NULL)
+ if (estatus == NULL || (mce->kflags & MCE_HANDLED_CEC))
return NOTIFY_DONE;

memcpy(elog_buf, (void *)estatus, ELOG_ENTRY_LEN);
@@ -176,7 +176,8 @@ static int extlog_print(struct notifier_block *nb, unsigned long val,
}

out:
- return NOTIFY_STOP;
+ mce->kflags |= MCE_HANDLED_EXTLOG;
+ return NOTIFY_OK;
}

static bool __init extlog_get_l1addr(void)
diff --git a/drivers/acpi/nfit/mce.c b/drivers/acpi/nfit/mce.c
index f0ae48515b48..ee8d9973f60b 100644
--- a/drivers/acpi/nfit/mce.c
+++ b/drivers/acpi/nfit/mce.c
@@ -76,6 +76,7 @@ static int nfit_handle_mce(struct notifier_block *nb, unsigned long val,
*/
acpi_nfit_ars_rescan(acpi_desc, 0);
}
+ mce->kflags |= MCE_HANDLED_NFIT;
break;
}

diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c
index b3135b208f9a..5860ca41185c 100644
--- a/drivers/edac/i7core_edac.c
+++ b/drivers/edac/i7core_edac.c
@@ -1815,7 +1815,7 @@ static int i7core_mce_check_error(struct notifier_block *nb, unsigned long val,
struct mem_ctl_info *mci;

i7_dev = get_i7core_dev(mce->socketid);
- if (!i7_dev)
+ if (!i7_dev || (mce->kflags & MCE_HANDLED_CEC))
return NOTIFY_DONE;

mci = i7_dev->mci;
@@ -1834,7 +1834,8 @@ static int i7core_mce_check_error(struct notifier_block *nb, unsigned long val,
i7core_check_error(mci, mce);

/* Advise mcelog that the errors were handled */
- return NOTIFY_STOP;
+ mce->kflags |= MCE_HANDLED_EDAC;
+ return NOTIFY_OK;
}

static struct notifier_block i7_mce_dec = {
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index ea980c556f2e..e31e4db64e1b 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -1067,8 +1067,12 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
unsigned int fam = x86_family(m->cpuid);
int ecc;

- if (ignore_mce(m))
+ if (ignore_mce(m)) {
+ m->kflags |= MCE_HANDLED_EDAC;
return NOTIFY_STOP;
+ }
+ if (m->kflags & MCE_HANDLED_CEC)
+ return NOTIFY_DONE;

pr_emerg(HW_ERR "%s\n", decode_error_status(m));

@@ -1170,7 +1174,8 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
err_code:
amd_decode_err_code(m->status & 0xffff);

- return NOTIFY_STOP;
+ m->kflags |= MCE_HANDLED_EDAC;
+ return NOTIFY_OK;
}

static struct notifier_block amd_mce_dec_nb = {
diff --git a/drivers/edac/pnd2_edac.c b/drivers/edac/pnd2_edac.c
index 933f7722b893..77ad315c7e8d 100644
--- a/drivers/edac/pnd2_edac.c
+++ b/drivers/edac/pnd2_edac.c
@@ -1400,7 +1400,7 @@ static int pnd2_mce_check_error(struct notifier_block *nb, unsigned long val, vo
return NOTIFY_DONE;

mci = pnd2_mci;
- if (!mci)
+ if (!mci || (mce->kflags & MCE_HANDLED_CEC))
return NOTIFY_DONE;

/*
@@ -1429,7 +1429,8 @@ static int pnd2_mce_check_error(struct notifier_block *nb, unsigned long val, vo
pnd2_mce_output_error(mci, mce, &daddr);

/* Advice mcelog that the error were handled */
- return NOTIFY_STOP;
+ mce->kflags |= MCE_HANDLED_EDAC;
+ return NOTIFY_OK;
}

static struct notifier_block pnd2_mce_dec = {
diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c
index 4957e8ee1879..6e17f601ea63 100644
--- a/drivers/edac/sb_edac.c
+++ b/drivers/edac/sb_edac.c
@@ -3136,6 +3136,8 @@ static int sbridge_mce_check_error(struct notifier_block *nb, unsigned long val,

if (edac_get_report_status() == EDAC_REPORTING_DISABLED)
return NOTIFY_DONE;
+ if (mce->kflags & MCE_HANDLED_CEC)
+ return NOTIFY_DONE;

/*
* Just let mcelog handle it if the error is
@@ -3183,7 +3185,8 @@ static int sbridge_mce_check_error(struct notifier_block *nb, unsigned long val,
sbridge_mce_output_error(mci, mce);

/* Advice mcelog that the error were handled */
- return NOTIFY_STOP;
+ mce->kflags |= MCE_HANDLED_EDAC;
+ return NOTIFY_OK;
}

static struct notifier_block sbridge_mce_dec = {
diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index 99bbaf629b8d..6f08a12f6b11 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -577,6 +577,9 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
if (edac_get_report_status() == EDAC_REPORTING_DISABLED)
return NOTIFY_DONE;

+ if (mce->kflags & MCE_HANDLED_CEC)
+ return NOTIFY_DONE;
+
/* ignore unless this is memory related with an address */
if ((mce->status & 0xefff) >> 7 != 1 || !(mce->status & MCI_STATUS_ADDRV))
return NOTIFY_DONE;
@@ -616,6 +619,7 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val,

skx_mce_output_error(mci, mce, &res);

+ mce->kflags |= MCE_HANDLED_EDAC;
return NOTIFY_DONE;
}

diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
index d7f6718cbf8d..e061962d3c58 100644
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -538,9 +538,12 @@ static int cec_notifier(struct notifier_block *nb, unsigned long val,
/* We eat only correctable DRAM errors with usable addresses. */
if (mce_is_memory_error(m) &&
mce_is_correctable(m) &&
- mce_usable_address(m))
- if (!cec_add_elem(m->addr >> PAGE_SHIFT))
- return NOTIFY_STOP;
+ mce_usable_address(m)) {
+ if (!cec_add_elem(m->addr >> PAGE_SHIFT)) {
+ m->kflags |= MCE_HANDLED_CEC;
+ return NOTIFY_OK;
+ }
+ }

return NOTIFY_DONE;
}
--
2.21.1