[PATCH] x86/mce: Rework DFR handling flow
From: Yazen Ghannam
Date: Mon Sep 22 2025 - 16:26:06 EST
Add a flag to poll for Deferred errors similar to MCP_UC for
uncorrectable errors. This will do checks specific to deferred errors
and fallback to common UC/CE checks otherwise.
Also, clear the MCA_DESTAT register at the end of the handler rather
than the beginning.
Signed-off-by: Yazen Ghannam <yazen.ghannam@xxxxxxx>
---
arch/x86/include/asm/mce.h | 1 +
arch/x86/kernel/cpu/mce/amd.c | 13 ++++++++----
arch/x86/kernel/cpu/mce/core.c | 36 ++++++++++++++++++++--------------
3 files changed, 31 insertions(+), 19 deletions(-)
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 1cfbfff0be3f..9652fc11860d 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -299,6 +299,7 @@ enum mcp_flags {
MCP_TIMESTAMP = BIT(0), /* log time stamp */
MCP_UC = BIT(1), /* log uncorrected errors */
MCP_QUEUE_LOG = BIT(2), /* only queue to genpool */
+ MCP_DFR = BIT(3), /* log deferred errors */
};
void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 9b746080351f..83fad4503b1c 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -839,7 +839,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error)
/* APIC interrupt handler for deferred errors */
static void amd_deferred_error_interrupt(void)
{
- machine_check_poll(MCP_TIMESTAMP, &this_cpu_ptr(&mce_amd_data)->dfr_intr_banks);
+ machine_check_poll(MCP_TIMESTAMP | MCP_DFR, &this_cpu_ptr(&mce_amd_data)->dfr_intr_banks);
}
void mce_amd_handle_storm(unsigned int bank, bool on)
@@ -865,10 +865,15 @@ void amd_clear_bank(struct mce *m)
{
amd_reset_thr_limit(m->bank);
- if (m->kflags & MCE_CHECK_DFR_REGS)
+ /* Clear MCA_DESTAT for all deferred errors even those logged in MCA_STATUS. */
+ if (m->status & MCI_STATUS_DEFERRED)
mce_wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(m->bank), 0);
- else
- mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0);
+
+ /* Don't clear MCA_STATUS if MCA_DESTAT was used exclusively. */
+ if (m->kflags & MCE_CHECK_DFR_REGS)
+ return;
+
+ mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0);
}
/*
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index e2d51609d2cb..960efee4be3e 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -731,27 +731,26 @@ static bool smca_should_log_poll_error(enum mcp_flags flags, struct mce_hw_err *
struct mce *m = &err->m;
/*
- * If this is a deferred error found in MCA_STATUS, then clear
- * the redundant data from the MCA_DESTAT register.
+ * If the MCA_STATUS register has a deferred error, then continue using it as
+ * the status register.
+ *
+ * MCA_DESTAT will be cleared at the end of the handler.
*/
- if (m->status & MCI_STATUS_VAL) {
- if (m->status & MCI_STATUS_DEFERRED)
- mce_wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(m->bank), 0);
-
+ if ((m->status & MCI_STATUS_VAL) && (m->status & MCI_STATUS_DEFERRED))
return true;
- }
/*
- * If the MCA_DESTAT register has valid data, then use
- * it as the status register.
+ * If the MCA_DESTAT register has a deferred error, then use it instead.
+ *
+ * MCA_STATUS will not be cleared at the end of the handler.
*/
m->status = mce_rdmsrq(MSR_AMD64_SMCA_MCx_DESTAT(m->bank));
+ if ((m->status & MCI_STATUS_VAL) && (m->status & MCI_STATUS_DEFERRED)) {
+ m->kflags |= MCE_CHECK_DFR_REGS;
+ return true;
+ }
- if (!(m->status & MCI_STATUS_VAL))
- return false;
-
- m->kflags |= MCE_CHECK_DFR_REGS;
- return true;
+ return false;
}
/*
@@ -780,13 +779,17 @@ static bool should_log_poll_error(enum mcp_flags flags, struct mce_hw_err *err)
{
struct mce *m = &err->m;
- if (mce_flags.smca)
+ if (flags & MCP_DFR)
return smca_should_log_poll_error(flags, err);
/* If this entry is not valid, ignore it. */
if (!(m->status & MCI_STATUS_VAL))
return false;
+ /* Ignore deferred errors if not looking for them (MCP_DFR not set). */
+ if (m->status & MCI_STATUS_DEFERRED)
+ return false;
+
/*
* If we are logging everything (at CPU online) or this
* is a corrected error, then we must log it.
@@ -1924,6 +1927,9 @@ static void __mcheck_cpu_init_prepare_banks(void)
bitmap_fill(all_banks, MAX_NR_BANKS);
machine_check_poll(MCP_UC | MCP_QUEUE_LOG, &all_banks);
+
+ if (mce_flags.smca)
+ machine_check_poll(MCP_DFR | MCP_QUEUE_LOG, &all_banks);
}
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
--
2.51.0