[PATCH 3/4] edac, mce: Prepare error decoded info

From: Borislav Petkov
Date: Sat Mar 27 2010 - 13:42:08 EST


Add a buffer where CECC error info is stored and dump it later into the
trace record.

Not-Signed-off-by: Borislav Petkov <petkovbb@xxxxxxxxx>
---
arch/x86/kernel/cpu/mcheck/mce.c | 2 +
drivers/edac/amd64_edac.c | 4 ++-
drivers/edac/edac_mc.c | 7 ++++
drivers/edac/edac_mce_amd.c | 60 +++++++++++++++++++++++++++++++------
drivers/edac/edac_mce_amd.h | 1 +
5 files changed, 63 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 3880f3c..0bcb488 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -160,8 +160,10 @@ void mce_log(struct mce *mce)
{
unsigned next, entry;

+#ifndef CONFIG_EDAC_DECODE_MCE
/* Emit the trace record: */
trace_mce_record(mce, "");
+#endif

mce->finished = 0;
wmb();
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 80600f1..3e036f3 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -1993,7 +1993,9 @@ static void amd64_handle_ce(struct mem_ctl_info *mci,
sys_addr = pvt->ops->get_error_address(mci, info);

amd64_mc_printk(mci, KERN_ERR,
- "CE ERROR_ADDRESS= 0x%llx\n", sys_addr);
+ "CE err addr: 0x%llx\n", sys_addr);
+
+ edac_snprintf("CE err addr: 0x%llx\n", sys_addr);

pvt->ops->map_sysaddr_to_csrow(mci, info, sys_addr);
}
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 3630308..f4b7de7 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -33,6 +33,7 @@
#include <asm/edac.h>
#include "edac_core.h"
#include "edac_module.h"
+#include "edac_mce_amd.h"

/* lock to memory controller's control array */
static DEFINE_MUTEX(mem_ctls_mutex);
@@ -702,6 +703,12 @@ void edac_mc_handle_ce(struct mem_ctl_info *mci,
mci->csrows[row].grain, syndrome, row, channel,
mci->csrows[row].channels[channel].label, msg);

+ edac_snprintf("CE page 0x%lx, offset 0x%lx, grain %d, syndrome "
+ "0x%lx, row %d, channel %d\n",
+ page_frame_number, offset_in_page,
+ mci->csrows[row].grain, syndrome, row, channel);
+
+
mci->ce_count++;
mci->csrows[row].ce_count++;
mci->csrows[row].channels[channel].ce_count++;
diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c
index 97e64bc..86b374e 100644
--- a/drivers/edac/edac_mce_amd.c
+++ b/drivers/edac/edac_mce_amd.c
@@ -1,4 +1,6 @@
#include <linux/module.h>
+#include <linux/slab.h>
+#include <trace/events/mce.h>
#include "edac_mce_amd.h"

static bool report_gart_errors;
@@ -128,6 +130,33 @@ const char *ext_msgs[] = {
};
EXPORT_SYMBOL_GPL(ext_msgs);

+static char *decoded_err;
+static unsigned dec_len;
+
+void edac_snprintf(const char *fmt, ...)
+{
+ va_list args;
+ char *buf = decoded_err + dec_len;
+ unsigned size = DECODED_ERR_SZ - dec_len - 1;
+ int i;
+
+ if (dec_len >= DECODED_ERR_SZ-1)
+ return;
+
+ va_start(args, fmt);
+ i = vsnprintf(buf, size, fmt, args);
+ va_end(args);
+
+ if (i >= size) {
+ printk(KERN_ERR "MCE decode buffer truncated.\n");
+ dec_len = DECODED_ERR_SZ-1;
+ decoded_err[dec_len] = '\n';
+ } else {
+ dec_len += i;
+ }
+}
+EXPORT_SYMBOL_GPL(edac_snprintf);
+
static void amd_decode_dc_mce(u64 mc0_status)
{
u32 ec = mc0_status & 0xffff;
@@ -304,7 +333,7 @@ void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors)
if (TLB_ERROR(ec) && !report_gart_errors)
return;

- pr_emerg(" Northbridge Error, node %d", node_id);
+ edac_snprintf(" Northbridge Error, node %d", node_id);

/*
* F10h, revD can disable ErrCpu[3:0] so check that first and also the
@@ -313,17 +342,17 @@ void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors)
if ((boot_cpu_data.x86 == 0x10) &&
(boot_cpu_data.x86_model > 7)) {
if (regs->nbsh & K8_NBSH_ERR_CPU_VAL)
- pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf));
+ edac_snprintf(", core: %u\n", (u8)(regs->nbsh & 0xf));
} else {
u8 assoc_cpus = regs->nbsh & 0xf;

if (assoc_cpus > 0)
- pr_cont(", core: %d", fls(assoc_cpus) - 1);
+ edac_snprintf(", core: %d", fls(assoc_cpus) - 1);

- pr_cont("\n");
+ edac_snprintf("\n");
}

- pr_emerg("%s.\n", EXT_ERR_MSG(regs->nbsl));
+ edac_snprintf("%s.\n", EXT_ERR_MSG(regs->nbsl));

if (BUS_ERROR(ec) && nb_bus_decoder)
nb_bus_decoder(node_id, regs);
@@ -342,13 +371,13 @@ static void amd_decode_fr_mce(u64 mc5_status)
static inline void amd_decode_err_code(unsigned int ec)
{
if (TLB_ERROR(ec)) {
- pr_emerg(" Transaction: %s, Cache Level %s\n",
+ edac_snprintf(" Transaction: %s, Cache Level %s\n",
TT_MSG(ec), LL_MSG(ec));
} else if (MEM_ERROR(ec)) {
- pr_emerg(" Transaction: %s, Type: %s, Cache Level: %s",
+ edac_snprintf(" Transaction: %s, Type: %s, Cache Level: %s",
RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
} else if (BUS_ERROR(ec)) {
- pr_emerg(" Transaction type: %s(%s), %s, Cache Level: %s, "
+ edac_snprintf(" Transaction type: %s(%s), %s, Cache Level: %s, "
"Participating Processor: %s\n",
RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec),
PP_MSG(ec));
@@ -363,9 +392,9 @@ static int amd_decode_mce(struct notifier_block *nb, unsigned long val,
struct err_regs regs;
int node, ecc;

- pr_emerg("MC%d_STATUS: ", m->bank);
+/* already in the MCE record: pr_emerg("MC%d_STATUS: ", m->bank); */

- pr_cont("%sorrected error, report: %s, MiscV: %svalid, "
+ pr_emerg("%sorrected error, report: %s, MiscV: %svalid, "
"CPU context corrupt: %s",
((m->status & MCI_STATUS_UC) ? "Unc" : "C"),
((m->status & MCI_STATUS_EN) ? "yes" : "no"),
@@ -416,6 +445,12 @@ static int amd_decode_mce(struct notifier_block *nb, unsigned long val,

amd_decode_err_code(m->status & 0xffff);

+ /* this has to be at the end */
+ pr_emerg("%s\n", decoded_err);
+
+ trace_mce_record(m, decoded_err);
+ dec_len = 0;
+
return NOTIFY_STOP;
}

@@ -432,6 +467,10 @@ static int __init mce_amd_init(void)
(boot_cpu_data.x86 >= 0xf))
atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb);

+ decoded_err = kzalloc(DECODED_ERR_SZ, GFP_KERNEL);
+ if (!decoded_err)
+ return -ENOMEM;
+
return 0;
}
early_initcall(mce_amd_init);
@@ -439,6 +478,7 @@ early_initcall(mce_amd_init);
#ifdef MODULE
static void __exit mce_amd_exit(void)
{
+ kfree(decoded_err);
atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb);
}

diff --git a/drivers/edac/edac_mce_amd.h b/drivers/edac/edac_mce_amd.h
index df23ee0..3ff1802 100644
--- a/drivers/edac/edac_mce_amd.h
+++ b/drivers/edac/edac_mce_amd.h
@@ -66,4 +66,5 @@ void amd_register_ecc_decoder(void (*f)(int, struct err_regs *));
void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *));
void amd_decode_nb_mce(int, struct err_regs *, int);

+void edac_snprintf(const char *fmt, ...);
#endif /* _EDAC_MCE_AMD_H */
--
1.6.4.4


--k1lZvvs/B4yU6o8G--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/