>From 980c6fdf00cf23ef76481a4c94ba682c0ff80d61 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 27 Mar 2010 18:42:08 +0100 Subject: [PATCH 3/4] edac, mce: Prepare error decoded info Add a buffer where CECC error info is stored and dump it later into the trace record. Not-Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce.c | 2 + drivers/edac/amd64_edac.c | 4 ++- drivers/edac/edac_mc.c | 7 ++++ drivers/edac/edac_mce_amd.c | 60 +++++++++++++++++++++++++++++++------ drivers/edac/edac_mce_amd.h | 1 + 5 files changed, 63 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 3880f3c..0bcb488 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -160,8 +160,10 @@ void mce_log(struct mce *mce) { unsigned next, entry; +#ifndef CONFIG_EDAC_DECODE_MCE /* Emit the trace record: */ trace_mce_record(mce, ""); +#endif mce->finished = 0; wmb(); diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 80600f1..3e036f3 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -1993,7 +1993,9 @@ static void amd64_handle_ce(struct mem_ctl_info *mci, sys_addr = pvt->ops->get_error_address(mci, info); amd64_mc_printk(mci, KERN_ERR, - "CE ERROR_ADDRESS= 0x%llx\n", sys_addr); + "CE err addr: 0x%llx\n", sys_addr); + + edac_snprintf("CE err addr: 0x%llx\n", sys_addr); pvt->ops->map_sysaddr_to_csrow(mci, info, sys_addr); } diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 3630308..f4b7de7 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -33,6 +33,7 @@ #include #include "edac_core.h" #include "edac_module.h" +#include "edac_mce_amd.h" /* lock to memory controller's control array */ static DEFINE_MUTEX(mem_ctls_mutex); @@ -702,6 +703,12 @@ void edac_mc_handle_ce(struct mem_ctl_info *mci, mci->csrows[row].grain, syndrome, row, channel, mci->csrows[row].channels[channel].label, msg); + edac_snprintf("CE page 0x%lx, offset 0x%lx, grain %d, syndrome " + "0x%lx, row %d, channel %d\n", + page_frame_number, offset_in_page, + mci->csrows[row].grain, syndrome, row, channel); + + mci->ce_count++; mci->csrows[row].ce_count++; mci->csrows[row].channels[channel].ce_count++; diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c index 97e64bc..86b374e 100644 --- a/drivers/edac/edac_mce_amd.c +++ b/drivers/edac/edac_mce_amd.c @@ -1,4 +1,6 @@ #include +#include +#include #include "edac_mce_amd.h" static bool report_gart_errors; @@ -128,6 +130,33 @@ const char *ext_msgs[] = { }; EXPORT_SYMBOL_GPL(ext_msgs); +static char *decoded_err; +static unsigned dec_len; + +void edac_snprintf(const char *fmt, ...) +{ + va_list args; + char *buf = decoded_err + dec_len; + unsigned size = DECODED_ERR_SZ - dec_len - 1; + int i; + + if (dec_len >= DECODED_ERR_SZ-1) + return; + + va_start(args, fmt); + i = vsnprintf(buf, size, fmt, args); + va_end(args); + + if (i >= size) { + printk(KERN_ERR "MCE decode buffer truncated.\n"); + dec_len = DECODED_ERR_SZ-1; + decoded_err[dec_len] = '\n'; + } else { + dec_len += i; + } +} +EXPORT_SYMBOL_GPL(edac_snprintf); + static void amd_decode_dc_mce(u64 mc0_status) { u32 ec = mc0_status & 0xffff; @@ -304,7 +333,7 @@ void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) if (TLB_ERROR(ec) && !report_gart_errors) return; - pr_emerg(" Northbridge Error, node %d", node_id); + edac_snprintf(" Northbridge Error, node %d", node_id); /* * F10h, revD can disable ErrCpu[3:0] so check that first and also the @@ -313,17 +342,17 @@ void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model > 7)) { if (regs->nbsh & K8_NBSH_ERR_CPU_VAL) - pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf)); + edac_snprintf(", core: %u\n", (u8)(regs->nbsh & 0xf)); } else { u8 assoc_cpus = regs->nbsh & 0xf; if (assoc_cpus > 0) - pr_cont(", core: %d", fls(assoc_cpus) - 1); + edac_snprintf(", core: %d", fls(assoc_cpus) - 1); - pr_cont("\n"); + edac_snprintf("\n"); } - pr_emerg("%s.\n", EXT_ERR_MSG(regs->nbsl)); + edac_snprintf("%s.\n", EXT_ERR_MSG(regs->nbsl)); if (BUS_ERROR(ec) && nb_bus_decoder) nb_bus_decoder(node_id, regs); @@ -342,13 +371,13 @@ static void amd_decode_fr_mce(u64 mc5_status) static inline void amd_decode_err_code(unsigned int ec) { if (TLB_ERROR(ec)) { - pr_emerg(" Transaction: %s, Cache Level %s\n", + edac_snprintf(" Transaction: %s, Cache Level %s\n", TT_MSG(ec), LL_MSG(ec)); } else if (MEM_ERROR(ec)) { - pr_emerg(" Transaction: %s, Type: %s, Cache Level: %s", + edac_snprintf(" Transaction: %s, Type: %s, Cache Level: %s", RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); } else if (BUS_ERROR(ec)) { - pr_emerg(" Transaction type: %s(%s), %s, Cache Level: %s, " + edac_snprintf(" Transaction type: %s(%s), %s, Cache Level: %s, " "Participating Processor: %s\n", RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec), PP_MSG(ec)); @@ -363,9 +392,9 @@ static int amd_decode_mce(struct notifier_block *nb, unsigned long val, struct err_regs regs; int node, ecc; - pr_emerg("MC%d_STATUS: ", m->bank); +/* already in the MCE record: pr_emerg("MC%d_STATUS: ", m->bank); */ - pr_cont("%sorrected error, report: %s, MiscV: %svalid, " + pr_emerg("%sorrected error, report: %s, MiscV: %svalid, " "CPU context corrupt: %s", ((m->status & MCI_STATUS_UC) ? "Unc" : "C"), ((m->status & MCI_STATUS_EN) ? "yes" : "no"), @@ -416,6 +445,12 @@ static int amd_decode_mce(struct notifier_block *nb, unsigned long val, amd_decode_err_code(m->status & 0xffff); + /* this has to be at the end */ + pr_emerg("%s\n", decoded_err); + + trace_mce_record(m, decoded_err); + dec_len = 0; + return NOTIFY_STOP; } @@ -432,6 +467,10 @@ static int __init mce_amd_init(void) (boot_cpu_data.x86 >= 0xf)) atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb); + decoded_err = kzalloc(DECODED_ERR_SZ, GFP_KERNEL); + if (!decoded_err) + return -ENOMEM; + return 0; } early_initcall(mce_amd_init); @@ -439,6 +478,7 @@ early_initcall(mce_amd_init); #ifdef MODULE static void __exit mce_amd_exit(void) { + kfree(decoded_err); atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb); } diff --git a/drivers/edac/edac_mce_amd.h b/drivers/edac/edac_mce_amd.h index df23ee0..3ff1802 100644 --- a/drivers/edac/edac_mce_amd.h +++ b/drivers/edac/edac_mce_amd.h @@ -66,4 +66,5 @@ void amd_register_ecc_decoder(void (*f)(int, struct err_regs *)); void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *)); void amd_decode_nb_mce(int, struct err_regs *, int); +void edac_snprintf(const char *fmt, ...); #endif /* _EDAC_MCE_AMD_H */ -- 1.6.4.4