[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <173046126509.3137.9972747278395091217.tip-bot2@tip-bot2>
Date: Fri, 01 Nov 2024 11:41:05 -0000
From: "tip-bot2 for Avadhut Naik" <tip-bot2@...utronix.de>
To: linux-tip-commits@...r.kernel.org
Cc: Yazen Ghannam <yazen.ghannam@....com>, Avadhut Naik <avadhut.naik@....com>,
"Borislav Petkov (AMD)" <bp@...en8.de>, Qiuxu Zhuo <qiuxu.zhuo@...el.com>,
x86@...nel.org, linux-kernel@...r.kernel.org
Subject:
[tip: ras/core] x86/MCE/AMD: Add support for new MCA_SYND{1,2} registers
The following commit has been merged into the ras/core branch of tip:
Commit-ID: d4fca1358ea9096f2f6ed942e2cb3a820073dfc1
Gitweb: https://git.kernel.org/tip/d4fca1358ea9096f2f6ed942e2cb3a820073dfc1
Author: Avadhut Naik <avadhut.naik@....com>
AuthorDate: Tue, 22 Oct 2024 19:36:29
Committer: Borislav Petkov (AMD) <bp@...en8.de>
CommitterDate: Thu, 31 Oct 2024 10:36:07 +01:00
x86/MCE/AMD: Add support for new MCA_SYND{1,2} registers
Starting with Zen4, AMD's Scalable MCA systems incorporate two new registers:
MCA_SYND1 and MCA_SYND2.
These registers will include supplemental error information in addition to the
existing MCA_SYND register. The data within these registers is considered
valid if MCA_STATUS[SyndV] is set.
Userspace error decoding tools like rasdaemon gather related hardware error
information through the tracepoints.
Therefore, export these two registers through the mce_record tracepoint so
that tools like rasdaemon can parse them and output the supplemental error
information like FRU text contained in them.
[ bp: Massage. ]
Signed-off-by: Yazen Ghannam <yazen.ghannam@....com>
Signed-off-by: Avadhut Naik <avadhut.naik@....com>
Signed-off-by: Borislav Petkov (AMD) <bp@...en8.de>
Reviewed-by: Qiuxu Zhuo <qiuxu.zhuo@...el.com>
Link: https://lore.kernel.org/r/20241022194158.110073-4-avadhut.naik@amd.com
---
arch/x86/include/asm/mce.h | 21 +++++++++++++++++++++
arch/x86/include/uapi/asm/mce.h | 3 ++-
arch/x86/kernel/cpu/mce/amd.c | 5 ++++-
arch/x86/kernel/cpu/mce/core.c | 9 ++++++++-
drivers/edac/mce_amd.c | 8 ++++++--
include/trace/events/mce.h | 7 +++++--
6 files changed, 46 insertions(+), 7 deletions(-)
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 4e45f45..4d936ee 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -122,6 +122,9 @@
#define MSR_AMD64_SMCA_MC0_DESTAT 0xc0002008
#define MSR_AMD64_SMCA_MC0_DEADDR 0xc0002009
#define MSR_AMD64_SMCA_MC0_MISC1 0xc000200a
+/* Registers MISC2 to MISC4 are at offsets B to D. */
+#define MSR_AMD64_SMCA_MC0_SYND1 0xc000200e
+#define MSR_AMD64_SMCA_MC0_SYND2 0xc000200f
#define MSR_AMD64_SMCA_MCx_CTL(x) (MSR_AMD64_SMCA_MC0_CTL + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_STATUS(x) (MSR_AMD64_SMCA_MC0_STATUS + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_ADDR(x) (MSR_AMD64_SMCA_MC0_ADDR + 0x10*(x))
@@ -132,6 +135,8 @@
#define MSR_AMD64_SMCA_MCx_DESTAT(x) (MSR_AMD64_SMCA_MC0_DESTAT + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_DEADDR(x) (MSR_AMD64_SMCA_MC0_DEADDR + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_MISCy(x, y) ((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x)))
+#define MSR_AMD64_SMCA_MCx_SYND1(x) (MSR_AMD64_SMCA_MC0_SYND1 + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_SYND2(x) (MSR_AMD64_SMCA_MC0_SYND2 + 0x10*(x))
#define XEC(x, mask) (((x) >> 16) & mask)
@@ -190,9 +195,25 @@ enum mce_notifier_prios {
/**
* struct mce_hw_err - Hardware Error Record.
* @m: Machine Check record.
+ * @vendor: Vendor-specific error information.
+ *
+ * Vendor-specific fields should not be added to struct mce. Instead, vendors
+ * should export their vendor-specific data through their structure in the
+ * vendor union below.
+ *
+ * AMD's vendor data is parsed by error decoding tools for supplemental error
+ * information. Thus, current offsets of existing fields must be maintained.
+ * Only add new fields at the end of AMD's vendor structure.
*/
struct mce_hw_err {
struct mce m;
+
+ union vendor_info {
+ struct {
+ u64 synd1; /* MCA_SYND1 MSR */
+ u64 synd2; /* MCA_SYND2 MSR */
+ } amd;
+ } vendor;
};
#define to_mce_hw_err(mce) container_of(mce, struct mce_hw_err, m)
diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h
index db9adc0..cb6b48a 100644
--- a/arch/x86/include/uapi/asm/mce.h
+++ b/arch/x86/include/uapi/asm/mce.h
@@ -8,7 +8,8 @@
/*
* Fields are zero when not available. Also, this struct is shared with
* userspace mcelog and thus must keep existing fields at current offsets.
- * Only add new fields to the end of the structure
+ * Only add new, shared fields to the end of the structure.
+ * Do not add vendor-specific fields.
*/
struct mce {
__u64 status; /* Bank's MCi_STATUS MSR */
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 5b4d266..6ca80ff 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -797,8 +797,11 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
if (mce_flags.smca) {
rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m->ipid);
- if (m->status & MCI_STATUS_SYNDV)
+ if (m->status & MCI_STATUS_SYNDV) {
rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m->synd);
+ rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(bank), err.vendor.amd.synd1);
+ rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(bank), err.vendor.amd.synd2);
+ }
}
mce_log(&err);
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 28e28b6..7fb5556 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -202,6 +202,10 @@ static void __print_mce(struct mce_hw_err *err)
if (mce_flags.smca) {
if (m->synd)
pr_cont("SYND %llx ", m->synd);
+ if (err->vendor.amd.synd1)
+ pr_cont("SYND1 %llx ", err->vendor.amd.synd1);
+ if (err->vendor.amd.synd2)
+ pr_cont("SYND2 %llx ", err->vendor.amd.synd2);
if (m->ipid)
pr_cont("IPID %llx ", m->ipid);
}
@@ -678,8 +682,11 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i)
if (mce_flags.smca) {
m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
- if (m->status & MCI_STATUS_SYNDV)
+ if (m->status & MCI_STATUS_SYNDV) {
m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
+ err->vendor.amd.synd1 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(i));
+ err->vendor.amd.synd2 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(i));
+ }
}
}
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index 8130c3d..194d9fd 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -793,6 +793,7 @@ static int
amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
{
struct mce *m = (struct mce *)data;
+ struct mce_hw_err *err = to_mce_hw_err(m);
unsigned int fam = x86_family(m->cpuid);
int ecc;
@@ -850,8 +851,11 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
if (boot_cpu_has(X86_FEATURE_SMCA)) {
pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);
- if (m->status & MCI_STATUS_SYNDV)
- pr_cont(", Syndrome: 0x%016llx", m->synd);
+ if (m->status & MCI_STATUS_SYNDV) {
+ pr_cont(", Syndrome: 0x%016llx\n", m->synd);
+ pr_emerg(HW_ERR "Syndrome1: 0x%016llx, Syndrome2: 0x%016llx",
+ err->vendor.amd.synd1, err->vendor.amd.synd2);
+ }
pr_cont("\n");
diff --git a/include/trace/events/mce.h b/include/trace/events/mce.h
index 65aba1a..c1c50df 100644
--- a/include/trace/events/mce.h
+++ b/include/trace/events/mce.h
@@ -43,6 +43,7 @@ TRACE_EVENT(mce_record,
__field( u8, bank )
__field( u8, cpuvendor )
__field( u32, microcode )
+ __dynamic_array(u8, v_data, sizeof(err->vendor))
),
TP_fast_assign(
@@ -65,9 +66,10 @@ TRACE_EVENT(mce_record,
__entry->bank = err->m.bank;
__entry->cpuvendor = err->m.cpuvendor;
__entry->microcode = err->m.microcode;
+ memcpy(__get_dynamic_array(v_data), &err->vendor, sizeof(err->vendor));
),
- TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016Lx, IPID: %016Lx, ADDR: %016Lx, MISC: %016Lx, SYND: %016Lx, RIP: %02x:<%016Lx>, TSC: %llx, PPIN: %llx, vendor: %u, CPUID: %x, time: %llu, socket: %u, APIC: %x, microcode: %x",
+ TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016llx, IPID: %016llx, ADDR: %016llx, MISC: %016llx, SYND: %016llx, RIP: %02x:<%016llx>, TSC: %llx, PPIN: %llx, vendor: %u, CPUID: %x, time: %llu, socket: %u, APIC: %x, microcode: %x, vendor data: %s",
__entry->cpu,
__entry->mcgcap, __entry->mcgstatus,
__entry->bank, __entry->status,
@@ -83,7 +85,8 @@ TRACE_EVENT(mce_record,
__entry->walltime,
__entry->socketid,
__entry->apicid,
- __entry->microcode)
+ __entry->microcode,
+ __print_dynamic_array(v_data, sizeof(u8)))
);
#endif /* _TRACE_MCE_H */
Powered by blists - more mailing lists