[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250814154809.165916-1-yazen.ghannam@amd.com>
Date: Thu, 14 Aug 2025 11:48:09 -0400
From: Yazen Ghannam <yazen.ghannam@....com>
To: <linux-edac@...r.kernel.org>
CC: <linux-kernel@...r.kernel.org>, <tony.luck@...el.com>, <x86@...nel.org>,
<avadhut.naik@....com>, <john.allen@....com>, Yazen Ghannam
<yazen.ghannam@....com>
Subject: [PATCH v2] x86/mce: Do away with unnecessary context quirks
Both Intel and AMD have quirks related to recovery in the Instruction
Fetch Units. The common issue is that MCG_STATUS[RIPV] and
MCG_STATUS[EIPV] are set to '0', so Linux will not save the CS and IP
registers. The severity grading functions will later see that CS=0, so it
is assumed that the #MC was taken in kernel context. This leads to a
kernel panic even if the #MC was recoverable and in user context.
RIPV is "restart IP valid" which means program execution can restart at
the IP on the stack. This is a general indicator on whether system
software should try to return to the executing process or not. The exact
value is not needed by MCE handling.
EIPV is "error IP valid" which means the IP on the stack is directly
associated with the error. This is a specific indicator that the saved
IP is exactly where the #MC was taken. System software can share this
for debugging and/or try to take further recovery actions based on the
nature of the code represented by the IP.
Neither of these refer to the CS register which is used to determine
the execution context privilege level.
It is not clear why CS and IP are tied together in the Linux handling.
This could be a carryover from 32-bit execution where "IP" is the
combination of "CS:IP". But it not apparent if this "IP=CS:IP"
association, as applies to MCE handling, is a Linux assumption or
explicitly noted in x86 documentation when describing RIPV/EIPV.
It is clear that in the affected use cases, the processor context is
valid in general. And the only variable is the IP validity which is
explicitly based on RIPV/EIPV. An invalid CPU context is represented by
the MCA_STATUS[PCC] "Processor Context Corrupt" bit.
Avoid the need for these context quirks by refactoring the Linux MCE
handling code to treat the CS and IP registers independently.
Signed-off-by: Yazen Ghannam <yazen.ghannam@....com>
---
Link:
https://lore.kernel.org/r/20250813154455.162489-1-yazen.ghannam@amd.com
v1->v2:
* Minimize changes to only code related to context quirks.
arch/x86/kernel/cpu/mce/core.c | 83 +++++-------------------------
arch/x86/kernel/cpu/mce/internal.h | 8 +--
2 files changed, 13 insertions(+), 78 deletions(-)
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 4da4eab56c81..a26534a914ec 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -470,22 +470,23 @@ static noinstr void mce_gather_info(struct mce_hw_err *err, struct pt_regs *regs
m = &err->m;
m->mcgstatus = mce_rdmsrq(MSR_IA32_MCG_STATUS);
if (regs) {
+ m->cs = regs->cs;
+
+ /*
+ * When in VM86 mode make the cs look like ring 3
+ * always. This is a lie, but it's better than passing
+ * the additional vm86 bit around everywhere.
+ */
+ if (v8086_mode(regs))
+ m->cs |= 3;
+
/*
* Get the address of the instruction at the time of
* the machine check error.
*/
- if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
+ if (m->mcgstatus & (MCG_STATUS_RIPV | MCG_STATUS_EIPV))
m->ip = regs->ip;
- m->cs = regs->cs;
-
- /*
- * When in VM86 mode make the cs look like ring 3
- * always. This is a lie, but it's better than passing
- * the additional vm86 bit around everywhere.
- */
- if (v8086_mode(regs))
- m->cs |= 3;
- }
+
/* Use accurate RIP reporting if available. */
if (mca_cfg.rip_msr)
m->ip = mce_rdmsrq(mca_cfg.rip_msr);
@@ -841,35 +842,6 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
}
EXPORT_SYMBOL_GPL(machine_check_poll);
-/*
- * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
- * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
- * Vol 3B Table 15-20). But this confuses both the code that determines
- * whether the machine check occurred in kernel or user mode, and also
- * the severity assessment code. Pretend that EIPV was set, and take the
- * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
- */
-static __always_inline void
-quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
-{
- if (bank != 0)
- return;
- if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
- return;
- if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
- MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
- MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
- MCACOD)) !=
- (MCI_STATUS_UC|MCI_STATUS_EN|
- MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
- MCI_STATUS_AR|MCACOD_INSTR))
- return;
-
- m->mcgstatus |= MCG_STATUS_EIPV;
- m->ip = regs->ip;
- m->cs = regs->cs;
-}
-
/*
* Disable fast string copy and return from the MCE handler upon the first SRAR
* MCE on bank 1 due to a CPU erratum on Intel Skylake/Cascade Lake/Cooper Lake
@@ -923,26 +895,6 @@ static noinstr bool quirk_skylake_repmov(void)
return false;
}
-/*
- * Some Zen-based Instruction Fetch Units set EIPV=RIPV=0 on poison consumption
- * errors. This means mce_gather_info() will not save the "ip" and "cs" registers.
- *
- * However, the context is still valid, so save the "cs" register for later use.
- *
- * The "ip" register is truly unknown, so don't save it or fixup EIPV/RIPV.
- *
- * The Instruction Fetch Unit is at MCA bank 1 for all affected systems.
- */
-static __always_inline void quirk_zen_ifu(int bank, struct mce *m, struct pt_regs *regs)
-{
- if (bank != 1)
- return;
- if (!(m->status & MCI_STATUS_POISON))
- return;
-
- m->cs = regs->cs;
-}
-
/*
* Do a quick check if any of the events requires a panic.
* This decides if we keep the events around or clear them.
@@ -960,11 +912,6 @@ static __always_inline int mce_no_way_out(struct mce_hw_err *err, char **msg, un
continue;
arch___set_bit(i, validp);
- if (mce_flags.snb_ifu_quirk)
- quirk_sandybridge_ifu(i, m, regs);
-
- if (mce_flags.zen_ifu_quirk)
- quirk_zen_ifu(i, m, regs);
m->bank = i;
if (mce_severity(m, regs, &tmp, true) >= MCE_PANIC_SEVERITY) {
@@ -1950,9 +1897,6 @@ static void apply_quirks_amd(struct cpuinfo_x86 *c)
*/
if (c->x86 == 0x15 && c->x86_model <= 0xf)
mce_flags.overflow_recov = 1;
-
- if (c->x86 >= 0x17 && c->x86 <= 0x1A)
- mce_flags.zen_ifu_quirk = 1;
}
static void apply_quirks_intel(struct cpuinfo_x86 *c)
@@ -1988,9 +1932,6 @@ static void apply_quirks_intel(struct cpuinfo_x86 *c)
if (c->x86_vfm < INTEL_CORE_YONAH && mca_cfg.bootlog < 0)
mca_cfg.bootlog = 0;
- if (c->x86_vfm == INTEL_SANDYBRIDGE_X)
- mce_flags.snb_ifu_quirk = 1;
-
/*
* Skylake, Cascacde Lake and Cooper Lake require a quirk on
* rep movs.
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index b5ba598e54cb..59a94daa31ad 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -211,9 +211,6 @@ struct mce_vendor_flags {
*/
smca : 1,
- /* Zen IFU quirk */
- zen_ifu_quirk : 1,
-
/* AMD-style error thresholding banks present. */
amd_threshold : 1,
@@ -223,13 +220,10 @@ struct mce_vendor_flags {
/* Centaur Winchip C6-style MCA */
winchip : 1,
- /* SandyBridge IFU quirk */
- snb_ifu_quirk : 1,
-
/* Skylake, Cascade Lake, Cooper Lake REP;MOVS* quirk */
skx_repmov_quirk : 1,
- __reserved_0 : 55;
+ __reserved_0 : 57;
};
extern struct mce_vendor_flags mce_flags;
base-commit: 0cc53520e68bea7fb80fdc6bdf8d226d1b6a98d9
--
2.50.1
Powered by blists - more mailing lists