linux-kernel - [PATCH] x86/mce: Do away with unnecessary context quirks

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250813154455.162489-1-yazen.ghannam@amd.com>
Date: Wed, 13 Aug 2025 15:44:55 +0000
From: Yazen Ghannam <yazen.ghannam@....com>
To: <linux-edac@...r.kernel.org>
CC: <linux-kernel@...r.kernel.org>, <tony.luck@...el.com>, <x86@...nel.org>,
	<avadhut.naik@....com>, <john.allen@....com>, Yazen Ghannam
	<yazen.ghannam@....com>
Subject: [PATCH] x86/mce: Do away with unnecessary context quirks

Both Intel and AMD have quirks related to recovery in the Instruction
Fetch Units. The common issue is that MCG_STATUS[RIPV] and
MCG_STATUS[EIPV] are set to '0', so Linux will not save the CS and IP
registers. The severity grading functions will later see that CS=0, so it
is assumed that the #MC was taken in kernel context. This leads to a
kernel panic even if the #MC was recoverable and in user context.

RIPV is "restart IP valid" which means program execution can restart at
the IP on the stack. This is a general indicator on whether system
software should try to return to the executing process or not. The exact
value is not needed by MCE handling.

EIPV is "error IP valid" which means the IP on the stack is directly
associated with the error. This is a specific indicator that the saved
IP is exactly where the #MC was taken. System software can share this
for debugging and/or try to take further recovery actions based on the
nature of the code represented by the IP.

Neither of these refer to the CS register which is used to determine
the execution context privilege level.

It is not clear why CS and IP are tied together in the Linux handling.
This could be a carryover from 32-bit execution where "IP" is the
combination of "CS:IP". But it not apparent if this "IP=CS:IP"
association, as applies to MCE handling, is a Linux assumption or
explicitly noted in x86 documentation when describing RIPV/EIPV.

It is clear that in the affected use cases, the processor context is
valid in general. And the only variable is the IP validity which is
explicitly based on RIPV/EIPV. An invalid CPU context is represented by
the MCA_STATUS[PCC] "Processor Context Corrupt" bit.

Avoid the need for these context quirks by refactoring the Linux MCE
handling code to treat the CS and IP registers independently.

Additionally, this removes the need to "fake" the CS value for VM86
mode.

Signed-off-by: Yazen Ghannam <yazen.ghannam@....com>
---
 arch/x86/kernel/cpu/mce/core.c     | 101 +++--------------------------
 arch/x86/kernel/cpu/mce/internal.h |   8 +--
 arch/x86/kernel/cpu/mce/severity.c |  24 +++----
 3 files changed, 22 insertions(+), 111 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 4da4eab56c81..1aed411cbdb1 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -456,9 +456,8 @@ static noinstr void mce_wrmsrq(u32 msr, u64 v)
  * check into our "mce" struct so that we can use it later to assess
  * the severity of the problem as we read per-bank specific details.
  */
-static noinstr void mce_gather_info(struct mce_hw_err *err, struct pt_regs *regs)
+static noinstr void mce_gather_info(struct mce_hw_err *err)
 {
-	struct mce *m;
 	/*
 	 * Enable instrumentation around mce_prep_record() which calls external
 	 * facilities.
@@ -467,29 +466,7 @@ static noinstr void mce_gather_info(struct mce_hw_err *err, struct pt_regs *regs
 	mce_prep_record(err);
 	instrumentation_end();
 
-	m = &err->m;
-	m->mcgstatus = mce_rdmsrq(MSR_IA32_MCG_STATUS);
-	if (regs) {
-		/*
-		 * Get the address of the instruction at the time of
-		 * the machine check error.
-		 */
-		if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
-			m->ip = regs->ip;
-			m->cs = regs->cs;
-
-			/*
-			 * When in VM86 mode make the cs look like ring 3
-			 * always. This is a lie, but it's better than passing
-			 * the additional vm86 bit around everywhere.
-			 */
-			if (v8086_mode(regs))
-				m->cs |= 3;
-		}
-		/* Use accurate RIP reporting if available. */
-		if (mca_cfg.rip_msr)
-			m->ip = mce_rdmsrq(mca_cfg.rip_msr);
-	}
+	err->m.mcgstatus = mce_rdmsrq(MSR_IA32_MCG_STATUS);
 }
 
 bool mce_available(struct cpuinfo_x86 *c)
@@ -738,7 +715,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 
 	this_cpu_inc(mce_poll_count);
 
-	mce_gather_info(&err, NULL);
+	mce_gather_info(&err);
 	m = &err.m;
 
 	if (flags & MCP_TIMESTAMP)
@@ -841,35 +818,6 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 }
 EXPORT_SYMBOL_GPL(machine_check_poll);
 
-/*
- * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
- * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
- * Vol 3B Table 15-20). But this confuses both the code that determines
- * whether the machine check occurred in kernel or user mode, and also
- * the severity assessment code. Pretend that EIPV was set, and take the
- * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
- */
-static __always_inline void
-quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
-{
-	if (bank != 0)
-		return;
-	if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
-		return;
-	if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
-		          MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
-			  MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
-			  MCACOD)) !=
-			 (MCI_STATUS_UC|MCI_STATUS_EN|
-			  MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
-			  MCI_STATUS_AR|MCACOD_INSTR))
-		return;
-
-	m->mcgstatus |= MCG_STATUS_EIPV;
-	m->ip = regs->ip;
-	m->cs = regs->cs;
-}
-
 /*
  * Disable fast string copy and return from the MCE handler upon the first SRAR
  * MCE on bank 1 due to a CPU erratum on Intel Skylake/Cascade Lake/Cooper Lake
@@ -923,26 +871,6 @@ static noinstr bool quirk_skylake_repmov(void)
 	return false;
 }
 
-/*
- * Some Zen-based Instruction Fetch Units set EIPV=RIPV=0 on poison consumption
- * errors. This means mce_gather_info() will not save the "ip" and "cs" registers.
- *
- * However, the context is still valid, so save the "cs" register for later use.
- *
- * The "ip" register is truly unknown, so don't save it or fixup EIPV/RIPV.
- *
- * The Instruction Fetch Unit is at MCA bank 1 for all affected systems.
- */
-static __always_inline void quirk_zen_ifu(int bank, struct mce *m, struct pt_regs *regs)
-{
-	if (bank != 1)
-		return;
-	if (!(m->status & MCI_STATUS_POISON))
-		return;
-
-	m->cs = regs->cs;
-}
-
 /*
  * Do a quick check if any of the events requires a panic.
  * This decides if we keep the events around or clear them.
@@ -960,11 +888,6 @@ static __always_inline int mce_no_way_out(struct mce_hw_err *err, char **msg, un
 			continue;
 
 		arch___set_bit(i, validp);
-		if (mce_flags.snb_ifu_quirk)
-			quirk_sandybridge_ifu(i, m, regs);
-
-		if (mce_flags.zen_ifu_quirk)
-			quirk_zen_ifu(i, m, regs);
 
 		m->bank = i;
 		if (mce_severity(m, regs, &tmp, true) >= MCE_PANIC_SEVERITY) {
@@ -1057,7 +980,7 @@ static noinstr int mce_timed_out(u64 *t, const char *msg)
  * All the spin loops have timeouts; when a timeout happens a CPU
  * typically elects itself to be Monarch.
  */
-static void mce_reign(void)
+static void mce_reign(struct pt_regs *regs)
 {
 	struct mce_hw_err *err = NULL;
 	struct mce *m = NULL;
@@ -1088,7 +1011,7 @@ static void mce_reign(void)
 	 */
 	if (m && global_worst >= MCE_PANIC_SEVERITY) {
 		/* call mce_severity() to get "msg" for panic */
-		mce_severity(m, NULL, &msg, true);
+		mce_severity(m, regs, &msg, true);
 		mce_panic("Fatal machine check", err, msg);
 	}
 
@@ -1197,7 +1120,7 @@ static noinstr int mce_start(int *no_way_out)
  * Synchronize between CPUs after main scanning loop.
  * This invokes the bulk of the Monarch processing.
  */
-static noinstr int mce_end(int order)
+static noinstr int mce_end(int order, struct pt_regs *regs)
 {
 	u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
 	int ret = -1;
@@ -1227,7 +1150,7 @@ static noinstr int mce_end(int order)
 			ndelay(SPINUNIT);
 		}
 
-		mce_reign();
+		mce_reign(regs);
 		barrier();
 		ret = 0;
 	} else {
@@ -1557,7 +1480,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
 
 	this_cpu_inc(mce_exception_count);
 
-	mce_gather_info(&err, regs);
+	mce_gather_info(&err);
 	m = &err.m;
 	m->tsc = rdtsc();
 
@@ -1607,7 +1530,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
 	 * When there's any problem use only local no_way_out state.
 	 */
 	if (!lmce) {
-		if (mce_end(order) < 0) {
+		if (mce_end(order, regs) < 0) {
 			if (!no_way_out)
 				no_way_out = worst >= MCE_PANIC_SEVERITY;
 
@@ -1950,9 +1873,6 @@ static void apply_quirks_amd(struct cpuinfo_x86 *c)
 	 */
 	if (c->x86 == 0x15 && c->x86_model <= 0xf)
 		mce_flags.overflow_recov = 1;
-
-	if (c->x86 >= 0x17 && c->x86 <= 0x1A)
-		mce_flags.zen_ifu_quirk = 1;
 }
 
 static void apply_quirks_intel(struct cpuinfo_x86 *c)
@@ -1988,9 +1908,6 @@ static void apply_quirks_intel(struct cpuinfo_x86 *c)
 	if (c->x86_vfm < INTEL_CORE_YONAH && mca_cfg.bootlog < 0)
 		mca_cfg.bootlog = 0;
 
-	if (c->x86_vfm == INTEL_SANDYBRIDGE_X)
-		mce_flags.snb_ifu_quirk = 1;
-
 	/*
 	 * Skylake, Cascacde Lake and Cooper Lake require a quirk on
 	 * rep movs.
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index b5ba598e54cb..59a94daa31ad 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -211,9 +211,6 @@ struct mce_vendor_flags {
 	 */
 	smca			: 1,
 
-	/* Zen IFU quirk */
-	zen_ifu_quirk		: 1,
-
 	/* AMD-style error thresholding banks present. */
 	amd_threshold		: 1,
 
@@ -223,13 +220,10 @@ struct mce_vendor_flags {
 	/* Centaur Winchip C6-style MCA */
 	winchip			: 1,
 
-	/* SandyBridge IFU quirk */
-	snb_ifu_quirk		: 1,
-
 	/* Skylake, Cascade Lake, Cooper Lake REP;MOVS* quirk */
 	skx_repmov_quirk	: 1,
 
-	__reserved_0		: 55;
+	__reserved_0		: 57;
 };
 
 extern struct mce_vendor_flags mce_flags;
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
index 2235a7477436..6bf7081b19c0 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -272,23 +272,23 @@ static bool is_copy_from_user(struct pt_regs *regs)
 	return true;
 }
 
-/*
- * If mcgstatus indicated that ip/cs on the stack were
- * no good, then "m->cs" will be zero and we will have
- * to assume the worst case (IN_KERNEL) as we actually
- * have no idea what we were executing when the machine
- * check hit.
- * If we do have a good "m->cs" (or a faked one in the
- * case we were executing in VM86 mode) we can use it to
- * distinguish an exception taken in user from from one
- * taken in the kernel.
- */
 static noinstr int error_context(struct mce *m, struct pt_regs *regs)
 {
 	int fixup_type;
 	bool copy_user;
 
-	if ((m->cs & 3) == 3)
+	/* Without register info, assume the worst. */
+	if (!regs)
+		return IN_KERNEL;
+
+	m->ip = regs->ip;
+	m->cs = regs->cs;
+
+	/* Use accurate RIP reporting if available. */
+	if (mca_cfg.rip_msr)
+		m->ip = mce_rdmsrq(mca_cfg.rip_msr);
+
+	if (user_mode(regs))
 		return IN_USER;
 
 	if (!mc_recoverable(m->mcgstatus))
-- 
2.50.1