linux-kernel - Re: [RFC PATCH 40/56] x86/alternative: Use sync_core_nmi

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251020150133.GK3245006@noisy.programming.kicks-ass.net>
Date: Mon, 20 Oct 2025 17:01:33 +0200
From: Peter Zijlstra <peterz@...radead.org>
To: "Kaplan, David" <David.Kaplan@....com>
Cc: Thomas Gleixner <tglx@...utronix.de>, Borislav Petkov <bp@...en8.de>,
	Josh Poimboeuf <jpoimboe@...nel.org>,
	Pawan Gupta <pawan.kumar.gupta@...ux.intel.com>,
	Ingo Molnar <mingo@...hat.com>,
	Dave Hansen <dave.hansen@...ux.intel.com>,
	"x86@...nel.org" <x86@...nel.org>,
	"H . Peter Anvin" <hpa@...or.com>, Alexander Graf <graf@...zon.com>,
	Boris Ostrovsky <boris.ostrovsky@...cle.com>,
	"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>
Subject: Re: [RFC PATCH 40/56] x86/alternative: Use sync_core_nmi_safe()

On Mon, Oct 20, 2025 at 02:49:56PM +0000, Kaplan, David wrote:

> Coming back to this, are you thinking we should just create something
> like 'text_poke_sync_core()' inside alternative.c and that can use:
>    1. SERIALIZE (if available)
>    2. MOV-CR2 (if re-patching)
>    3. Else, IRET
> 
> And maybe someday we put MFENCE into there too for AMD parts.
> 
> Right now, of course this is the only logic that would care about an
> NMI-safe sync_core().  So maybe this makes sense vs creating a generic
> version that nobody else is using?

I was thinking something fairly straight forward like the below. Yes,
there are a few more sync_core() callers out there, git tells me:

arch/x86/kernel/alternative.c:          sync_core();
arch/x86/kernel/alternative.c:noinstr void sync_core(void)
arch/x86/kernel/alternative.c:  sync_core();
arch/x86/kernel/cpu/mce/core.c: sync_core();
arch/x86/kernel/cpu/mce/core.c:         sync_core();
arch/x86/kernel/static_call.c:  sync_core();
drivers/misc/sgi-gru/grufault.c:                sync_core();
drivers/misc/sgi-gru/grufault.c:                sync_core();            /* make sure we are have current data */
drivers/misc/sgi-gru/gruhandles.c:      sync_core();
drivers/misc/sgi-gru/gruhandles.c:      sync_core();
drivers/misc/sgi-gru/grukservices.c:    sync_core();

But none of that seems like it cares about an extra few cycles, and why
complicate matters with another sync_core variant and all that.


diff --git a/arch/x86/include/asm/sync_core.h b/arch/x86/include/asm/sync_core.h
index 96bda43538ee..ef4508a03800 100644
--- a/arch/x86/include/asm/sync_core.h
+++ b/arch/x86/include/asm/sync_core.h
@@ -7,86 +7,7 @@
 #include <asm/cpufeature.h>
 #include <asm/special_insns.h>
 
-#ifdef CONFIG_X86_32
-static __always_inline void iret_to_self(void)
-{
-	asm volatile (
-		"pushfl\n\t"
-		"pushl %%cs\n\t"
-		"pushl $1f\n\t"
-		"iret\n\t"
-		"1:"
-		: ASM_CALL_CONSTRAINT : : "memory");
-}
-#else
-static __always_inline void iret_to_self(void)
-{
-	unsigned int tmp;
-
-	asm volatile (
-		"mov %%ss, %0\n\t"
-		"pushq %q0\n\t"
-		"pushq %%rsp\n\t"
-		"addq $8, (%%rsp)\n\t"
-		"pushfq\n\t"
-		"mov %%cs, %0\n\t"
-		"pushq %q0\n\t"
-		"pushq $1f\n\t"
-		"iretq\n\t"
-		"1:"
-		: "=&r" (tmp), ASM_CALL_CONSTRAINT : : "cc", "memory");
-}
-#endif /* CONFIG_X86_32 */
-
-/*
- * This function forces the icache and prefetched instruction stream to
- * catch up with reality in two very specific cases:
- *
- *  a) Text was modified using one virtual address and is about to be executed
- *     from the same physical page at a different virtual address.
- *
- *  b) Text was modified on a different CPU, may subsequently be
- *     executed on this CPU, and you want to make sure the new version
- *     gets executed.  This generally means you're calling this in an IPI.
- *
- * If you're calling this for a different reason, you're probably doing
- * it wrong.
- *
- * Like all of Linux's memory ordering operations, this is a
- * compiler barrier as well.
- */
-static __always_inline void sync_core(void)
-{
-	/*
-	 * The SERIALIZE instruction is the most straightforward way to
-	 * do this, but it is not universally available.
-	 */
-	if (static_cpu_has(X86_FEATURE_SERIALIZE)) {
-		serialize();
-		return;
-	}
-
-	/*
-	 * For all other processors, there are quite a few ways to do this.
-	 * IRET-to-self is nice because it works on every CPU, at any CPL
-	 * (so it's compatible with paravirtualization), and it never exits
-	 * to a hypervisor.  The only downsides are that it's a bit slow
-	 * (it seems to be a bit more than 2x slower than the fastest
-	 * options) and that it unmasks NMIs.  The "push %cs" is needed,
-	 * because in paravirtual environments __KERNEL_CS may not be a
-	 * valid CS value when we do IRET directly.
-	 *
-	 * In case NMI unmasking or performance ever becomes a problem,
-	 * the next best option appears to be MOV-to-CR2 and an
-	 * unconditional jump.  That sequence also works on all CPUs,
-	 * but it will fault at CPL3 (i.e. Xen PV).
-	 *
-	 * CPUID is the conventional way, but it's nasty: it doesn't
-	 * exist on some 486-like CPUs, and it usually exits to a
-	 * hypervisor.
-	 */
-	iret_to_self();
-}
+extern void sync_core(void);
 
 /*
  * Ensure that a core serializing instruction is issued before returning
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index e377b06e70e3..2a5daae3626b 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -2687,6 +2687,87 @@ void *text_poke_set(void *addr, int c, size_t len)
 	return addr;
 }
 
+#ifdef CONFIG_X86_32
+static __always_inline void iret_to_self(void)
+{
+	asm volatile (
+		"pushfl\n\t"
+		"pushl %%cs\n\t"
+		"pushl $1f\n\t"
+		"iret\n\t"
+		"1:"
+		: ASM_CALL_CONSTRAINT : : "memory");
+}
+#else
+static __always_inline void iret_to_self(void)
+{
+	unsigned int tmp;
+
+	asm volatile (
+		"mov %%ss, %0\n\t"
+		"pushq %q0\n\t"
+		"pushq %%rsp\n\t"
+		"addq $8, (%%rsp)\n\t"
+		"pushfq\n\t"
+		"mov %%cs, %0\n\t"
+		"pushq %q0\n\t"
+		"pushq $1f\n\t"
+		"iretq\n\t"
+		"1:"
+		: "=&r" (tmp), ASM_CALL_CONSTRAINT : : "cc", "memory");
+}
+#endif /* CONFIG_X86_32 */
+
+/*
+ * This function forces the icache and prefetched instruction stream to
+ * catch up with reality in two very specific cases:
+ *
+ *  a) Text was modified using one virtual address and is about to be executed
+ *     from the same physical page at a different virtual address.
+ *
+ *  b) Text was modified on a different CPU, may subsequently be
+ *     executed on this CPU, and you want to make sure the new version
+ *     gets executed.  This generally means you're calling this in an IPI.
+ *
+ * If you're calling this for a different reason, you're probably doing
+ * it wrong.
+ *
+ * Like all of Linux's memory ordering operations, this is a
+ * compiler barrier as well.
+ */
+noinstr void sync_core(void)
+{
+	/*
+	 * The SERIALIZE instruction is the most straightforward way to
+	 * do this, but it is not universally available.
+	 */
+	if (static_cpu_has(X86_FEATURE_SERIALIZE)) {
+		serialize();
+		return;
+	}
+
+	/*
+	 * For all other processors, there are quite a few ways to do this.
+	 * IRET-to-self is nice because it works on every CPU, at any CPL
+	 * (so it's compatible with paravirtualization), and it never exits
+	 * to a hypervisor.  The only downsides are that it's a bit slow
+	 * (it seems to be a bit more than 2x slower than the fastest
+	 * options) and that it unmasks NMIs.  The "push %cs" is needed,
+	 * because in paravirtual environments __KERNEL_CS may not be a
+	 * valid CS value when we do IRET directly.
+	 *
+	 * In case NMI unmasking or performance ever becomes a problem,
+	 * the next best option appears to be MOV-to-CR2 and an
+	 * unconditional jump.  That sequence also works on all CPUs,
+	 * but it will fault at CPL3 (i.e. Xen PV).
+	 *
+	 * CPUID is the conventional way, but it's nasty: it doesn't
+	 * exist on some 486-like CPUs, and it usually exits to a
+	 * hypervisor.
+	 */
+	iret_to_self();
+}
+
 static void do_sync_core(void *info)
 {
 	sync_core();