[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <172656198906.2471820.7237893424649453751.tglx@xen13>
Date: Tue, 17 Sep 2024 10:54:12 +0200 (CEST)
From: Thomas Gleixner <tglx@...utronix.de>
To: Linus Torvalds <torvalds@...ux-foundation.org>
Cc: linux-kernel@...r.kernel.org, x86@...nel.org
Subject: [GIT pull] x86/fred for v6.12-rc1
Linus,
please pull the latest x86/fred branch from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86-fred-2024-09-17
up to: fe85ee391966: x86/entry: Set FRED RSP0 on return to userspace instead of context switch
Updates for x86 FRED:
- Enable FRED right after init_mem_mapping() because at that point the
early IDT fault handler is replaced by the real fault handler. The real
fault handler retrieves the faulting address from the stack frame and
not from CR2 when the FRED feature is set. But that obviously only
works when FRED is enabled in the CPU as well.
- Set SS to __KERNEL_DS when enabling FRED to prevent a corner case where
ERETS can observe a SS mismatch and raises a #GP.
Thanks,
tglx
------------------>
Andrew Cooper (1):
x86/msr: Switch between WRMSRNS and WRMSR with the alternatives mechanism
Xin Li (Intel) (6):
x86/fred: Parse cmdline param "fred=" in cpu_parse_early_param()
x86/fred: Move FRED RSP initialization into a separate function
x86/fred: Enable FRED right after init_mem_mapping()
x86/fred: Set SS to __KERNEL_DS when enabling FRED
x86/entry: Test ti_work for zero before processing individual bits
x86/entry: Set FRED RSP0 on return to userspace instead of context switch
arch/x86/include/asm/entry-common.h | 13 +++++++++--
arch/x86/include/asm/fred.h | 23 ++++++++++++++++++-
arch/x86/include/asm/msr.h | 25 +++++++++------------
arch/x86/include/asm/processor.h | 3 ++-
arch/x86/include/asm/switch_to.h | 6 +----
arch/x86/kernel/cpu/common.c | 22 ++++++++++++++++--
arch/x86/kernel/cpu/cpuid-deps.c | 1 -
arch/x86/kernel/fred.c | 45 +++++++++++++++++++++++++++++--------
arch/x86/kernel/setup.c | 7 +++++-
arch/x86/kernel/smpboot.c | 2 +-
arch/x86/kernel/traps.c | 28 +----------------------
11 files changed, 111 insertions(+), 64 deletions(-)
diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h
index fb2809b20b0a..77d20555e04d 100644
--- a/arch/x86/include/asm/entry-common.h
+++ b/arch/x86/include/asm/entry-common.h
@@ -8,6 +8,7 @@
#include <asm/nospec-branch.h>
#include <asm/io_bitmap.h>
#include <asm/fpu/api.h>
+#include <asm/fred.h>
/* Check that the stack and regs on entry from user mode are sane. */
static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs)
@@ -44,8 +45,7 @@ static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs)
}
#define arch_enter_from_user_mode arch_enter_from_user_mode
-static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
- unsigned long ti_work)
+static inline void arch_exit_work(unsigned long ti_work)
{
if (ti_work & _TIF_USER_RETURN_NOTIFY)
fire_user_return_notifiers();
@@ -56,6 +56,15 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
fpregs_assert_state_consistent();
if (unlikely(ti_work & _TIF_NEED_FPU_LOAD))
switch_fpu_return();
+}
+
+static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
+ unsigned long ti_work)
+{
+ if (IS_ENABLED(CONFIG_X86_DEBUG_FPU) || unlikely(ti_work))
+ arch_exit_work(ti_work);
+
+ fred_update_rsp0();
#ifdef CONFIG_COMPAT
/*
diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h
index e86c7ba32435..25ca00bd70e8 100644
--- a/arch/x86/include/asm/fred.h
+++ b/arch/x86/include/asm/fred.h
@@ -36,6 +36,7 @@
#ifdef CONFIG_X86_FRED
#include <linux/kernel.h>
+#include <linux/sched/task_stack.h>
#include <asm/ptrace.h>
@@ -84,13 +85,33 @@ static __always_inline void fred_entry_from_kvm(unsigned int type, unsigned int
}
void cpu_init_fred_exceptions(void);
+void cpu_init_fred_rsps(void);
void fred_complete_exception_setup(void);
+DECLARE_PER_CPU(unsigned long, fred_rsp0);
+
+static __always_inline void fred_sync_rsp0(unsigned long rsp0)
+{
+ __this_cpu_write(fred_rsp0, rsp0);
+}
+
+static __always_inline void fred_update_rsp0(void)
+{
+ unsigned long rsp0 = (unsigned long) task_stack_page(current) + THREAD_SIZE;
+
+ if (cpu_feature_enabled(X86_FEATURE_FRED) && (__this_cpu_read(fred_rsp0) != rsp0)) {
+ wrmsrns(MSR_IA32_FRED_RSP0, rsp0);
+ __this_cpu_write(fred_rsp0, rsp0);
+ }
+}
#else /* CONFIG_X86_FRED */
static __always_inline unsigned long fred_event_data(struct pt_regs *regs) { return 0; }
static inline void cpu_init_fred_exceptions(void) { }
+static inline void cpu_init_fred_rsps(void) { }
static inline void fred_complete_exception_setup(void) { }
-static __always_inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) { }
+static inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) { }
+static inline void fred_sync_rsp0(unsigned long rsp0) { }
+static inline void fred_update_rsp0(void) { }
#endif /* CONFIG_X86_FRED */
#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index d642037f9ed5..001853541f1e 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -99,19 +99,6 @@ static __always_inline void __wrmsr(unsigned int msr, u32 low, u32 high)
: : "c" (msr), "a"(low), "d" (high) : "memory");
}
-/*
- * WRMSRNS behaves exactly like WRMSR with the only difference being
- * that it is not a serializing instruction by default.
- */
-static __always_inline void __wrmsrns(u32 msr, u32 low, u32 high)
-{
- /* Instruction opcode for WRMSRNS; supported in binutils >= 2.40. */
- asm volatile("1: .byte 0x0f,0x01,0xc6\n"
- "2:\n"
- _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR)
- : : "c" (msr), "a"(low), "d" (high));
-}
-
#define native_rdmsr(msr, val1, val2) \
do { \
u64 __val = __rdmsr((msr)); \
@@ -312,9 +299,19 @@ do { \
#endif /* !CONFIG_PARAVIRT_XXL */
+/* Instruction opcode for WRMSRNS supported in binutils >= 2.40 */
+#define WRMSRNS _ASM_BYTES(0x0f,0x01,0xc6)
+
+/* Non-serializing WRMSR, when available. Falls back to a serializing WRMSR. */
static __always_inline void wrmsrns(u32 msr, u64 val)
{
- __wrmsrns(msr, val, val >> 32);
+ /*
+ * WRMSR is 2 bytes. WRMSRNS is 3 bytes. Pad WRMSR with a redundant
+ * DS prefix to avoid a trailing NOP.
+ */
+ asm volatile("1: " ALTERNATIVE("ds wrmsr", WRMSRNS, X86_FEATURE_WRMSRNS)
+ "2: " _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR)
+ : : "c" (msr), "a" ((u32)val), "d" ((u32)(val >> 32)));
}
/*
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index a75a07f4931f..399f7d1c4c61 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -582,7 +582,8 @@ extern void switch_gdt_and_percpu_base(int);
extern void load_direct_gdt(int);
extern void load_fixmap_gdt(int);
extern void cpu_init(void);
-extern void cpu_init_exception_handling(void);
+extern void cpu_init_exception_handling(bool boot_cpu);
+extern void cpu_init_replace_early_idt(void);
extern void cr4_init(void);
extern void set_task_blockstep(struct task_struct *task, bool on);
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index c3bd0c0758c9..75248546403d 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -70,13 +70,9 @@ static inline void update_task_stack(struct task_struct *task)
#ifdef CONFIG_X86_32
this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
#else
- if (cpu_feature_enabled(X86_FEATURE_FRED)) {
- /* WRMSRNS is a baseline feature for FRED. */
- wrmsrns(MSR_IA32_FRED_RSP0, (unsigned long)task_stack_page(task) + THREAD_SIZE);
- } else if (cpu_feature_enabled(X86_FEATURE_XENPV)) {
+ if (!cpu_feature_enabled(X86_FEATURE_FRED) && cpu_feature_enabled(X86_FEATURE_XENPV))
/* Xen PV enters the kernel on the thread stack. */
load_sp0(task_top_of_stack(task));
- }
#endif
}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index d4e539d4e158..a4735d9b5a1d 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1510,6 +1510,11 @@ static void __init cpu_parse_early_param(void)
if (cmdline_find_option_bool(boot_command_line, "nousershstk"))
setup_clear_cpu_cap(X86_FEATURE_USER_SHSTK);
+ /* Minimize the gap between FRED is available and available but disabled. */
+ arglen = cmdline_find_option(boot_command_line, "fred", arg, sizeof(arg));
+ if (arglen != 2 || strncmp(arg, "on", 2))
+ setup_clear_cpu_cap(X86_FEATURE_FRED);
+
arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg));
if (arglen <= 0)
return;
@@ -2171,7 +2176,7 @@ static inline void tss_setup_io_bitmap(struct tss_struct *tss)
* Setup everything needed to handle exceptions from the IDT, including the IST
* exceptions which use paranoid_entry().
*/
-void cpu_init_exception_handling(void)
+void cpu_init_exception_handling(bool boot_cpu)
{
struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
int cpu = raw_smp_processor_id();
@@ -2190,10 +2195,23 @@ void cpu_init_exception_handling(void)
/* GHCB needs to be setup to handle #VC. */
setup_ghcb();
+ if (cpu_feature_enabled(X86_FEATURE_FRED)) {
+ /* The boot CPU has enabled FRED during early boot */
+ if (!boot_cpu)
+ cpu_init_fred_exceptions();
+
+ cpu_init_fred_rsps();
+ } else {
+ load_current_idt();
+ }
+}
+
+void __init cpu_init_replace_early_idt(void)
+{
if (cpu_feature_enabled(X86_FEATURE_FRED))
cpu_init_fred_exceptions();
else
- load_current_idt();
+ idt_setup_early_pf();
}
/*
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index b7d9f530ae16..8bd84114c2d9 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -83,7 +83,6 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_AMX_TILE, X86_FEATURE_XFD },
{ X86_FEATURE_SHSTK, X86_FEATURE_XSAVES },
{ X86_FEATURE_FRED, X86_FEATURE_LKGS },
- { X86_FEATURE_FRED, X86_FEATURE_WRMSRNS },
{}
};
diff --git a/arch/x86/kernel/fred.c b/arch/x86/kernel/fred.c
index 4bcd8791ad96..8d32c3f48abc 100644
--- a/arch/x86/kernel/fred.c
+++ b/arch/x86/kernel/fred.c
@@ -21,17 +21,53 @@
#define FRED_STKLVL(vector, lvl) ((lvl) << (2 * (vector)))
+DEFINE_PER_CPU(unsigned long, fred_rsp0);
+EXPORT_PER_CPU_SYMBOL(fred_rsp0);
+
void cpu_init_fred_exceptions(void)
{
/* When FRED is enabled by default, remove this log message */
pr_info("Initialize FRED on CPU%d\n", smp_processor_id());
+ /*
+ * If a kernel event is delivered before a CPU goes to user level for
+ * the first time, its SS is NULL thus NULL is pushed into the SS field
+ * of the FRED stack frame. But before ERETS is executed, the CPU may
+ * context switch to another task and go to user level. Then when the
+ * CPU comes back to kernel mode, SS is changed to __KERNEL_DS. Later
+ * when ERETS is executed to return from the kernel event handler, a #GP
+ * fault is generated because SS doesn't match the SS saved in the FRED
+ * stack frame.
+ *
+ * Initialize SS to __KERNEL_DS when enabling FRED to avoid such #GPs.
+ */
+ loadsegment(ss, __KERNEL_DS);
+
wrmsrl(MSR_IA32_FRED_CONFIG,
/* Reserve for CALL emulation */
FRED_CONFIG_REDZONE |
FRED_CONFIG_INT_STKLVL(0) |
FRED_CONFIG_ENTRYPOINT(asm_fred_entrypoint_user));
+ wrmsrl(MSR_IA32_FRED_STKLVLS, 0);
+ wrmsrl(MSR_IA32_FRED_RSP0, 0);
+ wrmsrl(MSR_IA32_FRED_RSP1, 0);
+ wrmsrl(MSR_IA32_FRED_RSP2, 0);
+ wrmsrl(MSR_IA32_FRED_RSP3, 0);
+
+ /* Enable FRED */
+ cr4_set_bits(X86_CR4_FRED);
+ /* Any further IDT use is a bug */
+ idt_invalidate();
+
+ /* Use int $0x80 for 32-bit system calls in FRED mode */
+ setup_clear_cpu_cap(X86_FEATURE_SYSENTER32);
+ setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
+}
+
+/* Must be called after setup_cpu_entry_areas() */
+void cpu_init_fred_rsps(void)
+{
/*
* The purpose of separate stacks for NMI, #DB and #MC *in the kernel*
* (remember that user space faults are always taken on stack level 0)
@@ -47,13 +83,4 @@ void cpu_init_fred_exceptions(void)
wrmsrl(MSR_IA32_FRED_RSP1, __this_cpu_ist_top_va(DB));
wrmsrl(MSR_IA32_FRED_RSP2, __this_cpu_ist_top_va(NMI));
wrmsrl(MSR_IA32_FRED_RSP3, __this_cpu_ist_top_va(DF));
-
- /* Enable FRED */
- cr4_set_bits(X86_CR4_FRED);
- /* Any further IDT use is a bug */
- idt_invalidate();
-
- /* Use int $0x80 for 32-bit system calls in FRED mode */
- setup_clear_cpu_cap(X86_FEATURE_SYSENTER32);
- setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
}
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 6129dc2ba784..f1fea506e20f 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1039,7 +1039,12 @@ void __init setup_arch(char **cmdline_p)
init_mem_mapping();
- idt_setup_early_pf();
+ /*
+ * init_mem_mapping() relies on the early IDT page fault handling.
+ * Now either enable FRED or install the real page fault handler
+ * for 64-bit in the IDT.
+ */
+ cpu_init_replace_early_idt();
/*
* Update mmu_cr4_features (and, indirectly, trampoline_cr4_features)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 0c35207320cb..dc4fff8fccce 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -246,7 +246,7 @@ static void notrace start_secondary(void *unused)
__flush_tlb_all();
}
- cpu_init_exception_handling();
+ cpu_init_exception_handling(false);
/*
* Load the microcode before reaching the AP alive synchronization
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 4fa0b17e5043..197d5888b0e2 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -1402,34 +1402,8 @@ DEFINE_IDTENTRY_SW(iret_error)
}
#endif
-/* Do not enable FRED by default yet. */
-static bool enable_fred __ro_after_init = false;
-
-#ifdef CONFIG_X86_FRED
-static int __init fred_setup(char *str)
-{
- if (!str)
- return -EINVAL;
-
- if (!cpu_feature_enabled(X86_FEATURE_FRED))
- return 0;
-
- if (!strcmp(str, "on"))
- enable_fred = true;
- else if (!strcmp(str, "off"))
- enable_fred = false;
- else
- pr_warn("invalid FRED option: 'fred=%s'\n", str);
- return 0;
-}
-early_param("fred", fred_setup);
-#endif
-
void __init trap_init(void)
{
- if (cpu_feature_enabled(X86_FEATURE_FRED) && !enable_fred)
- setup_clear_cpu_cap(X86_FEATURE_FRED);
-
/* Init cpu_entry_area before IST entries are set up */
setup_cpu_entry_areas();
@@ -1437,7 +1411,7 @@ void __init trap_init(void)
sev_es_init_vc_handling();
/* Initialize TSS before setting up traps so ISTs work */
- cpu_init_exception_handling();
+ cpu_init_exception_handling(true);
/* Setup traps as cpu_init() might #GP */
if (!cpu_feature_enabled(X86_FEATURE_FRED))
Powered by blists - more mailing lists