[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20201109112319.264511-10-alexandre.chartre@oracle.com>
Date: Mon, 9 Nov 2020 12:23:04 +0100
From: Alexandre Chartre <alexandre.chartre@...cle.com>
To: "tglx@...utronix.de"@userv0121.oracle.com,
"mingo@...hat.com"@userv0121.oracle.com,
"bp@...en8.de"@userv0121.oracle.com,
"hpa@...or.com"@userv0121.oracle.com,
"x86@...nel.org"@userv0121.oracle.com,
"dave.hansen@...ux.intel.com"@userv0121.oracle.com,
"luto@...nel.org"@userv0121.oracle.com,
"peterz@...radead.org"@userv0121.oracle.com,
"linux-kernel@...r.kernel.org"@userv0121.oracle.com,
"thomas.lendacky@....com"@userv0121.oracle.com,
"jroedel@...e.de"@userv0121.oracle.com
Cc: "konrad.wilk@...cle.com"@userv0121.oracle.com,
"jan.setjeeilers@...cle.com"@userv0121.oracle.com,
"junaids@...gle.com"@userv0121.oracle.com,
"oweisse@...gle.com"@userv0121.oracle.com,
"rppt@...ux.vnet.ibm.com"@userv0121.oracle.com,
"graf@...zon.de"@userv0121.oracle.com,
"mgross@...ux.intel.com"@userv0121.oracle.com,
"kuzuno@...il.com"@userv0121.oracle.com,
"alexandre.chartre@...cle.com"@userv0121.oracle.com
Subject: [RFC][PATCH 09/24] x86/entry: Add C version of paranoid_entry/exit
paranoid_entry/exit are assembly macros. Provide C versions of
these macros (kernel_paranoid_entry() and kernel_paranoid_exit()).
The C functions are functionally equivalent to the assembly macros,
except that kernel_paranoid_entry() doesn't save registers in
pt_regs like paranoid_entry does.
Signed-off-by: Alexandre Chartre <alexandre.chartre@...cle.com>
---
arch/x86/entry/common.c | 157 ++++++++++++++++++++++++++++
arch/x86/include/asm/entry-common.h | 10 ++
2 files changed, 167 insertions(+)
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index d09b1ded5287..54d0931801e1 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -387,3 +387,160 @@ static __always_inline unsigned long save_and_switch_to_kernel_cr3(void)
static __always_inline void restore_cr3(unsigned long cr3) {}
#endif /* CONFIG_PAGE_TABLE_ISOLATION */
+
+/*
+ * "Paranoid" entry path from exception stack. Ensure that the CR3 and
+ * GS registers are correctly set for the kernel. Return GSBASE related
+ * information in kernel_entry_state depending on the availability of
+ * the FSGSBASE instructions:
+ *
+ * FSGSBASE kernel_entry_state
+ * N swapgs=true -> SWAPGS on exit
+ * swapgs=false -> no SWAPGS on exit
+ *
+ * Y gsbase=GSBASE value at entry, must be restored in
+ * kernel_paranoid_exit()
+ *
+ * Note that per-cpu variables are accessed using the GS register,
+ * so paranoid entry code cannot access per-cpu variables before
+ * kernel_paranoid_entry() has been called.
+ */
+noinstr void kernel_paranoid_entry(struct kernel_entry_state *state)
+{
+ unsigned long gsbase;
+ unsigned int cpu;
+
+ /*
+ * Save CR3 in the kernel entry state. This value will be
+ * restored, verbatim, at exit. Needed if the paranoid entry
+ * interrupted another entry that already switched to the user
+ * CR3 value but has not yet returned to userspace.
+ *
+ * This is also why CS (stashed in the "iret frame" by the
+ * hardware at entry) can not be used: this may be a return
+ * to kernel code, but with a user CR3 value.
+ *
+ * Switching CR3 does not depend on kernel GSBASE so it can
+ * be done before switching to the kernel GSBASE. This is
+ * required for FSGSBASE because the kernel GSBASE has to
+ * be retrieved from a kernel internal table.
+ */
+ state->cr3 = save_and_switch_to_kernel_cr3();
+
+ /*
+ * Handling GSBASE depends on the availability of FSGSBASE.
+ *
+ * Without FSGSBASE the kernel enforces that negative GSBASE
+ * values indicate kernel GSBASE. With FSGSBASE no assumptions
+ * can be made about the GSBASE value when entering from user
+ * space.
+ */
+ if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
+ /*
+ * Read the current GSBASE and store it in the kernel
+ * entry state unconditionally, retrieve and set the
+ * current CPUs kernel GSBASE. The stored value has to
+ * be restored at exit unconditionally.
+ *
+ * The unconditional write to GS base below ensures that
+ * no subsequent loads based on a mispredicted GS base
+ * can happen, therefore no LFENCE is needed here.
+ */
+ state->gsbase = rdgsbase();
+
+ /*
+ * Fetch the per-CPU GSBASE value for this processor. We
+ * normally use %gs for accessing per-CPU data, but we
+ * are setting up %gs here and obviously can not use %gs
+ * itself to access per-CPU data.
+ */
+ if (IS_ENABLED(CONFIG_SMP)) {
+ /*
+ * Load CPU from the GDT. Do not use RDPID,
+ * because KVM loads guest's TSC_AUX on vm-entry
+ * and may not restore the host's value until
+ * the CPU returns to userspace. Thus the kernel
+ * would consume a guest's TSC_AUX if an NMI
+ * arrives while running KVM's run loop.
+ */
+ asm_inline volatile ("lsl %[seg],%[p]"
+ : [p] "=r" (cpu)
+ : [seg] "r" (__CPUNODE_SEG));
+
+ cpu &= VDSO_CPUNODE_MASK;
+ gsbase = __per_cpu_offset[cpu];
+ } else {
+ gsbase = *pcpu_unit_offsets;
+ }
+
+ wrgsbase(gsbase);
+
+ } else {
+ /*
+ * The kernel-enforced convention is a negative GSBASE
+ * indicates a kernel value. No SWAPGS needed on entry
+ * and exit.
+ */
+ rdmsrl(MSR_GS_BASE, gsbase);
+ if (((long)gsbase) >= 0) {
+ swapgs();
+ /*
+ * Do an lfence to prevent GS speculation.
+ */
+ alternative("", "lfence",
+ X86_FEATURE_FENCE_SWAPGS_KERNEL);
+ state->swapgs = true;
+ } else {
+ state->swapgs = false;
+ }
+ }
+}
+
+/*
+ * "Paranoid" exit path from exception stack. Restore the CR3 and
+ * GS registers are as they were on entry. This is invoked only
+ * on return from IST interrupts that came from kernel space.
+ *
+ * We may be returning to very strange contexts (e.g. very early
+ * in syscall entry), so checking for preemption here would
+ * be complicated. Fortunately, there's no good reason to try
+ * to handle preemption here.
+ *
+ * The kernel_entry_state contains the GSBASE related information
+ * depending on the availability of the FSGSBASE instructions:
+ *
+ * FSGSBASE kernel_entry_state
+ * N swapgs=true -> SWAPGS on exit
+ * swapgs=false -> no SWAPGS on exit
+ *
+ * Y gsbase=GSBASE value at entry, must be restored
+ * unconditionally
+ *
+ * Note that per-cpu variables are accessed using the GS register,
+ * so paranoid entry code cannot access per-cpu variables after
+ * kernel_paranoid_exit() has been called.
+ */
+noinstr void kernel_paranoid_exit(struct kernel_entry_state *state)
+{
+ /*
+ * The order of operations is important. RESTORE_CR3 requires
+ * kernel GSBASE.
+ *
+ * NB to anyone to try to optimize this code: this code does
+ * not execute at all for exceptions from user mode. Those
+ * exceptions go through error_exit instead.
+ */
+ restore_cr3(state->cr3);
+
+ /* With FSGSBASE enabled, unconditionally restore GSBASE */
+ if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
+ wrgsbase(state->gsbase);
+ return;
+ }
+
+ /* On non-FSGSBASE systems, conditionally do SWAPGS */
+ if (state->swapgs) {
+ /* We are returning to a context with user GSBASE */
+ swapgs_unsafe_stack();
+ }
+}
diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h
index b05b212f5ebc..b75e9230c990 100644
--- a/arch/x86/include/asm/entry-common.h
+++ b/arch/x86/include/asm/entry-common.h
@@ -163,6 +163,16 @@ static inline void switch_to_kernel_cr3(void) {}
static inline void switch_to_user_cr3(void) {}
#endif /* CONFIG_PAGE_TABLE_ISOLATION */
+
+struct kernel_entry_state {
+ unsigned long cr3;
+ unsigned long gsbase;
+ bool swapgs;
+};
+
+void kernel_paranoid_entry(struct kernel_entry_state *state);
+void kernel_paranoid_exit(struct kernel_entry_state *state);
+
#endif /* MODULE */
#endif
--
2.18.4
Powered by blists - more mailing lists