lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20201109144425.270789-10-alexandre.chartre@oracle.com>
Date:   Mon,  9 Nov 2020 15:44:10 +0100
From:   Alexandre Chartre <alexandre.chartre@...cle.com>
To:     tglx@...utronix.de, mingo@...hat.com, bp@...en8.de, hpa@...or.com,
        x86@...nel.org, dave.hansen@...ux.intel.com, luto@...nel.org,
        peterz@...radead.org, linux-kernel@...r.kernel.org,
        thomas.lendacky@....com, jroedel@...e.de
Cc:     konrad.wilk@...cle.com, jan.setjeeilers@...cle.com,
        junaids@...gle.com, oweisse@...gle.com, rppt@...ux.vnet.ibm.com,
        graf@...zon.de, mgross@...ux.intel.com, kuzuno@...il.com,
        alexandre.chartre@...cle.com
Subject: [RFC][PATCH 09/24] x86/entry: Add C version of paranoid_entry/exit

paranoid_entry/exit are assembly macros. Provide C versions of
these macros (kernel_paranoid_entry() and kernel_paranoid_exit()).
The C functions are functionally equivalent to the assembly macros,
except that kernel_paranoid_entry() doesn't save registers in
pt_regs like paranoid_entry does.

Signed-off-by: Alexandre Chartre <alexandre.chartre@...cle.com>
---
 arch/x86/entry/common.c             | 157 ++++++++++++++++++++++++++++
 arch/x86/include/asm/entry-common.h |  10 ++
 2 files changed, 167 insertions(+)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index d09b1ded5287..54d0931801e1 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -387,3 +387,160 @@ static __always_inline unsigned long save_and_switch_to_kernel_cr3(void)
 static __always_inline void restore_cr3(unsigned long cr3) {}
 
 #endif /* CONFIG_PAGE_TABLE_ISOLATION */
+
+/*
+ * "Paranoid" entry path from exception stack. Ensure that the CR3 and
+ * GS registers are correctly set for the kernel. Return GSBASE related
+ * information in kernel_entry_state depending on the availability of
+ * the FSGSBASE instructions:
+ *
+ * FSGSBASE	kernel_entry_state
+ *     N        swapgs=true -> SWAPGS on exit
+ *              swapgs=false -> no SWAPGS on exit
+ *
+ *     Y        gsbase=GSBASE value at entry, must be restored in
+ *              kernel_paranoid_exit()
+ *
+ * Note that per-cpu variables are accessed using the GS register,
+ * so paranoid entry code cannot access per-cpu variables before
+ * kernel_paranoid_entry() has been called.
+ */
+noinstr void kernel_paranoid_entry(struct kernel_entry_state *state)
+{
+	unsigned long gsbase;
+	unsigned int cpu;
+
+	/*
+	 * Save CR3 in the kernel entry state.  This value will be
+	 * restored, verbatim, at exit.  Needed if the paranoid entry
+	 * interrupted another entry that already switched to the user
+	 * CR3 value but has not yet returned to userspace.
+	 *
+	 * This is also why CS (stashed in the "iret frame" by the
+	 * hardware at entry) can not be used: this may be a return
+	 * to kernel code, but with a user CR3 value.
+	 *
+	 * Switching CR3 does not depend on kernel GSBASE so it can
+	 * be done before switching to the kernel GSBASE. This is
+	 * required for FSGSBASE because the kernel GSBASE has to
+	 * be retrieved from a kernel internal table.
+	 */
+	state->cr3 = save_and_switch_to_kernel_cr3();
+
+	/*
+	 * Handling GSBASE depends on the availability of FSGSBASE.
+	 *
+	 * Without FSGSBASE the kernel enforces that negative GSBASE
+	 * values indicate kernel GSBASE. With FSGSBASE no assumptions
+	 * can be made about the GSBASE value when entering from user
+	 * space.
+	 */
+	if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
+		/*
+		 * Read the current GSBASE and store it in the kernel
+		 * entry state unconditionally, retrieve and set the
+		 * current CPUs kernel GSBASE. The stored value has to
+		 * be restored at exit unconditionally.
+		 *
+		 * The unconditional write to GS base below ensures that
+		 * no subsequent loads based on a mispredicted GS base
+		 * can happen, therefore no LFENCE is needed here.
+		 */
+		state->gsbase = rdgsbase();
+
+		/*
+		 * Fetch the per-CPU GSBASE value for this processor. We
+		 * normally use %gs for accessing per-CPU data, but we
+		 * are setting up %gs here and obviously can not use %gs
+		 * itself to access per-CPU data.
+		 */
+		if (IS_ENABLED(CONFIG_SMP)) {
+			/*
+			 * Load CPU from the GDT. Do not use RDPID,
+			 * because KVM loads guest's TSC_AUX on vm-entry
+			 * and may not restore the host's value until
+			 * the CPU returns to userspace. Thus the kernel
+			 * would consume a guest's TSC_AUX if an NMI
+			 * arrives while running KVM's run loop.
+			 */
+			asm_inline volatile ("lsl %[seg],%[p]"
+					     : [p] "=r" (cpu)
+					     : [seg] "r" (__CPUNODE_SEG));
+
+			cpu &= VDSO_CPUNODE_MASK;
+			gsbase = __per_cpu_offset[cpu];
+		} else {
+			gsbase = *pcpu_unit_offsets;
+		}
+
+		wrgsbase(gsbase);
+
+	} else {
+		/*
+		 * The kernel-enforced convention is a negative GSBASE
+		 * indicates a kernel value. No SWAPGS needed on entry
+		 * and exit.
+		 */
+		rdmsrl(MSR_GS_BASE, gsbase);
+		if (((long)gsbase) >= 0) {
+			swapgs();
+			/*
+			 * Do an lfence to prevent GS speculation.
+			 */
+			alternative("", "lfence",
+				    X86_FEATURE_FENCE_SWAPGS_KERNEL);
+			state->swapgs = true;
+		} else {
+			state->swapgs = false;
+		}
+	}
+}
+
+/*
+ * "Paranoid" exit path from exception stack. Restore the CR3 and
+ * GS registers are as they were on entry. This is invoked only
+ * on return from IST interrupts that came from kernel space.
+ *
+ * We may be returning to very strange contexts (e.g. very early
+ * in syscall entry), so checking for preemption here would
+ * be complicated.  Fortunately, there's no good reason to try
+ * to handle preemption here.
+ *
+ * The kernel_entry_state contains the GSBASE related information
+ * depending on the availability of the FSGSBASE instructions:
+ *
+ * FSGSBASE	kernel_entry_state
+ *     N        swapgs=true  -> SWAPGS on exit
+ *              swapgs=false -> no SWAPGS on exit
+ *
+ *     Y        gsbase=GSBASE value at entry, must be restored
+ *              unconditionally
+ *
+ * Note that per-cpu variables are accessed using the GS register,
+ * so paranoid entry code cannot access per-cpu variables after
+ * kernel_paranoid_exit() has been called.
+ */
+noinstr void kernel_paranoid_exit(struct kernel_entry_state *state)
+{
+	/*
+	 * The order of operations is important. RESTORE_CR3 requires
+	 * kernel GSBASE.
+	 *
+	 * NB to anyone to try to optimize this code: this code does
+	 * not execute at all for exceptions from user mode. Those
+	 * exceptions go through error_exit instead.
+	 */
+	restore_cr3(state->cr3);
+
+	/* With FSGSBASE enabled, unconditionally restore GSBASE */
+	if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
+		wrgsbase(state->gsbase);
+		return;
+	}
+
+	/* On non-FSGSBASE systems, conditionally do SWAPGS */
+	if (state->swapgs) {
+		/* We are returning to a context with user GSBASE */
+		swapgs_unsafe_stack();
+	}
+}
diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h
index b05b212f5ebc..b75e9230c990 100644
--- a/arch/x86/include/asm/entry-common.h
+++ b/arch/x86/include/asm/entry-common.h
@@ -163,6 +163,16 @@ static inline void switch_to_kernel_cr3(void) {}
 static inline void switch_to_user_cr3(void) {}
 
 #endif /* CONFIG_PAGE_TABLE_ISOLATION */
+
+struct kernel_entry_state {
+	unsigned long cr3;
+	unsigned long gsbase;
+	bool swapgs;
+};
+
+void kernel_paranoid_entry(struct kernel_entry_state *state);
+void kernel_paranoid_exit(struct kernel_entry_state *state);
+
 #endif /* MODULE */
 
 #endif
-- 
2.18.4

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ