lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date: Mon, 26 Feb 2024 22:35:21 +0800
From: Lai Jiangshan <jiangshanlai@...il.com>
To: linux-kernel@...r.kernel.org
Cc: Lai Jiangshan <jiangshan.ljs@...group.com>,
	Hou Wenlong <houwenlong.hwl@...group.com>,
	Linus Torvalds <torvalds@...ux-foundation.org>,
	Peter Zijlstra <peterz@...radead.org>,
	Sean Christopherson <seanjc@...gle.com>,
	Thomas Gleixner <tglx@...utronix.de>,
	Borislav Petkov <bp@...en8.de>,
	Ingo Molnar <mingo@...hat.com>,
	kvm@...r.kernel.org,
	Paolo Bonzini <pbonzini@...hat.com>,
	x86@...nel.org,
	Kees Cook <keescook@...omium.org>,
	Juergen Gross <jgross@...e.com>,
	Andy Lutomirski <luto@...nel.org>,
	Dave Hansen <dave.hansen@...ux.intel.com>,
	"H. Peter Anvin" <hpa@...or.com>,
	Oleg Nesterov <oleg@...hat.com>,
	Brian Gerst <brgerst@...il.com>
Subject: [RFC PATCH 04/73] x86/entry: Implement direct switching for the switcher

From: Lai Jiangshan <jiangshan.ljs@...group.com>

During VM running, all VM exits in the switcher will be forwarded to the
hypervisor and then returned to the switcher to re-enter the VM after
handling the VM exit. In some situations, the switcher can handle the VM
exit directly without involving the hypervisor. This is referred to as
direct switching, and it can reduce the overhead of guest/host state
switching. Currently, for simplicity, only the syscall event from user
mode and ERETU synthetic instruction are allowed for direct switching.

Signed-off-by: Lai Jiangshan <jiangshan.ljs@...group.com>
Signed-off-by: Hou Wenlong <houwenlong.hwl@...group.com>
---
 arch/x86/entry/entry_64_switcher.S | 145 ++++++++++++++++++++++++++++-
 arch/x86/include/asm/ptrace.h      |   2 +
 arch/x86/include/asm/switcher.h    |  60 ++++++++++++
 arch/x86/kernel/asm-offsets_64.c   |  23 +++++
 4 files changed, 229 insertions(+), 1 deletion(-)

diff --git a/arch/x86/entry/entry_64_switcher.S b/arch/x86/entry/entry_64_switcher.S
index 2b99a46421cc..6f166d15635c 100644
--- a/arch/x86/entry/entry_64_switcher.S
+++ b/arch/x86/entry/entry_64_switcher.S
@@ -75,7 +75,7 @@ SYM_FUNC_START(switcher_enter_guest)
 
 	/* Switch to guest GSBASE and return to guest */
 	swapgs
-	jmp	native_irq_return_iret
+	jmp	.L_switcher_return_to_guest
 
 SYM_INNER_LABEL(switcher_return_from_guest, SYM_L_GLOBAL)
 	/* switch back to host cr3 when still on sp0/ist stack */
@@ -99,6 +99,23 @@ SYM_INNER_LABEL(switcher_return_from_guest, SYM_L_GLOBAL)
 SYM_FUNC_END(switcher_enter_guest)
 EXPORT_SYMBOL_GPL(switcher_enter_guest)
 
+.macro canonical_rcx
+	/*
+	 * If width of "canonical tail" ever becomes variable, this will need
+	 * to be updated to remain correct on both old and new CPUs.
+	 *
+	 * Change top bits to match most significant bit (47th or 56th bit
+	 * depending on paging mode) in the address.
+	 */
+#ifdef CONFIG_X86_5LEVEL
+	ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \
+		    "shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57
+#else
+	shl	$(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
+	sar	$(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
+#endif
+.endm
+
 SYM_CODE_START(entry_SYSCALL_64_switcher)
 	UNWIND_HINT_ENTRY
 	ENDBR
@@ -117,7 +134,133 @@ SYM_INNER_LABEL(entry_SYSCALL_64_switcher_safe_stack, SYM_L_GLOBAL)
 	pushq	%r11					/* pt_regs->flags */
 	pushq	$__USER_CS				/* pt_regs->cs */
 	pushq	%rcx					/* pt_regs->ip */
+	pushq	%rdi					/* put rdi on ORIG_RAX */
+
+	/* check if it can do direct switch from umod to smod */
+	testq	$SWITCH_FLAGS_NO_DS_TO_SMOD, TSS_extra(switch_flags)
+	jnz	.L_switcher_check_return_umod_instruction
+
+	/* Now it must be umod, start to do direct switch from umod to smod */
+	movq	TSS_extra(pvcs), %rdi
+	movl	%r11d, PVCS_eflags(%rdi)
+	movq	%rcx, PVCS_rip(%rdi)
+	movq	%rcx, PVCS_rcx(%rdi)
+	movq	%r11, PVCS_r11(%rdi)
+	movq	RSP-ORIG_RAX(%rsp), %rcx
+	movq	%rcx, PVCS_rsp(%rdi)
+
+	/* switch umod to smod (switch_flags & cr3) */
+	xorb	$SWITCH_FLAGS_MOD_TOGGLE, TSS_extra(switch_flags)
+	movq	TSS_extra(smod_cr3), %rcx
+	movq	%rcx, %cr3
+
+	/* load smod registers from TSS_extra to sp0 stack or %r11 */
+	movq	TSS_extra(smod_rsp), %rcx
+	movq	%rcx, RSP-ORIG_RAX(%rsp)
+	movq	TSS_extra(smod_entry), %rcx
+	movq	%rcx, RIP-ORIG_RAX(%rsp)
+	movq	TSS_extra(smod_gsbase), %r11
+
+	/* switch host gsbase to guest gsbase, TSS_extra can't be use afterward */
+	swapgs
+
+	/* save guest gsbase as user_gsbase and switch to smod_gsbase */
+	rdgsbase %rcx
+	movq	%rcx, PVCS_user_gsbase(%rdi)
+	wrgsbase %r11
+
+	/* restore umod rdi and smod rflags/r11, rip/rcx and rsp for sysretq */
+	popq	%rdi
+	movq	$SWITCH_ENTER_EFLAGS_FIXED, %r11
+	movq	RIP-RIP(%rsp), %rcx
+
+.L_switcher_sysretq:
+	UNWIND_HINT_IRET_REGS
+	/* now everything is ready for sysretq except for %rsp */
+	movq	RSP-RIP(%rsp), %rsp
+	/* No instruction can be added between seting the guest %rsp and doing sysretq */
+SYM_INNER_LABEL(entry_SYSRETQ_switcher_unsafe_stack, SYM_L_GLOBAL)
+	sysretq
+
+.L_switcher_check_return_umod_instruction:
+	UNWIND_HINT_IRET_REGS offset=8
+
+	/* check if it can do direct switch from smod to umod */
+	testq	$SWITCH_FLAGS_NO_DS_TO_UMOD, TSS_extra(switch_flags)
+	jnz	.L_switcher_return_to_hypervisor
+
+	/*
+	 * Now it must be smod, check if it is the return-umod instruction.
+	 * Switcher and the PVM specification defines a SYSCALL instrucion
+	 * at TSS_extra(retu_rip) - 2 in smod as the return-umod instruction.
+	 */
+	cmpq	%rcx, TSS_extra(retu_rip)
+	jne	.L_switcher_return_to_hypervisor
+
+	/* only handle for the most common cs/ss */
+	movq	TSS_extra(pvcs), %rdi
+	cmpl	$((__USER_DS << 16) | __USER_CS), PVCS_user_cs(%rdi)
+	jne	.L_switcher_return_to_hypervisor
+
+	/* Switcher and the PVM specification requires the smod RSP to be saved */
+	movq	RSP-ORIG_RAX(%rsp), %rcx
+	movq	%rcx, TSS_extra(smod_rsp)
+
+	/* switch smod to umod (switch_flags & cr3) */
+	xorb	$SWITCH_FLAGS_MOD_TOGGLE, TSS_extra(switch_flags)
+	movq	TSS_extra(umod_cr3), %rcx
+	movq	%rcx, %cr3
+
+	/* switch host gsbase to guest gsbase, TSS_extra can't be use afterward */
+	swapgs
+
+	/* write umod gsbase */
+	movq	PVCS_user_gsbase(%rdi), %rcx
+	canonical_rcx
+	wrgsbase %rcx
+
+	/* load sp, flags, ip to sp0 stack and cx, r11, rdi to registers */
+	movq	PVCS_rsp(%rdi), %rcx
+	movq	%rcx, RSP-ORIG_RAX(%rsp)
+	movl	PVCS_eflags(%rdi), %r11d
+	movq	%r11, EFLAGS-ORIG_RAX(%rsp)
+	movq	PVCS_rip(%rdi), %rcx
+	movq	%rcx, RIP-ORIG_RAX(%rsp)
+	movq	PVCS_rcx(%rdi), %rcx
+	movq	PVCS_r11(%rdi), %r11
+	popq	%rdi		// saved rdi (on ORIG_RAX)
+
+.L_switcher_return_to_guest:
+	/*
+	 * Now the RSP points to an IRET frame with guest state on the
+	 * top of the sp0 stack.  Check if it can do sysretq.
+	 */
+	UNWIND_HINT_IRET_REGS
+
+	andq	$SWITCH_ENTER_EFLAGS_ALLOWED, EFLAGS-RIP(%rsp)
+	orq	$SWITCH_ENTER_EFLAGS_FIXED, EFLAGS-RIP(%rsp)
+	testq	$(X86_EFLAGS_RF|X86_EFLAGS_TF), EFLAGS-RIP(%rsp)
+	jnz	native_irq_return_iret
+	cmpq	%r11, EFLAGS-RIP(%rsp)
+	jne	native_irq_return_iret
+
+	cmpq	%rcx, RIP-RIP(%rsp)
+	jne	native_irq_return_iret
+	/*
+	 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
+	 * in kernel space.  This essentially lets the guest take over
+	 * the host, since guest controls RSP.
+	 */
+	canonical_rcx
+	cmpq	%rcx, RIP-RIP(%rsp)
+	je	.L_switcher_sysretq
+
+	/* RCX matches for RIP only before RCX is canonicalized, restore RCX and do IRET. */
+	movq	RIP-RIP(%rsp), %rcx
+	jmp	native_irq_return_iret
 
+.L_switcher_return_to_hypervisor:
+	popq	%rdi					/* saved rdi */
 	pushq	$0					/* pt_regs->orig_ax */
 	movl	$SWITCH_EXIT_REASONS_SYSCALL, 4(%rsp)
 
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 9eeeb5fdd387..322697877a2d 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -198,6 +198,8 @@ static __always_inline bool ip_within_syscall_gap(struct pt_regs *regs)
 	ret = ret || (regs->ip >= (unsigned long)entry_SYSCALL_64_switcher &&
 		      regs->ip <  (unsigned long)entry_SYSCALL_64_switcher_safe_stack);
 
+	ret = ret || (regs->ip == (unsigned long)entry_SYSRETQ_switcher_unsafe_stack);
+
 	return ret;
 }
 #endif
diff --git a/arch/x86/include/asm/switcher.h b/arch/x86/include/asm/switcher.h
index dbf1970ca62f..35a60f4044c4 100644
--- a/arch/x86/include/asm/switcher.h
+++ b/arch/x86/include/asm/switcher.h
@@ -8,6 +8,40 @@
 #define SWITCH_EXIT_REASONS_SYSCALL		1024
 #define SWITCH_EXIT_REASONS_FAILED_VMETNRY	1025
 
+/*
+ * SWITCH_FLAGS control the way how the switcher code works,
+ *	mostly dictate whether it should directly do the guest ring
+ *	switch or just go back to hypervisor.
+ *
+ * SMOD and UMOD
+ *	Current vcpu mode. Use two parity bits to simplify direct-switch
+ *	flags checking.
+ *
+ * NO_DS_CR3
+ *	Not to direct switch due to smod_cr3 or umod_cr3 not having been
+ *	prepared.
+ */
+#define SWITCH_FLAGS_SMOD			_BITULL(0)
+#define SWITCH_FLAGS_UMOD			_BITULL(1)
+#define SWITCH_FLAGS_NO_DS_CR3			_BITULL(2)
+
+#define SWITCH_FLAGS_MOD_TOGGLE			(SWITCH_FLAGS_SMOD | SWITCH_FLAGS_UMOD)
+
+/*
+ * Direct switching disabling bits are all the bits other than
+ * SWITCH_FLAGS_SMOD or SWITCH_FLAGS_UMOD. Bits 8-64 are defined by the driver
+ * using the switcher. Direct switching is enabled if all the disabling bits
+ * are cleared.
+ *
+ * SWITCH_FLAGS_NO_DS_TO_SMOD: not to direct switch to smod due to any
+ * disabling bit or smod bit being set.
+ *
+ * SWITCH_FLAGS_NO_DS_TO_UMOD: not to direct switch to umod due to any
+ * disabling bit or umod bit being set.
+ */
+#define SWITCH_FLAGS_NO_DS_TO_SMOD		(~SWITCH_FLAGS_UMOD)
+#define SWITCH_FLAGS_NO_DS_TO_UMOD		(~SWITCH_FLAGS_SMOD)
+
 /* Bits allowed to be set in the underlying eflags */
 #define SWITCH_ENTER_EFLAGS_ALLOWED	(X86_EFLAGS_FIXED | X86_EFLAGS_IF |\
 					 X86_EFLAGS_TF | X86_EFLAGS_RF |\
@@ -24,6 +58,7 @@
 #include <linux/cache.h>
 
 struct pt_regs;
+struct pvm_vcpu_struct;
 
 /*
  * Extra per CPU control structure lives in the struct tss_struct.
@@ -46,6 +81,31 @@ struct tss_extra {
 	unsigned long host_rsp;
 	/* Prepared guest CR3 to be loaded before VM enter. */
 	unsigned long enter_cr3;
+
+	/*
+	 * Direct switching flag indicates whether direct switching
+	 * is allowed.
+	 */
+	unsigned long switch_flags ____cacheline_aligned;
+	/*
+	 * Guest supervisor mode hardware CR3 for direct switching of guest
+	 * user mode syscall.
+	 */
+	unsigned long smod_cr3;
+	/*
+	 * Guest user mode hardware CR3 for direct switching of guest ERETU
+	 * synthetic instruction.
+	 */
+	unsigned long umod_cr3;
+	/*
+	 * The current PVCS for saving and restoring guest user mode context
+	 * in direct switching.
+	 */
+	struct pvm_vcpu_struct *pvcs;
+	unsigned long retu_rip;
+	unsigned long smod_entry;
+	unsigned long smod_gsbase;
+	unsigned long smod_rsp;
 } ____cacheline_aligned;
 
 extern struct pt_regs *switcher_enter_guest(void);
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 1485cbda6dc4..8230bd27f0b3 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -4,6 +4,7 @@
 #endif
 
 #include <asm/ia32.h>
+#include <asm/pvm_para.h>
 
 #if defined(CONFIG_KVM_GUEST)
 #include <asm/kvm_para.h>
@@ -65,6 +66,28 @@ int main(void)
 	ENTRY(host_cr3);
 	ENTRY(host_rsp);
 	ENTRY(enter_cr3);
+	ENTRY(switch_flags);
+	ENTRY(smod_cr3);
+	ENTRY(umod_cr3);
+	ENTRY(pvcs);
+	ENTRY(retu_rip);
+	ENTRY(smod_entry);
+	ENTRY(smod_gsbase);
+	ENTRY(smod_rsp);
+	BLANK();
+#undef ENTRY
+
+#define ENTRY(entry) OFFSET(PVCS_ ## entry, pvm_vcpu_struct, entry)
+	ENTRY(event_flags);
+	ENTRY(event_errcode);
+	ENTRY(user_cs);
+	ENTRY(user_ss);
+	ENTRY(user_gsbase);
+	ENTRY(rsp);
+	ENTRY(eflags);
+	ENTRY(rip);
+	ENTRY(rcx);
+	ENTRY(r11);
 	BLANK();
 #undef ENTRY
 
-- 
2.19.1.6.gb485710b


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ