lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20250904021017.1628993-5-mrathor@linux.microsoft.com>
Date: Wed,  3 Sep 2025 19:10:15 -0700
From: Mukesh Rathor <mrathor@...ux.microsoft.com>
To: linux-hyperv@...r.kernel.org,
	linux-kernel@...r.kernel.org,
	linux-arch@...r.kernel.org
Cc: kys@...rosoft.com,
	haiyangz@...rosoft.com,
	wei.liu@...nel.org,
	decui@...rosoft.com,
	tglx@...utronix.de,
	mingo@...hat.com,
	bp@...en8.de,
	dave.hansen@...ux.intel.com,
	x86@...nel.org,
	hpa@...or.com,
	arnd@...db.de
Subject: [PATCH v0 4/6] x86/hyperv: Add trampoline asm code to transition from hypervisor

This commit introduces a small asm stub to transition from the hypervisor
to linux upon devirtualization. At a high level, during panic of either
the hypervisor or the dom0 (aka root), the nmi handler asks hypervisor
to devirtualize. As part of that, the arguments include the entry point
to return to linux. This asm stub implements the entry point.

The stub is entered in protected mode, uses temporary gdt and page table
to enable long mode and get to kernel entry point to restore full kernel
context to resume execution to kexec.

Signed-off-by: Mukesh Rathor <mrathor@...ux.microsoft.com>
---
 arch/x86/hyperv/hv_trampoline.S | 99 +++++++++++++++++++++++++++++++++
 1 file changed, 99 insertions(+)
 create mode 100644 arch/x86/hyperv/hv_trampoline.S

diff --git a/arch/x86/hyperv/hv_trampoline.S b/arch/x86/hyperv/hv_trampoline.S
new file mode 100644
index 000000000000..307663c8891e
--- /dev/null
+++ b/arch/x86/hyperv/hv_trampoline.S
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * X86 specific Hyper-V kdump/crash related code.
+ *
+ * Copyright (C) 2025, Microsoft, Inc.
+ *
+ */
+#include <linux/linkage.h>
+#include <asm/alternative.h>
+#include <asm/msr.h>
+#include <asm/processor-flags.h>
+#include <asm/nospec-branch.h>
+
+/*
+ * void noreturn hv_crash_asm32(arg1)
+ *    arg1 == edi == 32bit PA of struct hv_crash_trdata
+ *
+ * The hypervisor jumps here upon devirtualization in protected mode. This
+ * code gets copied to a page in the low 4G ie, 32bit space so it can run
+ * in the protected mode. Hence we cannot use any compile/link time offsets or
+ * addresses. It restores long mode via temporary gdt and page tables and
+ * eventually jumps to kernel code entry at HV_CRASHDATA_OFFS_C_entry.
+ *
+ * PreCondition (ie, Hypervisor call back ABI):
+ *  o CR0 is set to 0x0021: PE(prot mode) and NE are set, paging is disabled
+ *  o CR4 is set to 0x0
+ *  o IA32_EFER is set to 0x901 (SCE and NXE are set)
+ *  o EDI is set to the Arg passed to HVCALL_DISABLE_HYP_EX.
+ *  o CS, DS, ES, FS, GS are all initialized with a base of 0 and limit 0xFFFF
+ *  o IDTR, TR and GDTR are initialized with a base of 0 and limit of 0xFFFF
+ *  o LDTR is initialized as invalid (limit of 0)
+ *  o MSR PAT is power on default.
+ *  o Other state/registers are cleared. All TLBs flushed.
+ *
+ * See Intel SDM 10.8.5
+ */
+
+#define HV_CRASHDATA_OFFS_TRAMPCR3    0x0    /*	 0 */
+#define HV_CRASHDATA_OFFS_KERNCR3     0x8    /*	 8 */
+#define HV_CRASHDATA_OFFS_GDTRLIMIT  0x12    /* 18 */
+#define HV_CRASHDATA_OFFS_CS_JMPTGT  0x28    /* 40 */
+#define HV_CRASHDATA_OFFS_C_entry    0x30    /* 48 */
+#define HV_CRASHDATA_TRAMPOLINE_CS    0x8
+
+	.text
+	.code32
+
+SYM_CODE_START(hv_crash_asm32)
+	movl	$X86_CR4_PAE, %ecx
+	movl	%ecx, %cr4
+
+	movl %edi, %ebx
+	add $HV_CRASHDATA_OFFS_TRAMPCR3, %ebx
+	movl %cs:(%ebx), %eax
+	movl %eax, %cr3
+
+	# Setup EFER for long mode now.
+	movl	$MSR_EFER, %ecx
+	rdmsr
+	btsl	$_EFER_LME, %eax
+	wrmsr
+
+	# Turn paging on using the temp 32bit trampoline page table.
+	movl %cr0, %eax
+	orl $(X86_CR0_PG), %eax
+	movl %eax, %cr0
+
+	/* since kernel cr3 could be above 4G, we need to be in the long mode
+	 * before we can load 64bits of the kernel cr3. We use a temp gdt for
+	 * that with CS.L=1 and CS.D=0 */
+	mov %edi, %eax
+	add $HV_CRASHDATA_OFFS_GDTRLIMIT, %eax
+	lgdtl %cs:(%eax)
+
+	/* not done yet, restore CS now to switch to CS.L=1 */
+	mov %edi, %eax
+	add $HV_CRASHDATA_OFFS_CS_JMPTGT, %eax
+	ljmp %cs:*(%eax)
+
+	/* we now run in full 64bit IA32-e long mode, CS.L=1 and CS.D=0 */
+	.code64
+	.balign 8
+SYM_INNER_LABEL(hv_crash_asm64_lbl, SYM_L_GLOBAL)
+	/* restore kernel page tables so we can jump to kernel code */
+	mov %edi, %eax
+	add $HV_CRASHDATA_OFFS_KERNCR3, %eax
+	movq %cs:(%eax), %rbx
+	movq %rbx, %cr3
+
+	mov %edi, %eax
+	add $HV_CRASHDATA_OFFS_C_entry, %eax
+	movq %cs:(%eax), %rbx
+	ANNOTATE_RETPOLINE_SAFE
+	jmp *%rbx
+
+	int $3
+
+SYM_INNER_LABEL(hv_crash_asm32_end, SYM_L_GLOBAL)
+SYM_CODE_END(hv_crash_asm32)
-- 
2.36.1.vfs.0.0


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ