lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20211214150304.62613-5-kirill.shutemov@linux.intel.com>
Date:   Tue, 14 Dec 2021 18:02:42 +0300
From:   "Kirill A. Shutemov" <kirill.shutemov@...ux.intel.com>
To:     tglx@...utronix.de, mingo@...hat.com, bp@...en8.de,
        dave.hansen@...el.com, luto@...nel.org, peterz@...radead.org
Cc:     sathyanarayanan.kuppuswamy@...ux.intel.com, aarcange@...hat.com,
        ak@...ux.intel.com, dan.j.williams@...el.com, david@...hat.com,
        hpa@...or.com, jgross@...e.com, jmattson@...gle.com,
        joro@...tes.org, jpoimboe@...hat.com, knsathya@...nel.org,
        pbonzini@...hat.com, sdeep@...are.com, seanjc@...gle.com,
        tony.luck@...el.com, vkuznets@...hat.com, wanpengli@...cent.com,
        x86@...nel.org, linux-kernel@...r.kernel.org,
        "Kirill A. Shutemov" <kirill.shutemov@...ux.intel.com>,
        Sean Christopherson <sean.j.christopherson@...el.com>
Subject: [PATCH 04/26] x86/traps: Add #VE support for TDX guest

Virtualization Exceptions (#VE) are delivered to TDX guests due to
specific guest actions which may happen in either user space or the
kernel:

 * Specific instructions (WBINVD, for example)
 * Specific MSR accesses
 * Specific CPUID leaf accesses
 * Access to unmapped pages (EPT violation)

In the settings that Linux will run in, virtual exceptions are never
generated on accesses to normal, TD-private memory that has been
accepted.

The #VE handler implementation is simplified by the fact that entry
paths do not trigger #VE and that the handler may not be interrupted.
Specifically, the implementation assumes that the entry paths do not
access TD-shared memory, MMIO regions, use #VE triggering MSRs,
instructions, or CPUID leaves that might generate #VE. Interrupts,
including NMIs, are blocked by the hardware starting with #VE delivery
until TDGETVEINFO is called. All of this combined  eliminates the
chance of a #VE during the syscall gap, or paranoid entry paths.

After TDGETVEINFO, #VE could happen in theory (e.g. through an NMI),
but it is expected not to happen because TDX expects NMIs not to
trigger #VEs. Another case where #VE could happen is if the #VE
exception panics, but in this case, since the platform is already in
a panic state, nested #VE is not a concern.

If a guest kernel action which would normally cause a #VE occurs in
the interrupt-disabled region before TDGETVEINFO, a #DF (fault
exception) is delivered to the guest which will result in an oops
(and should eventually be a panic, as it is expected panic_on_oops is
set to 1 for TDX guests).

Add basic infrastructure to handle any #VE which occurs in the kernel
or userspace. Later patches will add handling for specific #VE
scenarios.

For now, convert unhandled #VE's (everything, until later in this
series) so that they appear just like a #GP by calling the
ve_raise_fault() directly. The ve_raise_fault() function is similar
to #GP handler and is responsible for sending SIGSEGV to userspace
and CPU die and notifying debuggers and other die chain users.

Co-developed-by: Sean Christopherson <sean.j.christopherson@...el.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@...el.com>
Co-developed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@...ux.intel.com>
Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@...ux.intel.com>
Reviewed-by: Andi Kleen <ak@...ux.intel.com>
Reviewed-by: Tony Luck <tony.luck@...el.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@...ux.intel.com>
---
 arch/x86/include/asm/idtentry.h |  4 ++
 arch/x86/include/asm/tdx.h      | 21 +++++++++
 arch/x86/kernel/idt.c           |  3 ++
 arch/x86/kernel/tdx.c           | 66 +++++++++++++++++++++++++++
 arch/x86/kernel/traps.c         | 79 +++++++++++++++++++++++++++++++++
 5 files changed, 173 insertions(+)

diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 1345088e9902..8ccc81d653b3 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -625,6 +625,10 @@ DECLARE_IDTENTRY_XENCB(X86_TRAP_OTHER,	exc_xen_hypervisor_callback);
 DECLARE_IDTENTRY_RAW(X86_TRAP_OTHER,	exc_xen_unknown_trap);
 #endif
 
+#ifdef CONFIG_INTEL_TDX_GUEST
+DECLARE_IDTENTRY(X86_TRAP_VE,		exc_virtualization_exception);
+#endif
+
 /* Device interrupts common/spurious */
 DECLARE_IDTENTRY_IRQ(X86_TRAP_OTHER,	common_interrupt);
 #ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 78bfd7dc9b2f..8c33d7439c08 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -4,6 +4,7 @@
 #define _ASM_X86_TDX_H
 
 #include <linux/init.h>
+#include <asm/ptrace.h>
 
 #define TDX_CPUID_LEAF_ID	0x21
 #define TDX_HYPERCALL_STANDARD  0
@@ -38,6 +39,22 @@ struct tdx_hypercall_output {
 	u64 r15;
 };
 
+/*
+ * Used by the #VE exception handler to gather the #VE exception
+ * info from the TDX module. This is a software only structure
+ * and not part of the TDX module/VMM ABI.
+ */
+struct ve_info {
+	u64 exit_reason;
+	u64 exit_qual;
+	/* Guest Linear (virtual) Address */
+	u64 gla;
+	/* Guest Physical (virtual) Address */
+	u64 gpa;
+	u32 instr_len;
+	u32 instr_info;
+};
+
 #ifdef CONFIG_INTEL_TDX_GUEST
 
 void __init tdx_early_init(void);
@@ -51,6 +68,10 @@ u64 __tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9,
 u64 __tdx_hypercall(u64 type, u64 fn, u64 r12, u64 r13, u64 r14,
 		    u64 r15, struct tdx_hypercall_output *out);
 
+bool tdx_get_ve_info(struct ve_info *ve);
+
+bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve);
+
 #else
 
 static inline void tdx_early_init(void) { };
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index df0fa695bb09..1da074123c16 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -68,6 +68,9 @@ static const __initconst struct idt_data early_idts[] = {
 	 */
 	INTG(X86_TRAP_PF,		asm_exc_page_fault),
 #endif
+#ifdef CONFIG_INTEL_TDX_GUEST
+	INTG(X86_TRAP_VE,		asm_exc_virtualization_exception),
+#endif
 };
 
 /*
diff --git a/arch/x86/kernel/tdx.c b/arch/x86/kernel/tdx.c
index 1cc850fd03ff..b6d0e45e6589 100644
--- a/arch/x86/kernel/tdx.c
+++ b/arch/x86/kernel/tdx.c
@@ -7,6 +7,9 @@
 #include <linux/cpufeature.h>
 #include <asm/tdx.h>
 
+/* TDX Module Call Leaf IDs */
+#define TDX_GET_VEINFO			3
+
 static bool tdx_guest_detected __ro_after_init;
 
 /*
@@ -33,6 +36,69 @@ static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14,
 	return out->r10;
 }
 
+bool tdx_get_ve_info(struct ve_info *ve)
+{
+	struct tdx_module_output out;
+
+	/*
+	 * NMIs and machine checks are suppressed. Before this point any
+	 * #VE is fatal. After this point (TDGETVEINFO call), NMIs and
+	 * additional #VEs are permitted (but it is expected not to
+	 * happen unless kernel panics).
+	 */
+	if (__tdx_module_call(TDX_GET_VEINFO, 0, 0, 0, 0, &out))
+		return false;
+
+	ve->exit_reason = out.rcx;
+	ve->exit_qual   = out.rdx;
+	ve->gla         = out.r8;
+	ve->gpa         = out.r9;
+	ve->instr_len   = lower_32_bits(out.r10);
+	ve->instr_info  = upper_32_bits(out.r10);
+
+	return true;
+}
+
+/*
+ * Handle the user initiated #VE.
+ *
+ * For example, executing the CPUID instruction from the user
+ * space is a valid case and hence the resulting #VE had to
+ * be handled.
+ *
+ * For dis-allowed or invalid #VE just return failure.
+ *
+ * Return True on success and False on failure.
+ */
+static bool tdx_virt_exception_user(struct pt_regs *regs, struct ve_info *ve)
+{
+	pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
+	return false;
+}
+
+/* Handle the kernel #VE */
+static bool tdx_virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve)
+{
+	pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
+	return false;
+}
+
+bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve)
+{
+	bool ret;
+
+	if (user_mode(regs))
+		ret = tdx_virt_exception_user(regs, ve);
+	else
+		ret = tdx_virt_exception_kernel(regs, ve);
+
+	/* After successful #VE handling, move the IP */
+	if (ret)
+		regs->ip += ve->instr_len;
+
+	return ret;
+}
+
 bool is_tdx_guest(void)
 {
 	return tdx_guest_detected;
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index c9d566dcf89a..24791a8bcd63 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -61,6 +61,7 @@
 #include <asm/insn.h>
 #include <asm/insn-eval.h>
 #include <asm/vdso.h>
+#include <asm/tdx.h>
 
 #ifdef CONFIG_X86_64
 #include <asm/x86_init.h>
@@ -1212,6 +1213,84 @@ DEFINE_IDTENTRY(exc_device_not_available)
 	}
 }
 
+#ifdef CONFIG_INTEL_TDX_GUEST
+
+#define VE_FAULT_STR "VE fault"
+
+static void ve_raise_fault(struct pt_regs *regs, long error_code)
+{
+	struct task_struct *tsk = current;
+
+	if (user_mode(regs)) {
+		tsk->thread.error_code = error_code;
+		tsk->thread.trap_nr = X86_TRAP_VE;
+		show_signal(tsk, SIGSEGV, "", VE_FAULT_STR, regs, error_code);
+		force_sig(SIGSEGV);
+		return;
+	}
+
+	/*
+	 * Attempt to recover from #VE exception failure without
+	 * triggering OOPS (useful for MSR read/write failures)
+	 */
+	if (fixup_exception(regs, X86_TRAP_VE, error_code, 0))
+		return;
+
+	tsk->thread.error_code = error_code;
+	tsk->thread.trap_nr = X86_TRAP_VE;
+
+	/*
+	 * To be potentially processing a kprobe fault and to trust the result
+	 * from kprobe_running(), it should be non-preemptible.
+	 */
+	if (!preemptible() && kprobe_running() &&
+	    kprobe_fault_handler(regs, X86_TRAP_VE))
+		return;
+
+	/* Notify about #VE handling failure, useful for debugger hooks */
+	if (notify_die(DIE_GPF, VE_FAULT_STR, regs, error_code,
+		       X86_TRAP_VE, SIGSEGV) == NOTIFY_STOP)
+		return;
+
+	/* Trigger OOPS and panic */
+	die_addr(VE_FAULT_STR, regs, error_code, 0);
+}
+
+/*
+ * In TDX guests, specific MSRs, instructions, CPUID leaves, shared
+ * memory access triggers #VE. The tdx_handle_virt_exception() will
+ * try to handle the #VE using appropriate hypercalls. For unhandled
+ * or failed #VEs, attempt recovery using fixups or raise fault if
+ * failed.
+ */
+DEFINE_IDTENTRY(exc_virtualization_exception)
+{
+	struct ve_info ve;
+	bool ret;
+
+	/*
+	 * NMIs/Machine-checks/Interrupts will be in a disabled state
+	 * till TDGETVEINFO TDCALL is executed. This ensures that VE
+	 * info cannot be overwritten by a nested #VE.
+	 */
+	ret = tdx_get_ve_info(&ve);
+
+	cond_local_irq_enable(regs);
+
+	if (ret)
+		ret = tdx_handle_virt_exception(regs, &ve);
+	/*
+	 * If tdx_handle_virt_exception() could not process
+	 * it successfully, treat it as #GP(0) and handle it.
+	 */
+	if (!ret)
+		ve_raise_fault(regs, 0);
+
+	cond_local_irq_disable(regs);
+}
+
+#endif
+
 #ifdef CONFIG_X86_32
 DEFINE_IDTENTRY_SW(iret_error)
 {
-- 
2.32.0

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ