linux-kernel - Re: [RFC v2 08/32] x86/traps: Add #VE support for TDX guest

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <afd85e8f-ab26-aa3b-e4e9-a0b3bfd472c8@intel.com>
Date:   Fri, 7 May 2021 14:36:31 -0700
From:   Dave Hansen <dave.hansen@...el.com>
To:     Kuppuswamy Sathyanarayanan 
        <sathyanarayanan.kuppuswamy@...ux.intel.com>,
        Peter Zijlstra <peterz@...radead.org>,
        Andy Lutomirski <luto@...nel.org>,
        Dan Williams <dan.j.williams@...el.com>,
        Tony Luck <tony.luck@...el.com>
Cc:     Andi Kleen <ak@...ux.intel.com>,
        Kirill Shutemov <kirill.shutemov@...ux.intel.com>,
        Kuppuswamy Sathyanarayanan <knsathya@...nel.org>,
        Raj Ashok <ashok.raj@...el.com>,
        Sean Christopherson <seanjc@...gle.com>,
        linux-kernel@...r.kernel.org,
        Sean Christopherson <sean.j.christopherson@...el.com>
Subject: Re: [RFC v2 08/32] x86/traps: Add #VE support for TDX guest

On 4/26/21 11:01 AM, Kuppuswamy Sathyanarayanan wrote:
...
> The #VE cannot be nested before TDGETVEINFO is called, if there is any
> reason for it to nest the TD would shut down. The TDX module guarantees
> that no NMIs (or #MC or similar) can happen in this window. After
> TDGETVEINFO the #VE handler can nest if needed, although we don’t expect
> it to happen normally.

I think this description really needs some work.  Does "The #VE cannot
be nested" mean that "hardware guarantees that #VE will not be
generated", or "the #VE must not be nested"?

What does "the TD would shut down" mean?  I think you mean that instead
of delivering a nested #VE the hardware would actually exit to the host
and TDX would prevent the guest from being reentered.  Right?

> diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
> index 5eb3bdf36a41..41a0732d5f68 100644
> --- a/arch/x86/include/asm/idtentry.h
> +++ b/arch/x86/include/asm/idtentry.h
> @@ -619,6 +619,10 @@ DECLARE_IDTENTRY_XENCB(X86_TRAP_OTHER,	exc_xen_hypervisor_callback);
>  DECLARE_IDTENTRY_RAW(X86_TRAP_OTHER,	exc_xen_unknown_trap);
>  #endif
>  
> +#ifdef CONFIG_INTEL_TDX_GUEST
> +DECLARE_IDTENTRY(X86_TRAP_VE,		exc_virtualization_exception);
> +#endif
> +
>  /* Device interrupts common/spurious */
>  DECLARE_IDTENTRY_IRQ(X86_TRAP_OTHER,	common_interrupt);
>  #ifdef CONFIG_X86_LOCAL_APIC
> diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
> index c5a870cef0ae..1ca55d8e9963 100644
> --- a/arch/x86/include/asm/tdx.h
> +++ b/arch/x86/include/asm/tdx.h
> @@ -11,6 +11,7 @@
>  #include <linux/types.h>
>  
>  #define TDINFO			1
> +#define TDGETVEINFO		3
>  
>  struct tdcall_output {
>  	u64 rcx;
> @@ -29,6 +30,20 @@ struct tdvmcall_output {
>  	u64 r15;
>  };
>  
> +struct ve_info {
> +	u64 exit_reason;
> +	u64 exit_qual;
> +	u64 gla;
> +	u64 gpa;
> +	u32 instr_len;
> +	u32 instr_info;
> +};

Is this an architectural structure or some software construct?

> +unsigned long tdg_get_ve_info(struct ve_info *ve);
> +
> +int tdg_handle_virtualization_exception(struct pt_regs *regs,
> +		struct ve_info *ve);
> +
>  /* Common API to check TDX support in decompression and common kernel code. */
>  bool is_tdx_guest(void);
>  
> diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
> index ee1a283f8e96..546b6b636c7d 100644
> --- a/arch/x86/kernel/idt.c
> +++ b/arch/x86/kernel/idt.c
> @@ -64,6 +64,9 @@ static const __initconst struct idt_data early_idts[] = {
>  	 */
>  	INTG(X86_TRAP_PF,		asm_exc_page_fault),
>  #endif
> +#ifdef CONFIG_INTEL_TDX_GUEST
> +	INTG(X86_TRAP_VE,		asm_exc_virtualization_exception),
> +#endif
>  };
>  
>  /*
> @@ -87,6 +90,9 @@ static const __initconst struct idt_data def_idts[] = {
>  	INTG(X86_TRAP_MF,		asm_exc_coprocessor_error),
>  	INTG(X86_TRAP_AC,		asm_exc_alignment_check),
>  	INTG(X86_TRAP_XF,		asm_exc_simd_coprocessor_error),
> +#ifdef CONFIG_INTEL_TDX_GUEST
> +	INTG(X86_TRAP_VE,		asm_exc_virtualization_exception),
> +#endif
>  
>  #ifdef CONFIG_X86_32
>  	TSKG(X86_TRAP_DF,		GDT_ENTRY_DOUBLEFAULT_TSS),
> diff --git a/arch/x86/kernel/tdx.c b/arch/x86/kernel/tdx.c
> index b63275db1db9..ccfcb07bfb2c 100644
> --- a/arch/x86/kernel/tdx.c
> +++ b/arch/x86/kernel/tdx.c
> @@ -82,6 +82,44 @@ static void tdg_get_info(void)
>  	td_info.attributes = out.rdx;
>  }
>  
> +unsigned long tdg_get_ve_info(struct ve_info *ve)
> +{
> +	u64 ret;
> +	struct tdcall_output out = {0};
> +
> +	/*
> +	 * The #VE cannot be nested before TDGETVEINFO is called,
> +	 * if there is any reason for it to nest the TD would shut
> +	 * down. The TDX module guarantees that no NMIs (or #MC or
> +	 * similar) can happen in this window. After TDGETVEINFO
> +	 * the #VE handler can nest if needed, although we don’t
> +	 * expect it to happen normally.
> +	 */

I find that description a bit unsatisfying.  Could we make this a bit
more concrete?  By the way, what about *normal* interrupts?

Maybe we should talk about this in terms of *rules* that folks need to
follow.  Maybe:

	NMIs and machine checks are suppressed.  Before this point any
	#VE is fatal.  After this point, NMIs and additional #VEs are
	permitted.

> +	ret = __tdcall(TDGETVEINFO, 0, 0, 0, 0, &out);
> +
> +	ve->exit_reason = out.rcx;
> +	ve->exit_qual   = out.rdx;
> +	ve->gla         = out.r8;
> +	ve->gpa         = out.r9;
> +	ve->instr_len   = out.r10 & UINT_MAX;
> +	ve->instr_info  = out.r10 >> 32;
> +
> +	return ret;
> +}
> +
> +int tdg_handle_virtualization_exception(struct pt_regs *regs,
> +		struct ve_info *ve)
> +{
> +	/*
> +	 * TODO: Add handler support for various #VE exit
> +	 * reasons. It will be added by other patches in
> +	 * the series.
> +	 */
> +	pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
> +	return -EFAULT;
> +}
> +
>  void __init tdx_early_init(void)
>  {
>  	if (!cpuid_has_tdx_guest())
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index 213d4aa8e337..64869aa88a5a 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -61,6 +61,7 @@
>  #include <asm/insn.h>
>  #include <asm/insn-eval.h>
>  #include <asm/vdso.h>
> +#include <asm/tdx.h>
>  
>  #ifdef CONFIG_X86_64
>  #include <asm/x86_init.h>
> @@ -1140,6 +1141,35 @@ DEFINE_IDTENTRY(exc_device_not_available)
>  	}
>  }
>  
> +#ifdef CONFIG_INTEL_TDX_GUEST
> +DEFINE_IDTENTRY(exc_virtualization_exception)
> +{
> +	struct ve_info ve;
> +	int ret;
> +
> +	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
> +
> +	/*
> +	 * Consume #VE info before re-enabling interrupts. It will be
> +	 * re-enabled after executing the TDGETVEINFO TDCALL.
> +	 */

"It" is nebulous here.  Is this talking about NMIs, or the
cond_local_irq_enable() that is "after" TDGETVEINFO?

> +	ret = tdg_get_ve_info(&ve);
> +
> +	cond_local_irq_enable(regs);
> +
> +	if (!ret)
> +		ret = tdg_handle_virtualization_exception(regs, &ve);
> +	/*
> +	 * If tdg_handle_virtualization_exception() could not process
> +	 * it successfully, treat it as #GP(0) and handle it.
> +	 */
> +	if (ret)
> +		do_general_protection(regs, 0);
> +
> +	cond_local_irq_disable(regs);
> +}
> +#endif
> +
>  #ifdef CONFIG_X86_32
>  DEFINE_IDTENTRY_SW(iret_error)
>  {
>