linux-kernel - Re: [PATCH 4/7] KVM: TDX: restore host xsave state when exit from the guest TD

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Z4Aff2QTJeOyrEUY@google.com>
Date: Thu, 9 Jan 2025 11:11:59 -0800
From: Sean Christopherson <seanjc@...gle.com>
To: Adrian Hunter <adrian.hunter@...el.com>
Cc: Chao Gao <chao.gao@...el.com>, pbonzini@...hat.com, kvm@...r.kernel.org, 
	dave.hansen@...ux.intel.com, rick.p.edgecombe@...el.com, kai.huang@...el.com, 
	reinette.chatre@...el.com, xiaoyao.li@...el.com, 
	tony.lindgren@...ux.intel.com, binbin.wu@...ux.intel.com, dmatlack@...gle.com, 
	isaku.yamahata@...el.com, nik.borisov@...e.com, linux-kernel@...r.kernel.org, 
	x86@...nel.org, yan.y.zhao@...el.com, weijiang.yang@...el.com
Subject: Re: [PATCH 4/7] KVM: TDX: restore host xsave state when exit from the
 guest TD

On Fri, Jan 03, 2025, Adrian Hunter wrote:
> On 20/12/24 18:22, Sean Christopherson wrote:
> +/* Set a maximal guest CR0 value */
> +static u64 tdx_guest_cr0(struct kvm_vcpu *vcpu, u64 cr4)
> +{
> +	u64 cr0;
> +
> +	rdmsrl(MSR_IA32_VMX_CR0_FIXED1, cr0);
> +
> +	if (cr4 & X86_CR4_CET)
> +		cr0 |= X86_CR0_WP;
> +
> +	cr0 |= X86_CR0_PE | X86_CR0_NE;
> +	cr0 &= ~(X86_CR0_NW | X86_CR0_CD);
> +
> +	return cr0;
> +}
> +
> +/*
> + * Set a maximal guest CR4 value. Clear bits forbidden by XFAM or
> + * TD Attributes.
> + */
> +static u64 tdx_guest_cr4(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
> +	u64 cr4;
> +
> +	rdmsrl(MSR_IA32_VMX_CR4_FIXED1, cr4);

This won't be accurate long-term.  E.g. run KVM on hardware with CR4 bits that
neither KVM nor TDX know about, and vcpu->arch.cr4 will end up with bits set that
KVM think are illegal, which will cause it's own problems.

For CR0 and CR4, we should be able to start with KVM's set of allowed bits, not
the CPU's.  That will mean there will likely be missing bits, in vcpu->arch.cr{0,4},
but if KVM doesn't know about a bit, the fact that it's missing should be a complete
non-issue.

That also avoids weirdness for things like user-mode interrupts, LASS, PKS, etc.,
where KVM is open coding the bits.  The downside is that we'll need to remember
to update TDX when enabling those features to account for kvm_tdx->attributes,
but that's not unreasonable.

> +
> +	if (!(kvm_tdx->xfam & XFEATURE_PKRU))
> +		cr4 &= ~X86_CR4_PKE;
> +
> +	if (!(kvm_tdx->xfam & XFEATURE_CET_USER) || !(kvm_tdx->xfam & BIT_ULL(12)))
> +		cr4 &= ~X86_CR4_CET;
> +
> +	/* User Interrupts */
> +	if (!(kvm_tdx->xfam & BIT_ULL(14)))
> +		cr4 &= ~BIT_ULL(25);
> +
> +	if (!(kvm_tdx->attributes & TDX_TD_ATTR_LASS))
> +		cr4 &= ~BIT_ULL(27);
> +
> +	if (!(kvm_tdx->attributes & TDX_TD_ATTR_PKS))
> +		cr4 &= ~BIT_ULL(24);
> +
> +	if (!(kvm_tdx->attributes & TDX_TD_ATTR_KL))
> +		cr4 &= ~BIT_ULL(19);
> +
> +	cr4 &= ~X86_CR4_SMXE;
> +
> +	return cr4;
> +}
> +
>  int tdx_vcpu_create(struct kvm_vcpu *vcpu)
>  {
>  	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
> @@ -732,8 +783,8 @@ int tdx_vcpu_create(struct kvm_vcpu *vcpu)
>  	vcpu->arch.cr0_guest_owned_bits = -1ul;
>  	vcpu->arch.cr4_guest_owned_bits = -1ul;
>  
> -	vcpu->arch.cr4 = <maximal value>;
> -	vcpu->arch.cr0 = <maximal value, give or take>;
> +	vcpu->arch.cr4 = tdx_guest_cr4(vcpu);
> +	vcpu->arch.cr0 = tdx_guest_cr0(vcpu, vcpu->arch.cr4);
>  
>  	vcpu->arch.tsc_offset = kvm_tdx->tsc_offset;
>  	vcpu->arch.l1_tsc_offset = vcpu->arch.tsc_offset;
> @@ -767,6 +818,12 @@ int tdx_vcpu_create(struct kvm_vcpu *vcpu)
>  	return 0;
>  }
>  
> +void tdx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
> +{
> +	if (cpu_feature_enabled(X86_FEATURE_XSAVES))

This should use kvm_cpu_caps_has(), because strictly speaking it's KVM support
that matters.  In practice, I don't think it matters for XSAVES, but it can
matter for other features (though probably not for TDX guests).

> +		kvm_governed_feature_set(vcpu, X86_FEATURE_XSAVES);
> +}
> +
>  void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>  {
>  	struct vcpu_tdx *tdx = to_tdx(vcpu);
> @@ -933,6 +990,24 @@ static void tdx_user_return_msr_update_cache(void)
>  						 tdx_uret_msrs[i].defval);
>  }
>  
> +static void tdx_reinforce_guest_state(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
> +
> +	if (WARN_ON_ONCE(vcpu->arch.xcr0 != (kvm_tdx->xfam & TDX_XFAM_XCR0_MASK)))
> +		vcpu->arch.xcr0 = kvm_tdx->xfam & TDX_XFAM_XCR0_MASK;
> +	if (WARN_ON_ONCE(vcpu->arch.ia32_xss != (kvm_tdx->xfam & TDX_XFAM_XSS_MASK)))
> +		vcpu->arch.ia32_xss = kvm_tdx->xfam & TDX_XFAM_XSS_MASK;
> +	if (WARN_ON_ONCE(vcpu->arch.pkru))
> +		vcpu->arch.pkru = 0;
> +	if (WARN_ON_ONCE(cpu_feature_enabled(X86_FEATURE_XSAVE) &&
> +			 !kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)))
> +		vcpu->arch.cr4 |= X86_CR4_OSXSAVE;
> +	if (WARN_ON_ONCE(cpu_feature_enabled(X86_FEATURE_XSAVES) &&
> +			 !guest_can_use(vcpu, X86_FEATURE_XSAVES)))
> +		kvm_governed_feature_set(vcpu, X86_FEATURE_XSAVES);
> +}
> +
>  static noinstr void tdx_vcpu_enter_exit(struct kvm_vcpu *vcpu)
>  {
>  	struct vcpu_tdx *tdx = to_tdx(vcpu);
> @@ -1028,9 +1103,11 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
>  		update_debugctlmsr(tdx->host_debugctlmsr);
>  
>  	tdx_user_return_msr_update_cache();
> +
> +	tdx_reinforce_guest_state(vcpu);

Hmm, I don't think fixing up guest state is a good idea.  It probably works?
But continuing on when we know there's a KVM bug *and* a chance for host data
corruption seems unnecessarily risky.

My vote would to KVM_BUG_ON() before entering the guest.  I think I'd also be ok
omitting the checks, it's not like the potential for KVM bugs that clobber KVM's
view of state are unique to TDX (though I do agree that the behavior of the TDX
module in this case does make them more likely).

>  	kvm_load_host_xsave_state(vcpu);
>  
> -	vcpu->arch.regs_avail = TDX_REGS_UNSUPPORTED_SET;
> +	vcpu->arch.regs_avail = ~0;
>  
>  	if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR))
>  		return EXIT_FASTPATH_NONE;
> diff --git a/arch/x86/kvm/vmx/tdx_arch.h b/arch/x86/kvm/vmx/tdx_arch.h
> index 861c0f649b69..2e0e300a1f5e 100644
> --- a/arch/x86/kvm/vmx/tdx_arch.h
> +++ b/arch/x86/kvm/vmx/tdx_arch.h
> @@ -110,6 +110,7 @@ struct tdx_cpuid_value {
>  } __packed;
>  
>  #define TDX_TD_ATTR_DEBUG		BIT_ULL(0)
> +#define TDX_TD_ATTR_LASS		BIT_ULL(27)
>  #define TDX_TD_ATTR_SEPT_VE_DISABLE	BIT_ULL(28)
>  #define TDX_TD_ATTR_PKS			BIT_ULL(30)
>  #define TDX_TD_ATTR_KL			BIT_ULL(31)
> diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
> index 7fb1bbf12b39..7f03a6a24abc 100644
> --- a/arch/x86/kvm/vmx/x86_ops.h
> +++ b/arch/x86/kvm/vmx/x86_ops.h
> @@ -126,6 +126,7 @@ void tdx_vm_free(struct kvm *kvm);
>  int tdx_vm_ioctl(struct kvm *kvm, void __user *argp);
>  
>  int tdx_vcpu_create(struct kvm_vcpu *vcpu);
> +void tdx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu);
>  void tdx_vcpu_free(struct kvm_vcpu *vcpu);
>  void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
>  int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu);
> @@ -170,6 +171,7 @@ static inline void tdx_vm_free(struct kvm *kvm) {}
>  static inline int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) { return -EOPNOTSUPP; }
>  
>  static inline int tdx_vcpu_create(struct kvm_vcpu *vcpu) { return -EOPNOTSUPP; }
> +static inline void tdx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) {}
>  static inline void tdx_vcpu_free(struct kvm_vcpu *vcpu) {}
>  static inline void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) {}
>  static inline int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu) { return -EOPNOTSUPP; }
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index d2ea7db896ba..f2b1980f830d 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1240,6 +1240,11 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
>  	u64 old_xcr0 = vcpu->arch.xcr0;
>  	u64 valid_bits;
>  
> +	if (vcpu->arch.guest_state_protected) {

This should be a WARN_ON_ONCE() + return 1, no?

> +		kvm_update_cpuid_runtime(vcpu);
> +		return 0;
> +	}
> +
>  	/* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
>  	if (index != XCR_XFEATURE_ENABLED_MASK)
>  		return 1;
> @@ -12388,7 +12393,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
>  	 * into hardware, to be zeroed at vCPU creation.  Use CRs as a sentinel
>  	 * to detect improper or missing initialization.
>  	 */
> -	WARN_ON_ONCE(!init_event &&
> +	WARN_ON_ONCE(!init_event && !vcpu->arch.guest_state_protected &&
>  		     (old_cr0 || kvm_read_cr3(vcpu) || kvm_read_cr4(vcpu)));

Maybe stuff state in tdx_vcpu_init() to avoid this waiver?  KVM is already
deferring APIC base and RCX initialization to that point, waiting to stuff all
TDX-specific vCPU state seems natural.