[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <96f7204b-6eb4-4fac-b5bb-1cd5c1fc6def@intel.com>
Date: Fri, 3 Jan 2025 20:16:56 +0200
From: Adrian Hunter <adrian.hunter@...el.com>
To: Sean Christopherson <seanjc@...gle.com>
Cc: Chao Gao <chao.gao@...el.com>, pbonzini@...hat.com, kvm@...r.kernel.org,
dave.hansen@...ux.intel.com, rick.p.edgecombe@...el.com,
kai.huang@...el.com, reinette.chatre@...el.com, xiaoyao.li@...el.com,
tony.lindgren@...ux.intel.com, binbin.wu@...ux.intel.com,
dmatlack@...gle.com, isaku.yamahata@...el.com, nik.borisov@...e.com,
linux-kernel@...r.kernel.org, x86@...nel.org, yan.y.zhao@...el.com,
weijiang.yang@...el.com
Subject: Re: [PATCH 4/7] KVM: TDX: restore host xsave state when exit from the
guest TD
On 20/12/24 18:22, Sean Christopherson wrote:
> On Fri, Dec 20, 2024, Adrian Hunter wrote:
>> On 17/12/24 18:09, Sean Christopherson wrote:
>>> On Mon, Nov 25, 2024, Adrian Hunter wrote:
>>> I would rather just use kvm_load_host_xsave_state(), by forcing vcpu->arch.{xcr0,xss}
>>> to XFAM, with a comment explaining that the TDX module sets XCR0 and XSS prior to
>>> returning from VP.ENTER. I don't see any justificaton for maintaining a special
>>> flow for TDX, it's just more work.
>>> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
>>> index 7eff717c9d0d..b49dcf32206b 100644
>>> --- a/arch/x86/kvm/vmx/tdx.c
>>> +++ b/arch/x86/kvm/vmx/tdx.c
>>> @@ -636,6 +636,9 @@ int tdx_vcpu_create(struct kvm_vcpu *vcpu)
>>> vcpu->arch.cr0_guest_owned_bits = -1ul;
>>> vcpu->arch.cr4_guest_owned_bits = -1ul;
>>>
>>> + vcpu->arch.cr4 = <maximal value>;
>> Sorry for slow reply. Seems fine except maybe CR4 usage.
>>
>> TDX Module validates CR4 based on XFAM and scrubs host state
>> based on XFAM. It seems like we would need to use XFAM to
>> manufacture a CR4 that we then effectively use as a proxy
>> instead of just checking XFAM.
> Yep.
>
>> Since only some vcpu->arch.cr4 bits will be meaningful, it also
>> still leaves the possibility for confusion.
> IMO, it's less confusing having '0' for CR0 and CR4, while having accurate values
> for other state. And I'm far more worried about KVM wondering into a bad path
> because CR0 and/or CR4 are completely wrong. E.g. kvm_mmu.cpu_role would be
> completely wrong at steady state, the CR4-based runtime CPUID updates would do
> the wrong thing, and any helper that wraps kvm_is_cr{0,4}_bit_set() would likely
> do the worst possible thing.
>
>> Are you sure you want this?
> Yeah, pretty sure. It would be nice if the TDX Module exposed guest CR0/CR4 to
> KVM, a la the traps SEV-ES+ uses, but I think the next best thing is to assume
> the guest is using all features.
>
>>> + vcpu->arch.cr0 = <maximal value, give or take>;
>> AFAICT we don't need to care about CR0
> Probably not, but having e.g. CR4.PAE/LA57=1 with CR0.PG/PE=0 will be quite
> weird.
Below is what I have so far. It seems to work. Note:
- use of MSR_IA32_VMX_CR0_FIXED1 and MSR_IA32_VMX_CR4_FIXED1
to provide base value for CR0 and CR4
- tdx_reinforce_guest_state() to make sure host state doesn't
get broken because the values go wrong
- __kvm_set_xcr() to handle guest_state_protected case
- kvm_vcpu_reset() to handle guest_state_protected case
Please let me know your feedback.
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index 10aebae5af18..2a5f756b05e2 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -351,8 +351,10 @@ static void vt_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
static void vt_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
- if (is_td_vcpu(vcpu))
+ if (is_td_vcpu(vcpu)) {
+ tdx_vcpu_after_set_cpuid(vcpu);
return;
+ }
vmx_vcpu_after_set_cpuid(vcpu);
}
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 191ee209caa0..0ae427340494 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -710,6 +710,57 @@ int tdx_vm_init(struct kvm *kvm)
return 0;
}
+/* Set a maximal guest CR0 value */
+static u64 tdx_guest_cr0(struct kvm_vcpu *vcpu, u64 cr4)
+{
+ u64 cr0;
+
+ rdmsrl(MSR_IA32_VMX_CR0_FIXED1, cr0);
+
+ if (cr4 & X86_CR4_CET)
+ cr0 |= X86_CR0_WP;
+
+ cr0 |= X86_CR0_PE | X86_CR0_NE;
+ cr0 &= ~(X86_CR0_NW | X86_CR0_CD);
+
+ return cr0;
+}
+
+/*
+ * Set a maximal guest CR4 value. Clear bits forbidden by XFAM or
+ * TD Attributes.
+ */
+static u64 tdx_guest_cr4(struct kvm_vcpu *vcpu)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+ u64 cr4;
+
+ rdmsrl(MSR_IA32_VMX_CR4_FIXED1, cr4);
+
+ if (!(kvm_tdx->xfam & XFEATURE_PKRU))
+ cr4 &= ~X86_CR4_PKE;
+
+ if (!(kvm_tdx->xfam & XFEATURE_CET_USER) || !(kvm_tdx->xfam & BIT_ULL(12)))
+ cr4 &= ~X86_CR4_CET;
+
+ /* User Interrupts */
+ if (!(kvm_tdx->xfam & BIT_ULL(14)))
+ cr4 &= ~BIT_ULL(25);
+
+ if (!(kvm_tdx->attributes & TDX_TD_ATTR_LASS))
+ cr4 &= ~BIT_ULL(27);
+
+ if (!(kvm_tdx->attributes & TDX_TD_ATTR_PKS))
+ cr4 &= ~BIT_ULL(24);
+
+ if (!(kvm_tdx->attributes & TDX_TD_ATTR_KL))
+ cr4 &= ~BIT_ULL(19);
+
+ cr4 &= ~X86_CR4_SMXE;
+
+ return cr4;
+}
+
int tdx_vcpu_create(struct kvm_vcpu *vcpu)
{
struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
@@ -732,8 +783,8 @@ int tdx_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.cr0_guest_owned_bits = -1ul;
vcpu->arch.cr4_guest_owned_bits = -1ul;
- vcpu->arch.cr4 = <maximal value>;
- vcpu->arch.cr0 = <maximal value, give or take>;
+ vcpu->arch.cr4 = tdx_guest_cr4(vcpu);
+ vcpu->arch.cr0 = tdx_guest_cr0(vcpu, vcpu->arch.cr4);
vcpu->arch.tsc_offset = kvm_tdx->tsc_offset;
vcpu->arch.l1_tsc_offset = vcpu->arch.tsc_offset;
@@ -767,6 +818,12 @@ int tdx_vcpu_create(struct kvm_vcpu *vcpu)
return 0;
}
+void tdx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
+{
+ if (cpu_feature_enabled(X86_FEATURE_XSAVES))
+ kvm_governed_feature_set(vcpu, X86_FEATURE_XSAVES);
+}
+
void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct vcpu_tdx *tdx = to_tdx(vcpu);
@@ -933,6 +990,24 @@ static void tdx_user_return_msr_update_cache(void)
tdx_uret_msrs[i].defval);
}
+static void tdx_reinforce_guest_state(struct kvm_vcpu *vcpu)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+
+ if (WARN_ON_ONCE(vcpu->arch.xcr0 != (kvm_tdx->xfam & TDX_XFAM_XCR0_MASK)))
+ vcpu->arch.xcr0 = kvm_tdx->xfam & TDX_XFAM_XCR0_MASK;
+ if (WARN_ON_ONCE(vcpu->arch.ia32_xss != (kvm_tdx->xfam & TDX_XFAM_XSS_MASK)))
+ vcpu->arch.ia32_xss = kvm_tdx->xfam & TDX_XFAM_XSS_MASK;
+ if (WARN_ON_ONCE(vcpu->arch.pkru))
+ vcpu->arch.pkru = 0;
+ if (WARN_ON_ONCE(cpu_feature_enabled(X86_FEATURE_XSAVE) &&
+ !kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)))
+ vcpu->arch.cr4 |= X86_CR4_OSXSAVE;
+ if (WARN_ON_ONCE(cpu_feature_enabled(X86_FEATURE_XSAVES) &&
+ !guest_can_use(vcpu, X86_FEATURE_XSAVES)))
+ kvm_governed_feature_set(vcpu, X86_FEATURE_XSAVES);
+}
+
static noinstr void tdx_vcpu_enter_exit(struct kvm_vcpu *vcpu)
{
struct vcpu_tdx *tdx = to_tdx(vcpu);
@@ -1028,9 +1103,11 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
update_debugctlmsr(tdx->host_debugctlmsr);
tdx_user_return_msr_update_cache();
+
+ tdx_reinforce_guest_state(vcpu);
kvm_load_host_xsave_state(vcpu);
- vcpu->arch.regs_avail = TDX_REGS_UNSUPPORTED_SET;
+ vcpu->arch.regs_avail = ~0;
if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR))
return EXIT_FASTPATH_NONE;
diff --git a/arch/x86/kvm/vmx/tdx_arch.h b/arch/x86/kvm/vmx/tdx_arch.h
index 861c0f649b69..2e0e300a1f5e 100644
--- a/arch/x86/kvm/vmx/tdx_arch.h
+++ b/arch/x86/kvm/vmx/tdx_arch.h
@@ -110,6 +110,7 @@ struct tdx_cpuid_value {
} __packed;
#define TDX_TD_ATTR_DEBUG BIT_ULL(0)
+#define TDX_TD_ATTR_LASS BIT_ULL(27)
#define TDX_TD_ATTR_SEPT_VE_DISABLE BIT_ULL(28)
#define TDX_TD_ATTR_PKS BIT_ULL(30)
#define TDX_TD_ATTR_KL BIT_ULL(31)
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index 7fb1bbf12b39..7f03a6a24abc 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -126,6 +126,7 @@ void tdx_vm_free(struct kvm *kvm);
int tdx_vm_ioctl(struct kvm *kvm, void __user *argp);
int tdx_vcpu_create(struct kvm_vcpu *vcpu);
+void tdx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu);
void tdx_vcpu_free(struct kvm_vcpu *vcpu);
void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu);
@@ -170,6 +171,7 @@ static inline void tdx_vm_free(struct kvm *kvm) {}
static inline int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) { return -EOPNOTSUPP; }
static inline int tdx_vcpu_create(struct kvm_vcpu *vcpu) { return -EOPNOTSUPP; }
+static inline void tdx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) {}
static inline void tdx_vcpu_free(struct kvm_vcpu *vcpu) {}
static inline void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) {}
static inline int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu) { return -EOPNOTSUPP; }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d2ea7db896ba..f2b1980f830d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1240,6 +1240,11 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
u64 old_xcr0 = vcpu->arch.xcr0;
u64 valid_bits;
+ if (vcpu->arch.guest_state_protected) {
+ kvm_update_cpuid_runtime(vcpu);
+ return 0;
+ }
+
/* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */
if (index != XCR_XFEATURE_ENABLED_MASK)
return 1;
@@ -12388,7 +12393,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
* into hardware, to be zeroed at vCPU creation. Use CRs as a sentinel
* to detect improper or missing initialization.
*/
- WARN_ON_ONCE(!init_event &&
+ WARN_ON_ONCE(!init_event && !vcpu->arch.guest_state_protected &&
(old_cr0 || kvm_read_cr3(vcpu) || kvm_read_cr4(vcpu)));
/*
diff --git a/tools/testing/selftests/kvm/x86_64/tdx_vm_tests.c b/tools/testing/selftests/kvm/x86_64/tdx_vm_tests.c
index 075419ef3ac7..2cc9bf40a788 100644
--- a/tools/testing/selftests/kvm/x86_64/tdx_vm_tests.c
+++ b/tools/testing/selftests/kvm/x86_64/tdx_vm_tests.c
@@ -1054,7 +1054,7 @@ void verify_td_cpuid_tdcall(void)
TDX_TEST_ASSERT_SUCCESS(vcpu);
/* Get KVM CPUIDs for reference */
- tmp = get_cpuid_entry(kvm_get_supported_cpuid(), 1, 0);
+ tmp = get_cpuid_entry(vcpu->cpuid, 1, 0);
TEST_ASSERT(tmp, "CPUID entry missing\n");
cpuid_entry = *tmp;
Powered by blists - more mailing lists