[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20250816144436.83718-3-adrian.hunter@intel.com>
Date: Sat, 16 Aug 2025 17:44:35 +0300
From: Adrian Hunter <adrian.hunter@...el.com>
To: pbonzini@...hat.com,
seanjc@...gle.com
Cc: kvm@...r.kernel.org,
rick.p.edgecombe@...el.com,
kirill.shutemov@...ux.intel.com,
kai.huang@...el.com,
reinette.chatre@...el.com,
xiaoyao.li@...el.com,
tony.lindgren@...ux.intel.com,
binbin.wu@...ux.intel.com,
isaku.yamahata@...el.com,
linux-kernel@...r.kernel.org,
yan.y.zhao@...el.com,
chao.gao@...el.com,
ira.weiny@...el.com
Subject: [PATCH RFC 2/2] KVM: TDX: Add flag to support MWAIT instruction only
Add a TDX-specific flag to allow for using the MWAIT instruction in a
guest. This provides for users that understand the limitations that TDX
has compared with VMX in this regard.
The limitations are:
1. TDX Module versions prior to 1.5.09 and 2.0.04 do not expose the
Always-Running-APIC-Timer (ARAT) feature (CPUID leaf 6: EAX bit 2),
which a TDX guest may need for correct handling of deep C-states.
For example, with a Linux guest, that results in cpuidle disabling the
timer interrupt and invoking the Tick Broadcast framework to provide a
wake-up. Currently, that falls back to the PIT timer which does not
work for TDX, resulting in the guest becoming stuck in the idle loop.
2. TDX Module versions 1.5.09 and 2.0.04 or later support #VE reduction,
which, if the guest opts to enable it, results in the TDX Module
injecting #GP for accesses to MSRs that the guest could reasonably
assume to exist if the MWAIT feature is available.
A Linux guest could possibly be used with TDX support for MWAIT, for
example by:
a) - Using TDX Module versions 1.5.09 and 2.0.04 or later, and
- Using acpi_idle driver with suitable ACPI tables like _CST
b) - Using TDX Module versions 1.5.09 and 2.0.04 or later, and
- Ignoring unchecked MSR access errors from intel_idle
Signed-off-by: Adrian Hunter <adrian.hunter@...el.com>
---
Documentation/virt/kvm/x86/intel-tdx.rst | 28 ++++++++++-
arch/x86/include/uapi/asm/kvm.h | 3 ++
arch/x86/kvm/vmx/tdx.c | 62 ++++++++++++++++--------
3 files changed, 72 insertions(+), 21 deletions(-)
diff --git a/Documentation/virt/kvm/x86/intel-tdx.rst b/Documentation/virt/kvm/x86/intel-tdx.rst
index bcfa97e0c9e7..b534a092b4c1 100644
--- a/Documentation/virt/kvm/x86/intel-tdx.rst
+++ b/Documentation/virt/kvm/x86/intel-tdx.rst
@@ -70,8 +70,12 @@ Return the TDX capabilities that current KVM supports with the specific TDX
module loaded in the system. It reports what features/capabilities are allowed
to be configured to the TDX guest.
+KVM_TDX_FLAGS_ALLOW_MWAIT flag allows the capability to use the MWAIT
+instruction in a guest (CPUID leaf 1 ECX bit 3), but beware of the limitations,
+see "MWAIT Limitations" below.
+
- id: KVM_TDX_CAPABILITIES
-- flags: must be 0
+- flags: must be 0, or KVM_TDX_FLAGS_ALLOW_MWAIT (if KVM_TDX_CAP_ALLOW_MWAIT)
- data: pointer to struct kvm_tdx_capabilities
- hw_error: must be 0
@@ -111,8 +115,12 @@ KVM_TDX_INIT_VM
Perform TDX specific VM initialization. This needs to be called after
KVM_CREATE_VM and before creating any VCPUs.
+KVM_TDX_FLAGS_ALLOW_MWAIT flag allows the capability to use the MWAIT
+instruction in a guest (CPUID leaf 1 ECX bit 3), but beware of the limitations,
+see "MWAIT Limitations" below.
+
- id: KVM_TDX_INIT_VM
-- flags: must be 0
+- flags: must be 0, or KVM_TDX_FLAGS_ALLOW_MWAIT (if KVM_TDX_CAP_ALLOW_MWAIT)
- data: pointer to struct kvm_tdx_init_vm
- hw_error: must be 0
@@ -282,6 +290,22 @@ control flow is as follows:
#. Run VCPU
+MWAIT Limitations
+=================
+
+- TDX Module versions 1.5.09 and 2.0.04 or later support #VE reduction,
+ which, if the guest opts to enable it, results in the TDX Module
+ injecting #GP for accesses to MSRs that the guest could reasonably
+ assume to exist if the MWAIT feature is available.
+
+- TDX Module versions prior to 1.5.09 and 2.0.04 do not expose the
+ Always-Running-APIC-Timer (ARAT) feature (CPUID leaf 6: EAX bit 2),
+ which a TDX guest may need for correct handling of deep C-states.
+ For example, with a Linux guest, that results in cpuidle disabling the
+ timer interrupt and invoking the Tick Broadcast framework to provide a
+ wake-up. Currently, that falls back to the PIT timer which does not
+ work for TDX, resulting in the guest becoming stuck in the idle loop.
+
References
==========
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index e019111e2150..8175e05c9e50 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -945,6 +945,8 @@ enum kvm_tdx_cmd_id {
KVM_TDX_CMD_NR_MAX,
};
+#define KVM_TDX_FLAGS_ALLOW_MWAIT _BITUL(0)
+
struct kvm_tdx_cmd {
/* enum kvm_tdx_cmd_id */
__u32 id;
@@ -964,6 +966,7 @@ struct kvm_tdx_cmd {
};
#define KVM_TDX_CAP_TERMINATE_VM _BITULL(0)
+#define KVM_TDX_CAP_ALLOW_MWAIT _BITULL(1)
struct kvm_tdx_capabilities {
__u64 supported_attrs;
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index cdf0dc6cf068..db85624e0e78 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -143,7 +143,7 @@ static void clear_mwait(struct kvm_cpuid_entry2 *entry)
entry->ecx &= ~__feature_bit(X86_FEATURE_MWAIT);
}
-static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry)
+static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry, bool disallow_mwait)
{
if (has_tsx(entry))
clear_tsx(entry);
@@ -152,18 +152,20 @@ static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry)
clear_waitpkg(entry);
/* Also KVM_X86_DISABLE_EXITS_MWAIT is disallowed in tdx_vm_init() */
- if (has_mwait(entry))
+ if (disallow_mwait && has_mwait(entry))
clear_mwait(entry);
}
-static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry)
+static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry, bool disallow_mwait)
{
- return has_tsx(entry) || has_waitpkg(entry) || has_mwait(entry);
+ return has_tsx(entry) || has_waitpkg(entry) ||
+ (disallow_mwait && has_mwait(entry));
}
#define KVM_TDX_CPUID_NO_SUBLEAF ((__u32)-1)
-static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char idx)
+static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char idx,
+ bool disallow_mwait)
{
const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
@@ -185,14 +187,15 @@ static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char i
if (entry->function == 0x80000008)
entry->eax = tdx_set_guest_phys_addr_bits(entry->eax, 0xff);
- tdx_clear_unsupported_cpuid(entry);
+ tdx_clear_unsupported_cpuid(entry, disallow_mwait);
}
#define TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT BIT(1)
-static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf,
+static int init_kvm_tdx_caps(struct kvm *kvm, const struct tdx_sys_info_td_conf *td_conf,
struct kvm_tdx_capabilities *caps)
{
+ bool disallow_mwait = kvm->arch.unsupported_disable_exits & KVM_X86_DISABLE_EXITS_MWAIT;
int i;
caps->supported_attrs = tdx_get_supported_attrs(td_conf);
@@ -203,7 +206,7 @@ static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf,
if (!caps->supported_xfam)
return -EIO;
- caps->supported_caps = KVM_TDX_CAP_TERMINATE_VM;
+ caps->supported_caps = KVM_TDX_CAP_TERMINATE_VM | KVM_TDX_CAP_ALLOW_MWAIT;
caps->cpuid.nent = td_conf->num_cpuid_config;
@@ -211,7 +214,7 @@ static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf,
TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT;
for (i = 0; i < td_conf->num_cpuid_config; i++)
- td_init_cpuid_entry2(&caps->cpuid.entries[i], i);
+ td_init_cpuid_entry2(&caps->cpuid.entries[i], i, disallow_mwait);
return 0;
}
@@ -2268,7 +2271,9 @@ int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
}
}
-static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
+#define KVM_TDX_CAPABILITIES_FLAGS KVM_TDX_FLAGS_ALLOW_MWAIT
+
+static int tdx_get_capabilities(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
{
const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
struct kvm_tdx_capabilities __user *user_caps;
@@ -2276,10 +2281,12 @@ static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
u32 nr_user_entries;
int ret = 0;
- /* flags is reserved for future use */
- if (cmd->flags)
+ if (cmd->flags & ~KVM_TDX_CAPABILITIES_FLAGS)
return -EINVAL;
+ if (cmd->flags & KVM_TDX_FLAGS_ALLOW_MWAIT)
+ kvm->arch.unsupported_disable_exits &= ~KVM_X86_DISABLE_EXITS_MWAIT;
+
caps = kzalloc(sizeof(*caps) +
sizeof(struct kvm_cpuid_entry2) * td_conf->num_cpuid_config,
GFP_KERNEL);
@@ -2297,7 +2304,7 @@ static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
goto out;
}
- ret = init_kvm_tdx_caps(td_conf, caps);
+ ret = init_kvm_tdx_caps(kvm, td_conf, caps);
if (ret)
goto out;
@@ -2356,9 +2363,19 @@ static int setup_tdparams_eptp_controls(struct kvm_cpuid2 *cpuid,
return 0;
}
-static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid,
+static void tdx_update_mwait_in_guest(struct kvm *kvm, struct kvm_cpuid2 *cpuid)
+{
+ const struct kvm_cpuid_entry2 *entry;
+
+ entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 1, 0);
+
+ kvm->arch.mwait_in_guest = entry && has_mwait(entry);
+}
+
+static int setup_tdparams_cpuids(struct kvm *kvm, struct kvm_cpuid2 *cpuid,
struct td_params *td_params)
{
+ bool disallow_mwait = kvm->arch.unsupported_disable_exits & KVM_X86_DISABLE_EXITS_MWAIT;
const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
const struct kvm_cpuid_entry2 *entry;
struct tdx_cpuid_value *value;
@@ -2372,14 +2389,14 @@ static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid,
for (i = 0; i < td_conf->num_cpuid_config; i++) {
struct kvm_cpuid_entry2 tmp;
- td_init_cpuid_entry2(&tmp, i);
+ td_init_cpuid_entry2(&tmp, i, disallow_mwait);
entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent,
tmp.function, tmp.index);
if (!entry)
continue;
- if (tdx_unsupported_cpuid(entry))
+ if (tdx_unsupported_cpuid(entry, disallow_mwait))
return -EINVAL;
copy_cnt++;
@@ -2437,10 +2454,12 @@ static int setup_tdparams(struct kvm *kvm, struct td_params *td_params,
if (ret)
return ret;
- ret = setup_tdparams_cpuids(cpuid, td_params);
+ ret = setup_tdparams_cpuids(kvm, cpuid, td_params);
if (ret)
return ret;
+ tdx_update_mwait_in_guest(kvm, cpuid);
+
#define MEMCPY_SAME_SIZE(dst, src) \
do { \
BUILD_BUG_ON(sizeof(dst) != sizeof(src)); \
@@ -2745,6 +2764,8 @@ static int tdx_read_cpuid(struct kvm_vcpu *vcpu, u32 leaf, u32 sub_leaf,
return -EIO;
}
+#define KVM_TDX_INIT_VM_FLAGS KVM_TDX_FLAGS_ALLOW_MWAIT
+
static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
{
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
@@ -2758,9 +2779,12 @@ static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
if (kvm_tdx->state != TD_STATE_UNINITIALIZED)
return -EINVAL;
- if (cmd->flags)
+ if (cmd->flags & ~KVM_TDX_INIT_VM_FLAGS)
return -EINVAL;
+ if (cmd->flags & KVM_TDX_FLAGS_ALLOW_MWAIT)
+ kvm->arch.unsupported_disable_exits &= ~KVM_X86_DISABLE_EXITS_MWAIT;
+
init_vm = kmalloc(sizeof(*init_vm) +
sizeof(init_vm->cpuid.entries[0]) * KVM_MAX_CPUID_ENTRIES,
GFP_KERNEL);
@@ -2925,7 +2949,7 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
switch (tdx_cmd.id) {
case KVM_TDX_CAPABILITIES:
- r = tdx_get_capabilities(&tdx_cmd);
+ r = tdx_get_capabilities(kvm, &tdx_cmd);
break;
case KVM_TDX_INIT_VM:
r = tdx_td_init(kvm, &tdx_cmd);
--
2.48.1
Powered by blists - more mailing lists