[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250919005955.1366256-9-seanjc@google.com>
Date: Thu, 18 Sep 2025 17:59:54 -0700
From: Sean Christopherson <seanjc@...gle.com>
To: Sean Christopherson <seanjc@...gle.com>, Paolo Bonzini <pbonzini@...hat.com>
Cc: kvm@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: [PATCH 8/9] KVM: nVMX: Remove support for "early" consistency checks
via hardware
Remove nested_early_check and all associated code, as it's quite obviously
not being used or tested (it's been broken for 4+ years without a single
bug report). More importantly, KVM's software-based consistency checks
have matured since the option to do hardware-based checks was added; KVM
appears to be missing only _one_ consistency check, on vTPR. And even
*more* importantly, that consistency check can't be prevented by an early
hardware check due to L1 being able to modify the virtual APIC at any
time, i.e. there's an inherent TOCTOU flaw that could cause KVM to "miss"
a consistency check VM-Fail, regardless of whether the check is performed
by software or by hardware.
In other words, KVM _must_ be able to unwind from a late VM-Fail (which
was a big motivation for doing early checks). I.e. now that KVM provides
(almost) all necessary consistency checks, what's really needed is a way
to detect missing checks in KVM, not a way to avoid having to unwind from
a late VM-Fail. And that can be done much more simply, e.g. by an simple
module param to guard a WARN (which, sadly, must be off-by-default to
avoid splats due to the aforementioned TOCTOU issue).
For all intents and purposes, this reverts commit 52017608da33 ("KVM:
nVMX: add option to perform early consistency checks via H/W").
Signed-off-by: Sean Christopherson <seanjc@...gle.com>
---
arch/x86/kvm/vmx/nested.c | 130 ++++----------------------------------
1 file changed, 12 insertions(+), 118 deletions(-)
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index e3a94bf6d269..a1ffaccf317d 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -23,9 +23,6 @@
static bool __read_mostly enable_shadow_vmcs = 1;
module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
-static bool __read_mostly nested_early_check = 0;
-module_param(nested_early_check, bool, S_IRUGO);
-
#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
/*
@@ -2280,13 +2277,6 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
return;
vmx->nested.vmcs02_initialized = true;
- /*
- * If early consistency checks are enabled, stuff the EPT Pointer with
- * a dummy *legal* value to avoid false positives on bad control state.
- */
- if (enable_ept && nested_early_check)
- vmcs_write64(EPT_POINTER, VMX_EPTP_MT_WB | VMX_EPTP_PWL_4);
-
if (vmx->ve_info)
vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info));
@@ -2352,13 +2342,6 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
else
vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
}
-
- if (kvm_caps.has_tsc_control && nested_early_check) {
- if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
- vmcs_write64(TSC_MULTIPLIER, vmcs12->tsc_multiplier);
- else
- vmcs_write64(TSC_MULTIPLIER, 1);
- }
}
static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01,
@@ -3229,84 +3212,6 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
return 0;
}
-static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long cr3, cr4;
- bool vm_fail;
-
- if (!nested_early_check)
- return 0;
-
- if (vmx->msr_autoload.host.nr)
- vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
- if (vmx->msr_autoload.guest.nr)
- vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
-
- preempt_disable();
-
- vmx_prepare_switch_to_guest(vcpu);
-
- /*
- * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
- * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to
- * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e.
- * there is no need to preserve other bits or save/restore the field.
- */
- vmcs_writel(GUEST_RFLAGS, 0);
-
- cr3 = __get_current_cr3_fast();
- if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
- vmcs_writel(HOST_CR3, cr3);
- vmx->loaded_vmcs->host_state.cr3 = cr3;
- }
-
- cr4 = cr4_read_shadow();
- if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
- vmcs_writel(HOST_CR4, cr4);
- vmx->loaded_vmcs->host_state.cr4 = cr4;
- }
-
- vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
- __vmx_vcpu_run_flags(vmx));
-
- if (vmx->msr_autoload.host.nr)
- vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
- if (vmx->msr_autoload.guest.nr)
- vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
-
- if (vm_fail) {
- u32 error = vmcs_read32(VM_INSTRUCTION_ERROR);
-
- preempt_enable();
-
- trace_kvm_nested_vmenter_failed(
- "early hardware check VM-instruction error: ", error);
- WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD);
- return 1;
- }
-
- /*
- * VMExit clears RFLAGS.IF and DR7, even on a consistency check.
- */
- if (hw_breakpoint_active())
- set_debugreg(__this_cpu_read(cpu_dr7), 7);
- local_irq_enable();
- preempt_enable();
-
- /*
- * A non-failing VMEntry means we somehow entered guest mode with
- * an illegal RIP, and that's just the tip of the iceberg. There
- * is no telling what memory has been modified or what state has
- * been exposed to unknown code. Hitting this all but guarantees
- * a (very critical) hardware issue.
- */
- WARN_ON(!(vmcs_read32(VM_EXIT_REASON) &
- VMX_EXIT_REASONS_FAILED_VMENTRY));
-
- return 0;
-}
-
#ifdef CONFIG_KVM_HYPERV
static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
{
@@ -3557,22 +3462,18 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
/*
- * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
- * nested early checks are disabled. In the event of a "late" VM-Fail,
- * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its
- * software model to the pre-VMEntry host state. When EPT is disabled,
- * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes
- * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing
- * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to
- * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested
- * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is
- * guaranteed to be overwritten with a shadow CR3 prior to re-entering
- * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as
- * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks
- * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail
- * path would need to manually save/restore vmcs01.GUEST_CR3.
+ * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled. In the
+ * event of a "late" VM-Fail, i.e. a VM-Fail detected by hardware but
+ * not KVM, KVM must unwind its software model to the pre-VM-Entry host
+ * state. When EPT is disabled, GUEST_CR3 holds KVM's shadow CR3, not
+ * L1's "real" CR3, which causes nested_vmx_restore_host_state() to
+ * corrupt vcpu->arch.cr3. Stuffing vmcs01.GUEST_CR3 results in the
+ * unwind naturally setting arch.cr3 to the correct value. Smashing
+ * vmcs01.GUEST_CR3 is safe because nested VM-Exits, and the unwind,
+ * reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is guaranteed to be
+ * overwritten with a shadow CR3 prior to re-entering L1.
*/
- if (!enable_ept && !nested_early_check)
+ if (!enable_ept)
vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
@@ -3585,11 +3486,6 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
return NVMX_VMENTRY_KVM_INTERNAL_ERROR;
}
- if (nested_vmx_check_vmentry_hw(vcpu)) {
- vmx_switch_vmcs(vcpu, &vmx->vmcs01);
- return NVMX_VMENTRY_VMFAIL;
- }
-
if (nested_vmx_check_guest_state(vcpu, vmcs12,
&entry_failure_code)) {
exit_reason.basic = EXIT_REASON_INVALID_STATE;
@@ -5038,12 +4934,10 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
/*
* The only expected VM-instruction error is "VM entry with
* invalid control field(s)." Anything else indicates a
- * problem with L0. And we should never get here with a
- * VMFail of any type if early consistency checks are enabled.
+ * problem with L0.
*/
WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
VMXERR_ENTRY_INVALID_CONTROL_FIELD);
- WARN_ON_ONCE(nested_early_check);
}
/*
--
2.51.0.470.ga7dc726c21-goog
Powered by blists - more mailing lists