From: Paolo Bonzini Subject: [PATCH] KVM: SMM: Do not set nested_run_pending if RSM causes triple fault Syzkaller found a warning triggered in nested_vmx_vmexit(). vmx->nested.nested_run_pending is non-zero, even though we're in nested_vmx_vmexit(). Generally, trying to cancel a pending entry is considered a bug. However in this particular scenario, the kernel behavior seems correct. Syzkaller scenario: 1) Set up VCPU's 2) Run some code with KVM_RUN in L2 as a nested guest 3) Return from KVM_RUN 4) Inject KVM_SMI into the VCPU 5) Change the EFER register with KVM_SET_SREGS to value 0x2501 6) Run some code on the VCPU using KVM_RUN 7) Observe following behavior: kvm_smm_transition: vcpu 0: entering SMM, smbase 0x30000 kvm_entry: vcpu 0, rip 0x8000 kvm_entry: vcpu 0, rip 0x8000 kvm_entry: vcpu 0, rip 0x8002 kvm_smm_transition: vcpu 0: leaving SMM, smbase 0x30000 kvm_nested_vmenter: rip: 0x0000000000008002 vmcs: 0x0000000000007000 nested_rip: 0x0000000000000000 int_ctl: 0x00000000 event_inj: 0x00000000 nested_ept=n guest cr3: 0x0000000000002000 kvm_nested_vmexit_inject: reason: TRIPLE_FAULT ext_inf1: 0x0000000000000000 ext_inf2: 0x0000000000000000 ext_int: 0x00000000 ext_int_err: 0x00000000 What happened here is an SMI was injected immediately and the handler was called at address 0x8000; all is good. Later, an RSM instruction is executed in an emulator to return to the nested VM. em_rsm() is called, which leads to emulator_leave_smm(). A part of this function calls VMX/SVM callback, in this case vmx_leave_smm(). It attempts to set up a pending reentry to guest VM by calling nested_vmx_enter_non_root_mode() and sets vmx->nested.nested_run_pending to one. Unfortunately, later in emulator_leave_smm(), rsm_load_state_64() fails to write invalid EFER to the MSR. This results in em_rsm() calling triple_fault callback. At this point it's clear that the KVM should call the vmexit, but vmx->nested.nested_run_pending is left set to 1. Similar flow goes for SVM, as the bug also reproduces on AMD platforms. To address this issue, only set the nested_run_pending flag if RSM does not cause a triple fault. This is superior to resetting the nested_run_pending flag in the triple_fault handler, because in the past there were instances where KVM prematurely synthesized a triple fault on nested VM-Enter. In these cases, it is not appropriate to zero the nested_pending_run, otherwise the order of event injection is not preserved. [Commit message by Michal Wilczynski] Fixes: 759cbd59674a ("KVM: x86: nSVM/nVMX: set nested_run_pending on VM entry which is a result of RSM") Reported-by: Zheyu Ma Closes: https://lore.kernel.org/all/CAMhUBjmXMYsEoVYw_M8hSZjBMHh24i88QYm-RY6HDta5YZ7Wgw@mail.gmail.com Analyzed-by: Michal Wilczynski Signed-off-by: Paolo Bonzini --- include/asm/kvm-x86-ops.h | 3 ++- include/asm/kvm_host.h | 3 ++- kvm/smm.c | 13 +++++++++---- kvm/svm/svm.c | 16 ++++++++++++---- kvm/vmx/vmx.c | 15 ++++++++++++--- 5 files changed, 37 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index ac8b7614e79d..3d18fa7db353 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -119,7 +119,8 @@ KVM_X86_OP(setup_mce) #ifdef CONFIG_KVM_SMM KVM_X86_OP(smi_allowed) KVM_X86_OP() -KVM_X86_OP(leave_smm) +KVM_X86_OP(leave_smm_prepare) +KVM_X86_OP(leave_smm_commit) KVM_X86_OP(enable_smi_window) #endif KVM_X86_OP_OPTIONAL(dev_get_attr) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0bcd9ae16097..8821cfe4ced2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1765,7 +1765,8 @@ struct kvm_x86_ops { #ifdef CONFIG_KVM_SMM int (*smi_allowed)(struct kvm_vcpu *vcpu, bool for_injection); int (*enter_smm)(struct kvm_vcpu *vcpu, union kvm_smram *smram); - int (*leave_smm)(struct kvm_vcpu *vcpu, const union kvm_smram *smram); + int (*leave_smm_prepare)(struct kvm_vcpu *vcpu, const union kvm_smram *smram); + void (*leave_smm_commit)(struct kvm_vcpu *vcpu); void (*enable_smi_window)(struct kvm_vcpu *vcpu); #endif diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index dc3d95fdca7d..ac951894611e 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -631,17 +631,22 @@ int emulator_leave_smm(struct x86_emulate_ctxt *ctxt) #endif /* - * Give leave_smm() a chance to make ISA-specific changes to the vCPU + * Give vendor code a chance to make ISA-specific changes to the vCPU * state (e.g. enter guest mode) before loading state from the SMM * state-save area. */ - if (static_call(kvm_x86_leave_smm)(vcpu, &smram)) + if (static_call(kvm_x86_leave_smm_prepare)(vcpu, &smram)) return X86EMUL_UNHANDLEABLE; #ifdef CONFIG_X86_64 if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) - return rsm_load_state_64(ctxt, &smram.smram64); + ret = rsm_load_state_64(ctxt, &smram.smram64); else #endif - return rsm_load_state_32(ctxt, &smram.smram32); + ret = rsm_load_state_32(ctxt, &smram.smram32); + + if (ret == X86EMUL_CONTINUE) + static_call(kvm_x86_leave_smm_commit)(vcpu); + + return ret; } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 392b9c2e2ce1..7433ba32c5ff 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4642,7 +4642,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) return 0; } -static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) +static int svm_leave_smm_prepare(struct kvm_vcpu *vcpu, const union kvm_smram *smram) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_host_map map, map_save; @@ -4695,8 +4695,6 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) if (ret) goto unmap_save; - svm->nested.nested_run_pending = 1; - unmap_save: kvm_vcpu_unmap(vcpu, &map_save, true); unmap_map: @@ -4704,6 +4702,15 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) return ret; } +static void svm_leave_smm_commit(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + /* Force any pending events to cause vmexits. */ + if (is_guest_mode(vcpu)) + svm->nested.nested_run_pending = 1; +} + static void svm_enable_smi_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -5010,7 +5014,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { #ifdef CONFIG_KVM_SMM .smi_allowed = svm_smi_allowed, .enter_smm = svm_enter_smm, - .leave_smm = svm_leave_smm, + .leave_smm_prepare = svm_leave_smm_prepare, + .leave_smm_commit = svm_leave_smm_commit, .enable_smi_window = svm_enable_smi_window, #endif diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e262bc2ba4e5..ee2b85058539 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8138,7 +8138,7 @@ static int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) return 0; } -static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) +static int vmx_leave_smm_prepare(struct kvm_vcpu *vcpu, const union kvm_smram *smram) { struct vcpu_vmx *vmx = to_vmx(vcpu); int ret; @@ -8153,12 +8153,20 @@ static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) if (ret) return ret; - vmx->nested.nested_run_pending = 1; vmx->nested.smm.guest_mode = false; } return 0; } +static void vmx_leave_smm_commit(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* Force any pending events to cause vmexits. */ + if (is_guest_mode(vcpu)) + vmx->nested.nested_run_pending = 1; +} + static void vmx_enable_smi_window(struct kvm_vcpu *vcpu) { /* RSM will cause a vmexit anyway. */ @@ -8380,7 +8385,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { #ifdef CONFIG_KVM_SMM .smi_allowed = vmx_smi_allowed, .enter_smm = vmx_enter_smm, - .leave_smm = vmx_leave_smm, + .leave_smm_prepare = vmx_leave_smm_prepare, + .leave_smm_commit = vmx_leave_smm_commit, .enable_smi_window = vmx_enable_smi_window, #endif