From: Paolo Bonzini <pbonzini@redhat.com>
Subject: [PATCH] KVM: SMM: Do not set nested_run_pending if RSM causes triple fault

Syzkaller found a warning triggered in nested_vmx_vmexit().
vmx->nested.nested_run_pending is non-zero, even though we're in
nested_vmx_vmexit(). Generally, trying  to cancel a pending entry is
considered a bug. However in this particular scenario, the kernel
behavior seems correct.

Syzkaller scenario:
1) Set up VCPU's
2) Run some code with KVM_RUN in L2 as a nested guest
3) Return from KVM_RUN
4) Inject KVM_SMI into the VCPU
5) Change the EFER register with KVM_SET_SREGS to value 0x2501
6) Run some code on the VCPU using KVM_RUN
7) Observe following behavior:

kvm_smm_transition: vcpu 0: entering SMM, smbase 0x30000
kvm_entry: vcpu 0, rip 0x8000
kvm_entry: vcpu 0, rip 0x8000
kvm_entry: vcpu 0, rip 0x8002
kvm_smm_transition: vcpu 0: leaving SMM, smbase 0x30000
kvm_nested_vmenter: rip: 0x0000000000008002 vmcs: 0x0000000000007000
                    nested_rip: 0x0000000000000000 int_ctl: 0x00000000
                    event_inj: 0x00000000 nested_ept=n guest
                    cr3: 0x0000000000002000
kvm_nested_vmexit_inject: reason: TRIPLE_FAULT ext_inf1: 0x0000000000000000
                          ext_inf2: 0x0000000000000000 ext_int: 0x00000000
                          ext_int_err: 0x00000000

What happened here is an SMI was injected immediately and the handler was
called at address 0x8000; all is good. Later, an RSM instruction is
executed in an emulator to return to the nested VM. em_rsm() is called,
which leads to emulator_leave_smm(). A part of this function calls VMX/SVM
callback, in this case vmx_leave_smm(). It attempts to set up a pending
reentry to guest VM by calling nested_vmx_enter_non_root_mode() and sets
vmx->nested.nested_run_pending to one. Unfortunately, later in
emulator_leave_smm(), rsm_load_state_64() fails to write invalid EFER to
the MSR. This results in em_rsm() calling triple_fault callback. At this
point it's clear that the KVM should call the vmexit, but
vmx->nested.nested_run_pending is left set to 1.

Similar flow goes for SVM, as the bug also reproduces on AMD platforms.

To address this issue, only set the nested_run_pending flag if RSM
does not cause a triple fault.  This is superior to resetting the
nested_run_pending flag in the triple_fault handler, because in the
past there were instances where KVM prematurely synthesized a triple
fault on nested VM-Enter. In these cases, it is not appropriate to zero
the nested_pending_run, otherwise the order of event injection is not
preserved.

[Commit message by Michal Wilczynski]

Fixes: 759cbd59674a ("KVM: x86: nSVM/nVMX: set nested_run_pending on VM entry which is a result of RSM")
Reported-by: Zheyu Ma <zheyuma97@gmail.com>
Closes: https://lore.kernel.org/all/CAMhUBjmXMYsEoVYw_M8hSZjBMHh24i88QYm-RY6HDta5YZ7Wgw@mail.gmail.com
Analyzed-by: Michal Wilczynski <michal.wilczynski@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---

 include/asm/kvm-x86-ops.h |    3 ++-
 include/asm/kvm_host.h    |    3 ++-
 kvm/smm.c                 |   13 +++++++++----
 kvm/svm/svm.c             |   16 ++++++++++++----
 kvm/vmx/vmx.c             |   15 ++++++++++++---
 5 files changed, 37 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index ac8b7614e79d..3d18fa7db353 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -119,7 +119,8 @@ KVM_X86_OP(setup_mce)
 #ifdef CONFIG_KVM_SMM
 KVM_X86_OP(smi_allowed)
 KVM_X86_OP()
-KVM_X86_OP(leave_smm)
+KVM_X86_OP(leave_smm_prepare)
+KVM_X86_OP(leave_smm_commit)
 KVM_X86_OP(enable_smi_window)
 #endif
 KVM_X86_OP_OPTIONAL(dev_get_attr)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0bcd9ae16097..8821cfe4ced2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1765,7 +1765,8 @@ struct kvm_x86_ops {
 #ifdef CONFIG_KVM_SMM
 	int (*smi_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
 	int (*enter_smm)(struct kvm_vcpu *vcpu, union kvm_smram *smram);
-	int (*leave_smm)(struct kvm_vcpu *vcpu, const union kvm_smram *smram);
+	int (*leave_smm_prepare)(struct kvm_vcpu *vcpu, const union kvm_smram *smram);
+	void (*leave_smm_commit)(struct kvm_vcpu *vcpu);
 	void (*enable_smi_window)(struct kvm_vcpu *vcpu);
 #endif
 
diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c
index dc3d95fdca7d..ac951894611e 100644
--- a/arch/x86/kvm/smm.c
+++ b/arch/x86/kvm/smm.c
@@ -631,17 +631,22 @@ int emulator_leave_smm(struct x86_emulate_ctxt *ctxt)
 #endif
 
 	/*
-	 * Give leave_smm() a chance to make ISA-specific changes to the vCPU
+	 * Give vendor code a chance to make ISA-specific changes to the vCPU
 	 * state (e.g. enter guest mode) before loading state from the SMM
 	 * state-save area.
 	 */
-	if (static_call(kvm_x86_leave_smm)(vcpu, &smram))
+	if (static_call(kvm_x86_leave_smm_prepare)(vcpu, &smram))
 		return X86EMUL_UNHANDLEABLE;
 
 #ifdef CONFIG_X86_64
 	if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
-		return rsm_load_state_64(ctxt, &smram.smram64);
+		ret = rsm_load_state_64(ctxt, &smram.smram64);
 	else
 #endif
-		return rsm_load_state_32(ctxt, &smram.smram32);
+		ret = rsm_load_state_32(ctxt, &smram.smram32);
+
+	if (ret == X86EMUL_CONTINUE)
+		static_call(kvm_x86_leave_smm_commit)(vcpu);
+
+	return ret;
 }
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 392b9c2e2ce1..7433ba32c5ff 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4642,7 +4642,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
 	return 0;
 }
 
-static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
+static int svm_leave_smm_prepare(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	struct kvm_host_map map, map_save;
@@ -4695,8 +4695,6 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
 	if (ret)
 		goto unmap_save;
 
-	svm->nested.nested_run_pending = 1;
-
 unmap_save:
 	kvm_vcpu_unmap(vcpu, &map_save, true);
 unmap_map:
@@ -4704,6 +4702,15 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
 	return ret;
 }
 
+static void svm_leave_smm_commit(struct kvm_vcpu *vcpu)
+{
+ 	struct vcpu_svm *svm = to_svm(vcpu);
+
+	/* Force any pending events to cause vmexits.  */
+	if (is_guest_mode(vcpu))
+		svm->nested.nested_run_pending = 1;
+}
+
 static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -5010,7 +5014,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
 #ifdef CONFIG_KVM_SMM
 	.smi_allowed = svm_smi_allowed,
 	.enter_smm = svm_enter_smm,
-	.leave_smm = svm_leave_smm,
+	.leave_smm_prepare = svm_leave_smm_prepare,
+	.leave_smm_commit = svm_leave_smm_commit,
 	.enable_smi_window = svm_enable_smi_window,
 #endif
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index e262bc2ba4e5..ee2b85058539 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -8138,7 +8138,7 @@ static int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
 	return 0;
 }
 
-static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
+static int vmx_leave_smm_prepare(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	int ret;
@@ -8153,12 +8153,20 @@ static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
 		if (ret)
 			return ret;
 
-		vmx->nested.nested_run_pending = 1;
 		vmx->nested.smm.guest_mode = false;
 	}
 	return 0;
 }
 
+static void vmx_leave_smm_commit(struct kvm_vcpu *vcpu)
+{
+ 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	/* Force any pending events to cause vmexits.  */
+	if (is_guest_mode(vcpu))
+		vmx->nested.nested_run_pending = 1;
+}
+
 static void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
 {
 	/* RSM will cause a vmexit anyway.  */
@@ -8380,7 +8385,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
 #ifdef CONFIG_KVM_SMM
 	.smi_allowed = vmx_smi_allowed,
 	.enter_smm = vmx_enter_smm,
-	.leave_smm = vmx_leave_smm,
+	.leave_smm_prepare = vmx_leave_smm_prepare,
+	.leave_smm_commit = vmx_leave_smm_commit,
 	.enable_smi_window = vmx_enable_smi_window,
 #endif