linux-kernel - [PATCH v3] KVM: x86/vmx: Suppress posted interrupt notification when CPU is in host

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20220928111604.180284-1-chao.gao@intel.com>
Date:   Wed, 28 Sep 2022 19:16:03 +0800
From:   Chao Gao <chao.gao@...el.com>
To:     kvm@...r.kernel.org, linux-kernel@...r.kernel.org, jon@...anix.com,
        kevin.tian@...el.com, seanjc@...gle.com
Cc:     Chao Gao <chao.gao@...el.com>, Paolo Bonzini <pbonzini@...hat.com>,
        Thomas Gleixner <tglx@...utronix.de>,
        Ingo Molnar <mingo@...hat.com>, Borislav Petkov <bp@...en8.de>,
        Dave Hansen <dave.hansen@...ux.intel.com>, x86@...nel.org,
        "H. Peter Anvin" <hpa@...or.com>
Subject: [PATCH v3] KVM: x86/vmx: Suppress posted interrupt notification when CPU is in host

PIN (Posted interrupt notification) is useless to host as KVM always syncs
pending guest interrupts in PID to guest's vAPIC before each VM entry. In
fact, Sending PINs to a CPU running in host will lead to additional
overhead due to interrupt handling.

Currently, software path, vmx_deliver_posted_interrupt(), is optimized to
issue PINs only if target vCPU is in IN_GUEST_MODE. But hardware paths
(VT-d and Intel IPI virtualization) aren't optimized.

Suppress notification unless notification is necessary (when vCPU is
blocked or running in guest). Specifically, set PID.SN when kvm sets
OUTSIDE_GUEST_MODE and vCPUs wake up from blocking, and clear PID.SN when
kvm sets IN_GUEST_MODE and vCPUs starts to block. Also, don't toggle PID.SN
at any other places, e.g., vmx_vcpu_pi_load().

To allow x86 common code to toggle SN bit when kvm sets vcpu->mode,
a new kvm_x86_ops is added.

When IPI virtualization is enabled, this patch increases "perf bench" [*]
by 8.10%, and PIN count in 1 second drops from tens of thousands to
tens. But cpuid loop test shows this patch causes 0.94% overhead in
VM-exit round-trip latency.

[*] test cmd: perf bench sched pipe -T. Note that we change the source
code to pin two threads to two different vCPUs so that it can reproduce
stable results.

Signed-off-by: Chao Gao <chao.gao@...el.com>
---
Changes in v3:
 - add a dedicated hook to toggle SN
 - clear/set SN when kvm sets IN/OUTSIDE_GUEST_MODE
 - use kvm_arch_vcpu_{un}blocking instead of vcpu_put/load to set/clear SN
   for blocked vCPUs
 - recollect perf data and update the figures in commit message

 arch/x86/include/asm/kvm-x86-ops.h |  1 +
 arch/x86/include/asm/kvm_host.h    |  1 +
 arch/x86/kvm/vmx/posted_intr.c     | 63 +++++++++++++++++-------------
 arch/x86/kvm/vmx/posted_intr.h     |  1 +
 arch/x86/kvm/vmx/vmx.c             | 22 ++++++++++-
 arch/x86/kvm/x86.c                 | 20 ++++++++++
 6 files changed, 80 insertions(+), 28 deletions(-)

diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 51f777071584..a2d77302d7a8 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -86,6 +86,7 @@ KVM_X86_OP_OPTIONAL(set_virtual_apic_mode)
 KVM_X86_OP_OPTIONAL(set_apic_access_page_addr)
 KVM_X86_OP(deliver_interrupt)
 KVM_X86_OP_OPTIONAL(sync_pir_to_irr)
+KVM_X86_OP_OPTIONAL(pi_suppress_notification)
 KVM_X86_OP_OPTIONAL_RET0(set_tss_addr)
 KVM_X86_OP_OPTIONAL_RET0(set_identity_map_addr)
 KVM_X86_OP_OPTIONAL_RET0(get_mt_mask)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2c96c43c313a..66d67b198021 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1543,6 +1543,7 @@ struct kvm_x86_ops {
 	void (*deliver_interrupt)(struct kvm_lapic *apic, int delivery_mode,
 				  int trig_mode, int vector);
 	int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
+	void (*pi_suppress_notification)(struct kvm_vcpu *vcpu, bool suppress);
 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
 	int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr);
 	u8 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 1b56c5e5c9fb..ae1794a213d7 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -48,6 +48,38 @@ static int pi_try_set_control(struct pi_desc *pi_desc, u64 *pold, u64 new)
 	return 0;
 }
 
+/*
+ * Toggle Suppress Notification (SN) bit in PID.
+ *
+ * Note that after clearing SN, the check of PIR and setting ON bit
+ * ensure pending IRQs in PIR are not ignored.
+ */
+void vmx_pi_suppress_notification(struct kvm_vcpu *vcpu, bool suppress)
+{
+	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+	if (suppress) {
+		pi_set_sn(pi_desc);
+	} else {
+		pi_clear_sn(pi_desc);
+		/*
+		 * Clear SN before reading the bitmap.  The VT-d firmware
+		 * writes the bitmap and reads SN atomically (5.2.3 in the
+		 * spec), so it doesn't really have a memory barrier that
+		 * pairs with this, but we cannot do that and we need one.
+		 */
+		smp_mb__after_atomic();
+
+		/*
+		 * ON may be out of sync with PIR. Set ON if there are
+		 * pending IRQs in PIR to ensure subsequent
+		 * ->sync_pir_to_irr() won't leave the pending IRQs behind.
+		 */
+		if (!pi_test_on(pi_desc) && !pi_is_pir_empty(pi_desc))
+			pi_set_on(pi_desc);
+	}
+}
+
 void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
@@ -69,15 +101,8 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
 	 * full update can be skipped as neither the vector nor the destination
 	 * needs to be changed.
 	 */
-	if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) {
-		/*
-		 * Clear SN if it was set due to being preempted.  Again, do
-		 * this even if there is no assigned device for simplicity.
-		 */
-		if (pi_test_and_clear_sn(pi_desc))
-			goto after_clear_sn;
+	if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu)
 		return;
-	}
 
 	local_irq_save(flags);
 
@@ -101,11 +126,10 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
 		new.control = old.control;
 
 		/*
-		 * Clear SN (as above) and refresh the destination APIC ID to
-		 * handle task migration (@cpu != vcpu->cpu).
+		 * Refresh the destination APIC ID to handle task migration
+		 * (@cpu != vcpu->cpu).
 		 */
 		new.ndst = dest;
-		new.sn = 0;
 
 		/*
 		 * Restore the notification vector; in the blocking case, the
@@ -115,19 +139,6 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
 	} while (pi_try_set_control(pi_desc, &old.control, new.control));
 
 	local_irq_restore(flags);
-
-after_clear_sn:
-
-	/*
-	 * Clear SN before reading the bitmap.  The VT-d firmware
-	 * writes the bitmap and reads SN atomically (5.2.3 in the
-	 * spec), so it doesn't really have a memory barrier that
-	 * pairs with this, but we cannot do that and we need one.
-	 */
-	smp_mb__after_atomic();
-
-	if (!pi_is_pir_empty(pi_desc))
-		pi_set_on(pi_desc);
 }
 
 static bool vmx_can_use_vtd_pi(struct kvm *kvm)
@@ -193,8 +204,6 @@ static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu)
 
 void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
 {
-	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-
 	if (!vmx_needs_pi_wakeup(vcpu))
 		return;
 
@@ -207,7 +216,7 @@ void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
 	 * its wait state and manually scheduling out.
 	 */
 	if (vcpu->preempted)
-		pi_set_sn(pi_desc);
+		vmx_pi_suppress_notification(vcpu, true);
 }
 
 /*
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
index 26992076552e..9232b87bf63e 100644
--- a/arch/x86/kvm/vmx/posted_intr.h
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -102,5 +102,6 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
 int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
 		       uint32_t guest_irq, bool set);
 void vmx_pi_start_assignment(struct kvm *kvm);
+void vmx_pi_suppress_notification(struct kvm_vcpu *vcpu, bool suppress);
 
 #endif /* __KVM_X86_VMX_POSTED_INTR_H */
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index c9b49a09e6b5..084894c50b64 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6697,6 +6697,16 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
 		vmx_set_rvi(max_irr);
 }
 
+static void vmx_vcpu_blocking(struct kvm_vcpu *vcpu)
+{
+	vmx_pi_suppress_notification(vcpu, false);
+}
+
+static void vmx_vcpu_unblocking(struct kvm_vcpu *vcpu)
+{
+	vmx_pi_suppress_notification(vcpu, true);
+}
+
 static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6706,7 +6716,14 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
 	if (KVM_BUG_ON(!enable_apicv, vcpu->kvm))
 		return -EIO;
 
-	if (pi_test_on(&vmx->pi_desc)) {
+	/*
+	 * Check PIR if ON=1 || SN=1. This is required to avoid breaking
+	 * halt-polling (and other call sites with SN=1). Notification
+	 * during halt-polling is undesired. But VT-d engine and IPI
+	 * virtualization don't set ON if SN=1. To ensure KVM can detect
+	 * pending IRQs in time, check PIR as well if SN=1.
+	 */
+	if (pi_test_on(&vmx->pi_desc) || pi_test_sn(&vmx->pi_desc)) {
 		pi_clear_on(&vmx->pi_desc);
 		/*
 		 * IOMMU can write to PID.ON, so the barrier matters even on UP.
@@ -8030,6 +8047,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
 	.prepare_switch_to_guest = vmx_prepare_switch_to_guest,
 	.vcpu_load = vmx_vcpu_load,
 	.vcpu_put = vmx_vcpu_put,
+	.vcpu_blocking = vmx_vcpu_blocking,
+	.vcpu_unblocking = vmx_vcpu_unblocking,
 
 	.update_exception_bitmap = vmx_update_exception_bitmap,
 	.get_msr_feature = vmx_get_msr_feature,
@@ -8089,6 +8108,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
 	.hwapic_isr_update = vmx_hwapic_isr_update,
 	.guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
 	.sync_pir_to_irr = vmx_sync_pir_to_irr,
+	.pi_suppress_notification = vmx_pi_suppress_notification,
 	.deliver_interrupt = vmx_deliver_interrupt,
 	.dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d7374d768296..ce744e840423 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10426,6 +10426,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	/* Store vcpu->apicv_active before vcpu->mode.  */
 	smp_store_release(&vcpu->mode, IN_GUEST_MODE);
+	/*
+	 * Notification are necessary to trigger the delivery of posted
+	 * IRQs when vCPU is running in guest. Enable notification before
+	 * guest entry.
+	 *
+	 * Do this even if apicv is disabled for simplicity.
+	 */
+	if (kvm_lapic_enabled(vcpu))
+		static_call_cond(kvm_x86_pi_suppress_notification)(vcpu, false);
 
 	kvm_vcpu_srcu_read_unlock(vcpu);
 
@@ -10538,6 +10547,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	vcpu->mode = OUTSIDE_GUEST_MODE;
 	smp_wmb();
+	/*
+	 * Suppress notification when CPU is OUTSIDE_GUEST_MODE to avoid
+	 * wasting time on handling interrupts. A notification to host/kvm
+	 * just indicates some interrupts are posted for a vCPU. Since KVM
+	 * always syncs pending interrupts in PIR to vAPIC IRR before guest
+	 * entry (in ->sync_pir_to_irr()), notification isn't needed.
+	 *
+	 * Do this even if apicv is disabled for simplicity.
+	 */
+	if (kvm_lapic_enabled(vcpu))
+		static_call_cond(kvm_x86_pi_suppress_notification)(vcpu, true);
 
 	/*
 	 * Sync xfd before calling handle_exit_irqoff() which may

base-commit: 372d07084593dc7a399bf9bee815711b1fb1bcf2
-- 
2.25.1