linux-kernel - [PATCH v2 6/9] KVM: x86: Add IPI tracking infrastructure

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251219035334.39790-7-kernellwp@gmail.com>
Date: Fri, 19 Dec 2025 11:53:30 +0800
From: Wanpeng Li <kernellwp@...il.com>
To: Peter Zijlstra <peterz@...radead.org>,
	Ingo Molnar <mingo@...hat.com>,
	Thomas Gleixner <tglx@...utronix.de>,
	Paolo Bonzini <pbonzini@...hat.com>,
	Sean Christopherson <seanjc@...gle.com>
Cc: K Prateek Nayak <kprateek.nayak@....com>,
	Christian Borntraeger <borntraeger@...ux.ibm.com>,
	Steven Rostedt <rostedt@...dmis.org>,
	Vincent Guittot <vincent.guittot@...aro.org>,
	Juri Lelli <juri.lelli@...hat.com>,
	linux-kernel@...r.kernel.org,
	kvm@...r.kernel.org,
	Wanpeng Li <wanpengli@...cent.com>
Subject: [PATCH v2 6/9] KVM: x86: Add IPI tracking infrastructure

From: Wanpeng Li <wanpengli@...cent.com>

Add foundational infrastructure for tracking IPI sender/receiver
relationships to improve directed yield candidate selection.

Introduce per-vCPU ipi_context structure containing:
- last_ipi_receiver: vCPU index that received the last IPI from this vCPU
- last_ipi_time_ns: timestamp of the last IPI send
- ipi_pending: flag indicating an unacknowledged IPI
- last_ipi_sender: vCPU index that sent an IPI to this vCPU
- last_ipi_recv_time_ns: timestamp when IPI was received

Add module parameters for runtime control:
- ipi_tracking_enabled (default: true): master switch for IPI tracking
- ipi_window_ns (default: 50ms): recency window for IPI validity

Implement helper functions:
- kvm_ipi_tracking_enabled(): check if tracking is active
- kvm_vcpu_is_ipi_receiver(): determine if a vCPU is a recent IPI target

The infrastructure is inert until integrated with interrupt delivery
in subsequent patches.

v1 -> v2:
- Improve documentation for module parameters explaining the 50ms
  window rationale
- Add kvm_vcpu_is_ipi_receiver() declaration to x86.h header
- Add weak function annotation comment in kvm_host.h

Signed-off-by: Wanpeng Li <wanpengli@...cent.com>
---
 arch/x86/include/asm/kvm_host.h | 12 ++++++
 arch/x86/kvm/lapic.c            | 76 +++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c              |  3 ++
 arch/x86/kvm/x86.h              |  8 ++++
 include/linux/kvm_host.h        |  3 ++
 virt/kvm/kvm_main.c             |  6 +++
 6 files changed, 108 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5a3bfa293e8b..2464c310f0a2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1052,6 +1052,18 @@ struct kvm_vcpu_arch {
 	int pending_external_vector;
 	int highest_stale_pending_ioapic_eoi;
 
+	/*
+	 * IPI tracking for directed yield optimization.
+	 * Records sender/receiver relationships when IPIs are delivered
+	 * to enable IPI-aware vCPU scheduling decisions.
+	 */
+	struct {
+		int last_ipi_sender;	/* vCPU index of last IPI sender */
+		int last_ipi_receiver;	/* vCPU index of last IPI receiver */
+		bool pending_ipi;	/* Awaiting IPI response */
+		u64 ipi_time_ns;	/* Timestamp when IPI was sent */
+	} ipi_context;
+
 	/* be preempted when it's in kernel-mode(cpl=0) */
 	bool preempted_in_kernel;
 
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 1597dd0b0cc6..23f247a3b127 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -75,6 +75,19 @@ module_param(lapic_timer_advance, bool, 0444);
 /* step-by-step approximation to mitigate fluctuation */
 #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
 
+/*
+ * IPI tracking for directed yield optimization.
+ * - ipi_tracking_enabled: global toggle (default on)
+ * - ipi_window_ns: recency window for IPI validity (default 50ms)
+ *   The 50ms window is chosen to be long enough to capture IPI response
+ *   patterns while short enough to avoid stale information affecting
+ *   scheduling decisions in throughput-sensitive workloads.
+ */
+static bool ipi_tracking_enabled = true;
+static unsigned long ipi_window_ns = 50 * NSEC_PER_MSEC;
+module_param(ipi_tracking_enabled, bool, 0644);
+module_param(ipi_window_ns, ulong, 0644);
+
 static bool __read_mostly vector_hashing_enabled = true;
 module_param_named(vector_hashing, vector_hashing_enabled, bool, 0444);
 
@@ -1113,6 +1126,69 @@ static int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
 	return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
 }
 
+/*
+ * Track IPI communication for directed yield optimization.
+ * Records sender/receiver relationship when a unicast IPI is delivered.
+ * Only tracks when a unique receiver exists; ignores self-IPI.
+ */
+void kvm_track_ipi_communication(struct kvm_vcpu *sender, struct kvm_vcpu *receiver)
+{
+	if (!sender || !receiver || sender == receiver)
+		return;
+	if (unlikely(!READ_ONCE(ipi_tracking_enabled)))
+		return;
+
+	WRITE_ONCE(sender->arch.ipi_context.last_ipi_receiver, receiver->vcpu_idx);
+	WRITE_ONCE(sender->arch.ipi_context.pending_ipi, true);
+	WRITE_ONCE(sender->arch.ipi_context.ipi_time_ns, ktime_get_mono_fast_ns());
+
+	WRITE_ONCE(receiver->arch.ipi_context.last_ipi_sender, sender->vcpu_idx);
+}
+
+/*
+ * Check if 'receiver' is the recent IPI target of 'sender'.
+ *
+ * Rationale:
+ * - Use a short window to avoid stale IPI inflating boost priority
+ *   on throughput-sensitive workloads.
+ */
+bool kvm_vcpu_is_ipi_receiver(struct kvm_vcpu *sender, struct kvm_vcpu *receiver)
+{
+	u64 then, now;
+
+	if (unlikely(!READ_ONCE(ipi_tracking_enabled)))
+		return false;
+
+	then = READ_ONCE(sender->arch.ipi_context.ipi_time_ns);
+	now = ktime_get_mono_fast_ns();
+	if (READ_ONCE(sender->arch.ipi_context.pending_ipi) &&
+	    READ_ONCE(sender->arch.ipi_context.last_ipi_receiver) ==
+	    receiver->vcpu_idx &&
+	    now - then <= ipi_window_ns)
+		return true;
+
+	return false;
+}
+
+/*
+ * Clear IPI context for a vCPU (e.g., on EOI or reset).
+ */
+void kvm_vcpu_clear_ipi_context(struct kvm_vcpu *vcpu)
+{
+	WRITE_ONCE(vcpu->arch.ipi_context.pending_ipi, false);
+	WRITE_ONCE(vcpu->arch.ipi_context.last_ipi_sender, -1);
+	WRITE_ONCE(vcpu->arch.ipi_context.last_ipi_receiver, -1);
+}
+
+/*
+ * Reset IPI context completely (e.g., on vCPU creation/destruction).
+ */
+void kvm_vcpu_reset_ipi_context(struct kvm_vcpu *vcpu)
+{
+	kvm_vcpu_clear_ipi_context(vcpu);
+	WRITE_ONCE(vcpu->arch.ipi_context.ipi_time_ns, 0);
+}
+
 /* Return true if the interrupt can be handled by using *bitmap as index mask
  * for valid destinations in *dst array.
  * Return false if kvm_apic_map_get_dest_lapic did nothing useful.
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0c6d899d53dd..d4c401ef04ca 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -12728,6 +12728,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 		goto free_guest_fpu;
 
 	kvm_xen_init_vcpu(vcpu);
+	kvm_vcpu_reset_ipi_context(vcpu);
 	vcpu_load(vcpu);
 	kvm_vcpu_after_set_cpuid(vcpu);
 	kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
@@ -12795,6 +12796,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 	kvm_mmu_destroy(vcpu);
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 	free_page((unsigned long)vcpu->arch.pio_data);
+	kvm_vcpu_reset_ipi_context(vcpu);
 	kvfree(vcpu->arch.cpuid_entries);
 }
 
@@ -12871,6 +12873,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 		kvm_leave_nested(vcpu);
 
 	kvm_lapic_reset(vcpu, init_event);
+	kvm_vcpu_clear_ipi_context(vcpu);
 
 	WARN_ON_ONCE(is_guest_mode(vcpu) || is_smm(vcpu));
 	vcpu->arch.hflags = 0;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index fdab0ad49098..cfc24fb207e0 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -466,6 +466,14 @@ fastpath_t handle_fastpath_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg);
 fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu);
 fastpath_t handle_fastpath_invd(struct kvm_vcpu *vcpu);
 
+/* IPI tracking helpers for directed yield */
+void kvm_track_ipi_communication(struct kvm_vcpu *sender,
+				 struct kvm_vcpu *receiver);
+bool kvm_vcpu_is_ipi_receiver(struct kvm_vcpu *sender,
+			      struct kvm_vcpu *receiver);
+void kvm_vcpu_clear_ipi_context(struct kvm_vcpu *vcpu);
+void kvm_vcpu_reset_ipi_context(struct kvm_vcpu *vcpu);
+
 extern struct kvm_caps kvm_caps;
 extern struct kvm_host_values kvm_host;
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index d93f75b05ae2..f42315d341b3 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1535,6 +1535,9 @@ static inline void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 int kvm_vcpu_yield_to(struct kvm_vcpu *target);
 void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu, bool yield_to_kernel_mode);
 
+/* Weak function, overridden by arch/x86/kvm for IPI-aware directed yield */
+bool kvm_vcpu_is_ipi_receiver(struct kvm_vcpu *sender, struct kvm_vcpu *receiver);
+
 void kvm_flush_remote_tlbs(struct kvm *kvm);
 void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages);
 void kvm_flush_remote_tlbs_memslot(struct kvm *kvm,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5fcd401a5897..ff771a872c6d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3964,6 +3964,12 @@ bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
 	return false;
 }
 
+bool __weak kvm_vcpu_is_ipi_receiver(struct kvm_vcpu *sender,
+				     struct kvm_vcpu *receiver)
+{
+	return false;
+}
+
 void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
 {
 	int nr_vcpus, start, i, idx, yielded;
-- 
2.43.0