[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250218202618.567363-4-sieberf@amazon.com>
Date: Tue, 18 Feb 2025 22:26:03 +0200
From: Fernand Sieber <sieberf@...zon.com>
To: <sieberf@...zon.com>, Ingo Molnar <mingo@...hat.com>, Peter Zijlstra
	<peterz@...radead.org>, Vincent Guittot <vincent.guittot@...aro.org>, "Paolo
 Bonzini" <pbonzini@...hat.com>, <x86@...nel.org>, <kvm@...r.kernel.org>,
	<linux-kernel@...r.kernel.org>, <nh-open-source@...zon.com>
Subject: [RFC PATCH 3/3] sched,x86: Make the scheduler guest unhalted aware
With guest hlt/mwait/pause pass through, the scheduler has no visibility into
real vCPU activity as it sees them all 100% active. As such, load balancing
cannot make informed decisions on where it is preferrable to collocate
tasks when necessary. I.e as far as the load balancer is concerned, a
halted vCPU and an idle polling vCPU look exactly the same so it may decide
that either should be preempted when in reality it would be preferrable to
preempt the idle one.
This commits enlightens the scheduler to real guest activity in this
situation. Leveraging gtime unhalted, it adds a hook for kvm to communicate
to the scheduler the duration that a vCPU spends halted. This is then used in
PELT accounting to discount it from real activity. This results in better
placement and overall steal time reduction.
This initial implementation assumes that non-idle CPUs are ticking as it
hooks the unhalted time the PELT decaying load accounting. As such it
doesn't work well if PELT is updated infrequenly with large chunks of
halted time. This is not a fundamental limitation but more complex
accounting is needed to generalize the use case to nohz full.
---
 arch/x86/kvm/x86.c    |  8 ++++++--
 include/linux/sched.h |  4 ++++
 kernel/sched/core.c   |  1 +
 kernel/sched/fair.c   | 25 +++++++++++++++++++++++++
 kernel/sched/pelt.c   | 42 +++++++++++++++++++++++++++++++++++-------
 kernel/sched/sched.h  |  2 ++
 6 files changed, 73 insertions(+), 9 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 46975b0a63a5..156cf05b325f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10712,6 +10712,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	int r;
 	unsigned long long cycles, cycles_start = 0;
 	unsigned long long unhalted_cycles, unhalted_cycles_start = 0;
+	unsigned long long halted_cycles_ns = 0;
 	bool req_int_win =
 		dm_request_for_irq_injection(vcpu) &&
 		kvm_cpu_accept_dm_intr(vcpu);
@@ -11083,8 +11084,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		cycles = get_cycles() - cycles_start;
 		unhalted_cycles = get_unhalted_cycles() -
 			unhalted_cycles_start;
-		if (likely(cycles > unhalted_cycles))
-			current->gtime_halted += cycles2ns(cycles - unhalted_cycles);
+		if (likely(cycles > unhalted_cycles)) {
+			halted_cycles_ns = cycles2ns(cycles - unhalted_cycles);
+			current->gtime_halted += halted_cycles_ns;
+			sched_account_gtime_halted(current, halted_cycles_ns);
+		}
 	}
 	local_irq_enable();
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5f6745357e20..5409fac152c9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -367,6 +367,8 @@ struct vtime {
 	u64			gtime;
 };
+extern void sched_account_gtime_halted(struct task_struct *p, u64 gtime_halted);
+
 /*
  * Utilization clamp constraints.
  * @UCLAMP_MIN:	Minimum utilization
@@ -588,6 +590,8 @@ struct sched_entity {
 	 */
 	struct sched_avg		avg;
 #endif
+
+	u64				gtime_halted;
 };
 struct sched_rt_entity {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9aecd914ac69..1f3ced2b2636 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4487,6 +4487,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->se.nr_migrations		= 0;
 	p->se.vruntime			= 0;
 	p->se.vlag			= 0;
+	p->se.gtime_halted              = 0;
 	INIT_LIST_HEAD(&p->se.group_node);
 	/* A delayed task cannot be in clone(). */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1c0ef435a7aa..5ff52711d459 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -13705,4 +13705,29 @@ __init void init_sched_fair_class(void)
 #endif
 #endif /* SMP */
+
+}
+
+#ifdef CONFIG_NO_HZ_FULL
+void sched_account_gtime_halted(struct task_struct *p, u64 gtime_halted)
+{
 }
+#else
+/*
+ * The implementation hooking into PELT requires regular updates of
+ * gtime_halted. This is guaranteed unless we run on CONFIG_NO_HZ_FULL.
+ */
+void sched_account_gtime_halted(struct task_struct *p, u64 gtime_halted)
+{
+	struct sched_entity *se = &p->se;
+
+	if (unlikely(!gtime_halted))
+		return;
+
+	for_each_sched_entity(se) {
+		se->gtime_halted += gtime_halted;
+		se->cfs_rq->gtime_halted += gtime_halted;
+	}
+}
+#endif
+EXPORT_SYMBOL(sched_account_gtime_halted);
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 7a8534a2deff..9f96b7c46c00 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -305,10 +305,23 @@ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
 int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	if (___update_load_sum(now, &se->avg, !!se->on_rq, se_runnable(se),
-				cfs_rq->curr == se)) {
+	int ret = 0;
+	u64 delta = now - se->avg.last_update_time;
+	u64 gtime_halted = min(delta, se->gtime_halted);
-		___update_load_avg(&se->avg, se_weight(se));
+	ret = ___update_load_sum(now - gtime_halted, &se->avg, !!se->on_rq, se_runnable(se),
+			cfs_rq->curr == se);
+
+	if (gtime_halted) {
+		ret |= ___update_load_sum(now, &se->avg, 0, 0, 0);
+		se->gtime_halted -= gtime_halted;
+
+		/* decay residual halted time */
+		if (ret && se->gtime_halted)
+			se->gtime_halted = decay_load(se->gtime_halted, delta / 1024);
+	}
+
+	if (ret) {
 		cfs_se_util_change(&se->avg);
 		trace_pelt_se_tp(se);
 		return 1;
@@ -319,10 +332,25 @@ int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se
 int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
 {
-	if (___update_load_sum(now, &cfs_rq->avg,
-				scale_load_down(cfs_rq->load.weight),
-				cfs_rq->h_nr_runnable,
-				cfs_rq->curr != NULL)) {
+	int ret = 0;
+	u64 delta = now - cfs_rq->avg.last_update_time;
+	u64 gtime_halted = min(delta, cfs_rq->gtime_halted);
+
+	ret =  ___update_load_sum(now - gtime_halted, &cfs_rq->avg,
+			scale_load_down(cfs_rq->load.weight),
+			cfs_rq->h_nr_runnable,
+			cfs_rq->curr != NULL);
+
+	if (gtime_halted) {
+		ret |= ___update_load_sum(now, &cfs_rq->avg, 0, 0, 0);
+		cfs_rq->gtime_halted -= gtime_halted;
+
+		/* decay any residual halted time */
+		if (ret && cfs_rq->gtime_halted)
+			cfs_rq->gtime_halted = decay_load(cfs_rq->gtime_halted, delta / 1024);
+	}
+
+	if (ret) {
 		___update_load_avg(&cfs_rq->avg, 1);
 		trace_pelt_cfs_tp(cfs_rq);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b93c8c3dc05a..79b1166265bf 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -744,6 +744,8 @@ struct cfs_rq {
 	struct list_head	throttled_csd_list;
 #endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
+
+	u64			gtime_halted;
 };
 #ifdef CONFIG_SCHED_CLASS_EXT
--
2.43.0
Amazon Development Centre (South Africa) (Proprietary) Limited
29 Gogosoa Street, Observatory, Cape Town, Western Cape, 7925, South Africa
Registration Number: 2004 / 034463 / 07
Powered by blists - more mailing lists
 
