linux-kernel - [PATCH] sched/fair: Force idle aware load balancing

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Message-ID: <20251127202719.963766-1-sieberf@amazon.com>
Date: Thu, 27 Nov 2025 22:27:17 +0200
From: Fernand Sieber <sieberf@...zon.com>
To: <mingo@...hat.com>, <peterz@...radead.org>
CC: <linux-kernel@...r.kernel.org>, <juri.lelli@...hat.com>,
	<vincent.guittot@...aro.org>, <dietmar.eggemann@....com>,
	<rostedt@...dmis.org>, <bsegall@...gle.com>, <mgorman@...e.de>,
	<vschneid@...hat.com>, <kprateek.nayak@....com>, <dwmw@...zon.co.uk>,
	<jschoenh@...zon.de>, <liuyuxua@...zon.com>, <abusse@...zon.com>,
	<gmazz@...zon.com>, <rkagan@...zon.com>, <sieberf@...zon.com>
Subject: [PATCH] sched/fair: Force idle aware load balancing

Consider force idle wasted capacity when computing if a group is idle or
overloaded. We use a rather crude mechanism based on the current force idle
state of the rq. It may be preferable to use a decaying average, similar
to other load metrics, to avoid jittering.

If the busiest group has force idle, make it a task migration. This way we
will try to move one task regardless of the load. There are still
subsequent checks later on to verify that this doesn't cause more force
idle on the destination.

===

Testing

Testing is aimed at measuring perceived guest noise on hypervisor system
with time shared scenarios.

Setup is on system where the load is nearing 100% which should allow no
steal time. The system has 64 CPUs, with 8 VMs, each VM using core
scheduling with 8 vCPUs per VM, time shared.

7 VMs are running stressors (`stress-ng --cpu 0`) while the last VM is
running the hwlat tracer with a width of 100ms, a period of 300ms, and
a threshold of 100us. Each VM runs a cookied non vCPU VMM process that
adds a light level of noise which forces some level of load balancing.

Signed-off-by: Fernand Sieber <sieberf@...zon.com>

The test scenario is ran 10x60s and the average noise is measured.

At baseline, we measure about 1.20% of noise (computed from hwlat
breaches). With the proposed patch, the noise drops to 0.63%.
---
 kernel/sched/fair.c  | 40 +++++++++++++++++++++++++++-------------
 kernel/sched/sched.h | 12 ++++++++++++
 2 files changed, 39 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5b752324270b..ab8c9aa09107 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9932,6 +9932,7 @@ struct sg_lb_stats {
 	unsigned int nr_numa_running;
 	unsigned int nr_preferred_running;
 #endif
+	unsigned int forceidle_weight;
 };
 
 /*
@@ -10135,15 +10136,15 @@ static inline int sg_imbalanced(struct sched_group *group)
 static inline bool
 group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
 {
-	if (sgs->sum_nr_running < sgs->group_weight)
+	if (sgs->sum_nr_running < (sgs->group_weight - sgs->forceidle_weight))
 		return true;
 
-	if ((sgs->group_capacity * imbalance_pct) <
-			(sgs->group_runnable * 100))
+	if ((sgs->group_capacity * imbalance_pct * (sgs->group_weight - sgs->forceidle_weight)) <
+			(sgs->group_runnable * 100 * sgs->group_weight))
 		return false;
 
-	if ((sgs->group_capacity * 100) >
-			(sgs->group_util * imbalance_pct))
+	if ((sgs->group_capacity * 100 * (sgs->group_weight - sgs->forceidle_weight)) >
+			(sgs->group_util * imbalance_pct * sgs->group_weight))
 		return true;
 
 	return false;
@@ -10160,15 +10161,15 @@ group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
 static inline bool
 group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
 {
-	if (sgs->sum_nr_running <= sgs->group_weight)
+	if (sgs->sum_nr_running <= (sgs->group_weight - sgs->forceidle_weight))
 		return false;
 
-	if ((sgs->group_capacity * 100) <
-			(sgs->group_util * imbalance_pct))
+	if ((sgs->group_capacity * 100 * (sgs->group_weight - sgs->forceidle_weight)) <
+			(sgs->group_util * imbalance_pct * sgs->group_weight))
 		return true;
 
-	if ((sgs->group_capacity * imbalance_pct) <
-			(sgs->group_runnable * 100))
+	if ((sgs->group_capacity * imbalance_pct * (sgs->group_weight - sgs->forceidle_weight)) <
+			(sgs->group_runnable * 100 * sgs->group_weight))
 		return true;
 
 	return false;
@@ -10371,13 +10372,19 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 		nr_running = rq->nr_running;
 		sgs->sum_nr_running += nr_running;
 
+		/*
+		 * Ignore force idle if we are balancing within the SMT mask
+		 */
+		if (rq_in_forceidle(rq) && !(env->sd->flags & SD_SHARE_CPUCAPACITY))
+			sgs->forceidle_weight++;
+
 		if (cpu_overutilized(i))
 			*sg_overutilized = 1;
 
 		/*
 		 * No need to call idle_cpu() if nr_running is not 0
 		 */
-		if (!nr_running && idle_cpu(i)) {
+		if (!rq_in_forceidle(rq) && !nr_running && idle_cpu(i)) {
 			sgs->idle_cpus++;
 			/* Idle cpu can't have misfit task */
 			continue;
@@ -10691,10 +10698,16 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
 		nr_running = rq->nr_running - local;
 		sgs->sum_nr_running += nr_running;
 
+		/*
+		 * Ignore force idle if we are balancing within the SMT mask
+		 */
+		if (rq_in_forceidle(rq) && !(sd->flags & SD_SHARE_CPUCAPACITY))
+			sgs->forceidle_weight++;
+
 		/*
 		 * No need to call idle_cpu_without() if nr_running is not 0
 		 */
-		if (!nr_running && idle_cpu_without(i, p))
+		if (!rq_in_forceidle(rq) && !nr_running && idle_cpu_without(i, p))
 			sgs->idle_cpus++;
 
 		/* Check if task fits in the CPU */
@@ -11123,7 +11136,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 		return;
 	}
 
-	if (busiest->group_type == group_smt_balance) {
+	if (busiest->group_type == group_smt_balance ||
+	    busiest->forceidle_weight) {
 		/* Reduce number of tasks sharing CPU capacity */
 		env->migration_type = migrate_task;
 		env->imbalance = 1;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index adfb6e3409d7..fdee101b1a66 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1468,6 +1468,13 @@ static inline bool sched_core_enqueued(struct task_struct *p)
 	return !RB_EMPTY_NODE(&p->core_node);
 }
 
+static inline bool rq_in_forceidle(struct rq *rq)
+{
+	return rq->core->core_forceidle_count > 0 &&
+		rq->nr_running &&
+		rq->curr == rq->idle;
+}
+
 extern void sched_core_enqueue(struct rq *rq, struct task_struct *p);
 extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags);
 
@@ -1513,6 +1520,11 @@ static inline bool sched_group_cookie_match(struct rq *rq,
 	return true;
 }
 
+static inline bool rq_in_forceidle(struct rq *rq)
+{
+	return false;
+}
+
 #endif /* !CONFIG_SCHED_CORE */
 
 #ifdef CONFIG_RT_GROUP_SCHED
-- 
2.43.0




Amazon Development Centre (South Africa) (Proprietary) Limited
29 Gogosoa Street, Observatory, Cape Town, Western Cape, 7925, South Africa
Registration Number: 2004 / 034463 / 07