linux-kernel - [PATCH v2] sched/fair: Force idle aware load balancing

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251201124223.247107-1-sieberf@amazon.com>
Date: Mon, 1 Dec 2025 14:42:22 +0200
From: Fernand Sieber <sieberf@...zon.com>
To: <peterz@...radead.org>, <vincent.guittot@...aro.org>
CC: <abusse@...zon.com>, <bsegall@...gle.com>, <dietmar.eggemann@....com>,
	<dwmw@...zon.co.uk>, <gmazz@...zon.com>, <jschoenh@...zon.de>,
	<juri.lelli@...hat.com>, <kprateek.nayak@....com>,
	<linux-kernel@...r.kernel.org>, <liuyuxua@...zon.com>, <mgorman@...e.de>,
	<mingo@...hat.com>, <rkagan@...zon.com>, <rostedt@...dmis.org>,
	<sieberf@...zon.com>, <vschneid@...hat.com>, <nh-open-source@...zon.com>
Subject: [PATCH v2] sched/fair: Force idle aware load balancing

Consider force idle wasted capacity when computing if a group is idle or
overloaded. We use a rather crude mechanism based on the current force idle
state of the rq. It may be preferable to use a decaying average, similar
to other load metrics, to avoid jittering.

If the busiest group has force idle, make it a task migration. This way we
will try to move one task regardless of the load. There are still
subsequent checks later on to verify that this doesn't cause more force
idle on the destination.

===

rev1->rev2:
* addressed feedback about asym scheduling
* removed redundant force idle check for idle cpus
* removed migrate_task override for LB with force idle (no perf gains)

===

Testing

Testing is aimed at measuring perceived guest noise on hypervisor system
with time shared scenarios.

Setup is on system where the load is nearing 100% which should allow no
steal time. The system has 64 CPUs, with 8 VMs, each VM using core
scheduling with 8 vCPUs per VM, time shared.

7 VMs are running stressors (`stress-ng --cpu 0`) while the last VM is
running the hwlat tracer with a width of 100ms, a period of 300ms, and
a threshold of 100us. Each VM runs a cookied non vCPU VMM process that
adds a light level of noise which forces some level of load balancing.

The test scenario is ran 10x60s and the average noise is measured.

At baseline, we measure about 1.20% of noise (computed from hwlat
breaches). With the proposed patch, the noise drops to 0.63%.

Signed-off-by: Fernand Sieber <sieberf@...zon.com>
---
 kernel/sched/fair.c  | 67 ++++++++++++++++++++++++++++++++++++++++----
 kernel/sched/sched.h | 12 ++++++++
 2 files changed, 73 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5b752324270b..c4ef8aaf1142 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9932,6 +9932,10 @@ struct sg_lb_stats {
 	unsigned int nr_numa_running;
 	unsigned int nr_preferred_running;
 #endif
+#ifdef CONFIG_SCHED_CORE
+	unsigned int forceidle_weight;
+	unsigned long forceidle_capacity;
+#endif
 };
 
 /*
@@ -10120,6 +10124,29 @@ static inline int sg_imbalanced(struct sched_group *group)
 	return group->sgc->imbalance;
 }
 
+
+#ifdef CONFIG_SCHED_CORE
+static inline unsigned int sgs_available_weight(struct sg_lb_stats *sgs)
+{
+	return sgs->group_weight - sgs->forceidle_weight;
+}
+
+static inline unsigned long sgs_available_capacity(struct sg_lb_stats *sgs)
+{
+	return sgs->group_capacity - sgs->forceidle_capacity;
+}
+#else
+static inline unsigned int sgs_available_weight(struct sg_lb_stats *sgs)
+{
+	return sgs->group_weight;
+}
+
+static inline unsigned long sgs_available_capacity(struct sg_lb_stats *sgs)
+{
+	return sgs->group_capacity;
+}
+#endif /* CONFIG_SCHED_CORE */
+
 /*
  * group_has_capacity returns true if the group has spare capacity that could
  * be used by some tasks.
@@ -10135,14 +10162,14 @@ static inline int sg_imbalanced(struct sched_group *group)
 static inline bool
 group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
 {
-	if (sgs->sum_nr_running < sgs->group_weight)
+	if (sgs->sum_nr_running < sgs_available_weight(sgs))
 		return true;
 
-	if ((sgs->group_capacity * imbalance_pct) <
+	if ((sgs_available_capacity(sgs) * imbalance_pct) <
 			(sgs->group_runnable * 100))
 		return false;
 
-	if ((sgs->group_capacity * 100) >
+	if ((sgs_available_capacity(sgs) * 100) >
 			(sgs->group_util * imbalance_pct))
 		return true;
 
@@ -10160,14 +10187,14 @@ group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
 static inline bool
 group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
 {
-	if (sgs->sum_nr_running <= sgs->group_weight)
+	if (sgs->sum_nr_running <= sgs_available_weight(sgs))
 		return false;
 
-	if ((sgs->group_capacity * 100) <
+	if ((sgs_available_capacity(sgs) * 100) <
 			(sgs->group_util * imbalance_pct))
 		return true;
 
-	if ((sgs->group_capacity * imbalance_pct) <
+	if ((sgs_available_capacity(sgs) * imbalance_pct) <
 			(sgs->group_runnable * 100))
 		return true;
 
@@ -10336,6 +10363,30 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
 	return check_cpu_capacity(rq, sd);
 }
 
+#ifdef CONFIG_SCHED_CORE
+static inline void
+update_forceidle_capacity(struct sched_domain *sd,
+			  struct sg_lb_stats *sgs,
+			  struct rq *rq)
+{
+	/*
+	 * Ignore force idle if we are balancing within the SMT mask
+	 */
+	if (sd->flags & SD_SHARE_CPUCAPACITY)
+		return;
+
+	if (rq_in_forceidle(rq)) {
+		sgs->forceidle_weight++;
+		sgs->forceidle_capacity += rq->cpu_capacity;
+	}
+}
+#else
+static inline void
+update_forceidle_capacity(struct sched_domain *sd,
+			  struct sg_lb_stats *sgs,
+			  struct rq *rq) {}
+#endif /* !CONFIG_SCHED_CORE */
+
 /**
  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
  * @env: The load balancing environment.
@@ -10371,6 +10422,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 		nr_running = rq->nr_running;
 		sgs->sum_nr_running += nr_running;
 
+		update_forceidle_capacity(env->sd, sgs, rq);
+
 		if (cpu_overutilized(i))
 			*sg_overutilized = 1;
 
@@ -10691,6 +10744,8 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
 		nr_running = rq->nr_running - local;
 		sgs->sum_nr_running += nr_running;
 
+		update_forceidle_capacity(sd, sgs, rq);
+
 		/*
 		 * No need to call idle_cpu_without() if nr_running is not 0
 		 */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index adfb6e3409d7..fdee101b1a66 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1468,6 +1468,13 @@ static inline bool sched_core_enqueued(struct task_struct *p)
 	return !RB_EMPTY_NODE(&p->core_node);
 }
 
+static inline bool rq_in_forceidle(struct rq *rq)
+{
+	return rq->core->core_forceidle_count > 0 &&
+		rq->nr_running &&
+		rq->curr == rq->idle;
+}
+
 extern void sched_core_enqueue(struct rq *rq, struct task_struct *p);
 extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags);
 
@@ -1513,6 +1520,11 @@ static inline bool sched_group_cookie_match(struct rq *rq,
 	return true;
 }
 
+static inline bool rq_in_forceidle(struct rq *rq)
+{
+	return false;
+}
+
 #endif /* !CONFIG_SCHED_CORE */
 
 #ifdef CONFIG_RT_GROUP_SCHED
-- 
2.43.0




Amazon Development Centre (South Africa) (Proprietary) Limited
29 Gogosoa Street, Observatory, Cape Town, Western Cape, 7925, South Africa
Registration Number: 2004 / 034463 / 07