[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251208092744.32737-29-kprateek.nayak@amd.com>
Date: Mon, 8 Dec 2025 09:27:15 +0000
From: K Prateek Nayak <kprateek.nayak@....com>
To: Ingo Molnar <mingo@...hat.com>, Peter Zijlstra <peterz@...radead.org>,
Juri Lelli <juri.lelli@...hat.com>, Vincent Guittot
<vincent.guittot@...aro.org>, Anna-Maria Behnsen <anna-maria@...utronix.de>,
Frederic Weisbecker <frederic@...nel.org>, Thomas Gleixner
<tglx@...utronix.de>
CC: <linux-kernel@...r.kernel.org>, Dietmar Eggemann
<dietmar.eggemann@....com>, Steven Rostedt <rostedt@...dmis.org>, Ben Segall
<bsegall@...gle.com>, Mel Gorman <mgorman@...e.de>, Valentin Schneider
<vschneid@...hat.com>, K Prateek Nayak <kprateek.nayak@....com>, "Gautham R.
Shenoy" <gautham.shenoy@....com>, Swapnil Sapkal <swapnil.sapkal@....com>,
Shrikanth Hegde <sshegde@...ux.ibm.com>, Chen Yu <yu.c.chen@...el.com>
Subject: [RESEND RFC PATCH v2 29/29] [EXPERIMENTAL] sched/fair: Faster alternate for intra-NUMA newidle balance
Kernels that enable CONFIG_PREEMPTION only pull a single task during
newidle balance to keep the latency low. In standard newidle balance
path, the computation of busiest group, busiest rq, and then pulling
a single task from the busy rq adds a lot of overhead.
During the discussions at OSPM around overheads of load balancing, Peter
suggested trying out a different strategy for inter-NUMA newidle balance
with the goal of pulling a task as quickly as possible.
Try out an alternative strategy of newidle balance of directry
traversing the CPUs in the sched domain to pull runnable tasks.
Signed-off-by: K Prateek Nayak <kprateek.nayak@....com>
---
kernel/sched/fair.c | 119 ++++++++++++++++++++++++++++++++++----------
1 file changed, 93 insertions(+), 26 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 46d33ab63336..aa2821a9b800 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -11701,6 +11701,11 @@ static int need_active_balance(struct lb_env *env)
static int active_load_balance_cpu_stop(void *data);
+static inline bool sched_newidle_stop_balance(struct rq *rq)
+{
+ return (rq->nr_running > 0 || rq->ttwu_pending);
+}
+
static int should_we_balance(struct lb_env *env)
{
struct cpumask *swb_cpus = this_cpu_cpumask_var_ptr(should_we_balance_tmpmask);
@@ -11722,7 +11727,7 @@ static int should_we_balance(struct lb_env *env)
* to optimize wakeup latency.
*/
if (env->idle == CPU_NEWLY_IDLE) {
- if (env->dst_rq->nr_running > 0 || env->dst_rq->ttwu_pending)
+ if (sched_newidle_stop_balance(env->dst_rq))
return 0;
return 1;
}
@@ -13256,6 +13261,7 @@ static inline void fair_queue_pushable_tasks(struct rq *rq) { }
*/
static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
{
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
unsigned long next_balance = jiffies + HZ;
int this_cpu = this_rq->cpu;
int continue_balancing = 1;
@@ -13315,8 +13321,11 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
t0 = sched_clock_cpu(this_cpu);
sched_balance_update_blocked_averages(this_cpu);
+ cpumask_clear(cpus);
+
rcu_read_lock();
for_each_domain(this_cpu, sd) {
+ unsigned int weight = 1;
u64 domain_cost;
update_next_balance(sd, &next_balance);
@@ -13324,40 +13333,98 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
break;
- if (sd->flags & SD_BALANCE_NEWIDLE) {
- unsigned int weight = 1;
+ if (!(sd->flags & SD_BALANCE_NEWIDLE))
+ continue;
- if (sched_feat(NI_RANDOM)) {
- /*
- * Throw a 1k sided dice; and only run
- * newidle_balance according to the success
- * rate.
- */
- u32 d1k = sched_rng() % 1024;
- weight = 1 + sd->newidle_ratio;
- if (d1k > weight) {
- update_newidle_stats(sd, 0);
- continue;
- }
- weight = (1024 + weight/2) / weight;
+ if (sched_feat(NI_RANDOM)) {
+ /*
+ * Throw a 1k sided dice; and only run
+ * newidle_balance according to the success
+ * rate.
+ */
+ u32 d1k = sched_rng() % 1024;
+ weight = 1 + sd->newidle_ratio;
+ if (d1k > weight) {
+ update_newidle_stats(sd, 0);
+ continue;
}
+ weight = (1024 + weight/2) / weight;
+ }
- pulled_task = sched_balance_rq(this_cpu, this_rq,
- sd, CPU_NEWLY_IDLE,
- &continue_balancing);
- t1 = sched_clock_cpu(this_cpu);
- domain_cost = t1 - t0;
- curr_cost += domain_cost;
- t0 = t1;
+ /*
+ * Non-preemptible kernels can pull more than one task during
+ * newidle balance and NUMA domains may need special
+ * consideration to preserve tasks on preferred NUMA node.
+ *
+ * Only take fast-path on preemptible kernels for intra NUMA
+ * domains.
+ */
+ if (!IS_ENABLED(CONFIG_PREEMPTION) || (sd->flags & SD_NUMA)) {
+ pulled_task = sched_balance_rq(this_cpu, this_rq,
+ sd, CPU_NEWLY_IDLE,
+ &continue_balancing);
+ } else {
+ struct lb_env env = {
+ .sd = sd,
+ .dst_cpu = this_cpu,
+ .dst_rq = this_rq,
+ .idle = CPU_NEWLY_IDLE,
+ };
+ int cpu;
/*
- * Track max cost of a domain to make sure to not delay the
- * next wakeup on the CPU.
+ * Clear the CPUs of child domain. They have already
+ * been visited during last balance. !NUMA domains do
+ * not overlap so simply excluding the previous
+ * domain's span should be enough.
*/
- update_newidle_cost(sd, domain_cost, weight * !!pulled_task);
+ cpumask_andnot(cpus, sched_domain_span(sd), cpus);
+
+ /* Commit to searching the sd if we are idle at start. */
+ continue_balancing = sched_newidle_stop_balance(this_rq);
+ if (!continue_balancing)
+ break;
+
+ for_each_cpu_wrap(cpu, cpus, this_cpu + 1) {
+ struct task_struct *p = NULL;
+ struct rq *rq = cpu_rq(cpu);
+
+ /* Not overloaded with runnable tasks. */
+ if (rq->cfs.h_nr_runnable <= 1)
+ continue;
+
+ scoped_guard(rq_lock, rq) {
+ /* Check again with rq lock held. */
+ if (rq->cfs.h_nr_runnable <= 1)
+ break;
+
+ env.src_cpu = cpu;
+ env.src_rq = rq;
+
+ update_rq_clock(rq);
+ p = detach_one_task(&env);
+ }
+
+ if (p) {
+ attach_one_task(this_rq, p);
+ pulled_task = 1;
+ break;
+ }
+ }
}
+ t1 = sched_clock_cpu(this_cpu);
+ domain_cost = t1 - t0;
+ curr_cost += domain_cost;
+ t0 = t1;
+
+ /*
+ * Track max cost of a domain to make sure to not delay the
+ * next wakeup on the CPU.
+ */
+ update_newidle_cost(sd, domain_cost, weight * !!pulled_task);
+
/*
* Stop searching for tasks to pull if there are
* now runnable tasks on this rq.
--
2.43.0
Powered by blists - more mailing lists