[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251208092744.32737-27-kprateek.nayak@amd.com>
Date: Mon, 8 Dec 2025 09:27:13 +0000
From: K Prateek Nayak <kprateek.nayak@....com>
To: Ingo Molnar <mingo@...hat.com>, Peter Zijlstra <peterz@...radead.org>,
Juri Lelli <juri.lelli@...hat.com>, Vincent Guittot
<vincent.guittot@...aro.org>, Anna-Maria Behnsen <anna-maria@...utronix.de>,
Frederic Weisbecker <frederic@...nel.org>, Thomas Gleixner
<tglx@...utronix.de>
CC: <linux-kernel@...r.kernel.org>, Dietmar Eggemann
<dietmar.eggemann@....com>, Steven Rostedt <rostedt@...dmis.org>, Ben Segall
<bsegall@...gle.com>, Mel Gorman <mgorman@...e.de>, Valentin Schneider
<vschneid@...hat.com>, K Prateek Nayak <kprateek.nayak@....com>, "Gautham R.
Shenoy" <gautham.shenoy@....com>, Swapnil Sapkal <swapnil.sapkal@....com>,
Shrikanth Hegde <sshegde@...ux.ibm.com>, Chen Yu <yu.c.chen@...el.com>
Subject: [RESEND RFC PATCH v2 27/29] [EXPERIMENTAL] sched/fair: Proactive idle balance using push mechanism
Proactively try to push tasks to one of the CPUs in the sd_nohz domain
if "nr_idle_cpus" indicator indicates the presence of idle CPUs.
pick_next_pushable_fair_task() is taken from Vincent's series [1] as is
but the locking rules in push_fair_task() has been replaced with an IPI
based __ttwu_queue_wakelist(). Few additional checks have been added to
catch any corner cases with proxy execution where neither the current,
not the donor is pushable.
For the sake of this PoC, the __ttwu_queue_wakelist() based mechanism is
being used as is. If folks are in agreement with wider use of the
wakelist based migration, we can work on making this wakelist based
activation path more generic to be used for migrations too.
Although it is logical to traverse the "sd_nohz->shared->nohz_idle_cpus"
only, in testing, traversing the entire span was found to be more
beneficial.
Link: https://lore.kernel.org/all/20250302210539.1563190-6-vincent.guittot@linaro.org/ [1]
Signed-off-by: K Prateek Nayak <kprateek.nayak@....com>
---
kernel/sched/core.c | 2 +-
kernel/sched/fair.c | 93 +++++++++++++++++++++++++++++++++++++++++++-
kernel/sched/sched.h | 1 +
3 files changed, 94 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 35cb640b7266..388805c4436c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3769,7 +3769,7 @@ bool call_function_single_prep_ipi(int cpu)
* via sched_ttwu_wakeup() for activation so the wakee incurs the cost
* of the wakeup instead of the waker.
*/
-static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
+void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
{
struct rq *rq = cpu_rq(cpu);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e6ba7bb09a61..34aeb8e58e0b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -13079,12 +13079,103 @@ static inline int has_pushable_tasks(struct rq *rq)
return !plist_head_empty(&rq->cfs.pushable_tasks);
}
+static struct task_struct *pick_next_pushable_fair_task(struct rq *rq)
+{
+ struct task_struct *p;
+
+ if (!has_pushable_tasks(rq))
+ return NULL;
+
+ p = plist_first_entry(&rq->cfs.pushable_tasks,
+ struct task_struct, pushable_tasks);
+
+ WARN_ON_ONCE(rq->cpu != task_cpu(p));
+ WARN_ON_ONCE(task_current(rq, p));
+ WARN_ON_ONCE(task_current_donor(rq, p));
+ WARN_ON_ONCE(p->nr_cpus_allowed <= 1);
+ WARN_ON_ONCE(!task_on_rq_queued(p));
+
+ /*
+ * Remove task from the pushable list as we try only once after that
+ * the task has been put back in enqueued list.
+ */
+ plist_del(&p->pushable_tasks, &rq->cfs.pushable_tasks);
+
+ return p;
+}
+
+static inline bool should_push_tasks(struct rq *rq)
+{
+ struct sched_domain_shared *sds;
+ struct sched_domain *sd;
+ int cpu = cpu_of(rq);
+
+ /* TODO: Add a CPU local failure counter. */
+
+ /* CPU doesn't have any fair task to push. */
+ if (!has_pushable_tasks(rq))
+ return false;
+
+ /* CPU is overloaded! Do not waste cycles pushing tasks. */
+ if (!fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu)))
+ return false;
+
+ guard(rcu)();
+
+ sd = rcu_dereference(per_cpu(sd_nohz, cpu));
+ if (!sd)
+ return false;
+
+ /*
+ * We may not be able to find a push target.
+ * Skip for this tick and depend on the periodic
+ * balance to pull the queued tasks.
+ */
+ sds = sd->shared;
+ if (!sds || !atomic_read(&sds->nr_idle_cpus))
+ return false;
+
+ return true;
+}
+
/*
* See if the non running fair tasks on this rq can be sent on other CPUs
* that fits better with their profile.
*/
static bool push_fair_task(struct rq *rq)
{
+ struct task_struct *p = pick_next_pushable_fair_task(rq);
+ struct sched_domain_shared *sds;
+ int cpu, this_cpu = cpu_of(rq);
+ struct sched_domain *sd;
+
+ if (!p)
+ return false;
+
+ guard(rcu)();
+
+ sd = rcu_dereference(per_cpu(sd_nohz, cpu));
+ if (!sd)
+ return false;
+
+ /*
+ * It is possble to have idle CPUs with ticks enabled. To maximize the chance
+ * of pulling a task, traverse the entire sched_domain_span() instead of just
+ * the sd->shared->nohz_idle_cpus.
+ */
+ for_each_cpu_and_wrap(cpu, p->cpus_ptr, sched_domain_span(sd), this_cpu + 1) {
+ struct rq *target_rq;
+
+ if (!idle_cpu(cpu))
+ continue;
+
+ target_rq = cpu_rq(cpu);
+ deactivate_task(rq, p, 0);
+ set_task_cpu(p, cpu);
+ __ttwu_queue_wakelist(p, cpu, 0);
+ return true;
+ }
+
return false;
}
@@ -13099,7 +13190,7 @@ static DEFINE_PER_CPU(struct balance_callback, fair_push_head);
static inline void fair_queue_pushable_tasks(struct rq *rq)
{
- if (!has_pushable_tasks(rq))
+ if (should_push_tasks(rq))
return;
queue_balance_callback(rq, &per_cpu(fair_push_head, rq->cpu), push_fair_tasks);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 91928a371588..451666753c2a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2939,6 +2939,7 @@ static inline void __block_task(struct rq *rq, struct task_struct *p)
extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
+void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags);
extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
--
2.43.0
Powered by blists - more mailing lists