lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251208092744.32737-26-kprateek.nayak@amd.com>
Date: Mon, 8 Dec 2025 09:27:12 +0000
From: K Prateek Nayak <kprateek.nayak@....com>
To: Ingo Molnar <mingo@...hat.com>, Peter Zijlstra <peterz@...radead.org>,
	Juri Lelli <juri.lelli@...hat.com>, Vincent Guittot
	<vincent.guittot@...aro.org>, Anna-Maria Behnsen <anna-maria@...utronix.de>,
	Frederic Weisbecker <frederic@...nel.org>, Thomas Gleixner
	<tglx@...utronix.de>
CC: <linux-kernel@...r.kernel.org>, Dietmar Eggemann
	<dietmar.eggemann@....com>, Steven Rostedt <rostedt@...dmis.org>, Ben Segall
	<bsegall@...gle.com>, Mel Gorman <mgorman@...e.de>, Valentin Schneider
	<vschneid@...hat.com>, K Prateek Nayak <kprateek.nayak@....com>, "Gautham R.
 Shenoy" <gautham.shenoy@....com>, Swapnil Sapkal <swapnil.sapkal@....com>,
	Shrikanth Hegde <sshegde@...ux.ibm.com>, Chen Yu <yu.c.chen@...el.com>
Subject: [RESEND RFC PATCH v2 26/29] [EXPERIMENTAL] sched/fair: Add push task framework

From: Vincent Guittot <vincent.guittot@...aro.org>

Add the skeleton for push task infrastructure. The empty
push_fair_task() prototype will be used to implement proactive idle
balancing in subsequent commits.

  [ prateek: Broke off relevant bits from [1] ]

Link: https://lore.kernel.org/all/20250302210539.1563190-6-vincent.guittot@linaro.org/ [1]
Signed-off-by: Vincent Guittot <vincent.guittot@...aro.org>
Signed-off-by: K Prateek Nayak <kprateek.nayak@....com>
---
Peter, the plist is still being used since a plist node already exists
in the task_struct which can be reused. Depending on the collective push
effort, we can either settle on the reuse of the plist_node or add a new
list_head for fair tasks.
---
 kernel/sched/fair.c  | 102 +++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h |   2 +
 2 files changed, 104 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f622104d54d7..e6ba7bb09a61 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6865,6 +6865,9 @@ requeue_delayed_entity(struct sched_entity *se)
 	clear_delayed(se);
 }
 
+static void fair_remove_pushable_task(struct rq *rq, struct task_struct *p);
+static void fair_add_pushable_task(struct rq *rq, struct task_struct *p);
+
 /*
  * The enqueue_task method is called before nr_running is
  * increased. Here we update the fair scheduling stats and
@@ -7015,6 +7018,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 		h_nr_idle = task_has_idle_policy(p);
 		if (task_sleep || task_delayed || !se->sched_delayed)
 			h_nr_runnable = 1;
+		fair_remove_pushable_task(rq, p);
 	}
 
 	for_each_sched_entity(se) {
@@ -8954,6 +8958,12 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
 		put_prev_entity(cfs_rq, pse);
 		set_next_entity(cfs_rq, se);
 
+		/*
+		 * The previous task might be eligible for being pushed on
+		 * another cpu if it is still active.
+		 */
+		fair_add_pushable_task(rq, prev);
+
 		__set_next_task_fair(rq, p, true);
 	}
 
@@ -9017,6 +9027,13 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct t
 		cfs_rq = cfs_rq_of(se);
 		put_prev_entity(cfs_rq, se);
 	}
+
+	/*
+	 * The previous task might be eligible for being pushed on another cpu
+	 * if it is still active.
+	 */
+	fair_add_pushable_task(rq, prev);
+
 }
 
 /*
@@ -13028,6 +13045,79 @@ static void nohz_newidle_balance(struct rq *this_rq)
 	atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu));
 }
 
+/*
+ * Push based load balancing: It may take several ticks before a nohz idle CPU
+ * is selected for load balancing which is less than ideal for latency
+ * sensitive tasks stuck on overloaded CPUs.
+ *
+ * If a fair task is preempted, opportunistically try pushing to an idle CPU if
+ * the indicators say it is favourable. Since a busy CPU is handling the push,
+ * this is a time sensitive operation.
+ */
+static inline bool fair_push_task(struct rq *rq, struct task_struct *p)
+{
+	if (!task_on_rq_queued(p))
+		return false;
+
+	if (p->se.sched_delayed)
+		return false;
+
+	if (p->nr_cpus_allowed == 1)
+		return false;
+
+	if (task_current_donor(rq, p))
+		return false;
+
+	if (task_current(rq, p))
+		return false;
+
+	return true;
+}
+
+static inline int has_pushable_tasks(struct rq *rq)
+{
+	return !plist_head_empty(&rq->cfs.pushable_tasks);
+}
+
+/*
+ * See if the non running fair tasks on this rq can be sent on other CPUs
+ * that fits better with their profile.
+ */
+static bool push_fair_task(struct rq *rq)
+{
+	return false;
+}
+
+static void push_fair_tasks(struct rq *rq)
+{
+	/* push_fair_task() will return true if it moved a fair task */
+	while (push_fair_task(rq))
+		;
+}
+
+static DEFINE_PER_CPU(struct balance_callback, fair_push_head);
+
+static inline void fair_queue_pushable_tasks(struct rq *rq)
+{
+	if (!has_pushable_tasks(rq))
+		return;
+
+	queue_balance_callback(rq, &per_cpu(fair_push_head, rq->cpu), push_fair_tasks);
+}
+static void fair_remove_pushable_task(struct rq *rq, struct task_struct *p)
+{
+	plist_del(&p->pushable_tasks, &rq->cfs.pushable_tasks);
+}
+
+static void fair_add_pushable_task(struct rq *rq, struct task_struct *p)
+{
+	if (fair_push_task(rq, p)) {
+		plist_del(&p->pushable_tasks, &rq->cfs.pushable_tasks);
+		/* Place the task with greates chance to be pushed first. */
+		plist_node_init(&p->pushable_tasks, p->prio);
+		plist_add(&p->pushable_tasks, &rq->cfs.pushable_tasks);
+	}
+}
 #else /* !CONFIG_NO_HZ_COMMON: */
 static inline void cpu_sd_exit_nohz_balance(struct rq *rq) { }
 static inline void cpu_sd_reenter_nohz_balance(struct rq *rq) { }
@@ -13039,6 +13129,10 @@ static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle
 }
 
 static inline void nohz_newidle_balance(struct rq *this_rq) { }
+
+static inline void fair_remove_pushable_task(struct rq *rq, struct task_struct *p) { }
+static inline void fair_add_pushable_task(struct rq *rq, struct task_struct *p) { }
+static inline void fair_queue_pushable_tasks(struct rq *rq) { }
 #endif /* !CONFIG_NO_HZ_COMMON */
 
 /*
@@ -13738,6 +13832,8 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs
 {
 	struct sched_entity *se = &p->se;
 
+	fair_remove_pushable_task(rq, p);
+
 	if (task_on_rq_queued(p)) {
 		/*
 		 * Move the next running task to the front of the list, so our
@@ -13753,6 +13849,11 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs
 	if (hrtick_enabled_fair(rq))
 		hrtick_start_fair(rq, p);
 
+	/*
+	 * Try to push prev task before checking misfit for next task as
+	 * the migration of prev can make next fitting the CPU
+	 */
+	fair_queue_pushable_tasks(rq);
 	update_misfit_status(p, rq);
 	sched_fair_update_stop_tick(rq, p);
 }
@@ -13782,6 +13883,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	cfs_rq->tasks_timeline = RB_ROOT_CACHED;
 	cfs_rq->zero_vruntime = (u64)(-(1LL << 20));
+	plist_head_init(&cfs_rq->pushable_tasks);
 	raw_spin_lock_init(&cfs_rq->removed.lock);
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3433de20a249..91928a371588 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -743,6 +743,8 @@ struct cfs_rq {
 	struct list_head	leaf_cfs_rq_list;
 	struct task_group	*tg;	/* group that "owns" this runqueue */
 
+	struct plist_head	pushable_tasks;
+
 	/* Locally cached copy of our task_group's idle value */
 	int			idle;
 
-- 
2.43.0


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ