linux-kernel - [RFC v2 18/35] sched: pick a specified task

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <6e785c48ed266694748e0e71e264b94b27d9fa7b.1748594841.git.libo.gcs85@bytedance.com>
Date: Fri, 30 May 2025 17:27:46 +0800
From: Bo Li <libo.gcs85@...edance.com>
To: tglx@...utronix.de,
	mingo@...hat.com,
	bp@...en8.de,
	dave.hansen@...ux.intel.com,
	x86@...nel.org,
	luto@...nel.org,
	kees@...nel.org,
	akpm@...ux-foundation.org,
	david@...hat.com,
	juri.lelli@...hat.com,
	vincent.guittot@...aro.org,
	peterz@...radead.org
Cc: dietmar.eggemann@....com,
	hpa@...or.com,
	acme@...nel.org,
	namhyung@...nel.org,
	mark.rutland@....com,
	alexander.shishkin@...ux.intel.com,
	jolsa@...nel.org,
	irogers@...gle.com,
	adrian.hunter@...el.com,
	kan.liang@...ux.intel.com,
	viro@...iv.linux.org.uk,
	brauner@...nel.org,
	jack@...e.cz,
	lorenzo.stoakes@...cle.com,
	Liam.Howlett@...cle.com,
	vbabka@...e.cz,
	rppt@...nel.org,
	surenb@...gle.com,
	mhocko@...e.com,
	rostedt@...dmis.org,
	bsegall@...gle.com,
	mgorman@...e.de,
	vschneid@...hat.com,
	jannh@...gle.com,
	pfalcato@...e.de,
	riel@...riel.com,
	harry.yoo@...cle.com,
	linux-kernel@...r.kernel.org,
	linux-perf-users@...r.kernel.org,
	linux-fsdevel@...r.kernel.org,
	linux-mm@...ck.org,
	duanxiongchun@...edance.com,
	yinhongbo@...edance.com,
	dengliang.1214@...edance.com,
	xieyongji@...edance.com,
	chaiwen.cc@...edance.com,
	songmuchun@...edance.com,
	yuanzhu@...edance.com,
	chengguozhu@...edance.com,
	sunjiadong.lff@...edance.com,
	Bo Li <libo.gcs85@...edance.com>
Subject: [RFC v2 18/35] sched: pick a specified task

When a lazy switch occurs, the kernel already gets the task_struct of the
next task to switch to. However, the CFS does not provide an interface to
explicitly specify the next task. Therefore, RPAL must implement its own
mechanism to pick a specified task.

This patch introduces two interfaces, rpal_pick_next_task_fair() and
rpal_pick_task_fair(), to achieve this functionality. These interfaces
leverage the sched_entity of the target task to modify the CFS data
structures directly. Additionally, the patch adapts to the SCHED_CORE
feature by temporarily setting the highest weight for the specified task,
ensuring that the core will select this task preferentially during
scheduling decisions.

Signed-off-by: Bo Li <libo.gcs85@...edance.com>
---
 kernel/sched/core.c  | 212 +++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/fair.c  | 109 ++++++++++++++++++++++
 kernel/sched/sched.h |   8 ++
 3 files changed, 329 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a862bf4a0161..2e76376c5172 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -11003,3 +11003,215 @@ void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx)
 		set_next_task(rq, ctx->p);
 }
 #endif	/* CONFIG_SCHED_CLASS_EXT */
+
+#ifdef CONFIG_RPAL
+#ifdef CONFIG_SCHED_CORE
+static inline struct task_struct *
+__rpal_pick_next_task(struct rq *rq, struct task_struct *prev,
+		      struct task_struct *next, struct rq_flags *rf)
+{
+	struct task_struct *p;
+
+	if (likely(prev->sched_class == &fair_sched_class &&
+		   next->sched_class == &fair_sched_class)) {
+		p = rpal_pick_next_task_fair(prev, next, rq, rf);
+		return p;
+	}
+
+	BUG();
+}
+
+static struct task_struct *rpal_pick_next_task(struct rq *rq,
+					       struct task_struct *prev,
+					       struct task_struct *next,
+					       struct rq_flags *rf)
+{
+	struct task_struct *p;
+	const struct cpumask *smt_mask;
+	bool fi_before = false;
+	bool core_clock_updated = (rq == rq->core);
+	unsigned long cookie;
+	int i, cpu, occ = 0;
+	struct rq *rq_i;
+	bool need_sync;
+
+	if (!sched_core_enabled(rq))
+		return __rpal_pick_next_task(rq, prev, next, rf);
+
+	cpu = cpu_of(rq);
+
+	/* Stopper task is switching into idle, no need core-wide selection. */
+	if (cpu_is_offline(cpu)) {
+		rq->core_pick = NULL;
+		return __rpal_pick_next_task(rq, prev, next, rf);
+	}
+
+	if (rq->core->core_pick_seq == rq->core->core_task_seq &&
+	    rq->core->core_pick_seq != rq->core_sched_seq &&
+		rq->core_pick) {
+		WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq);
+
+		/* ignore rq->core_pick, always pick next */
+		if (rq->core_pick == next) {
+			put_prev_task(rq, prev);
+			set_next_task(rq, next);
+
+			rq->core_pick = NULL;
+			goto out;
+		}
+	}
+
+	put_prev_task_balance(rq, prev, rf);
+
+	smt_mask = cpu_smt_mask(cpu);
+	need_sync = !!rq->core->core_cookie;
+
+	/* reset state */
+	rq->core->core_cookie = 0UL;
+	if (rq->core->core_forceidle_count) {
+		if (!core_clock_updated) {
+			update_rq_clock(rq->core);
+			core_clock_updated = true;
+		}
+		sched_core_account_forceidle(rq);
+		/* reset after accounting force idle */
+		rq->core->core_forceidle_start = 0;
+		rq->core->core_forceidle_count = 0;
+		rq->core->core_forceidle_occupation = 0;
+		need_sync = true;
+		fi_before = true;
+	}
+
+	rq->core->core_task_seq++;
+
+	if (!need_sync) {
+		next = rpal_pick_task_fair(rq, next);
+		if (!next->core_cookie) {
+			rq->core_pick = NULL;
+			/*
+			 * For robustness, update the min_vruntime_fi for
+			 * unconstrained picks as well.
+			 */
+			WARN_ON_ONCE(fi_before);
+			task_vruntime_update(rq, next, false);
+			goto out_set_next;
+		}
+	}
+
+	for_each_cpu_wrap(i, smt_mask, cpu) {
+		rq_i = cpu_rq(i);
+
+		if (i != cpu && (rq_i != rq->core || !core_clock_updated))
+			update_rq_clock(rq_i);
+
+		/* ignore prio, always pick next */
+		if (i == cpu)
+			rq_i->core_pick = rpal_pick_task_fair(rq, next);
+		else
+			rq_i->core_pick = pick_task(rq_i);
+	}
+
+	cookie = rq->core->core_cookie = next->core_cookie;
+
+	for_each_cpu(i, smt_mask) {
+		rq_i = cpu_rq(i);
+		p = rq_i->core_pick;
+
+		if (!cookie_equals(p, cookie)) {
+			p = NULL;
+			if (cookie)
+				p = sched_core_find(rq_i, cookie);
+			if (!p)
+				p = idle_sched_class.pick_task(rq_i);
+		}
+
+		rq_i->core_pick = p;
+
+		if (p == rq_i->idle) {
+			if (rq_i->nr_running) {
+				rq->core->core_forceidle_count++;
+				if (!fi_before)
+					rq->core->core_forceidle_seq++;
+			}
+		} else {
+			occ++;
+		}
+	}
+
+	if (schedstat_enabled() && rq->core->core_forceidle_count) {
+		rq->core->core_forceidle_start = rq_clock(rq->core);
+		rq->core->core_forceidle_occupation = occ;
+	}
+
+	rq->core->core_pick_seq = rq->core->core_task_seq;
+	WARN_ON_ONCE(next != rq->core_pick);
+	rq->core_sched_seq = rq->core->core_pick_seq;
+
+	for_each_cpu(i, smt_mask) {
+		rq_i = cpu_rq(i);
+
+		/*
+		 * An online sibling might have gone offline before a task
+		 * could be picked for it, or it might be offline but later
+		 * happen to come online, but its too late and nothing was
+		 * picked for it.  That's Ok - it will pick tasks for itself,
+		 * so ignore it.
+		 */
+		if (!rq_i->core_pick)
+			continue;
+
+		/*
+		 * Update for new !FI->FI transitions, or if continuing to be in !FI:
+		 * fi_before     fi      update?
+		 *  0            0       1
+		 *  0            1       1
+		 *  1            0       1
+		 *  1            1       0
+		 */
+		if (!(fi_before && rq->core->core_forceidle_count))
+			task_vruntime_update(rq_i, rq_i->core_pick,
+					     !!rq->core->core_forceidle_count);
+
+		rq_i->core_pick->core_occupation = occ;
+
+		if (i == cpu) {
+			rq_i->core_pick = NULL;
+			continue;
+		}
+
+		/* Did we break L1TF mitigation requirements? */
+		WARN_ON_ONCE(!cookie_match(next, rq_i->core_pick));
+
+		if (rq_i->curr == rq_i->core_pick) {
+			rq_i->core_pick = NULL;
+			continue;
+		}
+
+		resched_curr(rq_i);
+	}
+
+out_set_next:
+	set_next_task(rq, next);
+out:
+	if (rq->core->core_forceidle_count && next == rq->idle)
+		queue_core_balance(rq);
+
+	return next;
+}
+#else
+static inline struct task_struct *
+rpal_pick_next_task(struct rq *rq, struct task_struct *prev,
+		      struct task_struct *next, struct rq_flags *rf)
+{
+	struct task_struct *p;
+
+	if (likely(prev->sched_class == &fair_sched_class &&
+		   next->sched_class == &fair_sched_class)) {
+		p = rpal_pick_next_task_fair(prev, next, rq, rf);
+		return p;
+	}
+
+	BUG();
+}
+#endif
+#endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 125912c0e9dd..d9c16d974a47 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8983,6 +8983,115 @@ void fair_server_init(struct rq *rq)
 	dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick_task);
 }
 
+#ifdef CONFIG_RPAL
+/* if the next is throttled, unthrottle it */
+static void rpal_unthrottle(struct rq *rq, struct task_struct *next)
+{
+	struct sched_entity *se;
+	struct cfs_rq *cfs_rq;
+
+	se = &next->se;
+	for_each_sched_entity(se) {
+		cfs_rq = cfs_rq_of(se);
+		if (cfs_rq_throttled(cfs_rq))
+			unthrottle_cfs_rq(cfs_rq);
+
+		if (cfs_rq == &rq->cfs)
+			break;
+	}
+}
+
+struct task_struct *rpal_pick_task_fair(struct rq *rq, struct task_struct *next)
+{
+	struct sched_entity *se;
+	struct cfs_rq *cfs_rq;
+
+	rpal_unthrottle(rq, next);
+
+	se = &next->se;
+	for_each_sched_entity(se) {
+		cfs_rq = cfs_rq_of(se);
+
+		if (cfs_rq->curr && cfs_rq->curr->on_rq)
+			update_curr(cfs_rq);
+
+		if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+			continue;
+
+		clear_buddies(cfs_rq, se);
+	}
+
+	return next;
+}
+
+struct task_struct *rpal_pick_next_task_fair(struct task_struct *prev,
+					     struct task_struct *next,
+					     struct rq *rq, struct rq_flags *rf)
+{
+	struct cfs_rq *cfs_rq;
+	struct sched_entity *se;
+	struct task_struct *p;
+
+	rpal_unthrottle(rq, next);
+
+	p = rpal_pick_task_fair(rq, next);
+
+	if (!sched_fair_runnable(rq))
+		panic("rpal error: !sched_fair_runnable\n");
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	__put_prev_set_next_dl_server(rq, prev, next);
+
+	se = &next->se;
+	p = task_of(se);
+
+	/*
+	 * Since we haven't yet done put_prev_entity and if the selected task
+	 * is a different task than we started out with, try and touch the
+	 * least amount of cfs_rqs.
+	 */
+	if (prev != p) {
+		struct sched_entity *pse = &prev->se;
+
+		while (!(cfs_rq = is_same_group(se, pse))) {
+			int se_depth = se->depth;
+			int pse_depth = pse->depth;
+
+			if (se_depth <= pse_depth) {
+				put_prev_entity(cfs_rq_of(pse), pse);
+				pse = parent_entity(pse);
+			}
+			if (se_depth >= pse_depth) {
+				set_next_entity(cfs_rq_of(se), se);
+				se = parent_entity(se);
+			}
+		}
+
+		put_prev_entity(cfs_rq, pse);
+		set_next_entity(cfs_rq, se);
+	}
+#endif
+#ifdef CONFIG_SMP
+	/*
+	 * Move the next running task to the front of
+	 * the list, so our cfs_tasks list becomes MRU
+	 * one.
+	 */
+	list_move(&p->se.group_node, &rq->cfs_tasks);
+#endif
+
+	WARN_ON_ONCE(se->sched_delayed);
+
+	if (hrtick_enabled_fair(rq))
+		hrtick_start_fair(rq, p);
+
+	update_misfit_status(p, rq);
+	sched_fair_update_stop_tick(rq, p);
+
+	return p;
+}
+#endif
+
 /*
  * Account for a descheduled task:
  */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c5a6a503eb6d..f8fd26b584c9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2575,6 +2575,14 @@ static inline bool sched_fair_runnable(struct rq *rq)
 
 extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
 extern struct task_struct *pick_task_idle(struct rq *rq);
+#ifdef CONFIG_RPAL
+extern struct task_struct *rpal_pick_task_fair(struct rq *rq,
+					       struct task_struct *next);
+extern struct task_struct *rpal_pick_next_task_fair(struct task_struct *prev,
+						    struct task_struct *next,
+						    struct rq *rq,
+						    struct rq_flags *rf);
+#endif
 
 #define SCA_CHECK		0x01
 #define SCA_MIGRATE_DISABLE	0x02
-- 
2.20.1