[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251006105453.769281377@infradead.org>
Date: Mon, 06 Oct 2025 12:46:55 +0200
From: Peter Zijlstra <peterz@...radead.org>
To: tj@...nel.org
Cc: linux-kernel@...r.kernel.org,
peterz@...radead.org,
mingo@...nel.org,
juri.lelli@...hat.com,
vincent.guittot@...aro.org,
dietmar.eggemann@....com,
rostedt@...dmis.org,
bsegall@...gle.com,
mgorman@...e.de,
vschneid@...hat.com,
longman@...hat.com,
hannes@...xchg.org,
mkoutny@...e.com,
void@...ifault.com,
arighi@...dia.com,
changwoo@...lia.com,
cgroups@...r.kernel.org,
sched-ext@...ts.linux.dev,
liuwenfang@...or.com,
tglx@...utronix.de
Subject: [RFC][PATCH 3/3] sched/ext: Fold balance_scx() into pick_task_scx()
With pick_task() having an rf argument, it is possible to do the
lock-break there, get rid of the weird balance/pick_task hack.
Signed-off-by: Peter Zijlstra (Intel) <peterz@...radead.org>
---
kernel/sched/core.c | 13 --------
kernel/sched/ext.c | 78 +++++++--------------------------------------------
kernel/sched/sched.h | 1
3 files changed, 12 insertions(+), 80 deletions(-)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5845,19 +5845,6 @@ static void prev_balance(struct rq *rq,
const struct sched_class *start_class = prev->sched_class;
const struct sched_class *class;
-#ifdef CONFIG_SCHED_CLASS_EXT
- /*
- * SCX requires a balance() call before every pick_task() including when
- * waking up from SCHED_IDLE. If @start_class is below SCX, start from
- * SCX instead. Also, set a flag to detect missing balance() call.
- */
- if (scx_enabled()) {
- rq->scx.flags |= SCX_RQ_BAL_PENDING;
- if (sched_class_above(&ext_sched_class, start_class))
- start_class = &ext_sched_class;
- }
-#endif
-
/*
* We must do the balancing pass before put_prev_task(), such
* that when we release the rq->lock the task is in the same
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2013,7 +2013,7 @@ static int balance_one(struct rq *rq, st
lockdep_assert_rq_held(rq);
rq->scx.flags |= SCX_RQ_IN_BALANCE;
- rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP);
+ rq->scx.flags &= ~SCX_RQ_BAL_KEEP;
if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) &&
unlikely(rq->scx.cpu_released)) {
@@ -2119,40 +2119,6 @@ static int balance_one(struct rq *rq, st
return true;
}
-static int balance_scx(struct rq *rq, struct task_struct *prev,
- struct rq_flags *rf)
-{
- int ret;
-
- rq_unpin_lock(rq, rf);
-
- ret = balance_one(rq, prev);
-
-#ifdef CONFIG_SCHED_SMT
- /*
- * When core-sched is enabled, this ops.balance() call will be followed
- * by pick_task_scx() on this CPU and the SMT siblings. Balance the
- * siblings too.
- */
- if (sched_core_enabled(rq)) {
- const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
- int scpu;
-
- for_each_cpu_andnot(scpu, smt_mask, cpumask_of(cpu_of(rq))) {
- struct rq *srq = cpu_rq(scpu);
- struct task_struct *sprev = srq->curr;
-
- WARN_ON_ONCE(__rq_lockp(rq) != __rq_lockp(srq));
- update_rq_clock(srq);
- balance_one(srq, sprev);
- }
- }
-#endif
- rq_repin_lock(rq, rf);
-
- return ret;
-}
-
static void process_ddsp_deferred_locals(struct rq *rq)
{
struct task_struct *p;
@@ -2335,38 +2301,19 @@ static struct task_struct *first_local_t
static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf)
{
struct task_struct *prev = rq->curr;
+ bool keep_prev, kick_idle = false;
struct task_struct *p;
- bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
- bool kick_idle = false;
- /*
- * WORKAROUND:
- *
- * %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just
- * have gone through balance_scx(). Unfortunately, there currently is a
- * bug where fair could say yes on balance() but no on pick_task(),
- * which then ends up calling pick_task_scx() without preceding
- * balance_scx().
- *
- * Keep running @prev if possible and avoid stalling from entering idle
- * without balancing.
- *
- * Once fair is fixed, remove the workaround and trigger WARN_ON_ONCE()
- * if pick_task_scx() is called without preceding balance_scx().
- */
- if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) {
- if (prev->scx.flags & SCX_TASK_QUEUED) {
- keep_prev = true;
- } else {
- keep_prev = false;
- kick_idle = true;
- }
- } else if (unlikely(keep_prev &&
- prev->sched_class != &ext_sched_class)) {
- /*
- * Can happen while enabling as SCX_RQ_BAL_PENDING assertion is
- * conditional on scx_enabled() and may have been skipped.
- */
+ rq->queue_mask = 0;
+ rq_unpin_lock(rq, rf);
+ balance_one(rq, prev);
+ rq_repin_lock(rq, rf);
+ if (rq->queue_mask & ~((ext_sched_class.queue_mask << 1)-1))
+ return RETRY_TASK;
+
+ keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
+ if (unlikely(keep_prev &&
+ prev->sched_class != &ext_sched_class)) {
WARN_ON_ONCE(scx_enable_state() == SCX_ENABLED);
keep_prev = false;
}
@@ -3243,7 +3190,6 @@ DEFINE_SCHED_CLASS(ext) = {
.wakeup_preempt = wakeup_preempt_scx,
- .balance = balance_scx,
.pick_task = pick_task_scx,
.put_prev_task = put_prev_task_scx,
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -779,7 +779,6 @@ enum scx_rq_flags {
*/
SCX_RQ_ONLINE = 1 << 0,
SCX_RQ_CAN_STOP_TICK = 1 << 1,
- SCX_RQ_BAL_PENDING = 1 << 2, /* balance hasn't run yet */
SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */
SCX_RQ_BYPASSING = 1 << 4,
SCX_RQ_CLK_VALID = 1 << 5, /* RQ clock is fresh and valid */
Powered by blists - more mailing lists