[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251217093923.1556187-5-arighi@nvidia.com>
Date: Wed, 17 Dec 2025 10:35:42 +0100
From: Andrea Righi <arighi@...dia.com>
To: Ingo Molnar <mingo@...hat.com>,
Peter Zijlstra <peterz@...radead.org>,
Juri Lelli <juri.lelli@...hat.com>,
Vincent Guittot <vincent.guittot@...aro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@....com>,
Steven Rostedt <rostedt@...dmis.org>,
Ben Segall <bsegall@...gle.com>,
Mel Gorman <mgorman@...e.de>,
Valentin Schneider <vschneid@...hat.com>,
Tejun Heo <tj@...nel.org>,
David Vernet <void@...ifault.com>,
Changwoo Min <changwoo@...lia.com>,
Shuah Khan <shuah@...nel.org>,
Joel Fernandes <joelagnelf@...dia.com>,
Christian Loehle <christian.loehle@....com>,
Emil Tsalapatis <emil@...alapatis.com>,
sched-ext@...ts.linux.dev,
bpf@...r.kernel.org,
linux-kselftest@...r.kernel.org,
linux-kernel@...r.kernel.org
Subject: [PATCH 4/7] sched_ext: Add a DL server for sched_ext tasks
sched_ext currently suffers starvation due to RT. The same workload when
converted to EXT can get zero runtime if RT is 100% running, causing EXT
processes to stall. Fix it by adding a DL server for EXT.
A kselftest is also included later to confirm that both DL servers are
functioning correctly:
# ./runner -t rt_stall
===== START =====
TEST: rt_stall
DESCRIPTION: Verify that RT tasks cannot stall SCHED_EXT tasks
OUTPUT:
TAP version 13
1..1
# Runtime of FAIR task (PID 1511) is 0.250000 seconds
# Runtime of RT task (PID 1512) is 4.750000 seconds
# FAIR task got 5.00% of total runtime
ok 1 PASS: FAIR task got more than 4.00% of runtime
TAP version 13
1..1
# Runtime of EXT task (PID 1514) is 0.250000 seconds
# Runtime of RT task (PID 1515) is 4.750000 seconds
# EXT task got 5.00% of total runtime
ok 2 PASS: EXT task got more than 4.00% of runtime
TAP version 13
1..1
# Runtime of FAIR task (PID 1517) is 0.250000 seconds
# Runtime of RT task (PID 1518) is 4.750000 seconds
# FAIR task got 5.00% of total runtime
ok 3 PASS: FAIR task got more than 4.00% of runtime
TAP version 13
1..1
# Runtime of EXT task (PID 1521) is 0.250000 seconds
# Runtime of RT task (PID 1522) is 4.750000 seconds
# EXT task got 5.00% of total runtime
ok 4 PASS: EXT task got more than 4.00% of runtime
ok 1 rt_stall #
===== END =====
v4: - initialize EXT server bandwidth reservation at init time and
always keep it active (Andrea Righi)
- check for rq->nr_running == 1 to determine when to account idle
time (Juri Lelli)
v3: - clarify that fair is not the only dl_server (Juri Lelli)
- remove explicit stop to reduce timer reprogramming overhead
(Juri Lelli)
- do not restart pick_task() when it's invoked by the dl_server
(Tejun Heo)
- depend on CONFIG_SCHED_CLASS_EXT (Andrea Righi)
v2: - drop ->balance() now that pick_task() has an rf argument
(Andrea Righi)
Tested-by: Christian Loehle <christian.loehle@....com>
Co-developed-by: Joel Fernandes <joelagnelf@...dia.com>
Signed-off-by: Joel Fernandes <joelagnelf@...dia.com>
Signed-off-by: Andrea Righi <arighi@...dia.com>
---
kernel/sched/core.c | 6 +++
kernel/sched/deadline.c | 84 ++++++++++++++++++++++++++++++-----------
kernel/sched/ext.c | 42 +++++++++++++++++++++
kernel/sched/idle.c | 3 ++
kernel/sched/sched.h | 2 +
kernel/sched/topology.c | 5 +++
6 files changed, 119 insertions(+), 23 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 41ba0be169117..a2400ee33a356 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8475,6 +8475,9 @@ int sched_cpu_dying(unsigned int cpu)
dump_rq_tasks(rq, KERN_WARNING);
}
dl_server_stop(&rq->fair_server);
+#ifdef CONFIG_SCHED_CLASS_EXT
+ dl_server_stop(&rq->ext_server);
+#endif
rq_unlock_irqrestore(rq, &rf);
calc_load_migrate(rq);
@@ -8678,6 +8681,9 @@ void __init sched_init(void)
hrtick_rq_init(rq);
atomic_set(&rq->nr_iowait, 0);
fair_server_init(rq);
+#ifdef CONFIG_SCHED_CLASS_EXT
+ ext_server_init(rq);
+#endif
#ifdef CONFIG_SCHED_CORE
rq->core = rq;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 2789db5217cd4..88f2b5ed5678a 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1445,8 +1445,8 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
dl_se->dl_defer_idle = 0;
/*
- * The fair server can consume its runtime while throttled (not queued/
- * running as regular CFS).
+ * The DL server can consume its runtime while throttled (not
+ * queued / running as regular CFS).
*
* If the server consumes its entire runtime in this state. The server
* is not required for the current period. Thus, reset the server by
@@ -1531,10 +1531,10 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
}
/*
- * The fair server (sole dl_server) does not account for real-time
- * workload because it is running fair work.
+ * The dl_server does not account for real-time workload because it
+ * is running fair work.
*/
- if (dl_se == &rq->fair_server)
+ if (dl_se->dl_server)
return;
#ifdef CONFIG_RT_GROUP_SCHED
@@ -1569,9 +1569,9 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
* In the non-defer mode, the idle time is not accounted, as the
* server provides a guarantee.
*
- * If the dl_server is in defer mode, the idle time is also considered
- * as time available for the fair server, avoiding a penalty for the
- * rt scheduler that did not consumed that time.
+ * If the dl_server is in defer mode, the idle time is also considered as
+ * time available for the dl_server, avoiding a penalty for the rt
+ * scheduler that did not consumed that time.
*/
void dl_server_update_idle(struct sched_dl_entity *dl_se, s64 delta_exec)
{
@@ -1810,6 +1810,7 @@ void dl_server_stop(struct sched_dl_entity *dl_se)
hrtimer_try_to_cancel(&dl_se->dl_timer);
dl_se->dl_defer_armed = 0;
dl_se->dl_throttled = 0;
+ dl_se->dl_defer_running = 0;
dl_se->dl_defer_idle = 0;
dl_se->dl_server_active = 0;
}
@@ -1844,6 +1845,18 @@ void sched_init_dl_servers(void)
dl_se->dl_server = 1;
dl_se->dl_defer = 1;
setup_new_dl_entity(dl_se);
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+ dl_se = &rq->ext_server;
+
+ WARN_ON(dl_server(dl_se));
+
+ dl_server_apply_params(dl_se, runtime, period, 1);
+
+ dl_se->dl_server = 1;
+ dl_se->dl_defer = 1;
+ setup_new_dl_entity(dl_se);
+#endif
}
}
@@ -3183,6 +3196,36 @@ void dl_add_task_root_domain(struct task_struct *p)
raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
}
+static void dl_server_add_bw(struct root_domain *rd, int cpu)
+{
+ struct sched_dl_entity *dl_se;
+
+ dl_se = &cpu_rq(cpu)->fair_server;
+ if (dl_server(dl_se) && cpu_active(cpu))
+ __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(cpu));
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+ dl_se = &cpu_rq(cpu)->ext_server;
+ if (dl_server(dl_se) && cpu_active(cpu))
+ __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(cpu));
+#endif
+}
+
+static u64 dl_server_read_bw(int cpu)
+{
+ u64 dl_bw = 0;
+
+ if (cpu_rq(cpu)->fair_server.dl_server)
+ dl_bw += cpu_rq(cpu)->fair_server.dl_bw;
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+ if (cpu_rq(cpu)->ext_server.dl_server)
+ dl_bw += cpu_rq(cpu)->ext_server.dl_bw;
+#endif
+
+ return dl_bw;
+}
+
void dl_clear_root_domain(struct root_domain *rd)
{
int i;
@@ -3201,12 +3244,8 @@ void dl_clear_root_domain(struct root_domain *rd)
* dl_servers are not tasks. Since dl_add_task_root_domain ignores
* them, we need to account for them here explicitly.
*/
- for_each_cpu(i, rd->span) {
- struct sched_dl_entity *dl_se = &cpu_rq(i)->fair_server;
-
- if (dl_server(dl_se) && cpu_active(i))
- __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(i));
- }
+ for_each_cpu(i, rd->span)
+ dl_server_add_bw(rd, i);
}
void dl_clear_root_domain_cpu(int cpu)
@@ -3702,7 +3741,7 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw)
unsigned long flags, cap;
struct dl_bw *dl_b;
bool overflow = 0;
- u64 fair_server_bw = 0;
+ u64 dl_server_bw = 0;
rcu_read_lock_sched();
dl_b = dl_bw_of(cpu);
@@ -3735,27 +3774,26 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw)
cap -= arch_scale_cpu_capacity(cpu);
/*
- * cpu is going offline and NORMAL tasks will be moved away
- * from it. We can thus discount dl_server bandwidth
- * contribution as it won't need to be servicing tasks after
- * the cpu is off.
+ * cpu is going offline and NORMAL and EXT tasks will be
+ * moved away from it. We can thus discount dl_server
+ * bandwidth contribution as it won't need to be servicing
+ * tasks after the cpu is off.
*/
- if (cpu_rq(cpu)->fair_server.dl_server)
- fair_server_bw = cpu_rq(cpu)->fair_server.dl_bw;
+ dl_server_bw = dl_server_read_bw(cpu);
/*
* Not much to check if no DEADLINE bandwidth is present.
* dl_servers we can discount, as tasks will be moved out the
* offlined CPUs anyway.
*/
- if (dl_b->total_bw - fair_server_bw > 0) {
+ if (dl_b->total_bw - dl_server_bw > 0) {
/*
* Leaving at least one CPU for DEADLINE tasks seems a
* wise thing to do. As said above, cpu is not offline
* yet, so account for that.
*/
if (dl_bw_cpus(cpu) - 1)
- overflow = __dl_overflow(dl_b, cap, fair_server_bw, 0);
+ overflow = __dl_overflow(dl_b, cap, dl_server_bw, 0);
else
overflow = 1;
}
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 94164f2dec6dc..04daaac74f514 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -957,6 +957,8 @@ static void update_curr_scx(struct rq *rq)
if (!curr->scx.slice)
touch_core_sched(rq, curr);
}
+
+ dl_server_update(&rq->ext_server, delta_exec);
}
static bool scx_dsq_priq_less(struct rb_node *node_a,
@@ -1500,6 +1502,10 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
if (enq_flags & SCX_ENQ_WAKEUP)
touch_core_sched(rq, p);
+ /* Start dl_server if this is the first task being enqueued */
+ if (rq->scx.nr_running == 1)
+ dl_server_start(&rq->ext_server);
+
do_enqueue_task(rq, p, enq_flags, sticky_cpu);
out:
rq->scx.flags &= ~SCX_RQ_IN_WAKEUP;
@@ -2511,6 +2517,33 @@ static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf)
return do_pick_task_scx(rq, rf, false);
}
+/*
+ * Select the next task to run from the ext scheduling class.
+ *
+ * Use do_pick_task_scx() directly with @force_scx enabled, since the
+ * dl_server must always select a sched_ext task.
+ */
+static struct task_struct *
+ext_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf)
+{
+ if (!scx_enabled())
+ return NULL;
+
+ return do_pick_task_scx(dl_se->rq, rf, true);
+}
+
+/*
+ * Initialize the ext server deadline entity.
+ */
+void ext_server_init(struct rq *rq)
+{
+ struct sched_dl_entity *dl_se = &rq->ext_server;
+
+ init_dl_entity(dl_se);
+
+ dl_server_init(dl_se, rq, ext_server_pick_task);
+}
+
#ifdef CONFIG_SCHED_CORE
/**
* scx_prio_less - Task ordering for core-sched
@@ -3090,6 +3123,15 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
static void switched_from_scx(struct rq *rq, struct task_struct *p)
{
scx_disable_task(p);
+
+ /*
+ * After class switch, if the DL server is still active, restart it so
+ * that DL timers will be queued, in case SCX switched to higher class.
+ */
+ if (dl_server_active(&rq->ext_server)) {
+ dl_server_stop(&rq->ext_server);
+ dl_server_start(&rq->ext_server);
+ }
}
static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c174afe1dd177..53793b9a04185 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -530,6 +530,9 @@ static void update_curr_idle(struct rq *rq)
se->exec_start = now;
dl_server_update_idle(&rq->fair_server, delta_exec);
+#ifdef CONFIG_SCHED_CLASS_EXT
+ dl_server_update_idle(&rq->ext_server, delta_exec);
+#endif
}
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d30cca6870f5f..28c24cda1c3ce 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -414,6 +414,7 @@ extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
extern void sched_init_dl_servers(void);
extern void fair_server_init(struct rq *rq);
+extern void ext_server_init(struct rq *rq);
extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq);
extern int dl_server_apply_params(struct sched_dl_entity *dl_se,
u64 runtime, u64 period, bool init);
@@ -1151,6 +1152,7 @@ struct rq {
struct dl_rq dl;
#ifdef CONFIG_SCHED_CLASS_EXT
struct scx_rq scx;
+ struct sched_dl_entity ext_server;
#endif
struct sched_dl_entity fair_server;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index cf643a5ddedd2..ac268da917781 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -508,6 +508,11 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
if (rq->fair_server.dl_server)
__dl_server_attach_root(&rq->fair_server, rq);
+#ifdef CONFIG_SCHED_CLASS_EXT
+ if (rq->ext_server.dl_server)
+ __dl_server_attach_root(&rq->ext_server, rq);
+#endif
+
rq_unlock_irqrestore(rq, &rf);
if (old_rd)
--
2.52.0
Powered by blists - more mailing lists