[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251029191111.167537-7-arighi@nvidia.com>
Date: Wed, 29 Oct 2025 20:08:43 +0100
From: Andrea Righi <arighi@...dia.com>
To: Ingo Molnar <mingo@...hat.com>,
Peter Zijlstra <peterz@...radead.org>,
Juri Lelli <juri.lelli@...hat.com>,
Vincent Guittot <vincent.guittot@...aro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@....com>,
Steven Rostedt <rostedt@...dmis.org>,
Ben Segall <bsegall@...gle.com>,
Mel Gorman <mgorman@...e.de>,
Valentin Schneider <vschneid@...hat.com>,
Tejun Heo <tj@...nel.org>,
David Vernet <void@...ifault.com>,
Changwoo Min <changwoo@...lia.com>,
Shuah Khan <shuah@...nel.org>,
Joel Fernandes <joelagnelf@...dia.com>,
Christian Loehle <christian.loehle@....com>,
Emil Tsalapatis <emil@...alapatis.com>,
Luigi De Matteis <ldematteis123@...il.com>,
sched-ext@...ts.linux.dev,
bpf@...r.kernel.org,
linux-kselftest@...r.kernel.org,
linux-kernel@...r.kernel.org
Subject: [PATCH 06/11] sched_ext: Add a DL server for sched_ext tasks
sched_ext currently suffers starvation due to RT. The same workload when
converted to EXT can get zero runtime if RT is 100% running, causing EXT
processes to stall. Fix it by adding a DL server for EXT.
A kselftest is also provided later to verify:
# ./runner -t rt_stall
===== START =====
TEST: rt_stall
DESCRIPTION: Verify that RT tasks cannot stall SCHED_EXT tasks
OUTPUT:
# Runtime of EXT task (PID 23338) is 0.250000 seconds
# Runtime of RT task (PID 23339) is 4.750000 seconds
# EXT task got 5.00% of total runtime
ok 1 PASS: EXT task got more than 4.00% of runtime
===== END =====
v3: - clarify that fair is not the only dl_server (Juri Lelli)
- remove explicit stop to reduce timer reprogramming overhead
(Juri Lelli)
- do not restart pick_task() when it's invoked by the dl_server
(Tejun Heo)
- depend on CONFIG_SCHED_CLASS_EXT (Andrea Righi)
v2: - drop ->balance() now that pick_task() has an rf argument
(Andrea Righi)
Cc: Luigi De Matteis <ldematteis123@...il.com>
Co-developed-by: Joel Fernandes <joelagnelf@...dia.com>
Signed-off-by: Joel Fernandes <joelagnelf@...dia.com>
Signed-off-by: Andrea Righi <arighi@...dia.com>
---
kernel/sched/core.c | 3 +++
kernel/sched/ext.c | 45 ++++++++++++++++++++++++++++++++++++++++++++
kernel/sched/sched.h | 2 ++
3 files changed, 50 insertions(+)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 096e8d03d85e7..31a9c9381c63f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8679,6 +8679,9 @@ void __init sched_init(void)
hrtick_rq_init(rq);
atomic_set(&rq->nr_iowait, 0);
fair_server_init(rq);
+#ifdef CONFIG_SCHED_CLASS_EXT
+ ext_server_init(rq);
+#endif
#ifdef CONFIG_SCHED_CORE
rq->core = rq;
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index d1ef5bda95aec..2a25749c54ba1 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -902,6 +902,9 @@ static void update_curr_scx(struct rq *rq)
if (!curr->scx.slice)
touch_core_sched(rq, curr);
}
+
+ if (dl_server_active(&rq->ext_server))
+ dl_server_update(&rq->ext_server, delta_exec);
}
static bool scx_dsq_priq_less(struct rb_node *node_a,
@@ -1409,6 +1412,15 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
if (enq_flags & SCX_ENQ_WAKEUP)
touch_core_sched(rq, p);
+ if (rq->scx.nr_running == 1) {
+ /* Account for idle runtime */
+ if (!rq->nr_running)
+ dl_server_update_idle_time(rq, rq->curr, &rq->ext_server);
+
+ /* Start dl_server if this is the first task being enqueued */
+ dl_server_start(&rq->ext_server);
+ }
+
do_enqueue_task(rq, p, enq_flags, sticky_cpu);
out:
rq->scx.flags &= ~SCX_RQ_IN_WAKEUP;
@@ -2444,6 +2456,30 @@ static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf)
return do_pick_task_scx(rq, rf, false);
}
+/*
+ * Select the next task to run from the ext scheduling class.
+ *
+ * Use do_pick_task_scx() directly with @force_scx enabled, since the
+ * dl_server must always select a sched_ext task.
+ */
+static struct task_struct *
+ext_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf)
+{
+ return do_pick_task_scx(dl_se->rq, rf, true);
+}
+
+/*
+ * Initialize the ext server deadline entity.
+ */
+void ext_server_init(struct rq *rq)
+{
+ struct sched_dl_entity *dl_se = &rq->ext_server;
+
+ init_dl_entity(dl_se);
+
+ dl_server_init(dl_se, rq, ext_server_pick_task);
+}
+
#ifdef CONFIG_SCHED_CORE
/**
* scx_prio_less - Task ordering for core-sched
@@ -3023,6 +3059,15 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
static void switched_from_scx(struct rq *rq, struct task_struct *p)
{
scx_disable_task(p);
+
+ /*
+ * After class switch, if the DL server is still active, restart it so
+ * that DL timers will be queued, in case SCX switched to higher class.
+ */
+ if (dl_server_active(&rq->ext_server)) {
+ dl_server_stop(&rq->ext_server);
+ dl_server_start(&rq->ext_server);
+ }
}
static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index eaae470841dea..002e5c1808014 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -415,6 +415,7 @@ extern void dl_server_update_idle_time(struct rq *rq,
struct task_struct *p,
struct sched_dl_entity *rq_dl_server);
extern void fair_server_init(struct rq *rq);
+extern void ext_server_init(struct rq *rq);
extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq);
extern int dl_server_apply_params(struct sched_dl_entity *dl_se,
u64 runtime, u64 period, bool init);
@@ -1154,6 +1155,7 @@ struct rq {
struct dl_rq dl;
#ifdef CONFIG_SCHED_CLASS_EXT
struct scx_rq scx;
+ struct sched_dl_entity ext_server;
#endif
struct sched_dl_entity fair_server;
--
2.51.2
Powered by blists - more mailing lists