[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251020141130.GJ3245006@noisy.programming.kicks-ass.net>
Date: Mon, 20 Oct 2025 16:11:30 +0200
From: Peter Zijlstra <peterz@...radead.org>
To: Juri Lelli <juri.lelli@...hat.com>
Cc: Gabriele Monaco <gmonaco@...hat.com>, linux-kernel@...r.kernel.org,
Ingo Molnar <mingo@...hat.com>,
Clark Williams <williams@...hat.com>, arighi@...dia.com
Subject: Re: [RFC PATCH] sched/deadline: Avoid dl_server boosting with
expired deadline
On Wed, Oct 15, 2025 at 07:40:18AM +0200, Juri Lelli wrote:
> On 14/10/25 21:33, Peter Zijlstra wrote:
> > On Tue, Oct 14, 2025 at 06:01:10PM +0200, Juri Lelli wrote:
> >
> > > Shouldn't idle time be accounted (subtracted from runtime) as well, though?
> >
> > Argh, indeed. Then I suppose we should look at bringing some of that
> > 'idle-for-whole-period' logic to try and actually stop the timer at some
> > point if nothing happens.
>
> That was my initial thought. If we get to that replenish after a whole
> idle period elapsed, stop the timer (resetting state), so that we can go
> back at defer mode with the next enqueue from fair.
Finally staring at this again; and I'm, as expected, confused again.
So put_prev_task_idle() calls dl_server_update_idle_time(). But this is
only called when we context switch away from idle. The dl_server_timer()
interrupt won't see this, because the interrupt doesn't schedule.
Worse, dl_server_update_idle() only updates p->se.exec_start when it
actually did the update. This means that if !dl_defer, it won't advance
the time, and then when dl_defer it will still see the old timestamp and
include the !dl_defer time.
Also, the enqueue_task_fair() callsite of dl_server_update_idle_time()
is dodgy as heck, the !nr_running check seems to want to ensures p ==
rq->idle, but I'm not sure it actually does.
So how about something like this for starters?
---
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 7b7671060bf9..963b85dbc477 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1166,8 +1166,12 @@ static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_
sched_clock_tick();
update_rq_clock(rq);
- if (!dl_se->dl_runtime)
- return HRTIMER_NORESTART;
+ /*
+ * Make sure current has propagated its pending runtime into
+ * any relevant server through calling dl_server_update() and
+ * friends.
+ */
+ rq->curr->sched_class->update_curr(rq);
if (dl_se->dl_defer_armed) {
/*
@@ -1543,35 +1547,16 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
* as time available for the fair server, avoiding a penalty for the
* rt scheduler that did not consumed that time.
*/
-void dl_server_update_idle_time(struct rq *rq, struct task_struct *p)
+void dl_server_update_idle(struct sched_dl_entity *dl_se, s64 delta_exec)
{
- s64 delta_exec;
-
- if (!rq->fair_server.dl_defer)
- return;
-
- /* no need to discount more */
- if (rq->fair_server.runtime < 0)
- return;
-
- delta_exec = rq_clock_task(rq) - p->se.exec_start;
- if (delta_exec < 0)
- return;
-
- rq->fair_server.runtime -= delta_exec;
-
- if (rq->fair_server.runtime < 0) {
- rq->fair_server.dl_defer_running = 0;
- rq->fair_server.runtime = 0;
- }
-
- p->se.exec_start = rq_clock_task(rq);
+ if (dl_se->dl_server_active && dl_se->dl_runtime && dl_se->dl_defer)
+ update_curr_dl_se(dl_se->rq, dl_se, delta_exec);
}
void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
{
/* 0 runtime = fair server disabled */
- if (dl_se->dl_runtime)
+ if (dl_se->dl_server_active && dl_se->dl_runtime)
update_curr_dl_se(dl_se->rq, dl_se, delta_exec);
}
@@ -1582,6 +1567,11 @@ void dl_server_start(struct sched_dl_entity *dl_se)
if (!dl_server(dl_se) || dl_se->dl_server_active)
return;
+ /*
+ * Update the current task to 'now'.
+ */
+ rq->curr->sched_class->update_curr(rq);
+
if (WARN_ON_ONCE(!cpu_online(cpu_of(rq))))
return;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cee1793e8277..c94c996360e6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1239,8 +1239,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
* against fair_server such that it can account for this time
* and possibly avoid running this period.
*/
- if (dl_server_active(&rq->fair_server))
- dl_server_update(&rq->fair_server, delta_exec);
+ dl_server_update(&rq->fair_server, delta_exec);
}
account_cfs_rq_runtime(cfs_rq, delta_exec);
@@ -6996,12 +6995,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
h_nr_idle = 1;
}
- if (!rq_h_nr_queued && rq->cfs.h_nr_queued) {
- /* Account for idle runtime */
- if (!rq->nr_running)
- dl_server_update_idle_time(rq, rq->curr);
+ if (!rq_h_nr_queued && rq->cfs.h_nr_queued)
dl_server_start(&rq->fair_server);
- }
/* At this point se is NULL and we are at root level*/
add_nr_running(rq, 1);
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c39b089d4f09..89cfc26ada46 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -452,9 +452,11 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags)
resched_curr(rq);
}
+static void update_curr_idle(struct rq *rq);
+
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
- dl_server_update_idle_time(rq, prev);
+ update_curr_idle(rq);
scx_update_idle(rq, false, true);
}
@@ -511,6 +513,17 @@ prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
static void update_curr_idle(struct rq *rq)
{
+ struct sched_entity *se = &rq->idle->se;
+ u64 now = rq_clock_task(rq);
+ s64 delta_exec;
+
+ delta_exec = now - se->exec_start;
+ if (unlikely(delta_exec <= 0))
+ return;
+
+ se->exec_start = now;
+
+ dl_server_update_idle(&rq->fair_server, delta_exec);
}
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1f5d07067f60..3bb1e59c5944 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -405,6 +405,7 @@ extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s6
* naturally thottled to once per period, avoiding high context switch
* workloads from spamming the hrtimer program/cancel paths.
*/
+extern void dl_server_update_idle(struct sched_dl_entity *dl_se, s64 delta_exec);
extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec);
extern void dl_server_start(struct sched_dl_entity *dl_se);
extern void dl_server_stop(struct sched_dl_entity *dl_se);
@@ -412,8 +413,6 @@ extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
dl_server_pick_f pick_task);
extern void sched_init_dl_servers(void);
-extern void dl_server_update_idle_time(struct rq *rq,
- struct task_struct *p);
extern void fair_server_init(struct rq *rq);
extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq);
extern int dl_server_apply_params(struct sched_dl_entity *dl_se,
Powered by blists - more mailing lists