[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAO7JXPijXstA3Eh_LrRGiK26U1Mfn8C1jSXP+4kfTnQRxSax7g@mail.gmail.com>
Date: Mon, 9 Dec 2024 21:52:35 -0500
From: Vineeth Remanan Pillai <vineeth@...byteword.org>
To: Ilya Maximets <i.maximets@....org>
Cc: Peter Zijlstra <peterz@...radead.org>, Joel Fernandes <joel@...lfernandes.org>,
LKML <linux-kernel@...r.kernel.org>, Ingo Molnar <mingo@...hat.com>,
Juri Lelli <juri.lelli@...hat.com>, Vincent Guittot <vincent.guittot@...aro.org>, vineethrp@...gle.com,
shraash@...gle.com, marcel.ziswiler@...ethink.co.uk
Subject: Re: [v6.12] WARNING: at kernel/sched/deadline.c:1995
enqueue_dl_entity (task blocked for more than 28262 seconds)
On Mon, Dec 9, 2024 at 7:34 PM Ilya Maximets <i.maximets@....org> wrote:
>
> On 12/9/24 13:56, Peter Zijlstra wrote:
> >
> > Does something like the below make sense?
> >
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index d380bffee2ef..abebeb67de4e 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -664,6 +664,7 @@ struct sched_dl_entity {
> > unsigned int dl_non_contending : 1;
> > unsigned int dl_overrun : 1;
> > unsigned int dl_server : 1;
> > + unsigned int dl_server_active : 1;
> > unsigned int dl_defer : 1;
> > unsigned int dl_defer_armed : 1;
> > unsigned int dl_defer_running : 1;
> > diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> > index d9d5a702f1a6..e2b542f684db 100644
> > --- a/kernel/sched/deadline.c
> > +++ b/kernel/sched/deadline.c
> > @@ -1647,6 +1647,7 @@ void dl_server_start(struct sched_dl_entity *dl_se)
> > if (!dl_se->dl_runtime)
> > return;
> >
> > + dl_se->dl_server_active = 1;
> > enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP);
> > if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &rq->curr->dl))
> > resched_curr(dl_se->rq);
> > @@ -1661,6 +1662,7 @@ void dl_server_stop(struct sched_dl_entity *dl_se)
> > hrtimer_try_to_cancel(&dl_se->dl_timer);
> > dl_se->dl_defer_armed = 0;
> > dl_se->dl_throttled = 0;
> > + dl_se->dl_server_active = 0;
> > }
> >
> > void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
> > @@ -2420,8 +2422,10 @@ static struct task_struct *__pick_task_dl(struct rq *rq)
> > if (dl_server(dl_se)) {
> > p = dl_se->server_pick_task(dl_se);
> > if (!p) {
> > - dl_se->dl_yielded = 1;
> > - update_curr_dl_se(rq, dl_se, 0);
> > + if (dl_se->dl_server_active) {
> > + dl_se->dl_yielded = 1;
> > + update_curr_dl_se(rq, dl_se, 0);
> > + }
> > goto again;
> > }
> > rq->dl_server = dl_se;
>
> And I tried this one on top of v6.12, but got a warning after about 1 minute (lucky?).
>
Hmm strange, I was running it for about 12 hours and has not WARNed
till now. I am on 6.13-rc1 but git log did not show any dlserver
related changes between 6.12 and 6.13 though. I also have another
patch for the double enqueue scenario we were disussing in this
thread(because of the wrong check in update_curr). Could you please
add the following changes to above patches and see if the isssue is
reproducible?
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fbdca89c677f..1f4b76c1f032 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1159,8 +1159,6 @@ static inline void update_curr_task(struct
task_struct *p, s64 delta_exec)
trace_sched_stat_runtime(p, delta_exec);
account_group_exec_runtime(p, delta_exec);
cgroup_account_cputime(p, delta_exec);
- if (p->dl_server)
- dl_server_update(p->dl_server, delta_exec);
}
static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct
sched_entity *curr)
@@ -1210,6 +1208,11 @@ s64 update_curr_common(struct rq *rq)
return delta_exec;
}
+static inline bool dl_server_active(struct dl_sched_entity *dl_se)
+{
+ return dl_se->dl_server_active;
+}
+
/*
* Update the current task's runtime statistics.
*/
@@ -1237,11 +1240,16 @@ static void update_curr(struct cfs_rq *cfs_rq)
update_curr_task(p, delta_exec);
/*
- * Any fair task that runs outside of fair_server should
- * account against fair_server such that it can account for
- * this time and possibly avoid running this period.
+ * If the fair_server is active, we need to account for the
+ * fair_server time whether or not the task is running on
+ * behalf of fair_server or not:
+ * - If the task is running on behalf of fair_server, we need
+ * to limit its time based on the assigned runtime.
+ * - Fair task that runs outside of fair_server should account
+ * against fair_server such that it can account for this time
+ * and possibly avoid running this period.
*/
- if (p->dl_server != &rq->fair_server)
+ if (dl_server_active(&rq->fair_server))
dl_server_update(&rq->fair_server, delta_exec);
}
Thanks for your time testing the fixes :-)
~Vineeth
Powered by blists - more mailing lists