[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <06a86aa50e62911871e95bb36b420cbd4e4fafbc.camel@codethink.co.uk>
Date: Tue, 10 Dec 2024 17:11:24 +0100
From: Marcel Ziswiler <marcel.ziswiler@...ethink.co.uk>
To: Vineeth Remanan Pillai <vineeth@...byteword.org>, Ilya Maximets
<i.maximets@....org>
Cc: Peter Zijlstra <peterz@...radead.org>, Joel Fernandes
<joel@...lfernandes.org>, LKML <linux-kernel@...r.kernel.org>, Ingo Molnar
<mingo@...hat.com>, Juri Lelli <juri.lelli@...hat.com>, Vincent Guittot
<vincent.guittot@...aro.org>, vineethrp@...gle.com, shraash@...gle.com
Subject: Re: [v6.12] WARNING: at kernel/sched/deadline.c:1995
enqueue_dl_entity (task blocked for more than 28262 seconds)
On Mon, 2024-12-09 at 21:52 -0500, Vineeth Remanan Pillai wrote:
> On Mon, Dec 9, 2024 at 7:34 PM Ilya Maximets <i.maximets@....org> wrote:
> >
> > On 12/9/24 13:56, Peter Zijlstra wrote:
> > >
> > > Does something like the below make sense?
> > >
> > > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > > index d380bffee2ef..abebeb67de4e 100644
> > > --- a/include/linux/sched.h
> > > +++ b/include/linux/sched.h
> > > @@ -664,6 +664,7 @@ struct sched_dl_entity {
> > > unsigned int dl_non_contending : 1;
> > > unsigned int dl_overrun : 1;
> > > unsigned int dl_server : 1;
> > > + unsigned int dl_server_active : 1;
> > > unsigned int dl_defer : 1;
> > > unsigned int dl_defer_armed : 1;
> > > unsigned int dl_defer_running : 1;
> > > diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> > > index d9d5a702f1a6..e2b542f684db 100644
> > > --- a/kernel/sched/deadline.c
> > > +++ b/kernel/sched/deadline.c
> > > @@ -1647,6 +1647,7 @@ void dl_server_start(struct sched_dl_entity *dl_se)
> > > if (!dl_se->dl_runtime)
> > > return;
> > >
> > > + dl_se->dl_server_active = 1;
> > > enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP);
> > > if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &rq->curr->dl))
> > > resched_curr(dl_se->rq);
> > > @@ -1661,6 +1662,7 @@ void dl_server_stop(struct sched_dl_entity *dl_se)
> > > hrtimer_try_to_cancel(&dl_se->dl_timer);
> > > dl_se->dl_defer_armed = 0;
> > > dl_se->dl_throttled = 0;
> > > + dl_se->dl_server_active = 0;
> > > }
> > >
> > > void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
> > > @@ -2420,8 +2422,10 @@ static struct task_struct *__pick_task_dl(struct rq *rq)
> > > if (dl_server(dl_se)) {
> > > p = dl_se->server_pick_task(dl_se);
> > > if (!p) {
> > > - dl_se->dl_yielded = 1;
> > > - update_curr_dl_se(rq, dl_se, 0);
> > > + if (dl_se->dl_server_active) {
> > > + dl_se->dl_yielded = 1;
> > > + update_curr_dl_se(rq, dl_se, 0);
> > > + }
> > > goto again;
> > > }
> > > rq->dl_server = dl_se;
> >
> > And I tried this one on top of v6.12, but got a warning after about 1 minute (lucky?).
> >
> Hmm strange, I was running it for about 12 hours and has not WARNed
> till now. I am on 6.13-rc1 but git log did not show any dlserver
> related changes between 6.12 and 6.13 though. I also have another
> patch for the double enqueue scenario we were disussing in this
> thread(because of the wrong check in update_curr). Could you please
> add the following changes to above patches and see if the isssue is
> reproducible?
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index fbdca89c677f..1f4b76c1f032 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1159,8 +1159,6 @@ static inline void update_curr_task(struct
> task_struct *p, s64 delta_exec)
> trace_sched_stat_runtime(p, delta_exec);
> account_group_exec_runtime(p, delta_exec);
> cgroup_account_cputime(p, delta_exec);
> - if (p->dl_server)
> - dl_server_update(p->dl_server, delta_exec);
> }
>
> static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct
> sched_entity *curr)
> @@ -1210,6 +1208,11 @@ s64 update_curr_common(struct rq *rq)
> return delta_exec;
> }
>
> +static inline bool dl_server_active(struct dl_sched_entity *dl_se)
> +{
> + return dl_se->dl_server_active;
> +}
> +
> /*
> * Update the current task's runtime statistics.
> */
> @@ -1237,11 +1240,16 @@ static void update_curr(struct cfs_rq *cfs_rq)
> update_curr_task(p, delta_exec);
>
> /*
> - * Any fair task that runs outside of fair_server should
> - * account against fair_server such that it can account for
> - * this time and possibly avoid running this period.
> + * If the fair_server is active, we need to account for the
> + * fair_server time whether or not the task is running on
> + * behalf of fair_server or not:
> + * - If the task is running on behalf of fair_server, we need
> + * to limit its time based on the assigned runtime.
> + * - Fair task that runs outside of fair_server should account
> + * against fair_server such that it can account for this time
> + * and possibly avoid running this period.
> */
> - if (p->dl_server != &rq->fair_server)
> + if (dl_server_active(&rq->fair_server))
> dl_server_update(&rq->fair_server, delta_exec);
> }
That indeed also fixes it for me.
https://drive.codethink.co.uk/s/s9kZQs2Mz6DpH3X
> Thanks for your time testing the fixes :-)
You are very welcome. Thanks you!
> ~Vineeth
Cheers
Marcel
Powered by blists - more mailing lists