lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAO7JXPijXstA3Eh_LrRGiK26U1Mfn8C1jSXP+4kfTnQRxSax7g@mail.gmail.com>
Date: Mon, 9 Dec 2024 21:52:35 -0500
From: Vineeth Remanan Pillai <vineeth@...byteword.org>
To: Ilya Maximets <i.maximets@....org>
Cc: Peter Zijlstra <peterz@...radead.org>, Joel Fernandes <joel@...lfernandes.org>, 
	LKML <linux-kernel@...r.kernel.org>, Ingo Molnar <mingo@...hat.com>, 
	Juri Lelli <juri.lelli@...hat.com>, Vincent Guittot <vincent.guittot@...aro.org>, vineethrp@...gle.com, 
	shraash@...gle.com, marcel.ziswiler@...ethink.co.uk
Subject: Re: [v6.12] WARNING: at kernel/sched/deadline.c:1995
 enqueue_dl_entity (task blocked for more than 28262 seconds)

On Mon, Dec 9, 2024 at 7:34 PM Ilya Maximets <i.maximets@....org> wrote:
>
> On 12/9/24 13:56, Peter Zijlstra wrote:
> >
> > Does something like the below make sense?
> >
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index d380bffee2ef..abebeb67de4e 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -664,6 +664,7 @@ struct sched_dl_entity {
> >       unsigned int                    dl_non_contending : 1;
> >       unsigned int                    dl_overrun        : 1;
> >       unsigned int                    dl_server         : 1;
> > +     unsigned int                    dl_server_active  : 1;
> >       unsigned int                    dl_defer          : 1;
> >       unsigned int                    dl_defer_armed    : 1;
> >       unsigned int                    dl_defer_running  : 1;
> > diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> > index d9d5a702f1a6..e2b542f684db 100644
> > --- a/kernel/sched/deadline.c
> > +++ b/kernel/sched/deadline.c
> > @@ -1647,6 +1647,7 @@ void dl_server_start(struct sched_dl_entity *dl_se)
> >       if (!dl_se->dl_runtime)
> >               return;
> >
> > +     dl_se->dl_server_active = 1;
> >       enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP);
> >       if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &rq->curr->dl))
> >               resched_curr(dl_se->rq);
> > @@ -1661,6 +1662,7 @@ void dl_server_stop(struct sched_dl_entity *dl_se)
> >       hrtimer_try_to_cancel(&dl_se->dl_timer);
> >       dl_se->dl_defer_armed = 0;
> >       dl_se->dl_throttled = 0;
> > +     dl_se->dl_server_active = 0;
> >  }
> >
> >  void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
> > @@ -2420,8 +2422,10 @@ static struct task_struct *__pick_task_dl(struct rq *rq)
> >       if (dl_server(dl_se)) {
> >               p = dl_se->server_pick_task(dl_se);
> >               if (!p) {
> > -                     dl_se->dl_yielded = 1;
> > -                     update_curr_dl_se(rq, dl_se, 0);
> > +                     if (dl_se->dl_server_active) {
> > +                             dl_se->dl_yielded = 1;
> > +                             update_curr_dl_se(rq, dl_se, 0);
> > +                     }
> >                       goto again;
> >               }
> >               rq->dl_server = dl_se;
>
> And I tried this one on top of v6.12, but got a warning after about 1 minute (lucky?).
>
Hmm strange, I was running it for about 12 hours and has not WARNed
till now. I am on 6.13-rc1 but git log did not show any dlserver
related changes between 6.12 and 6.13 though. I also have another
patch for the double enqueue scenario we were disussing in this
thread(because of the wrong check in update_curr). Could you please
add the following changes to above patches and see if the isssue is
reproducible?

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fbdca89c677f..1f4b76c1f032 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1159,8 +1159,6 @@ static inline void update_curr_task(struct
task_struct *p, s64 delta_exec)
        trace_sched_stat_runtime(p, delta_exec);
        account_group_exec_runtime(p, delta_exec);
        cgroup_account_cputime(p, delta_exec);
-       if (p->dl_server)
-               dl_server_update(p->dl_server, delta_exec);
 }

 static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct
sched_entity *curr)
@@ -1210,6 +1208,11 @@ s64 update_curr_common(struct rq *rq)
        return delta_exec;
 }

+static inline bool dl_server_active(struct dl_sched_entity *dl_se)
+{
+       return dl_se->dl_server_active;
+}
+
 /*
  * Update the current task's runtime statistics.
  */
@@ -1237,11 +1240,16 @@ static void update_curr(struct cfs_rq *cfs_rq)
                update_curr_task(p, delta_exec);

                /*
-                * Any fair task that runs outside of fair_server should
-                * account against fair_server such that it can account for
-                * this time and possibly avoid running this period.
+                * If the fair_server is active, we need to account for the
+                * fair_server time whether or not the task is running on
+                * behalf of fair_server or not:
+                *  - If the task is running on behalf of fair_server, we need
+                *    to limit its time based on the assigned runtime.
+                *  - Fair task that runs outside of fair_server should account
+                *    against fair_server such that it can account for this time
+                *    and possibly avoid running this period.
                 */
-               if (p->dl_server != &rq->fair_server)
+               if (dl_server_active(&rq->fair_server))
                        dl_server_update(&rq->fair_server, delta_exec);
        }


Thanks for your time testing the fixes :-)

~Vineeth

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ