linux-kernel - Re: [RFC v5 2/9] sched/deadline: improve the tracking of active utilization

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CANLsYkyCEBSUtkhA4iWJn+i_dQDVZKK+6itmSeFv3vURfh__0Q@mail.gmail.com>
Date:   Sun, 26 Mar 2017 11:32:59 -0600
From:   Mathieu Poirier <mathieu.poirier@...aro.org>
To:     luca abeni <luca.abeni@...tannapisa.it>
Cc:     "linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
        Peter Zijlstra <peterz@...radead.org>,
        Ingo Molnar <mingo@...hat.com>,
        Juri Lelli <juri.lelli@....com>,
        Claudio Scordino <claudio@...dence.eu.com>,
        Steven Rostedt <rostedt@...dmis.org>,
        Tommaso Cucinotta <tommaso.cucinotta@...up.it>,
        Daniel Bristot de Oliveira <bristot@...hat.com>,
        Joel Fernandes <joelaf@...gle.com>
Subject: Re: [RFC v5 2/9] sched/deadline: improve the tracking of active utilization

On 23 March 2017 at 21:52, luca abeni <luca.abeni@...tannapisa.it> wrote:
> From: Luca Abeni <luca.abeni@...tannapisa.it>
>
> This patch implements a more theoretically sound algorithm for
> tracking active utilization: instead of decreasing it when a
> task blocks, use a timer (the "inactive timer", named after the
> "Inactive" task state of the GRUB algorithm) to decrease the
> active utilization at the so called "0-lag time".
>
> Signed-off-by: Luca Abeni <luca.abeni@...tannapisa.it>
> Tested-by: Claudio Scordino <claudio@...dence.eu.com>
> Tested-by: Daniel Bristot de Oliveira <bristot@...hat.com>
> ---
>  include/linux/sched.h   |  17 ++++
>  kernel/sched/core.c     |   3 +
>  kernel/sched/deadline.c | 208 ++++++++++++++++++++++++++++++++++++++++++++----
>  kernel/sched/sched.h    |   2 +
>  4 files changed, 215 insertions(+), 15 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index d67eee8..952cac8 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -445,16 +445,33 @@ struct sched_dl_entity {
>          *
>          * @dl_yielded tells if task gave up the CPU before consuming
>          * all its available runtime during the last job.
> +        *
> +        * @dl_non_contending tells if task is inactive while still
> +        * contributing to the active utilization. In other words, it
> +        * indicates if the inactive timer has been armed and its handler
> +        * has not been executed yet. This flag is useful to avoid race
> +        * conditions between the inactive timer handler and the wakeup
> +        * code.
>          */
>         int                             dl_throttled;
>         int                             dl_boosted;
>         int                             dl_yielded;
> +       int                             dl_non_contending;
>
>         /*
>          * Bandwidth enforcement timer. Each -deadline task has its
>          * own bandwidth to be enforced, thus we need one timer per task.
>          */
>         struct hrtimer                  dl_timer;
> +
> +       /*
> +        * Inactive timer, responsible for decreasing the active utilization
> +        * at the "0-lag time". When a -deadline task blocks, it contributes
> +        * to GRUB's active utilization until the "0-lag time", hence a
> +        * timer is needed to decrease the active utilization at the correct
> +        * time.
> +        */
> +       struct hrtimer inactive_timer;
>  };
>
>  union rcu_special {
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 6d6cad9..bf0b0b9 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2165,6 +2165,7 @@ void __dl_clear_params(struct task_struct *p)
>
>         dl_se->dl_throttled = 0;
>         dl_se->dl_yielded = 0;
> +       dl_se->dl_non_contending = 0;
>  }
>
>  /*
> @@ -2196,6 +2197,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
>
>         RB_CLEAR_NODE(&p->dl.rb_node);
>         init_dl_task_timer(&p->dl);
> +       init_inactive_task_timer(&p->dl);
>         __dl_clear_params(p);
>
>         INIT_LIST_HEAD(&p->rt.run_list);
> @@ -2518,6 +2520,7 @@ static int dl_overflow(struct task_struct *p, int policy,
>                    !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
>                 __dl_clear(dl_b, p->dl.dl_bw);
>                 __dl_add(dl_b, new_bw);
> +               dl_change_utilization(p, new_bw);
>                 err = 0;
>         } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
>                 __dl_clear(dl_b, p->dl.dl_bw);
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index cef9adb..86aed82 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -65,6 +65,107 @@ void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
>                 dl_rq->running_bw = 0;
>  }
>
> +void dl_change_utilization(struct task_struct *p, u64 new_bw)
> +{
> +       if (!task_on_rq_queued(p)) {
> +               struct rq *rq = task_rq(p);
> +
> +               if (p->dl.dl_non_contending) {
> +                       sub_running_bw(p->dl.dl_bw, &rq->dl);
> +                       p->dl.dl_non_contending = 0;
> +                       /*
> +                        * If the timer handler is currently running and the
> +                        * timer cannot be cancelled, inactive_task_timer()
> +                        * will see that dl_not_contending is not set, and
> +                        * will not touch the rq's active utilization,
> +                        * so we are still safe.
> +                        */
> +                       if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
> +                               put_task_struct(p);
> +               }
> +       }
> +}
> +
> +static void task_non_contending(struct task_struct *p)
> +{
> +       struct sched_dl_entity *dl_se = &p->dl;
> +       struct hrtimer *timer = &dl_se->inactive_timer;
> +       struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
> +       struct rq *rq = rq_of_dl_rq(dl_rq);
> +       s64 zerolag_time;
> +
> +       /*
> +        * If this is a non-deadline task that has been boosted,
> +        * do nothing
> +        */
> +       if (dl_se->dl_runtime == 0)
> +               return;
> +
> +       WARN_ON(hrtimer_active(&dl_se->inactive_timer));
> +       WARN_ON(dl_se->dl_non_contending);
> +
> +       zerolag_time = dl_se->deadline -
> +                div64_long((dl_se->runtime * dl_se->dl_period),
> +                       dl_se->dl_runtime);
> +
> +       /*
> +        * Using relative times instead of the absolute "0-lag time"
> +        * allows to simplify the code
> +        */
> +       zerolag_time -= rq_clock(rq);
> +
> +       /*
> +        * If the "0-lag time" already passed, decrease the active
> +        * utilization now, instead of starting a timer
> +        */
> +       if (zerolag_time < 0) {
> +               if (dl_task(p))
> +                       sub_running_bw(dl_se->dl_bw, dl_rq);
> +               if (!dl_task(p) || p->state == TASK_DEAD)
> +                       __dl_clear_params(p);
> +
> +               return;
> +       }
> +
> +       dl_se->dl_non_contending = 1;
> +       get_task_struct(p);
> +       hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL);
> +}
> +
> +static void task_contending(struct sched_dl_entity *dl_se)
> +{
> +       struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
> +
> +       /*
> +        * If this is a non-deadline task that has been boosted,
> +        * do nothing
> +        */
> +       if (dl_se->dl_runtime == 0)
> +               return;
> +
> +       if (dl_se->dl_non_contending) {
> +               /*
> +                * If the timer handler is currently running and the
> +                * timer cannot be cancelled, inactive_task_timer()
> +                * will see that dl_not_contending is not set, and
> +                * will not touch the rq's active utilization,
> +                * so we are still safe.
> +                */
> +               if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1)
> +                       put_task_struct(dl_task_of(dl_se));
> +               dl_se->dl_non_contending = 0;
> +       } else {
> +               /*
> +                * Since "dl_non_contending" is not set, the
> +                * task's utilization has already been removed from
> +                * active utilization (either when the task blocked,
> +                * when the "inactive timer" fired).
> +                * So, add it back.
> +                */
> +               add_running_bw(dl_se->dl_bw, dl_rq);
> +       }
> +}
> +
>  static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
>  {
>         struct sched_dl_entity *dl_se = &p->dl;
> @@ -615,10 +716,8 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
>          * The task might have changed its scheduling policy to something
>          * different than SCHED_DEADLINE (through switched_from_dl()).
>          */
> -       if (!dl_task(p)) {
> -               __dl_clear_params(p);
> +       if (!dl_task(p))
>                 goto unlock;
> -       }
>
>         /*
>          * The task might have been boosted by someone else and might be in the
> @@ -837,6 +936,49 @@ static void update_curr_dl(struct rq *rq)
>         }
>  }
>
> +static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
> +{
> +       struct sched_dl_entity *dl_se = container_of(timer,
> +                                                    struct sched_dl_entity,
> +                                                    inactive_timer);
> +       struct task_struct *p = dl_task_of(dl_se);
> +       struct rq_flags rf;
> +       struct rq *rq;
> +
> +       rq = task_rq_lock(p, &rf);
> +
> +       if (!dl_task(p) || p->state == TASK_DEAD) {
> +               if (p->state == TASK_DEAD && dl_se->dl_non_contending) {
> +                       sub_running_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl));
> +                       dl_se->dl_non_contending = 0;
> +               }
> +               __dl_clear_params(p);
> +
> +               goto unlock;
> +       }
> +       if (dl_se->dl_non_contending == 0)
> +               goto unlock;
> +
> +       sched_clock_tick();
> +       update_rq_clock(rq);
> +
> +       sub_running_bw(dl_se->dl_bw, &rq->dl);
> +       dl_se->dl_non_contending = 0;
> +unlock:
> +       task_rq_unlock(rq, p, &rf);
> +       put_task_struct(p);
> +
> +       return HRTIMER_NORESTART;
> +}
> +
> +void init_inactive_task_timer(struct sched_dl_entity *dl_se)

To be consistent with the other DL related functions:

s/init_inactive_task_timer(...)/init_dl_inactive_task_timer(...)


> +{
> +       struct hrtimer *timer = &dl_se->inactive_timer;
> +
> +       hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> +       timer->function = inactive_task_timer;
> +}
> +
>  #ifdef CONFIG_SMP
>
>  static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
> @@ -969,9 +1111,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
>          * we want a replenishment of its runtime.
>          */
>         if (flags & ENQUEUE_WAKEUP) {
> -               struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
> -
> -               add_running_bw(dl_se->dl_bw, dl_rq);
> +               task_contending(dl_se);
>                 update_dl_entity(dl_se, pi_se);
>         }
>         else if (flags & ENQUEUE_REPLENISH)
> @@ -1040,7 +1180,9 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
>          * add_running_bw().
>          */
>         if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) {
> -               add_running_bw(p->dl.dl_bw, &rq->dl);
> +               if (flags & ENQUEUE_WAKEUP)
> +                       task_contending(&p->dl);
> +
>                 return;
>         }
>
> @@ -1065,7 +1207,8 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
>                 sub_running_bw(p->dl.dl_bw, &rq->dl);
>
>         /*
> -        * This check allows to decrease the active utilization in two cases:
> +        * This check allows to start the inactive timer (or to immediately
> +        * decrease the active utilization, if needed) in two cases:
>          * when the task blocks and when it is terminating
>          * (p->state == TASK_DEAD). We can handle the two cases in the same
>          * way, because from GRUB's point of view the same thing is happening
> @@ -1073,7 +1216,7 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
>          * or "inactive")
>          */
>         if (flags & DEQUEUE_SLEEP)
> -               sub_running_bw(p->dl.dl_bw, &rq->dl);
> +               task_non_contending(p);
>  }
>
>  /*
> @@ -1151,6 +1294,28 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
>         return cpu;
>  }
>
> +static void migrate_task_rq_dl(struct task_struct *p)
> +{
> +       if ((p->state == TASK_WAKING) && (p->dl.dl_non_contending)) {
> +               struct rq *rq = task_rq(p);
> +
> +               raw_spin_lock(&rq->lock);
> +               sub_running_bw(p->dl.dl_bw, &rq->dl);
> +               p->dl.dl_non_contending = 0;
> +               /*
> +                * If the timer handler is currently running and the
> +                * timer cannot be cancelled, inactive_task_timer()
> +                * will see that dl_not_contending is not set, and
> +                * will not touch the rq's active utilization,
> +                * so we are still safe.
> +                */
> +               if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
> +                       put_task_struct(p);
> +
> +               raw_spin_unlock(&rq->lock);
> +       }
> +}
> +
>  static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
>  {
>         /*
> @@ -1792,13 +1957,23 @@ void __init init_sched_dl_class(void)
>  static void switched_from_dl(struct rq *rq, struct task_struct *p)
>  {
>         /*
> -        * Start the deadline timer; if we switch back to dl before this we'll
> -        * continue consuming our current CBS slice. If we stay outside of
> -        * SCHED_DEADLINE until the deadline passes, the timer will reset the
> -        * task.
> +        * task_non_contending() can start the "inactive timer" (if the 0-lag
> +        * time is in the future). If the task switches back to dl before
> +        * the "inactive timer" fires, it can continue to consume its current
> +        * runtime using its current deadline. If it stays outside of
> +        * SCHED_DEADLINE until the 0-lag time passes, inactive_task_timer()
> +        * will reset the task parameters.
>          */
> -       if (!start_dl_timer(p))
> -               __dl_clear_params(p);
> +       if (task_on_rq_queued(p) && p->dl.dl_runtime)
> +               task_non_contending(p);
> +
> +       /*
> +        * We cannot use inactive_task_timer() to invoke sub_running_bw()
> +        * at the 0-lag time, because the task could have been migrated
> +        * while SCHED_OTHER in the meanwhile.
> +        */
> +       if (p->dl.dl_non_contending)
> +               p->dl.dl_non_contending = 0;
>
>         /*
>          * Since this might be the only -deadline task on the rq,
> @@ -1817,6 +1992,8 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
>   */
>  static void switched_to_dl(struct rq *rq, struct task_struct *p)
>  {
> +       if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
> +               put_task_struct(p);
>
>         /* If p is not queued we will update its parameters at next wakeup. */
>         if (!task_on_rq_queued(p))
> @@ -1891,6 +2068,7 @@ const struct sched_class dl_sched_class = {
>
>  #ifdef CONFIG_SMP
>         .select_task_rq         = select_task_rq_dl,
> +       .migrate_task_rq        = migrate_task_rq_dl,
>         .set_cpus_allowed       = set_cpus_allowed_dl,
>         .rq_online              = rq_online_dl,
>         .rq_offline             = rq_offline_dl,
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index caaa7d3..57bb79b 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -244,6 +244,7 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
>                dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
>  }
>
> +void dl_change_utilization(struct task_struct *p, u64 new_bw);
>  extern void init_dl_bw(struct dl_bw *dl_b);
>
>  #ifdef CONFIG_CGROUP_SCHED
> @@ -1490,6 +1491,7 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime
>  extern struct dl_bandwidth def_dl_bandwidth;
>  extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
>  extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
> +extern void init_inactive_task_timer(struct sched_dl_entity *dl_se);
>
>  unsigned long to_ratio(u64 period, u64 runtime);
>
> --
> 2.7.4
>