linux-kernel - Re: [PATCH v3] sched/fair: Avoid stale CPU util

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAKfTPtC9bfoMeYhhmjQoZyPRYvWLa=4L4qFC_B4vWUMRh8nz4g@mail.gmail.com>
Date:   Fri, 18 Dec 2020 14:12:50 +0100
From:   Vincent Guittot <vincent.guittot@...aro.org>
To:     Xuewen Yan <xuewen.yan94@...il.com>
Cc:     Dietmar Eggemann <dietmar.eggemann@....com>,
        Juri Lelli <juri.lelli@...hat.com>,
        Peter Zijlstra <peterz@...radead.org>,
        Ingo Molnar <mingo@...hat.com>,
        Steven Rostedt <rostedt@...dmis.org>,
        Ben Segall <bsegall@...gle.com>, Mel Gorman <mgorman@...e.de>,
        Daniel Bristot de Oliveira <bristot@...hat.com>,
        linux-kernel <linux-kernel@...r.kernel.org>,
        Patrick Bellasi <patrick.bellasi@....com>,
        Chunyan Zhang <zhang.lyra@...il.com>,
        王科 (Ke Wang) <Ke.Wang@...soc.com>,
        Ryan Y <xuewyan@...mail.com>,
        Xuewen Yan <Xuewen.Yan@...soc.com>
Subject: Re: [PATCH v3] sched/fair: Avoid stale CPU util_est value for
 schedutil in task dequeue

On Fri, 18 Dec 2020 at 10:28, Xuewen Yan <xuewen.yan94@...il.com> wrote:
>
> From: Xuewen Yan <xuewen.yan@...soc.com>
>
> CPU (root cfs_rq) estimated utilization (util_est) is currently used in
> dequeue_task_fair() to drive frequency selection before it is updated.
>
> with:
>
> CPU_util        : rq->cfs.avg.util_avg
> CPU_util_est    : rq->cfs.avg.util_est
> CPU_utilization : max(CPU_util, CPU_util_est)
> task_util       : p->se.avg.util_avg
> task_util_est   : p->se.avg.util_est
>
> dequeue_task_fair():
>
>     /* (1) CPU_util and task_util update + inform schedutil about
>            CPU_utilization changes */
>     for_each_sched_entity() /* 2 loops */
>         (dequeue_entity() ->) update_load_avg() -> cfs_rq_util_change()
>          -> cpufreq_update_util() ->...-> sugov_update_[shared\|single]
>          -> sugov_get_util() -> cpu_util_cfs()
>
>     /* (2) CPU_util_est and task_util_est update */
>     util_est_dequeue()
>
> cpu_util_cfs() uses CPU_utilization which could lead to a false (too
> high) utilization value for schedutil in task ramp-down or ramp-up
> scenarios during task dequeue.
>
> To mitigate the issue split the util_est update (2) into:
>
>  (A) CPU_util_est update in util_est_dequeue()
>  (B) task_util_est update in util_est_update()
>
> Place (A) before (1) and keep (B) where (2) is. The latter is necessary
> since (B) relies on task_util update in (1).
>

maybe add a
Fixes: 7f65ea42eb00 ("sched/fair: Add util_est on top of PELT")

> Signed-off-by: Xuewen Yan <xuewen.yan@...soc.com>
> Reviewed-by: Dietmar Eggemann <dietmar.eggemann@....com>

Reviewed-by: Vincent Guittot <vincent.guittot@...aro.org>

> ---
> Changes since v2:
> -modify the comment
> -move util_est_dequeue above within_margin()
> -modify the tab and space
>
> Changes since v1:
> -change the util_est_dequeue/update to inline type
> -use unsigned int enqueued rather than util_est in util_est_dequeue
> -remove "cpu" var
>
> ---
>  kernel/sched/fair.c | 43 ++++++++++++++++++++++++++++---------------
>  1 file changed, 28 insertions(+), 15 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index ae7ceba..f3a1b7a 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3932,6 +3932,22 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
>         trace_sched_util_est_cfs_tp(cfs_rq);
>  }
>
> +static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
> +                                   struct task_struct *p)
> +{
> +       unsigned int enqueued;
> +
> +       if (!sched_feat(UTIL_EST))
> +               return;
> +
> +       /* Update root cfs_rq's estimated utilization */
> +       enqueued  = cfs_rq->avg.util_est.enqueued;
> +       enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
> +       WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
> +
> +       trace_sched_util_est_cfs_tp(cfs_rq);
> +}
> +
>  /*
>   * Check if a (signed) value is within a specified (unsigned) margin,
>   * based on the observation that:
> @@ -3945,23 +3961,16 @@ static inline bool within_margin(int value, int margin)
>         return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
>  }
>
> -static void
> -util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
> +static inline void util_est_update(struct cfs_rq *cfs_rq,
> +                                  struct task_struct *p,
> +                                  bool task_sleep)
>  {
>         long last_ewma_diff;
>         struct util_est ue;
> -       int cpu;
>
>         if (!sched_feat(UTIL_EST))
>                 return;
>
> -       /* Update root cfs_rq's estimated utilization */
> -       ue.enqueued  = cfs_rq->avg.util_est.enqueued;
> -       ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p));
> -       WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
> -
> -       trace_sched_util_est_cfs_tp(cfs_rq);
> -
>         /*
>          * Skip update of task's estimated utilization when the task has not
>          * yet completed an activation, e.g. being migrated.
> @@ -4001,8 +4010,7 @@ static inline bool within_margin(int value, int margin)
>          * To avoid overestimation of actual task utilization, skip updates if
>          * we cannot grant there is idle time in this CPU.
>          */
> -       cpu = cpu_of(rq_of(cfs_rq));
> -       if (task_util(p) > capacity_orig_of(cpu))
> +       if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq))))
>                 return;
>
>         /*
> @@ -4085,8 +4093,11 @@ static inline int newidle_balance(struct rq *rq, struct rq_flags *rf)
>  util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
>
>  static inline void
> -util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
> -                bool task_sleep) {}
> +util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
> +
> +static inline void
> +util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p,
> +               bool task_sleep) {}
>  static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
>
>  #endif /* CONFIG_SMP */
> @@ -5589,6 +5600,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>         int idle_h_nr_running = task_has_idle_policy(p);
>         bool was_sched_idle = sched_idle_rq(rq);
>
> +       util_est_dequeue(&rq->cfs, p);
> +
>         for_each_sched_entity(se) {
>                 cfs_rq = cfs_rq_of(se);
>                 dequeue_entity(cfs_rq, se, flags);
> @@ -5639,7 +5652,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>                 rq->next_balance = jiffies;
>
>  dequeue_throttle:
> -       util_est_dequeue(&rq->cfs, p, task_sleep);
> +       util_est_update(&rq->cfs, p, task_sleep);
>         hrtick_update(rq);
>  }
>
> --
> 1.9.1
>