linux-kernel - Re: [RFC][PATCH 03/14] sched/fair: Remove se->load.weight from se->avg.load

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAKfTPtB3vgmBcM=cwDBFWCNVvdk9eupbotYeP7oN98yUeaVioA@mail.gmail.com>
Date:   Wed, 17 May 2017 09:04:47 +0200
From:   Vincent Guittot <vincent.guittot@...aro.org>
To:     Peter Zijlstra <peterz@...radead.org>
Cc:     Ingo Molnar <mingo@...nel.org>,
        linux-kernel <linux-kernel@...r.kernel.org>,
        Tejun Heo <tj@...nel.org>,
        Linus Torvalds <torvalds@...ux-foundation.org>,
        Mike Galbraith <efault@....de>, Paul Turner <pjt@...gle.com>,
        Chris Mason <clm@...com>,
        Dietmar Eggemann <dietmar.eggemann@....com>,
        Morten Rasmussen <morten.rasmussen@....com>,
        Ben Segall <bsegall@...gle.com>,
        Yuyang Du <yuyang.du@...el.com>
Subject: Re: [RFC][PATCH 03/14] sched/fair: Remove se->load.weight from se->avg.load_sum

Hi Peter,

On 12 May 2017 at 18:44, Peter Zijlstra <peterz@...radead.org> wrote:
> Remove the load from the load_sum for sched_entities, basically
> turning load_sum into runnable_sum.  This prepares for better
> reweighting of group entities.
>
> Since we now have different rules for computing load_avg, split
> ___update_load_avg() into two parts, ___update_load_sum() and
> ___update_load_avg().
>
> So for se:
>
>   ___update_load_sum(.weight = 1)
>   ___upate_load_avg(.weight = se->load.weight)
>
> and for cfs_rq:
>
>   ___update_load_sum(.weight = cfs_rq->load.weight)
>   ___upate_load_avg(.weight = 1)
>
> Since the primary consumable is load_avg, most things will not be
> affected. Only those few sites that initialize/modify load_sum need
> attention.

I wonder if there is a problem with this new way to compute se's
load_avg and cfs_rq's load_avg when a task changes is nice prio before
migrating to another CPU.

se load_avg is now: runnable x current weight
but cfs_rq load_avg keeps the history of the previous weight of the se
When we detach se, we will remove an up to date se's load_avg from
cfs_rq which doesn't have the up to date load_avg in its own load_avg.
So if se's prio decreases just before migrating, some load_avg stays
in prev cfs_rq and if se's prio increases, we will remove too much
load_avg and possibly make the cfs_rq load_avg null whereas other
tasks are running.

Thought ?

I'm able to reproduce the problem with a simple rt-app use case (after
adding a new feature in rt-app)

Vincent

>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@...radead.org>
> ---
>  kernel/sched/fair.c |   91 ++++++++++++++++++++++++++++++++++++----------------
>  1 file changed, 64 insertions(+), 27 deletions(-)
>
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -744,7 +744,7 @@ void init_entity_runnable_average(struct
>          */
>         if (entity_is_task(se))
>                 sa->load_avg = scale_load_down(se->load.weight);
> -       sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
> +       sa->load_sum = LOAD_AVG_MAX;
>         /*
>          * At this point, util_avg won't be used in select_task_rq_fair anyway
>          */
> @@ -1967,7 +1967,7 @@ static u64 numa_get_avg_runtime(struct t
>                 delta = runtime - p->last_sum_exec_runtime;
>                 *period = now - p->last_task_numa_placement;
>         } else {
> -               delta = p->se.avg.load_sum / p->se.load.weight;
> +               delta = p->se.avg.load_sum;
>                 *period = LOAD_AVG_MAX;
>         }
>
> @@ -2872,8 +2872,8 @@ accumulate_sum(u64 delta, int cpu, struc
>   *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
>   */
>  static __always_inline int
> -___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
> -                 unsigned long weight, int running, struct cfs_rq *cfs_rq)
> +___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
> +                  unsigned long weight, int running, struct cfs_rq *cfs_rq)
>  {
>         u64 delta;
>
> @@ -2907,39 +2907,80 @@ ___update_load_avg(u64 now, int cpu, str
>         if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
>                 return 0;
>
> +       return 1;
> +}
> +
> +static __always_inline void
> +___update_load_avg(struct sched_avg *sa, unsigned long weight, struct cfs_rq *cfs_rq)
> +{
> +       u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
> +
>         /*
>          * Step 2: update *_avg.
>          */
> -       sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
> +       sa->load_avg = div_u64(weight * sa->load_sum, divider);
>         if (cfs_rq) {
>                 cfs_rq->runnable_load_avg =
> -                       div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
> +                       div_u64(cfs_rq->runnable_load_sum, divider);
>         }
> -       sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib);
> +       sa->util_avg = sa->util_sum / divider;
> +}
>
> -       return 1;
> +/*
> + * XXX we want to get rid of this helper and use the full load resolution.
> + */
> +static inline long se_weight(struct sched_entity *se)
> +{
> +       return scale_load_down(se->load.weight);
>  }
>
> +/*
> + * sched_entity:
> + *
> + *   load_sum := runnable_sum
> + *   load_avg = se_weight(se) * runnable_avg
> + *
> + * cfq_rs:
> + *
> + *   load_sum = \Sum se_weight(se) * se->avg.load_sum
> + *   load_avg = \Sum se->avg.load_avg
> + */
> +
>  static int
>  __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
>  {
> -       return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL);
> +       if (___update_load_sum(now, cpu, &se->avg, 0, 0, NULL)) {
> +               ___update_load_avg(&se->avg, se_weight(se), NULL);
> +               return 1;
> +       }
> +
> +       return 0;
>  }
>
>  static int
>  __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
>  {
> -       return ___update_load_avg(now, cpu, &se->avg,
> -                                 se->on_rq * scale_load_down(se->load.weight),
> -                                 cfs_rq->curr == se, NULL);
> +       if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq,
> +                               cfs_rq->curr == se, NULL)) {
> +
> +               ___update_load_avg(&se->avg, se_weight(se), NULL);
> +               return 1;
> +       }
> +
> +       return 0;
>  }
>
>  static int
>  __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
>  {
> -       return ___update_load_avg(now, cpu, &cfs_rq->avg,
> -                       scale_load_down(cfs_rq->load.weight),
> -                       cfs_rq->curr != NULL, cfs_rq);
> +       if (___update_load_sum(now, cpu, &cfs_rq->avg,
> +                               scale_load_down(cfs_rq->load.weight),
> +                               cfs_rq->curr != NULL, cfs_rq)) {
> +               ___update_load_avg(&cfs_rq->avg, 1, cfs_rq);
> +               return 1;
> +       }
> +
> +       return 0;
>  }
>
>  /*
> @@ -3110,7 +3151,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq
>
>         /* Set new sched_entity's load */
>         se->avg.load_avg = load;
> -       se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
> +       se->avg.load_sum = LOAD_AVG_MAX;
>
>         /* Update parent cfs_rq load */
>         add_positive(&cfs_rq->avg.load_avg, delta);
> @@ -3340,7 +3381,7 @@ static void attach_entity_load_avg(struc
>  {
>         se->avg.last_update_time = cfs_rq->avg.last_update_time;
>         cfs_rq->avg.load_avg += se->avg.load_avg;
> -       cfs_rq->avg.load_sum += se->avg.load_sum;
> +       cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
>         cfs_rq->avg.util_avg += se->avg.util_avg;
>         cfs_rq->avg.util_sum += se->avg.util_sum;
>         set_tg_cfs_propagate(cfs_rq);
> @@ -3360,7 +3401,7 @@ static void detach_entity_load_avg(struc
>  {
>
>         sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
> -       sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
> +       sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
>         sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
>         sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
>         set_tg_cfs_propagate(cfs_rq);
> @@ -3372,12 +3413,10 @@ static void detach_entity_load_avg(struc
>  static inline void
>  enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  {
> -       struct sched_avg *sa = &se->avg;
> -
> -       cfs_rq->runnable_load_avg += sa->load_avg;
> -       cfs_rq->runnable_load_sum += sa->load_sum;
> +       cfs_rq->runnable_load_avg += se->avg.load_avg;
> +       cfs_rq->runnable_load_sum += se_weight(se) * se->avg.load_sum;
>
> -       if (!sa->last_update_time) {
> +       if (!se->avg.last_update_time) {
>                 attach_entity_load_avg(cfs_rq, se);
>                 update_tg_load_avg(cfs_rq, 0);
>         }
> @@ -3387,10 +3426,8 @@ enqueue_entity_load_avg(struct cfs_rq *c
>  static inline void
>  dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  {
> -       cfs_rq->runnable_load_avg =
> -               max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
> -       cfs_rq->runnable_load_sum =
> -               max_t(s64,  cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
> +       sub_positive(&cfs_rq->runnable_load_avg, se->avg.load_avg);
> +       sub_positive(&cfs_rq->runnable_load_sum, se_weight(se) * se->avg.load_sum);
>  }
>
>  #ifndef CONFIG_64BIT
>
>