linux-kernel - Re: [RESEND PATCH 2/2] sched/fair: Optimize __update_sched

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAPM31RLqU79ELpRn5Nq75sJ5oQVLoKUPDJ05eQYnh+wU6Z_PEg@mail.gmail.com>
Date:   Fri, 31 Mar 2017 03:55:40 -0700
From:   Paul Turner <pjt@...gle.com>
To:     Peter Zijlstra <peterz@...radead.org>
Cc:     Yuyang Du <yuyang.du@...el.com>, Ingo Molnar <mingo@...nel.org>,
        LKML <linux-kernel@...r.kernel.org>,
        Benjamin Segall <bsegall@...gle.com>,
        Morten Rasmussen <morten.rasmussen@....com>,
        Vincent Guittot <vincent.guittot@...aro.org>,
        Dietmar Eggemann <dietmar.eggemann@....com>,
        Matt Fleming <matt@...eblueprint.co.uk>,
        Mike Galbraith <umgwanakikbuti@...il.com>
Subject: Re: [RESEND PATCH 2/2] sched/fair: Optimize __update_sched_avg()

On Thu, Mar 30, 2017 at 7:14 AM, Peter Zijlstra <peterz@...radead.org> wrote:
> On Thu, Mar 30, 2017 at 02:16:58PM +0200, Peter Zijlstra wrote:
>> On Thu, Mar 30, 2017 at 04:21:08AM -0700, Paul Turner wrote:
>
>> > > +
>> > > +       if (unlikely(periods >= LOAD_AVG_MAX_N))
>> > >                 return LOAD_AVG_MAX;
>
>> >
>> > Is this correct in the iterated periods > LOAD_AVG_MAX_N case?
>> > I don't think the decay above is guaranteed to return these to zero.
>>
>> Ah!
>>
>> Indeed, so decay_load() needs LOAD_AVG_PERIOD * 63 before it truncates
>> to 0, because every LOAD_AVG_PERIOD we half the value; loose 1 bit; so
>> 63 of those and we're 0.
>>
>> But __accumulate_sum() OTOH returns LOAD_AVG_MAX after only
>> LOAD_AVG_MAX_N, which < LOAD_AVG_PERIOD * 63.
>>
>> So yes, combined we exceed LOAD_AVG_MAX, which is bad. Let me think what
>> to do about that.
>
>
> So at the very least it should be decay_load(LOAD_AVG_MAX, 1) (aka
> LOAD_AVG_MAX - 1024), but that still doesn't account for the !0
> decay_load() of the first segment.
>
> I'm thinking that we can compute the middle segment, by taking the max
> value and chopping off the ends, like:
>

>
>              p
>  c2 = 1024 \Sum y^n
>             n=1
>
>               inf        inf
>     = 1024 ( \Sum y^n - \Sum y^n - y^0 )
>               n=0        n=p
>

Very nice!
Minor nit: Second sum needs to be from n=p+1

>
> Which gives something like the below.. Or am I completely off my rocker?
>
> ---
>  kernel/sched/fair.c | 70 ++++++++++++++---------------------------------------
>  1 file changed, 18 insertions(+), 52 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 76f67b3e34d6..4f17ec0a378a 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -2744,26 +2744,6 @@ static const u32 runnable_avg_yN_inv[] = {
>  };
>
>  /*
> - * Precomputed \Sum y^k { 1<=k<=n }.  These are floor(true_value) to prevent
> - * over-estimates when re-combining.
> - */
> -static const u32 runnable_avg_yN_sum[] = {
> -           0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
> -        9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
> -       17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
> -};
> -
> -/*
> - * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to
> - * lower integers. See Documentation/scheduler/sched-avg.txt how these
> - * were generated:
> - */
> -static const u32 __accumulated_sum_N32[] = {
> -           0, 23371, 35056, 40899, 43820, 45281,
> -       46011, 46376, 46559, 46650, 46696, 46719,
> -};
> -
> -/*
>   * Approximate:
>   *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
>   */
> @@ -2795,40 +2775,25 @@ static u64 decay_load(u64 val, u64 n)
>         return val;
>  }
>
> -static u32 __accumulate_sum(u64 periods, u32 period_contrib, u32 remainder)
> +static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
>  {
> -       u32 c1, c2, c3 = remainder; /* y^0 == 1 */
> -
> -       if (!periods)
> -               return remainder - period_contrib;
> -
> -       if (unlikely(periods >= LOAD_AVG_MAX_N))
> -               return LOAD_AVG_MAX;
> +       u32 c1, c2, c3 = d3; /* y^0 == 1 */
>
>         /*
>          * c1 = d1 y^(p+1)
>          */
> -       c1 = decay_load((u64)(1024 - period_contrib), periods);
> +       c1 = decay_load((u64)d1, periods);
>
> -       periods -= 1;
>         /*
> -        * For updates fully spanning n periods, the contribution to runnable
> -        * average will be:
> +        *             p
> +        * c2 = 1024 \Sum y^n
> +        *            n=1
>          *
> -        *   c2 = 1024 \Sum y^n
> -        *
> -        * We can compute this reasonably efficiently by combining:
> -        *
> -        *   y^PERIOD = 1/2 with precomputed 1024 \Sum y^n {for: n < PERIOD}
> +        *              inf        inf
> +        *    = 1024 ( \Sum y^n - \Sum y^n - y^0 )
> +        *              n=0        n=p+1
>          */
> -       if (likely(periods <= LOAD_AVG_PERIOD)) {
> -               c2 = runnable_avg_yN_sum[periods];
> -       } else {
> -               c2 = __accumulated_sum_N32[periods/LOAD_AVG_PERIOD];
> -               periods %= LOAD_AVG_PERIOD;
> -               c2 = decay_load(c2, periods);
> -               c2 += runnable_avg_yN_sum[periods];
> -       }
> +       c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;

decay_load(LOAD_AVG_MAX, periods + 1)

I computed all the values vs true value that the old/new computations
result in, and it's very close.  Absolutely it's approximately 2x off
the previous computation, e.g. if the old value was -15 (relative to
true value) than the new computation is -30.

This is definitely more than good enough.  If we want more precision,
then the correction factor of:
  +clamp(periods, 0, 45)

Makes it almost perfect across the board (and more accurate than the
prior calculation).  Usefully it results in almost zero error in the
common cases of 0-25ms.

Want to fold this with the other patch above?

Reviewed-by: Paul Turner <pjt@...gle.com>

>
>         return c1 + c2 + c3;
>  }
> @@ -2861,8 +2826,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
>                unsigned long weight, int running, struct cfs_rq *cfs_rq)
>  {
>         unsigned long scale_freq, scale_cpu;
> +       u32 contrib = delta;
>         u64 periods;
> -       u32 contrib;
>
>         scale_freq = arch_scale_freq_capacity(NULL, cpu);
>         scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
> @@ -2880,13 +2845,14 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
>                                 decay_load(cfs_rq->runnable_load_sum, periods);
>                 }
>                 sa->util_sum = decay_load((u64)(sa->util_sum), periods);
> -       }
>
> -       /*
> -        * Step 2
> -        */
> -       delta %= 1024;
> -       contrib = __accumulate_sum(periods, sa->period_contrib, delta);
> +               /*
> +                * Step 2
> +                */
> +               delta %= 1024;
> +               contrib = __accumulate_pelt_segments(periods,
> +                               1024 - sa->period_contrib, delta);
> +       }
>         sa->period_contrib = delta;
>
>         contrib = cap_scale(contrib, scale_freq);