linux-kernel - Re: [PATCH v3 1/3] sched/pelt: Move pelt related code in a dedicated file

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Date:   Tue, 12 Dec 2017 12:35:20 +0000
From:   Quentin Perret <quentin.perret@....com>
To:     Vincent Guittot <vincent.guittot@...aro.org>
Cc:     Peter Zijlstra <peterz@...radead.org>,
        Ingo Molnar <mingo@...nel.org>,
        linux-kernel <linux-kernel@...r.kernel.org>,
        "rjw@...ysocki.net" <rjw@...ysocki.net>,
        Juri Lelli <juri.lelli@....com>,
        Dietmar Eggemann <dietmar.eggemann@....com>,
        Morten Rasmussen <Morten.Rasmussen@....com>,
        viresh kumar <viresh.kumar@...aro.org>
Subject: Re: [PATCH v3 1/3] sched/pelt: Move pelt related code in a dedicated
 file

On Tuesday 12 Dec 2017 at 12:21:39 (+0100), Vincent Guittot wrote:
> Hi Quentin,
> 
> On 11 December 2017 at 15:08, Quentin Perret <quentin.perret@....com> wrote:
> > Hi Vincent,
> >
> > Although I agree that moving the PELT code in a dedicated file is
> > probably the cleanest way to achieve what you want, I was wondering if
> > you were able no measure any overhead due to moving the __update_load_avg_*()
> > functions in a different translation unit ? This is introducing function calls
> > in latency sensitive paths (wakeup for ex) so I'm just wondering what's
> > the impact in practice.
> 
> To answer your question, I haven't seen any performance difference
> with perf  bench sched pipe on a hikey board with the patchset
> Then, where do you think that this will explicitly introduce function
> call ? 

For example, without the patch, __update_load_avg_blocked_se() was static
and the compiler had the opportunity to inline it where appropriate. With
the patch, the function is now in a different translation unit so I think
an actual function call is always needed. I'm not saying it's a big deal,
I was just wondering if that might have a noticeable impact. 

The only alternative I see is to put the rt rq load update functions in
fair.c but that doesn't quiet feel like the right thing to do anyway ...

> It's not obvious for me that the patch that moves pelt code,
> will generate such changes. size vmlinux gives me the exact same
> results with and without the patch that moves pelt code.
> size vmlinux of latest tip/sched/core sha1 a555e9d86ee3
> $ size vmlinux
>    text    data     bss     dec     hex filename
> 10677259 5931388 402104 17010751 103903f vmlinux_orig
> 
> size vmlinux with patch 1 rebased on latest tip/sched/core
> $ size vmlinux
>    text    data     bss     dec     hex filename
> 10677259 5931388 402104 17010751 103903f vmlinux_patch_rt
> 
> >
> > Thanks,
> > Quentin
> >
> > On Wednesday 22 Nov 2017 at 15:35:53 (+0100), Vincent Guittot wrote:
> >> We want to track rt_rq's utilization as a part of the estimation of the
> >> whole rq's utilization. This is necessary because rt tasks can steal
> >> utilization to cfs tasks and make them lighter than they are.
> >> As we want to use the same load tracking mecanism for both and prevent
> >> useless dependency between cfs and rt code, pelt code is moved in a
> >> dedicated file.
> >>
> >> Signed-off-by: Vincent Guittot <vincent.guittot@...aro.org>
> >> ---
> >>  kernel/sched/Makefile |   2 +-
> >>  kernel/sched/fair.c   | 308 +-------------------------------------------------
> >>  kernel/sched/pelt.c   | 308 ++++++++++++++++++++++++++++++++++++++++++++++++++
> >>  kernel/sched/pelt.h   |  17 +++
> >>  kernel/sched/sched.h  |  20 ++++
> >>  5 files changed, 347 insertions(+), 308 deletions(-)
> >>  create mode 100644 kernel/sched/pelt.c
> >>  create mode 100644 kernel/sched/pelt.h
> >>
> >> diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
> >> index e2f9d4f..5a6d1c1 100644
> >> --- a/kernel/sched/Makefile
> >> +++ b/kernel/sched/Makefile
> >> @@ -19,7 +19,7 @@ endif
> >>  obj-y += core.o loadavg.o clock.o cputime.o
> >>  obj-y += idle_task.o fair.o rt.o deadline.o
> >>  obj-y += wait.o wait_bit.o swait.o completion.o idle.o
> >> -obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o
> >> +obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o
> >>  obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
> >>  obj-$(CONFIG_SCHEDSTATS) += stats.o
> >>  obj-$(CONFIG_SCHED_DEBUG) += debug.o
> >> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> >> index 0989676..b88550e 100644
> >> --- a/kernel/sched/fair.c
> >> +++ b/kernel/sched/fair.c
> >> @@ -270,9 +270,6 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
> >>       return cfs_rq->rq;
> >>  }
> >>
> >> -/* An entity is a task if it doesn't "own" a runqueue */
> >> -#define entity_is_task(se)   (!se->my_q)
> >> -
> >>  static inline struct task_struct *task_of(struct sched_entity *se)
> >>  {
> >>       SCHED_WARN_ON(!entity_is_task(se));
> >> @@ -434,7 +431,6 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
> >>       return container_of(cfs_rq, struct rq, cfs);
> >>  }
> >>
> >> -#define entity_is_task(se)   1
> >>
> >>  #define for_each_sched_entity(se) \
> >>               for (; se; se = NULL)
> >> @@ -707,7 +703,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
> >>  }
> >>
> >>  #ifdef CONFIG_SMP
> >> -
> >> +#include "pelt.h"
> >>  #include "sched-pelt.h"
> >>
> >>  static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
> >> @@ -2723,19 +2719,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
> >>  } while (0)
> >>
> >>  #ifdef CONFIG_SMP
> >> -/*
> >> - * XXX we want to get rid of these helpers and use the full load resolution.
> >> - */
> >> -static inline long se_weight(struct sched_entity *se)
> >> -{
> >> -     return scale_load_down(se->load.weight);
> >> -}
> >> -
> >> -static inline long se_runnable(struct sched_entity *se)
> >> -{
> >> -     return scale_load_down(se->runnable_weight);
> >> -}
> >> -
> >>  static inline void
> >>  enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
> >>  {
> >> @@ -3038,289 +3021,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
> >>  }
> >>
> >>  #ifdef CONFIG_SMP
> >> -/*
> >> - * Approximate:
> >> - *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
> >> - */
> >> -static u64 decay_load(u64 val, u64 n)
> >> -{
> >> -     unsigned int local_n;
> >> -
> >> -     if (unlikely(n > LOAD_AVG_PERIOD * 63))
> >> -             return 0;
> >> -
> >> -     /* after bounds checking we can collapse to 32-bit */
> >> -     local_n = n;
> >> -
> >> -     /*
> >> -      * As y^PERIOD = 1/2, we can combine
> >> -      *    y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
> >> -      * With a look-up table which covers y^n (n<PERIOD)
> >> -      *
> >> -      * To achieve constant time decay_load.
> >> -      */
> >> -     if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
> >> -             val >>= local_n / LOAD_AVG_PERIOD;
> >> -             local_n %= LOAD_AVG_PERIOD;
> >> -     }
> >> -
> >> -     val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
> >> -     return val;
> >> -}
> >> -
> >> -static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
> >> -{
> >> -     u32 c1, c2, c3 = d3; /* y^0 == 1 */
> >> -
> >> -     /*
> >> -      * c1 = d1 y^p
> >> -      */
> >> -     c1 = decay_load((u64)d1, periods);
> >> -
> >> -     /*
> >> -      *            p-1
> >> -      * c2 = 1024 \Sum y^n
> >> -      *            n=1
> >> -      *
> >> -      *              inf        inf
> >> -      *    = 1024 ( \Sum y^n - \Sum y^n - y^0 )
> >> -      *              n=0        n=p
> >> -      */
> >> -     c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
> >> -
> >> -     return c1 + c2 + c3;
> >> -}
> >> -
> >> -#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
> >> -
> >> -/*
> >> - * Accumulate the three separate parts of the sum; d1 the remainder
> >> - * of the last (incomplete) period, d2 the span of full periods and d3
> >> - * the remainder of the (incomplete) current period.
> >> - *
> >> - *           d1          d2           d3
> >> - *           ^           ^            ^
> >> - *           |           |            |
> >> - *         |<->|<----------------->|<--->|
> >> - * ... |---x---|------| ... |------|-----x (now)
> >> - *
> >> - *                           p-1
> >> - * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
> >> - *                           n=1
> >> - *
> >> - *    = u y^p +                                      (Step 1)
> >> - *
> >> - *                     p-1
> >> - *      d1 y^p + 1024 \Sum y^n + d3 y^0              (Step 2)
> >> - *                     n=1
> >> - */
> >> -static __always_inline u32
> >> -accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
> >> -            unsigned long load, unsigned long runnable, int running)
> >> -{
> >> -     unsigned long scale_freq, scale_cpu;
> >> -     u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
> >> -     u64 periods;
> >> -
> >> -     scale_freq = arch_scale_freq_capacity(NULL, cpu);
> >> -     scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
> >> -
> >> -     delta += sa->period_contrib;
> >> -     periods = delta / 1024; /* A period is 1024us (~1ms) */
> >> -
> >> -     /*
> >> -      * Step 1: decay old *_sum if we crossed period boundaries.
> >> -      */
> >> -     if (periods) {
> >> -             sa->load_sum = decay_load(sa->load_sum, periods);
> >> -             sa->runnable_load_sum =
> >> -                     decay_load(sa->runnable_load_sum, periods);
> >> -             sa->util_sum = decay_load((u64)(sa->util_sum), periods);
> >> -
> >> -             /*
> >> -              * Step 2
> >> -              */
> >> -             delta %= 1024;
> >> -             contrib = __accumulate_pelt_segments(periods,
> >> -                             1024 - sa->period_contrib, delta);
> >> -     }
> >> -     sa->period_contrib = delta;
> >> -
> >> -     contrib = cap_scale(contrib, scale_freq);
> >> -     if (load)
> >> -             sa->load_sum += load * contrib;
> >> -     if (runnable)
> >> -             sa->runnable_load_sum += runnable * contrib;
> >> -     if (running)
> >> -             sa->util_sum += contrib * scale_cpu;
> >> -
> >> -     return periods;
> >> -}
> >> -
> >> -/*
> >> - * We can represent the historical contribution to runnable average as the
> >> - * coefficients of a geometric series.  To do this we sub-divide our runnable
> >> - * history into segments of approximately 1ms (1024us); label the segment that
> >> - * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
> >> - *
> >> - * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
> >> - *      p0            p1           p2
> >> - *     (now)       (~1ms ago)  (~2ms ago)
> >> - *
> >> - * Let u_i denote the fraction of p_i that the entity was runnable.
> >> - *
> >> - * We then designate the fractions u_i as our co-efficients, yielding the
> >> - * following representation of historical load:
> >> - *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
> >> - *
> >> - * We choose y based on the with of a reasonably scheduling period, fixing:
> >> - *   y^32 = 0.5
> >> - *
> >> - * This means that the contribution to load ~32ms ago (u_32) will be weighted
> >> - * approximately half as much as the contribution to load within the last ms
> >> - * (u_0).
> >> - *
> >> - * When a period "rolls over" and we have new u_0`, multiplying the previous
> >> - * sum again by y is sufficient to update:
> >> - *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
> >> - *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
> >> - */
> >> -static __always_inline int
> >> -___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
> >> -               unsigned long load, unsigned long runnable, int running)
> >> -{
> >> -     u64 delta;
> >> -
> >> -     delta = now - sa->last_update_time;
> >> -     /*
> >> -      * This should only happen when time goes backwards, which it
> >> -      * unfortunately does during sched clock init when we swap over to TSC.
> >> -      */
> >> -     if ((s64)delta < 0) {
> >> -             sa->last_update_time = now;
> >> -             return 0;
> >> -     }
> >> -
> >> -     /*
> >> -      * Use 1024ns as the unit of measurement since it's a reasonable
> >> -      * approximation of 1us and fast to compute.
> >> -      */
> >> -     delta >>= 10;
> >> -     if (!delta)
> >> -             return 0;
> >> -
> >> -     sa->last_update_time += delta << 10;
> >> -
> >> -     /*
> >> -      * running is a subset of runnable (weight) so running can't be set if
> >> -      * runnable is clear. But there are some corner cases where the current
> >> -      * se has been already dequeued but cfs_rq->curr still points to it.
> >> -      * This means that weight will be 0 but not running for a sched_entity
> >> -      * but also for a cfs_rq if the latter becomes idle. As an example,
> >> -      * this happens during idle_balance() which calls
> >> -      * update_blocked_averages()
> >> -      */
> >> -     if (!load)
> >> -             runnable = running = 0;
> >> -
> >> -     /*
> >> -      * Now we know we crossed measurement unit boundaries. The *_avg
> >> -      * accrues by two steps:
> >> -      *
> >> -      * Step 1: accumulate *_sum since last_update_time. If we haven't
> >> -      * crossed period boundaries, finish.
> >> -      */
> >> -     if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
> >> -             return 0;
> >> -
> >> -     return 1;
> >> -}
> >> -
> >> -static __always_inline void
> >> -___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
> >> -{
> >> -     u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
> >> -
> >> -     /*
> >> -      * Step 2: update *_avg.
> >> -      */
> >> -     sa->load_avg = div_u64(load * sa->load_sum, divider);
> >> -     sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider);
> >> -     sa->util_avg = sa->util_sum / divider;
> >> -}
> >> -
> >> -/*
> >> - * sched_entity:
> >> - *
> >> - *   task:
> >> - *     se_runnable() == se_weight()
> >> - *
> >> - *   group: [ see update_cfs_group() ]
> >> - *     se_weight()   = tg->weight * grq->load_avg / tg->load_avg
> >> - *     se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
> >> - *
> >> - *   load_sum := runnable_sum
> >> - *   load_avg = se_weight(se) * runnable_avg
> >> - *
> >> - *   runnable_load_sum := runnable_sum
> >> - *   runnable_load_avg = se_runnable(se) * runnable_avg
> >> - *
> >> - * XXX collapse load_sum and runnable_load_sum
> >> - *
> >> - * cfq_rs:
> >> - *
> >> - *   load_sum = \Sum se_weight(se) * se->avg.load_sum
> >> - *   load_avg = \Sum se->avg.load_avg
> >> - *
> >> - *   runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
> >> - *   runnable_load_avg = \Sum se->avg.runable_load_avg
> >> - */
> >> -
> >> -static int
> >> -__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
> >> -{
> >> -     if (entity_is_task(se))
> >> -             se->runnable_weight = se->load.weight;
> >> -
> >> -     if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
> >> -             ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
> >> -             return 1;
> >> -     }
> >> -
> >> -     return 0;
> >> -}
> >> -
> >> -static int
> >> -__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
> >> -{
> >> -     if (entity_is_task(se))
> >> -             se->runnable_weight = se->load.weight;
> >> -
> >> -     if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
> >> -                             cfs_rq->curr == se)) {
> >> -
> >> -             ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
> >> -             return 1;
> >> -     }
> >> -
> >> -     return 0;
> >> -}
> >> -
> >> -static int
> >> -__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
> >> -{
> >> -     if (___update_load_sum(now, cpu, &cfs_rq->avg,
> >> -                             scale_load_down(cfs_rq->load.weight),
> >> -                             scale_load_down(cfs_rq->runnable_weight),
> >> -                             cfs_rq->curr != NULL)) {
> >> -
> >> -             ___update_load_avg(&cfs_rq->avg, 1, 1);
> >> -             return 1;
> >> -     }
> >> -
> >> -     return 0;
> >> -}
> >> -
> >>  #ifdef CONFIG_FAIR_GROUP_SCHED
> >>  /**
> >>   * update_tg_load_avg - update the tg's load avg
> >> @@ -3831,12 +3531,6 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
> >>
> >>  #else /* CONFIG_SMP */
> >>
> >> -static inline int
> >> -update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
> >> -{
> >> -     return 0;
> >> -}
> >> -
> >>  #define UPDATE_TG    0x0
> >>  #define SKIP_AGE_LOAD        0x0
> >>  #define DO_ATTACH    0x0
> >> diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
> >> new file mode 100644
> >> index 0000000..da6d84f
> >> --- /dev/null
> >> +++ b/kernel/sched/pelt.c
> >> @@ -0,0 +1,308 @@
> >> +/*
> >> + * Per Entity Load Tracking
> >> + *
> >> + *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@...hat.com>
> >> + *
> >> + *  Interactivity improvements by Mike Galbraith
> >> + *  (C) 2007 Mike Galbraith <efault@....de>
> >> + *
> >> + *  Various enhancements by Dmitry Adamushko.
> >> + *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@...il.com>
> >> + *
> >> + *  Group scheduling enhancements by Srivatsa Vaddagiri
> >> + *  Copyright IBM Corporation, 2007
> >> + *  Author: Srivatsa Vaddagiri <vatsa@...ux.vnet.ibm.com>
> >> + *
> >> + *  Scaled math optimizations by Thomas Gleixner
> >> + *  Copyright (C) 2007, Thomas Gleixner <tglx@...utronix.de>
> >> + *
> >> + *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
> >> + *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
> >> + *
> >> + *  Move PELT related code from fair.c into this pelt.c file
> >> + *  Author: Vincent Guittot <vincent.guittot@...aro.org>
> >> + */
> >> +
> >> +#include <linux/sched.h>
> >> +#include "sched.h"
> >> +#include "sched-pelt.h"
> >> +
> >> +/*
> >> + * Approximate:
> >> + *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
> >> + */
> >> +static u64 decay_load(u64 val, u64 n)
> >> +{
> >> +     unsigned int local_n;
> >> +
> >> +     if (unlikely(n > LOAD_AVG_PERIOD * 63))
> >> +             return 0;
> >> +
> >> +     /* after bounds checking we can collapse to 32-bit */
> >> +     local_n = n;
> >> +
> >> +     /*
> >> +      * As y^PERIOD = 1/2, we can combine
> >> +      *    y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
> >> +      * With a look-up table which covers y^n (n<PERIOD)
> >> +      *
> >> +      * To achieve constant time decay_load.
> >> +      */
> >> +     if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
> >> +             val >>= local_n / LOAD_AVG_PERIOD;
> >> +             local_n %= LOAD_AVG_PERIOD;
> >> +     }
> >> +
> >> +     val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
> >> +     return val;
> >> +}
> >> +
> >> +static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
> >> +{
> >> +     u32 c1, c2, c3 = d3; /* y^0 == 1 */
> >> +
> >> +     /*
> >> +      * c1 = d1 y^p
> >> +      */
> >> +     c1 = decay_load((u64)d1, periods);
> >> +
> >> +     /*
> >> +      *            p-1
> >> +      * c2 = 1024 \Sum y^n
> >> +      *            n=1
> >> +      *
> >> +      *              inf        inf
> >> +      *    = 1024 ( \Sum y^n - \Sum y^n - y^0 )
> >> +      *              n=0        n=p
> >> +      */
> >> +     c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
> >> +
> >> +     return c1 + c2 + c3;
> >> +}
> >> +
> >> +#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
> >> +
> >> +/*
> >> + * Accumulate the three separate parts of the sum; d1 the remainder
> >> + * of the last (incomplete) period, d2 the span of full periods and d3
> >> + * the remainder of the (incomplete) current period.
> >> + *
> >> + *           d1          d2           d3
> >> + *           ^           ^            ^
> >> + *           |           |            |
> >> + *         |<->|<----------------->|<--->|
> >> + * ... |---x---|------| ... |------|-----x (now)
> >> + *
> >> + *                           p-1
> >> + * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
> >> + *                           n=1
> >> + *
> >> + *    = u y^p +                                      (Step 1)
> >> + *
> >> + *                     p-1
> >> + *      d1 y^p + 1024 \Sum y^n + d3 y^0              (Step 2)
> >> + *                     n=1
> >> + */
> >> +static __always_inline u32
> >> +accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
> >> +            unsigned long load, unsigned long runnable, int running)
> >> +{
> >> +     unsigned long scale_freq, scale_cpu;
> >> +     u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
> >> +     u64 periods;
> >> +
> >> +     scale_freq = arch_scale_freq_capacity(NULL, cpu);
> >> +     scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
> >> +
> >> +     delta += sa->period_contrib;
> >> +     periods = delta / 1024; /* A period is 1024us (~1ms) */
> >> +
> >> +     /*
> >> +      * Step 1: decay old *_sum if we crossed period boundaries.
> >> +      */
> >> +     if (periods) {
> >> +             sa->load_sum = decay_load(sa->load_sum, periods);
> >> +             sa->runnable_load_sum =
> >> +                     decay_load(sa->runnable_load_sum, periods);
> >> +             sa->util_sum = decay_load((u64)(sa->util_sum), periods);
> >> +
> >> +             /*
> >> +              * Step 2
> >> +              */
> >> +             delta %= 1024;
> >> +             contrib = __accumulate_pelt_segments(periods,
> >> +                             1024 - sa->period_contrib, delta);
> >> +     }
> >> +     sa->period_contrib = delta;
> >> +
> >> +     contrib = cap_scale(contrib, scale_freq);
> >> +     if (load)
> >> +             sa->load_sum += load * contrib;
> >> +     if (runnable)
> >> +             sa->runnable_load_sum += runnable * contrib;
> >> +     if (running)
> >> +             sa->util_sum += contrib * scale_cpu;
> >> +
> >> +     return periods;
> >> +}
> >> +
> >> +/*
> >> + * We can represent the historical contribution to runnable average as the
> >> + * coefficients of a geometric series.  To do this we sub-divide our runnable
> >> + * history into segments of approximately 1ms (1024us); label the segment that
> >> + * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
> >> + *
> >> + * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
> >> + *      p0            p1           p2
> >> + *     (now)       (~1ms ago)  (~2ms ago)
> >> + *
> >> + * Let u_i denote the fraction of p_i that the entity was runnable.
> >> + *
> >> + * We then designate the fractions u_i as our co-efficients, yielding the
> >> + * following representation of historical load:
> >> + *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
> >> + *
> >> + * We choose y based on the with of a reasonably scheduling period, fixing:
> >> + *   y^32 = 0.5
> >> + *
> >> + * This means that the contribution to load ~32ms ago (u_32) will be weighted
> >> + * approximately half as much as the contribution to load within the last ms
> >> + * (u_0).
> >> + *
> >> + * When a period "rolls over" and we have new u_0`, multiplying the previous
> >> + * sum again by y is sufficient to update:
> >> + *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
> >> + *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
> >> + */
> >> +static __always_inline int
> >> +___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
> >> +               unsigned long load, unsigned long runnable, int running)
> >> +{
> >> +     u64 delta;
> >> +
> >> +     delta = now - sa->last_update_time;
> >> +     /*
> >> +      * This should only happen when time goes backwards, which it
> >> +      * unfortunately does during sched clock init when we swap over to TSC.
> >> +      */
> >> +     if ((s64)delta < 0) {
> >> +             sa->last_update_time = now;
> >> +             return 0;
> >> +     }
> >> +
> >> +     /*
> >> +      * Use 1024ns as the unit of measurement since it's a reasonable
> >> +      * approximation of 1us and fast to compute.
> >> +      */
> >> +     delta >>= 10;
> >> +     if (!delta)
> >> +             return 0;
> >> +
> >> +     sa->last_update_time += delta << 10;
> >> +
> >> +     /*
> >> +      * running is a subset of runnable (weight) so running can't be set if
> >> +      * runnable is clear. But there are some corner cases where the current
> >> +      * se has been already dequeued but cfs_rq->curr still points to it.
> >> +      * This means that weight will be 0 but not running for a sched_entity
> >> +      * but also for a cfs_rq if the latter becomes idle. As an example,
> >> +      * this happens during idle_balance() which calls
> >> +      * update_blocked_averages()
> >> +      */
> >> +     if (!load)
> >> +             runnable = running = 0;
> >> +
> >> +     /*
> >> +      * Now we know we crossed measurement unit boundaries. The *_avg
> >> +      * accrues by two steps:
> >> +      *
> >> +      * Step 1: accumulate *_sum since last_update_time. If we haven't
> >> +      * crossed period boundaries, finish.
> >> +      */
> >> +     if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
> >> +             return 0;
> >> +
> >> +     return 1;
> >> +}
> >> +
> >> +static __always_inline void
> >> +___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
> >> +{
> >> +     u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
> >> +
> >> +     /*
> >> +      * Step 2: update *_avg.
> >> +      */
> >> +     sa->load_avg = div_u64(load * sa->load_sum, divider);
> >> +     sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider);
> >> +     sa->util_avg = sa->util_sum / divider;
> >> +}
> >> +
> >> +/*
> >> + * sched_entity:
> >> + *
> >> + *   task:
> >> + *     se_runnable() == se_weight()
> >> + *
> >> + *   group: [ see update_cfs_group() ]
> >> + *     se_weight()   = tg->weight * grq->load_avg / tg->load_avg
> >> + *     se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
> >> + *
> >> + *   load_sum := runnable_sum
> >> + *   load_avg = se_weight(se) * runnable_avg
> >> + *
> >> + *   runnable_load_sum := runnable_sum
> >> + *   runnable_load_avg = se_runnable(se) * runnable_avg
> >> + *
> >> + * XXX collapse load_sum and runnable_load_sum
> >> + *
> >> + * cfq_rs:
> >> + *
> >> + *   load_sum = \Sum se_weight(se) * se->avg.load_sum
> >> + *   load_avg = \Sum se->avg.load_avg
> >> + *
> >> + *   runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
> >> + *   runnable_load_avg = \Sum se->avg.runable_load_avg
> >> + */
> >> +
> >> +int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
> >> +{
> >> +     if (entity_is_task(se))
> >> +             se->runnable_weight = se->load.weight;
> >> +
> >> +     if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
> >> +             ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
> >> +             return 1;
> >> +     }
> >> +
> >> +     return 0;
> >> +}
> >> +
> >> +int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
> >> +{
> >> +     if (entity_is_task(se))
> >> +             se->runnable_weight = se->load.weight;
> >> +
> >> +     if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
> >> +                             cfs_rq->curr == se)) {
> >> +
> >> +             ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
> >> +             return 1;
> >> +     }
> >> +
> >> +     return 0;
> >> +}
> >> +
> >> +int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
> >> +{
> >> +     if (___update_load_sum(now, cpu, &cfs_rq->avg,
> >> +                             scale_load_down(cfs_rq->load.weight),
> >> +                             scale_load_down(cfs_rq->runnable_weight),
> >> +                             cfs_rq->curr != NULL)) {
> >> +
> >> +             ___update_load_avg(&cfs_rq->avg, 1, 1);
> >> +             return 1;
> >> +     }
> >> +
> >> +     return 0;
> >> +}
> >> diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
> >> new file mode 100644
> >> index 0000000..c312d8c
> >> --- /dev/null
> >> +++ b/kernel/sched/pelt.h
> >> @@ -0,0 +1,17 @@
> >> +#ifdef CONFIG_SMP
> >> +
> >> +int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se);
> >> +int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se);
> >> +int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq);
> >> +
> >> +#else
> >> +
> >> +static inline int
> >> +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
> >> +{
> >> +     return 0;
> >> +}
> >> +
> >> +#endif
> >> +
> >> +
> >> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> >> index 45ab0bf..6fefef6 100644
> >> --- a/kernel/sched/sched.h
> >> +++ b/kernel/sched/sched.h
> >> @@ -528,6 +528,7 @@ struct rt_rq {
> >>       unsigned long rt_nr_total;
> >>       int overloaded;
> >>       struct plist_head pushable_tasks;
> >> +
> >>  #endif /* CONFIG_SMP */
> >>       int rt_queued;
> >>
> >> @@ -602,7 +603,26 @@ struct dl_rq {
> >>       u64 bw_ratio;
> >>  };
> >>
> >> +#ifdef CONFIG_FAIR_GROUP_SCHED
> >> +/* An entity is a task if it doesn't "own" a runqueue */
> >> +#define entity_is_task(se)   (!se->my_q)
> >> +#else
> >> +#define entity_is_task(se)   1
> >> +#endif
> >> +
> >>  #ifdef CONFIG_SMP
> >> +/*
> >> + * XXX we want to get rid of these helpers and use the full load resolution.
> >> + */
> >> +static inline long se_weight(struct sched_entity *se)
> >> +{
> >> +     return scale_load_down(se->load.weight);
> >> +}
> >> +
> >> +static inline long se_runnable(struct sched_entity *se)
> >> +{
> >> +     return scale_load_down(se->runnable_weight);
> >> +}
> >>
> >>  static inline bool sched_asym_prefer(int a, int b)
> >>  {
> >> --
> >> 2.7.4
> >>