linux-kernel - Re: [PATCH v3 1/3] sched/pelt: Move pelt related code in a dedicated file

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20171211140838.GA30462@e108498-lin.cambridge.arm.com>
Date:   Mon, 11 Dec 2017 14:08:39 +0000
From:   Quentin Perret <quentin.perret@....com>
To:     Vincent Guittot <vincent.guittot@...aro.org>
Cc:     peterz@...radead.org, mingo@...nel.org,
        linux-kernel@...r.kernel.org, rjw@...ysocki.net,
        juri.lelli@....com, dietmar.eggemann@....com,
        Morten.Rasmussen@....com, viresh.kumar@...aro.org
Subject: Re: [PATCH v3 1/3] sched/pelt: Move pelt related code in a dedicated
 file

Hi Vincent,

Although I agree that moving the PELT code in a dedicated file is
probably the cleanest way to achieve what you want, I was wondering if
you were able no measure any overhead due to moving the __update_load_avg_*()
functions in a different translation unit ? This is introducing function calls
in latency sensitive paths (wakeup for ex) so I'm just wondering what's
the impact in practice.

Thanks,
Quentin

On Wednesday 22 Nov 2017 at 15:35:53 (+0100), Vincent Guittot wrote:
> We want to track rt_rq's utilization as a part of the estimation of the
> whole rq's utilization. This is necessary because rt tasks can steal
> utilization to cfs tasks and make them lighter than they are.
> As we want to use the same load tracking mecanism for both and prevent
> useless dependency between cfs and rt code, pelt code is moved in a
> dedicated file.
> 
> Signed-off-by: Vincent Guittot <vincent.guittot@...aro.org>
> ---
>  kernel/sched/Makefile |   2 +-
>  kernel/sched/fair.c   | 308 +-------------------------------------------------
>  kernel/sched/pelt.c   | 308 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  kernel/sched/pelt.h   |  17 +++
>  kernel/sched/sched.h  |  20 ++++
>  5 files changed, 347 insertions(+), 308 deletions(-)
>  create mode 100644 kernel/sched/pelt.c
>  create mode 100644 kernel/sched/pelt.h
> 
> diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
> index e2f9d4f..5a6d1c1 100644
> --- a/kernel/sched/Makefile
> +++ b/kernel/sched/Makefile
> @@ -19,7 +19,7 @@ endif
>  obj-y += core.o loadavg.o clock.o cputime.o
>  obj-y += idle_task.o fair.o rt.o deadline.o
>  obj-y += wait.o wait_bit.o swait.o completion.o idle.o
> -obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o
> +obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o
>  obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
>  obj-$(CONFIG_SCHEDSTATS) += stats.o
>  obj-$(CONFIG_SCHED_DEBUG) += debug.o
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 0989676..b88550e 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -270,9 +270,6 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
>  	return cfs_rq->rq;
>  }
>  
> -/* An entity is a task if it doesn't "own" a runqueue */
> -#define entity_is_task(se)	(!se->my_q)
> -
>  static inline struct task_struct *task_of(struct sched_entity *se)
>  {
>  	SCHED_WARN_ON(!entity_is_task(se));
> @@ -434,7 +431,6 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
>  	return container_of(cfs_rq, struct rq, cfs);
>  }
>  
> -#define entity_is_task(se)	1
>  
>  #define for_each_sched_entity(se) \
>  		for (; se; se = NULL)
> @@ -707,7 +703,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  }
>  
>  #ifdef CONFIG_SMP
> -
> +#include "pelt.h"
>  #include "sched-pelt.h"
>  
>  static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
> @@ -2723,19 +2719,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  } while (0)
>  
>  #ifdef CONFIG_SMP
> -/*
> - * XXX we want to get rid of these helpers and use the full load resolution.
> - */
> -static inline long se_weight(struct sched_entity *se)
> -{
> -	return scale_load_down(se->load.weight);
> -}
> -
> -static inline long se_runnable(struct sched_entity *se)
> -{
> -	return scale_load_down(se->runnable_weight);
> -}
> -
>  static inline void
>  enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  {
> @@ -3038,289 +3021,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
>  }
>  
>  #ifdef CONFIG_SMP
> -/*
> - * Approximate:
> - *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
> - */
> -static u64 decay_load(u64 val, u64 n)
> -{
> -	unsigned int local_n;
> -
> -	if (unlikely(n > LOAD_AVG_PERIOD * 63))
> -		return 0;
> -
> -	/* after bounds checking we can collapse to 32-bit */
> -	local_n = n;
> -
> -	/*
> -	 * As y^PERIOD = 1/2, we can combine
> -	 *    y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
> -	 * With a look-up table which covers y^n (n<PERIOD)
> -	 *
> -	 * To achieve constant time decay_load.
> -	 */
> -	if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
> -		val >>= local_n / LOAD_AVG_PERIOD;
> -		local_n %= LOAD_AVG_PERIOD;
> -	}
> -
> -	val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
> -	return val;
> -}
> -
> -static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
> -{
> -	u32 c1, c2, c3 = d3; /* y^0 == 1 */
> -
> -	/*
> -	 * c1 = d1 y^p
> -	 */
> -	c1 = decay_load((u64)d1, periods);
> -
> -	/*
> -	 *            p-1
> -	 * c2 = 1024 \Sum y^n
> -	 *            n=1
> -	 *
> -	 *              inf        inf
> -	 *    = 1024 ( \Sum y^n - \Sum y^n - y^0 )
> -	 *              n=0        n=p
> -	 */
> -	c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
> -
> -	return c1 + c2 + c3;
> -}
> -
> -#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
> -
> -/*
> - * Accumulate the three separate parts of the sum; d1 the remainder
> - * of the last (incomplete) period, d2 the span of full periods and d3
> - * the remainder of the (incomplete) current period.
> - *
> - *           d1          d2           d3
> - *           ^           ^            ^
> - *           |           |            |
> - *         |<->|<----------------->|<--->|
> - * ... |---x---|------| ... |------|-----x (now)
> - *
> - *                           p-1
> - * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
> - *                           n=1
> - *
> - *    = u y^p +					(Step 1)
> - *
> - *                     p-1
> - *      d1 y^p + 1024 \Sum y^n + d3 y^0		(Step 2)
> - *                     n=1
> - */
> -static __always_inline u32
> -accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
> -	       unsigned long load, unsigned long runnable, int running)
> -{
> -	unsigned long scale_freq, scale_cpu;
> -	u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
> -	u64 periods;
> -
> -	scale_freq = arch_scale_freq_capacity(NULL, cpu);
> -	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
> -
> -	delta += sa->period_contrib;
> -	periods = delta / 1024; /* A period is 1024us (~1ms) */
> -
> -	/*
> -	 * Step 1: decay old *_sum if we crossed period boundaries.
> -	 */
> -	if (periods) {
> -		sa->load_sum = decay_load(sa->load_sum, periods);
> -		sa->runnable_load_sum =
> -			decay_load(sa->runnable_load_sum, periods);
> -		sa->util_sum = decay_load((u64)(sa->util_sum), periods);
> -
> -		/*
> -		 * Step 2
> -		 */
> -		delta %= 1024;
> -		contrib = __accumulate_pelt_segments(periods,
> -				1024 - sa->period_contrib, delta);
> -	}
> -	sa->period_contrib = delta;
> -
> -	contrib = cap_scale(contrib, scale_freq);
> -	if (load)
> -		sa->load_sum += load * contrib;
> -	if (runnable)
> -		sa->runnable_load_sum += runnable * contrib;
> -	if (running)
> -		sa->util_sum += contrib * scale_cpu;
> -
> -	return periods;
> -}
> -
> -/*
> - * We can represent the historical contribution to runnable average as the
> - * coefficients of a geometric series.  To do this we sub-divide our runnable
> - * history into segments of approximately 1ms (1024us); label the segment that
> - * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
> - *
> - * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
> - *      p0            p1           p2
> - *     (now)       (~1ms ago)  (~2ms ago)
> - *
> - * Let u_i denote the fraction of p_i that the entity was runnable.
> - *
> - * We then designate the fractions u_i as our co-efficients, yielding the
> - * following representation of historical load:
> - *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
> - *
> - * We choose y based on the with of a reasonably scheduling period, fixing:
> - *   y^32 = 0.5
> - *
> - * This means that the contribution to load ~32ms ago (u_32) will be weighted
> - * approximately half as much as the contribution to load within the last ms
> - * (u_0).
> - *
> - * When a period "rolls over" and we have new u_0`, multiplying the previous
> - * sum again by y is sufficient to update:
> - *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
> - *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
> - */
> -static __always_inline int
> -___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
> -		  unsigned long load, unsigned long runnable, int running)
> -{
> -	u64 delta;
> -
> -	delta = now - sa->last_update_time;
> -	/*
> -	 * This should only happen when time goes backwards, which it
> -	 * unfortunately does during sched clock init when we swap over to TSC.
> -	 */
> -	if ((s64)delta < 0) {
> -		sa->last_update_time = now;
> -		return 0;
> -	}
> -
> -	/*
> -	 * Use 1024ns as the unit of measurement since it's a reasonable
> -	 * approximation of 1us and fast to compute.
> -	 */
> -	delta >>= 10;
> -	if (!delta)
> -		return 0;
> -
> -	sa->last_update_time += delta << 10;
> -
> -	/*
> -	 * running is a subset of runnable (weight) so running can't be set if
> -	 * runnable is clear. But there are some corner cases where the current
> -	 * se has been already dequeued but cfs_rq->curr still points to it.
> -	 * This means that weight will be 0 but not running for a sched_entity
> -	 * but also for a cfs_rq if the latter becomes idle. As an example,
> -	 * this happens during idle_balance() which calls
> -	 * update_blocked_averages()
> -	 */
> -	if (!load)
> -		runnable = running = 0;
> -
> -	/*
> -	 * Now we know we crossed measurement unit boundaries. The *_avg
> -	 * accrues by two steps:
> -	 *
> -	 * Step 1: accumulate *_sum since last_update_time. If we haven't
> -	 * crossed period boundaries, finish.
> -	 */
> -	if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
> -		return 0;
> -
> -	return 1;
> -}
> -
> -static __always_inline void
> -___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
> -{
> -	u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
> -
> -	/*
> -	 * Step 2: update *_avg.
> -	 */
> -	sa->load_avg = div_u64(load * sa->load_sum, divider);
> -	sa->runnable_load_avg =	div_u64(runnable * sa->runnable_load_sum, divider);
> -	sa->util_avg = sa->util_sum / divider;
> -}
> -
> -/*
> - * sched_entity:
> - *
> - *   task:
> - *     se_runnable() == se_weight()
> - *
> - *   group: [ see update_cfs_group() ]
> - *     se_weight()   = tg->weight * grq->load_avg / tg->load_avg
> - *     se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
> - *
> - *   load_sum := runnable_sum
> - *   load_avg = se_weight(se) * runnable_avg
> - *
> - *   runnable_load_sum := runnable_sum
> - *   runnable_load_avg = se_runnable(se) * runnable_avg
> - *
> - * XXX collapse load_sum and runnable_load_sum
> - *
> - * cfq_rs:
> - *
> - *   load_sum = \Sum se_weight(se) * se->avg.load_sum
> - *   load_avg = \Sum se->avg.load_avg
> - *
> - *   runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
> - *   runnable_load_avg = \Sum se->avg.runable_load_avg
> - */
> -
> -static int
> -__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
> -{
> -	if (entity_is_task(se))
> -		se->runnable_weight = se->load.weight;
> -
> -	if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
> -		___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
> -		return 1;
> -	}
> -
> -	return 0;
> -}
> -
> -static int
> -__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
> -{
> -	if (entity_is_task(se))
> -		se->runnable_weight = se->load.weight;
> -
> -	if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
> -				cfs_rq->curr == se)) {
> -
> -		___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
> -		return 1;
> -	}
> -
> -	return 0;
> -}
> -
> -static int
> -__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
> -{
> -	if (___update_load_sum(now, cpu, &cfs_rq->avg,
> -				scale_load_down(cfs_rq->load.weight),
> -				scale_load_down(cfs_rq->runnable_weight),
> -				cfs_rq->curr != NULL)) {
> -
> -		___update_load_avg(&cfs_rq->avg, 1, 1);
> -		return 1;
> -	}
> -
> -	return 0;
> -}
> -
>  #ifdef CONFIG_FAIR_GROUP_SCHED
>  /**
>   * update_tg_load_avg - update the tg's load avg
> @@ -3831,12 +3531,6 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
>  
>  #else /* CONFIG_SMP */
>  
> -static inline int
> -update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
> -{
> -	return 0;
> -}
> -
>  #define UPDATE_TG	0x0
>  #define SKIP_AGE_LOAD	0x0
>  #define DO_ATTACH	0x0
> diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
> new file mode 100644
> index 0000000..da6d84f
> --- /dev/null
> +++ b/kernel/sched/pelt.c
> @@ -0,0 +1,308 @@
> +/*
> + * Per Entity Load Tracking
> + *
> + *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@...hat.com>
> + *
> + *  Interactivity improvements by Mike Galbraith
> + *  (C) 2007 Mike Galbraith <efault@....de>
> + *
> + *  Various enhancements by Dmitry Adamushko.
> + *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@...il.com>
> + *
> + *  Group scheduling enhancements by Srivatsa Vaddagiri
> + *  Copyright IBM Corporation, 2007
> + *  Author: Srivatsa Vaddagiri <vatsa@...ux.vnet.ibm.com>
> + *
> + *  Scaled math optimizations by Thomas Gleixner
> + *  Copyright (C) 2007, Thomas Gleixner <tglx@...utronix.de>
> + *
> + *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
> + *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
> + *
> + *  Move PELT related code from fair.c into this pelt.c file
> + *  Author: Vincent Guittot <vincent.guittot@...aro.org>
> + */
> +
> +#include <linux/sched.h>
> +#include "sched.h"
> +#include "sched-pelt.h"
> +
> +/*
> + * Approximate:
> + *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
> + */
> +static u64 decay_load(u64 val, u64 n)
> +{
> +	unsigned int local_n;
> +
> +	if (unlikely(n > LOAD_AVG_PERIOD * 63))
> +		return 0;
> +
> +	/* after bounds checking we can collapse to 32-bit */
> +	local_n = n;
> +
> +	/*
> +	 * As y^PERIOD = 1/2, we can combine
> +	 *    y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
> +	 * With a look-up table which covers y^n (n<PERIOD)
> +	 *
> +	 * To achieve constant time decay_load.
> +	 */
> +	if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
> +		val >>= local_n / LOAD_AVG_PERIOD;
> +		local_n %= LOAD_AVG_PERIOD;
> +	}
> +
> +	val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
> +	return val;
> +}
> +
> +static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
> +{
> +	u32 c1, c2, c3 = d3; /* y^0 == 1 */
> +
> +	/*
> +	 * c1 = d1 y^p
> +	 */
> +	c1 = decay_load((u64)d1, periods);
> +
> +	/*
> +	 *            p-1
> +	 * c2 = 1024 \Sum y^n
> +	 *            n=1
> +	 *
> +	 *              inf        inf
> +	 *    = 1024 ( \Sum y^n - \Sum y^n - y^0 )
> +	 *              n=0        n=p
> +	 */
> +	c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
> +
> +	return c1 + c2 + c3;
> +}
> +
> +#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
> +
> +/*
> + * Accumulate the three separate parts of the sum; d1 the remainder
> + * of the last (incomplete) period, d2 the span of full periods and d3
> + * the remainder of the (incomplete) current period.
> + *
> + *           d1          d2           d3
> + *           ^           ^            ^
> + *           |           |            |
> + *         |<->|<----------------->|<--->|
> + * ... |---x---|------| ... |------|-----x (now)
> + *
> + *                           p-1
> + * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
> + *                           n=1
> + *
> + *    = u y^p +					(Step 1)
> + *
> + *                     p-1
> + *      d1 y^p + 1024 \Sum y^n + d3 y^0		(Step 2)
> + *                     n=1
> + */
> +static __always_inline u32
> +accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
> +	       unsigned long load, unsigned long runnable, int running)
> +{
> +	unsigned long scale_freq, scale_cpu;
> +	u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
> +	u64 periods;
> +
> +	scale_freq = arch_scale_freq_capacity(NULL, cpu);
> +	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
> +
> +	delta += sa->period_contrib;
> +	periods = delta / 1024; /* A period is 1024us (~1ms) */
> +
> +	/*
> +	 * Step 1: decay old *_sum if we crossed period boundaries.
> +	 */
> +	if (periods) {
> +		sa->load_sum = decay_load(sa->load_sum, periods);
> +		sa->runnable_load_sum =
> +			decay_load(sa->runnable_load_sum, periods);
> +		sa->util_sum = decay_load((u64)(sa->util_sum), periods);
> +
> +		/*
> +		 * Step 2
> +		 */
> +		delta %= 1024;
> +		contrib = __accumulate_pelt_segments(periods,
> +				1024 - sa->period_contrib, delta);
> +	}
> +	sa->period_contrib = delta;
> +
> +	contrib = cap_scale(contrib, scale_freq);
> +	if (load)
> +		sa->load_sum += load * contrib;
> +	if (runnable)
> +		sa->runnable_load_sum += runnable * contrib;
> +	if (running)
> +		sa->util_sum += contrib * scale_cpu;
> +
> +	return periods;
> +}
> +
> +/*
> + * We can represent the historical contribution to runnable average as the
> + * coefficients of a geometric series.  To do this we sub-divide our runnable
> + * history into segments of approximately 1ms (1024us); label the segment that
> + * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
> + *
> + * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
> + *      p0            p1           p2
> + *     (now)       (~1ms ago)  (~2ms ago)
> + *
> + * Let u_i denote the fraction of p_i that the entity was runnable.
> + *
> + * We then designate the fractions u_i as our co-efficients, yielding the
> + * following representation of historical load:
> + *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
> + *
> + * We choose y based on the with of a reasonably scheduling period, fixing:
> + *   y^32 = 0.5
> + *
> + * This means that the contribution to load ~32ms ago (u_32) will be weighted
> + * approximately half as much as the contribution to load within the last ms
> + * (u_0).
> + *
> + * When a period "rolls over" and we have new u_0`, multiplying the previous
> + * sum again by y is sufficient to update:
> + *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
> + *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
> + */
> +static __always_inline int
> +___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
> +		  unsigned long load, unsigned long runnable, int running)
> +{
> +	u64 delta;
> +
> +	delta = now - sa->last_update_time;
> +	/*
> +	 * This should only happen when time goes backwards, which it
> +	 * unfortunately does during sched clock init when we swap over to TSC.
> +	 */
> +	if ((s64)delta < 0) {
> +		sa->last_update_time = now;
> +		return 0;
> +	}
> +
> +	/*
> +	 * Use 1024ns as the unit of measurement since it's a reasonable
> +	 * approximation of 1us and fast to compute.
> +	 */
> +	delta >>= 10;
> +	if (!delta)
> +		return 0;
> +
> +	sa->last_update_time += delta << 10;
> +
> +	/*
> +	 * running is a subset of runnable (weight) so running can't be set if
> +	 * runnable is clear. But there are some corner cases where the current
> +	 * se has been already dequeued but cfs_rq->curr still points to it.
> +	 * This means that weight will be 0 but not running for a sched_entity
> +	 * but also for a cfs_rq if the latter becomes idle. As an example,
> +	 * this happens during idle_balance() which calls
> +	 * update_blocked_averages()
> +	 */
> +	if (!load)
> +		runnable = running = 0;
> +
> +	/*
> +	 * Now we know we crossed measurement unit boundaries. The *_avg
> +	 * accrues by two steps:
> +	 *
> +	 * Step 1: accumulate *_sum since last_update_time. If we haven't
> +	 * crossed period boundaries, finish.
> +	 */
> +	if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
> +		return 0;
> +
> +	return 1;
> +}
> +
> +static __always_inline void
> +___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
> +{
> +	u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
> +
> +	/*
> +	 * Step 2: update *_avg.
> +	 */
> +	sa->load_avg = div_u64(load * sa->load_sum, divider);
> +	sa->runnable_load_avg =	div_u64(runnable * sa->runnable_load_sum, divider);
> +	sa->util_avg = sa->util_sum / divider;
> +}
> +
> +/*
> + * sched_entity:
> + *
> + *   task:
> + *     se_runnable() == se_weight()
> + *
> + *   group: [ see update_cfs_group() ]
> + *     se_weight()   = tg->weight * grq->load_avg / tg->load_avg
> + *     se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
> + *
> + *   load_sum := runnable_sum
> + *   load_avg = se_weight(se) * runnable_avg
> + *
> + *   runnable_load_sum := runnable_sum
> + *   runnable_load_avg = se_runnable(se) * runnable_avg
> + *
> + * XXX collapse load_sum and runnable_load_sum
> + *
> + * cfq_rs:
> + *
> + *   load_sum = \Sum se_weight(se) * se->avg.load_sum
> + *   load_avg = \Sum se->avg.load_avg
> + *
> + *   runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
> + *   runnable_load_avg = \Sum se->avg.runable_load_avg
> + */
> +
> +int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
> +{
> +	if (entity_is_task(se))
> +		se->runnable_weight = se->load.weight;
> +
> +	if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
> +		___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
> +		return 1;
> +	}
> +
> +	return 0;
> +}
> +
> +int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
> +{
> +	if (entity_is_task(se))
> +		se->runnable_weight = se->load.weight;
> +
> +	if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
> +				cfs_rq->curr == se)) {
> +
> +		___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
> +		return 1;
> +	}
> +
> +	return 0;
> +}
> +
> +int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
> +{
> +	if (___update_load_sum(now, cpu, &cfs_rq->avg,
> +				scale_load_down(cfs_rq->load.weight),
> +				scale_load_down(cfs_rq->runnable_weight),
> +				cfs_rq->curr != NULL)) {
> +
> +		___update_load_avg(&cfs_rq->avg, 1, 1);
> +		return 1;
> +	}
> +
> +	return 0;
> +}
> diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
> new file mode 100644
> index 0000000..c312d8c
> --- /dev/null
> +++ b/kernel/sched/pelt.h
> @@ -0,0 +1,17 @@
> +#ifdef CONFIG_SMP
> +
> +int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se);
> +int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se);
> +int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq);
> +
> +#else
> +
> +static inline int
> +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
> +{
> +	return 0;
> +}
> +
> +#endif
> +
> +
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 45ab0bf..6fefef6 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -528,6 +528,7 @@ struct rt_rq {
>  	unsigned long rt_nr_total;
>  	int overloaded;
>  	struct plist_head pushable_tasks;
> +
>  #endif /* CONFIG_SMP */
>  	int rt_queued;
>  
> @@ -602,7 +603,26 @@ struct dl_rq {
>  	u64 bw_ratio;
>  };
>  
> +#ifdef CONFIG_FAIR_GROUP_SCHED
> +/* An entity is a task if it doesn't "own" a runqueue */
> +#define entity_is_task(se)	(!se->my_q)
> +#else
> +#define entity_is_task(se)	1
> +#endif
> +
>  #ifdef CONFIG_SMP
> +/*
> + * XXX we want to get rid of these helpers and use the full load resolution.
> + */
> +static inline long se_weight(struct sched_entity *se)
> +{
> +	return scale_load_down(se->load.weight);
> +}
> +
> +static inline long se_runnable(struct sched_entity *se)
> +{
> +	return scale_load_down(se->runnable_weight);
> +}
>  
>  static inline bool sched_asym_prefer(int a, int b)
>  {
> -- 
> 2.7.4
>