linux-kernel - Re: [PATCH 04/16] sched: maintain the load contribution of blocked entities

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <877guqykji.fsf@sejong.aot.lge.com>
Date:	Fri, 29 Jun 2012 10:27:29 +0900
From:	Namhyung Kim <namhyung@...nel.org>
To:	Paul Turner <pjt@...gle.com>
Cc:	linux-kernel@...r.kernel.org, Venki Pallipadi <venki@...gle.com>,
	Srivatsa Vaddagiri <vatsa@...ibm.com>,
	Vincent Guittot <vincent.guittot@...aro.org>,
	Peter Zijlstra <a.p.zijlstra@...llo.nl>,
	Nikunj A Dadhania <nikunj@...ux.vnet.ibm.com>,
	Mike Galbraith <efault@....de>,
	Kamalesh Babulal <kamalesh@...ux.vnet.ibm.com>,
	Ben Segall <bsegall@...gle.com>, Ingo Molnar <mingo@...e.hu>,
	"Paul E. McKenney" <paulmck@...ux.vnet.ibm.com>,
	Morten Rasmussen <Morten.Rasmussen@....com>,
	Vaidyanathan Srinivasan <svaidy@...ux.vnet.ibm.com>
Subject: Re: [PATCH 04/16] sched: maintain the load contribution of blocked entities

Hi,

On Wed, 27 Jun 2012 19:24:14 -0700, Paul Turner wrote:
> We are currently maintaining:
>   runnable_load(cfs_rq) = \Sum task_load(t)
>
> For all running children t of cfs_rq.  While this can be naturally updated for
> tasks in a runnable state (as they are scheduled); this does not account for
> the load contributed by blocked task entities.
>
> This can be solved by introducing a separate accounting for blocked load:
>   blocked_load(cfs_rq) = \Sum runnable(b) * weight(b)
>
> Obviously we do not want to iterate over all blocked entities to account for
> their decay, we instead observe that:
>   runnable_load(t) = \Sum p_i*y^i
>
> and that to account for an additional idle period we only need to compute:
>   y*runnable_load(t).
>
> This means that we can compute all blocked entities at once by evaluating:
>   blocked_load(cfs_rq)` = y * blocked_load(cfs_rq)
>
> Finally we maintain a decay counter so that when a sleeping entity re-awakens
> we can determine how much of its load should be removed from the blocked sum.
>
> Signed-off-by: Paul Turner <pjt@...gle.com>
> Signed-off-by: Ben Segall <bsegall@...gle.com>
> ---
>  include/linux/sched.h |    1 
>  kernel/sched/core.c   |    3 +
>  kernel/sched/debug.c  |    3 +
>  kernel/sched/fair.c   |  130 ++++++++++++++++++++++++++++++++++++++++++++-----
>  kernel/sched/sched.h  |    4 +-
>  5 files changed, 126 insertions(+), 15 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 0c54ce0..842c4df 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1139,6 +1139,7 @@ struct load_weight {
>  struct sched_avg {
>  	u32 runnable_avg_sum, runnable_avg_period;
>  	u64 last_runnable_update;
> +	s64 decay_count;
>  	unsigned long load_avg_contrib;
>  };
>  
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 9bb7d28..aeb8e56 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1713,6 +1713,9 @@ static void __sched_fork(struct task_struct *p)
>  	p->se.vruntime			= 0;
>  	INIT_LIST_HEAD(&p->se.group_node);
>  
> +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
> +	p->se.avg.decay_count = 0;
> +#endif
>  #ifdef CONFIG_SCHEDSTATS
>  	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
>  #endif
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index aeb74e3..2aa60cf 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -95,6 +95,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
>  	P(se->avg.runnable_avg_sum);
>  	P(se->avg.runnable_avg_period);
>  	P(se->avg.load_avg_contrib);
> +	P(se->avg.decay_count);
>  #endif
>  #undef PN
>  #undef P
> @@ -230,6 +231,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
>  			atomic_read(&cfs_rq->tg->load_weight));
>  	SEQ_printf(m, "  .%-30s: %lld\n", "runnable_load_avg",
>  			cfs_rq->runnable_load_avg);
> +	SEQ_printf(m, "  .%-30s: %lld\n", "blocked_load_avg",
> +			cfs_rq->blocked_load_avg);
>  #endif
>  
>  	print_cfs_group_stats(m, cpu, cfs_rq->tg);
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 8229766..6200d20 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1085,6 +1085,20 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
>  	return decayed;
>  }
>  
> +/* Synchronize an entity's decay with its parentin cfs_rq.*/
                                             parenting

> +static inline void __synchronize_entity_decay(struct sched_entity *se)
> +{
> +	struct cfs_rq *cfs_rq = cfs_rq_of(se);
> +	u64 decays = atomic64_read(&cfs_rq->decay_counter);
> +
> +	decays -= se->avg.decay_count;
> +	if (!decays)
> +		return;
> +
> +	se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
> +	se->avg.decay_count += decays;
> +}
> +
>  /* Compute the current contribution to load_avg by se, return any delta */
>  static long __update_entity_load_avg_contrib(struct sched_entity *se)
>  {
> @@ -1100,8 +1114,18 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se)
>  	return se->avg.load_avg_contrib - old_contrib;
>  }
>  
> +static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
> +						 long load_contrib)
> +{
> +	if (likely(load_contrib < cfs_rq->blocked_load_avg))
> +		cfs_rq->blocked_load_avg -= load_contrib;
> +	else
> +		cfs_rq->blocked_load_avg = 0;
> +}
> +
>  /* Update a sched_entity's runnable average */
> -static inline void update_entity_load_avg(struct sched_entity *se)
> +static inline void update_entity_load_avg(struct sched_entity *se,
> +					  int update_cfs_rq)
>  {
>  	struct cfs_rq *cfs_rq = cfs_rq_of(se);
>  	long contrib_delta;
> @@ -1111,8 +1135,34 @@ static inline void update_entity_load_avg(struct sched_entity *se)
>  		return;
>  
>  	contrib_delta = __update_entity_load_avg_contrib(se);
> +
> +	if (!update_cfs_rq)
> +		return;
> +
>  	if (se->on_rq)
>  		cfs_rq->runnable_load_avg += contrib_delta;
> +	else
> +		subtract_blocked_load_contrib(cfs_rq, -contrib_delta);

Subtracting negative delta means an addition, right?


> +}
> +
> +/*
> + * Decay the load contributed by all blocked children and account this so that
> + * they their contribution may appropriately discounted when they wake up.

s/they their/their/ ?


> + */
> +static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq)

I guess update_cfs_blocked_load is a bit more consistent name with
update_cfs_{load,shares}.

Thanks,
Namhyung


> +{
> +	u64 now = rq_of(cfs_rq)->clock_task >> 20;
> +	u64 decays;
> +
> +	decays = now - cfs_rq->last_decay;
> +	if (!decays)
> +		return;
> +
> +	cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
> +					      decays);
> +	atomic64_add(decays, &cfs_rq->decay_counter);
> +
> +	cfs_rq->last_decay = now;
>  }
>  
>  static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
> @@ -1122,26 +1172,56 @@ static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
>  
>  /* Add the load generated by se into cfs_rq's child load-average */
>  static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
> -						  struct sched_entity *se)
> -{
> -	update_entity_load_avg(se);
> +						  struct sched_entity *se,
> +						  int wakeup)
> +{
> +	/* we track migrations using entity decay_count == 0 */
> +	if (unlikely(!se->avg.decay_count)) {
> +		se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
> +		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
> +		wakeup = 0;
> +	} else {
> +		__synchronize_entity_decay(se);
> +	}
> +
> +	if (wakeup)
> +		subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
> +
> +	update_entity_load_avg(se, 0);
>  	cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
> +	update_cfs_rq_blocked_load(cfs_rq);
>  }
>  
> -/* Remove se's load from this cfs_rq child load-average */
> +/*
> + * Remove se's load from this cfs_rq child load-average, if the entity is
> + * transitioning to a blocked state we track its projected decay using
> + * blocked_load_avg.
> + */
>  static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
> -						  struct sched_entity *se)
> +						  struct sched_entity *se,
> +						  int sleep)
>  {
> -	update_entity_load_avg(se);
> +	update_entity_load_avg(se, 1);
> +
>  	cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
> +	if (sleep) {
> +		cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
> +		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
> +	} else {
> +		se->avg.decay_count = 0;
> +	}
>  }
>  #else
> -static inline void update_entity_load_avg(struct sched_entity *se) {}
> +static inline void update_entity_load_avg(struct sched_entity *se,
> +					  int update_cfs_rq) {}
>  static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
>  static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
> -						  struct sched_entity *se) {}
> +					   struct sched_entity *se,
> +					   int wakeup) {}
>  static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
> -						  struct sched_entity *se) {}
> +					   struct sched_entity *se,
> +					   int sleep) {}
> +static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq) {}
>  #endif
>  
>  static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
> @@ -1270,7 +1350,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>  	 */
>  	update_curr(cfs_rq);
>  	update_cfs_load(cfs_rq, 0);
> -	enqueue_entity_load_avg(cfs_rq, se);
> +	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
>  	account_entity_enqueue(cfs_rq, se);
>  	update_cfs_shares(cfs_rq);
>  
> @@ -1345,7 +1425,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>  	 * Update run-time statistics of the 'current'.
>  	 */
>  	update_curr(cfs_rq);
> -	dequeue_entity_load_avg(cfs_rq, se);
> +	dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
>  
>  	update_stats_dequeue(cfs_rq, se);
>  	if (flags & DEQUEUE_SLEEP) {
> @@ -1516,7 +1596,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
>  		/* Put 'current' back into the tree. */
>  		__enqueue_entity(cfs_rq, prev);
>  		/* in !on_rq case, update occurred at dequeue */
> -		update_entity_load_avg(prev);
> +		update_entity_load_avg(prev, 1);
>  	}
>  	cfs_rq->curr = NULL;
>  }
> @@ -1532,7 +1612,8 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
>  	/*
>  	 * Ensure that runnable average is periodically updated.
>  	 */
> -	update_entity_load_avg(curr);
> +	update_entity_load_avg(curr, 1);
> +	update_cfs_rq_blocked_load(cfs_rq);
>  
>  	/*
>  	 * Update share accounting for long-running entities.
> @@ -2391,6 +2472,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  
>  		update_cfs_load(cfs_rq, 0);
>  		update_cfs_shares(cfs_rq);
> +		update_entity_load_avg(se, 1);
>  	}
>  
>  	if (!se) {
> @@ -2452,6 +2534,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  
>  		update_cfs_load(cfs_rq, 0);
>  		update_cfs_shares(cfs_rq);
> +		update_entity_load_avg(se, 1);
>  	}
>  
>  	if (!se) {
> @@ -3557,6 +3640,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu)
>  
>  	update_rq_clock(rq);
>  	update_cfs_load(cfs_rq, 1);
> +	update_cfs_rq_blocked_load(cfs_rq);
>  
>  	/*
>  	 * We need to update shares after updating tg->load_weight in
> @@ -5379,6 +5463,21 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
>  		place_entity(cfs_rq, se, 0);
>  		se->vruntime -= cfs_rq->min_vruntime;
>  	}
> +
> +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
> +	/*
> +	* Remove our load from contribution when we leave sched_fair
> +	* and ensure we don't carry in an old decay_count if we
> +	* switch back.
> +	*/
> +	if (p->se.avg.decay_count) {
> +		struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
> +		__synchronize_entity_decay(&p->se);
> +		subtract_blocked_load_contrib(cfs_rq,
> +				p->se.avg.load_avg_contrib);
> +		p->se.avg.decay_count = 0;
> +	}
> +#endif
>  }
>  
>  /*
> @@ -5425,6 +5524,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
>  #ifndef CONFIG_64BIT
>  	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
>  #endif
> +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
> +	atomic64_set(&cfs_rq->decay_counter, 1);
> +#endif
>  }
>  
>  #ifdef CONFIG_FAIR_GROUP_SCHED
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 26cc36f..a96adf1 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -229,7 +229,9 @@ struct cfs_rq {
>  	 * This allows for the description of both thread and group usage (in
>  	 * the FAIR_GROUP_SCHED case).
>  	 */
> -	u64 runnable_load_avg;
> +	u64 runnable_load_avg, blocked_load_avg;
> +	atomic64_t decay_counter;
> +	u64 last_decay;
>  #endif
>  #ifdef CONFIG_FAIR_GROUP_SCHED
>  	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/