linux-kernel - Re: [PATCH v3 0/5] sync a se with its cfs

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20150820052608.GF24261@byungchulpark-X58A-UD3R>
Date:	Thu, 20 Aug 2015 14:26:08 +0900
From:	Byungchul Park <byungchul.park@....com>
To:	Peter Zijlstra <peterz@...radead.org>
Cc:	mingo@...nel.org, linux-kernel@...r.kernel.org, yuyang.du@...el.com
Subject: Re: [PATCH v3 0/5] sync a se with its cfs_rq when att(det)aching it

On Thu, Aug 20, 2015 at 03:17:21AM +0200, Peter Zijlstra wrote:
> 
> I did something like this on top.. please have a look at the XXX and
> integrate.

yes, i will do it.

> 
> ---
> 
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -2664,8 +2664,8 @@ static inline u64 cfs_rq_clock_task(stru
>  /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
>  static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
>  {
> -	int decayed;
>  	struct sched_avg *sa = &cfs_rq->avg;
> +	int decayed;
>  
>  	if (atomic_long_read(&cfs_rq->removed_load_avg)) {
>  		long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
> @@ -2695,15 +2695,16 @@ static inline int update_cfs_rq_load_avg
>  static inline void update_load_avg(struct sched_entity *se, int update_tg)
>  {
>  	struct cfs_rq *cfs_rq = cfs_rq_of(se);
> -	int cpu = cpu_of(rq_of(cfs_rq));
>  	u64 now = cfs_rq_clock_task(cfs_rq);
> +	int cpu = cpu_of(rq_of(cfs_rq));
>  
>  	/*
>  	 * Track task load average for carrying it to new CPU after migrated, and
>  	 * track group sched_entity load average for task_h_load calc in migration
>  	 */
>  	__update_load_avg(now, cpu, &se->avg,
> -		se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL);
> +			  se->on_rq * scale_load_down(se->load.weight),
> +			  cfs_rq->curr == se, NULL);
>  
>  	if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
>  		update_tg_load_avg(cfs_rq, 0);
> @@ -2718,9 +2719,10 @@ static void attach_entity_load_avg(struc
>  	 * update here. we have to update it with prev cfs_rq just before changing
>  	 * se's cfs_rq, and get here soon.
>  	 */
> -	if (se->avg.last_update_time)
> +	if (se->avg.last_update_time) {
>  		__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
> -				&se->avg, 0, 0, NULL);
> +				  &se->avg, 0, 0, NULL);
> +	}
>  
>  	se->avg.last_update_time = cfs_rq->avg.last_update_time;
>  	cfs_rq->avg.load_avg += se->avg.load_avg;
> @@ -2732,17 +2734,13 @@ static void attach_entity_load_avg(struc
>  static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  {
>  	__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
> -			&se->avg, se->on_rq * scale_load_down(se->load.weight),
> -			cfs_rq->curr == se, NULL);
> +			  &se->avg, se->on_rq * scale_load_down(se->load.weight),
> +			  cfs_rq->curr == se, NULL);
>  
> -	cfs_rq->avg.load_avg =
> -		max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
> -	cfs_rq->avg.load_sum =
> -		max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0);
> -	cfs_rq->avg.util_avg =
> -		max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
> -	cfs_rq->avg.util_sum =
> -		max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0);
> +	cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
> +	cfs_rq->avg.load_sum = max_t(s64,  cfs_rq->avg.load_sum - se->avg.load_sum, 0);
> +	cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
> +	cfs_rq->avg.util_sum = max_t(s32,  cfs_rq->avg.util_sum - se->avg.util_sum, 0);
>  }
>  
>  /* Add the load generated by se into cfs_rq's load average */
> @@ -2751,14 +2749,14 @@ enqueue_entity_load_avg(struct cfs_rq *c
>  {
>  	struct sched_avg *sa = &se->avg;
>  	u64 now = cfs_rq_clock_task(cfs_rq);
> -	int migrated = 0, decayed;
> +	int migrated, decayed;
>  
> -	if (sa->last_update_time == 0)
> -		migrated = 1;
> -	else
> +	migrated = !sa->last_update_time;
> +	if (!migrated) {
>  		__update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
>  			se->on_rq * scale_load_down(se->load.weight),
>  			cfs_rq->curr == se, NULL);
> +	}
>  
>  	decayed = update_cfs_rq_load_avg(now, cfs_rq);
>  
> @@ -2781,7 +2779,7 @@ dequeue_entity_load_avg(struct cfs_rq *c
>  	cfs_rq->runnable_load_avg =
>  		max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
>  	cfs_rq->runnable_load_sum =
> -		max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
> +		max_t(s64,  cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
>  }
>  
>  /*
> @@ -2849,6 +2847,11 @@ static inline void
>  dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
>  static inline void remove_entity_load_avg(struct sched_entity *se) {}
>  
> +static inline void
> +attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
> +static inline void
> +detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
> +
>  static inline int idle_balance(struct rq *rq)
>  {
>  	return 0;
> @@ -7915,29 +7918,50 @@ prio_changed_fair(struct rq *rq, struct
>  		check_preempt_curr(rq, p, 0);
>  }
>  
> -static void detach_task_cfs_rq(struct task_struct *p, int queued)
> +static inline bool vruntime_normalized(struct task_struct *p)
>  {
> +	int queued = task_on_rq_queued(p);
>  	struct sched_entity *se = &p->se;
> -	struct cfs_rq *cfs_rq = cfs_rq_of(se);
>  
>  	/*
> -	 * Ensure the task's vruntime is normalized, so that after attaching
> -	 * back it to a cfs_rq the enqueue_entity() will do the right thing.
> -	 *
>  	 * If it's queued, then the dequeue_entity(.flags=0) will already
> -	 * have normalized the vruntime, if it's !queued, then only when
> -	 * the task is sleeping it still has a non-normalized vruntime.
> -	 *
> +	 * have normalized the vruntime.
> +	 */
> +	if (queued)
> +		return true;
> +
> +	/*
>  	 * When !queued, vruntime of the task has usually NOT been normalized.
>  	 * But there are some cases where it has already been normalized:
> -	 *
> -	 * - Moving a forked child which is waiting for being woken up by
> +	 */
> +
> +	/*
> +	 * - a forked child which is waiting for being woken up by
>  	 *   wake_up_new_task().
> -	 * - Moving a task which has been woken up by try_to_wake_up() and
> +	 */
> +	if (!se->sum_exec_runtime || p->state == TASK_WAKING)
> +		return true;
> +
> +	/*
> +	 * - a task which has been woken up by try_to_wake_up() and
>  	 *   waiting for actually being woken up by sched_ttwu_pending().
> +	 *
> +	 * XXX this appears wrong!! check history, 
> +	 * we appear to always set queued and RUNNING under the same lock instance
> +	 * might be from before TASK_WAKING ?
>  	 */
> -	if (!queued && p->state != TASK_RUNNING &&
> -	    se->sum_exec_runtime && p->state != TASK_WAKING) {
> +	if (p->state == TASK_RUNNING)
> +		return true;
> +
> +	return false;
> +}
> +
> +static void detach_task_cfs_rq(struct task_struct *p)
> +{
> +	struct sched_entity *se = &p->se;
> +	struct cfs_rq *cfs_rq = cfs_rq_of(se);
> +
> +	if (!vruntime_normalized(p)) {
>  		/*
>  		 * Fix up our vruntime so that the current sleep doesn't
>  		 * cause 'unlimited' sleep bonus.
> @@ -7946,10 +7970,8 @@ static void detach_task_cfs_rq(struct ta
>  		se->vruntime -= cfs_rq->min_vruntime;
>  	}
>  
> -#ifdef CONFIG_SMP
>  	/* Catch up with the cfs_rq and remove our load when we leave */
>  	detach_entity_load_avg(cfs_rq, se);
> -#endif
>  }
>  
>  static void attach_task_cfs_rq(struct task_struct *p, int queued)
> @@ -7965,49 +7987,23 @@ static void attach_task_cfs_rq(struct ta
>  	se->depth = se->parent ? se->parent->depth + 1 : 0;
>  #endif
>  
> -#ifdef CONFIG_SMP
>  	/* synchronize task with its cfs_rq */
>  	attach_entity_load_avg(cfs_rq, se);
> -#endif
>  
> -	/*
> -	 * Ensure the task has a non-normalized vruntime when attaching it
> -	 * to a cfs_rq, so that enqueue_entity() at wake-up time will do
> -	 * the right thing.
> -	 *
> -	 * If it's queued, then the enqueue_entity(.flags=0) makes the task
> -	 * has non-normalized vruntime, if it's !queued, then it still has
> -	 * a normalized vruntime.
> -	 *
> -	 * When !queued, a task usually should have a non-normalized vruntime
> -	 * after being attached to a cfs_rq. But there are some cases where
> -	 * we should keep it normalized:
> -	 *
> -	 * - Moving a forked child which is waiting for being woken up by
> -	 *   wake_up_new_task().
> -	 * - Moving a task which has been woken up by try_to_wake_up() and
> -	 *   waiting for actually being woken up by sched_ttwu_pending().
> -	 */
> -	if (!queued && p->state != TASK_RUNNING &&
> -	    se->sum_exec_runtime && p->state != TASK_WAKING)
> +	if (!vruntime_normalized(p))
>  		se->vruntime += cfs_rq->min_vruntime;
>  }
>  
>  static void switched_from_fair(struct rq *rq, struct task_struct *p)
>  {
> -	detach_task_cfs_rq(p, task_on_rq_queued(p));
> +	detach_task_cfs_rq(p);
>  }
>  
> -/*
> - * We switched to the sched_fair class.
> - */
>  static void switched_to_fair(struct rq *rq, struct task_struct *p)
>  {
> -	int queued = task_on_rq_queued(p);
> -
> -	attach_task_cfs_rq(p, queued);
> +	attach_task_cfs_rq(p);
>  
> -	if (queued) {
> +	if (task_on_rq_queued(p)) {
>  		/*
>  		 * We were most likely switched from sched_rt, so
>  		 * kick off the schedule if running, otherwise just see
> @@ -8056,8 +8052,7 @@ static void task_move_group_fair(struct
>  {
>  	detach_task_cfs_rq(p, queued);
>  	set_task_rq(p, task_cpu(p));
> -
> -	/* tell se's cfs_rq has been changed */
> +	/* tell se's cfs_rq has been changed -- migrated */
>  	p->se.avg.last_update_time = 0;
>  	attach_task_cfs_rq(p, queued);
>  }
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@...r.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/