linux-kernel - Re: [RESEND PATCH 2/2] sched/fair: Optimize __update_sched

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20170331112358.422kb5aosbq3k5tn@hirez.programming.kicks-ass.net>
Date:   Fri, 31 Mar 2017 13:23:58 +0200
From:   Peter Zijlstra <peterz@...radead.org>
To:     Paul Turner <pjt@...gle.com>
Cc:     Yuyang Du <yuyang.du@...el.com>, Ingo Molnar <mingo@...nel.org>,
        LKML <linux-kernel@...r.kernel.org>,
        Benjamin Segall <bsegall@...gle.com>,
        Morten Rasmussen <morten.rasmussen@....com>,
        Vincent Guittot <vincent.guittot@...aro.org>,
        Dietmar Eggemann <dietmar.eggemann@....com>,
        Matt Fleming <matt@...eblueprint.co.uk>,
        Mike Galbraith <umgwanakikbuti@...il.com>
Subject: Re: [RESEND PATCH 2/2] sched/fair: Optimize __update_sched_avg()

On Fri, Mar 31, 2017 at 02:58:57AM -0700, Paul Turner wrote:
> > So lets pull it out again -- but I don't think we need to undo all of
> > yuyang's patches for that. So please, look at the patch I proposed for
> > the problem you spotted. Lets fix the current state and take it from
> > there, ok?
> 
> Oh, absolutely.  I'm not really not proposing re-vulcanizing any
> rubber here.  Just saying that this particular problem is just one
> facet of the weight mixing.  100% agree on fixing this as is and
> iterating.  I was actually thinking that these changes (which really
> nicely simplify the calculation) greatly simplify restoring the
> separation there.

So on top of the previous patch; I've arrived (without testing and
considering numerical overflow much) at the following patch.

Its known broken for things like update_cfs_rq_load_avg() where doing
this will unconditionally flatten load_sum, which seems undesired. Need
to think more on this.

Also, I've pulled out the cpufreq scaling, which I'm not sure is the
right thing to do. Vincent wants to scale time, which would be
persistent in the history, unlike this now.

But yes, this looks doable and simpler than before.

---

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -313,7 +313,7 @@ struct load_weight {
  */
 struct sched_avg {
 	u64				last_update_time;
-	u64				load_sum;
+	u32				load_sum;
 	u32				util_sum;
 	u32				period_contrib;
 	unsigned long			load_avg;
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -747,8 +747,8 @@ void init_entity_runnable_average(struct
 	 * nothing has been attached to the task group yet.
 	 */
 	if (entity_is_task(se))
-		sa->load_avg = scale_load_down(se->load.weight);
-	sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
+		sa->load_avg = se->load.weight;
+	sa->load_sum = LOAD_AVG_MAX;
 	/*
 	 * At this point, util_avg won't be used in select_task_rq_fair anyway
 	 */
@@ -2820,15 +2820,11 @@ static u32 __accumulate_pelt_segments(u6
  */
 static __always_inline u32
 accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
-	       unsigned long weight, int running, struct cfs_rq *cfs_rq)
+	       bool load, bool running, struct cfs_rq *cfs_rq)
 {
-	unsigned long scale_freq, scale_cpu;
-	u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
+	u32 contrib = (u32)delta;
 	u64 periods;
 
-	scale_freq = arch_scale_freq_capacity(NULL, cpu);
-	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
-
 	delta += sa->period_contrib;
 	periods = delta / 1024; /* A period is 1024us (~1ms) */
 
@@ -2852,14 +2848,13 @@ accumulate_sum(u64 delta, int cpu, struc
 	}
 	sa->period_contrib = delta;
 
-	contrib = cap_scale(contrib, scale_freq);
-	if (weight) {
-		sa->load_sum += weight * contrib;
+	if (load) {
+		sa->load_sum += contrib;
 		if (cfs_rq)
-			cfs_rq->runnable_load_sum += weight * contrib;
+			cfs_rq->runnable_load_sum += contrib;
 	}
 	if (running)
-		sa->util_sum += contrib * scale_cpu;
+		sa->util_sum += contrib;
 
 	return periods;
 }
@@ -2894,9 +2889,10 @@ accumulate_sum(u64 delta, int cpu, struc
  */
 static __always_inline int
 ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
-		  unsigned long weight, int running, struct cfs_rq *cfs_rq)
+		  unsigned long weight, bool running, struct cfs_rq *cfs_rq)
 {
-	u64 delta;
+	unsigned long scale_freq, scale_cpu;
+	u64 delta, load;
 
 	delta = now - sa->last_update_time;
 	/*
@@ -2924,18 +2920,22 @@ ___update_load_avg(u64 now, int cpu, str
 	 * Step 1: accumulate *_sum since last_update_time. If we haven't
 	 * crossed period boundaries, finish.
 	 */
-	if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
+	if (!accumulate_sum(delta, cpu, sa, !!weight, running, cfs_rq))
 		return 0;
 
+	scale_freq = arch_scale_freq_capacity(NULL, cpu);
+	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+
 	/*
 	 * Step 2: update *_avg.
 	 */
-	sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
+	load = cap_scale(weight, scale_freq);
+	sa->load_avg = div_u64(load * sa->load_sum, LOAD_AVG_MAX);
 	if (cfs_rq) {
 		cfs_rq->runnable_load_avg =
-			div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
+			div_u64(load * cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
 	}
-	sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
+	sa->util_avg = (cap_scale(scale_cpu, scale_freq) * sa->util_sum) / LOAD_AVG_MAX;
 
 	return 1;
 }
@@ -2950,7 +2950,7 @@ static int
 __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	return ___update_load_avg(now, cpu, &se->avg,
-				  se->on_rq * scale_load_down(se->load.weight),
+				  se->on_rq * se->load.weight,
 				  cfs_rq->curr == se, NULL);
 }
 
@@ -2958,8 +2958,7 @@ static int
 __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
 {
 	return ___update_load_avg(now, cpu, &cfs_rq->avg,
-			scale_load_down(cfs_rq->load.weight),
-			cfs_rq->curr != NULL, cfs_rq);
+			cfs_rq->load.weight, cfs_rq->curr != NULL, cfs_rq);
 }
 
 /*
@@ -3130,11 +3129,11 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq
 
 	/* Set new sched_entity's load */
 	se->avg.load_avg = load;
-	se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
+	se->avg.load_sum = LOAD_AVG_MAX;
 
 	/* Update parent cfs_rq load */
 	add_positive(&cfs_rq->avg.load_avg, delta);
-	cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
+	cfs_rq->avg.load_sum = LOAD_AVG_MAX;
 
 	/*
 	 * If the sched_entity is already enqueued, we also have to update the
@@ -3143,7 +3142,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq
 	if (se->on_rq) {
 		/* Update parent cfs_rq runnable_load_avg */
 		add_positive(&cfs_rq->runnable_load_avg, delta);
-		cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
+		cfs_rq->runnable_load_sum = LOAD_AVG_MAX;
 	}
 }