linux-kernel - [PATCH v1 06/10] sched/fair: Optimize __update_sched

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1470788095-2125-7-git-send-email-yuyang.du@intel.com>
Date:	Wed, 10 Aug 2016 08:14:51 +0800
From:	Yuyang Du <yuyang.du@...el.com>
To:	peterz@...radead.org, mingo@...nel.org,
	linux-kernel@...r.kernel.org
Cc:	bsegall@...gle.com, pjt@...gle.com, morten.rasmussen@....com,
	vincent.guittot@...aro.org, dietmar.eggemann@....com,
	matt@...eblueprint.co.uk, umgwanakikbuti@...il.com,
	Yuyang Du <yuyang.du@...el.com>
Subject: [PATCH v1 06/10] sched/fair: Optimize __update_sched_avg()

__update_sched_avg() has these steps:

  1. add the remainder of the last incomplete period
  2. decay old sum
  3. accumulate new sum in full periods since last_update_time
  4. add the current incomplete period
  5. update averages

Previously, we separately computed steps 1, 3, and 4, leading to
each one of them ugly in codes and costly in overhead.

Illustation:

             c1          c3           c4
             ^           ^            ^
             |           |            |
           |<->|<----------------->|<--->|
   ... |---x---|------| ... |------|-----x (now)

c1, c3, and c4 are the accumulated (meanwhile decayed) contributions
in timing aspect of steps 1, 3, and 4 respectively.

With them, the accumulated contribution to load_sum, for example, is:

contrib = c1 * weight * freq_scaled;
contrib += c3 * weight * freq_scaled;
contrib += c4 * weight * freq_scaled;

Obviously, we can optimize the above by:

contrib = c1 + c3 + c4;
contrib *= weight * freq_scaled;

One issue is that c1 must be additionally decayed as opposed to
decaying it with step 2. However, peformance wise, the two approaches
should be on par with each other if you compare __decay_sum() with a
round of contrib accumulation. And overall, we definitely will save
tens of instructions, although the performance gain may not be observed
by benchmarks, whereas code wise, this apprach is obviously much simpler.

Code size (allyesconfig):

Before: kernel/sched/built-in.o 1031616
After : kernel/sched/built-in.o 1025504 (-6112B)

Signed-off-by: Yuyang Du <yuyang.du@...el.com>
---
 kernel/sched/fair.c |  175 ++++++++++++++++++++++++++-------------------------
 1 file changed, 89 insertions(+), 86 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6b10fb0..283e2c2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2649,24 +2649,84 @@ static __always_inline u64 __decay_sum(u64 val, u64 n)
  * We can compute this efficiently by combining:
  * y^32 = 1/2 with precomputed \Sum 1024*y^n   (where n < 32)
  */
-static __always_inline u32 __accumulate_sum(u64 n)
+static __always_inline u32
+__accumulate_sum(u64 periods, u32 period_contrib, u32 remainder)
 {
-	u32 contrib = 0;
+	u32 contrib;
+
+	if (!periods)
+		return remainder - period_contrib;
 
-	if (likely(n <= SCHED_AVG_HALFLIFE))
-		return __accumulated_sum_N[n];
-	else if (unlikely(n >= SCHED_AVG_MAX_N))
+	if (unlikely(periods >= SCHED_AVG_MAX_N))
 		return SCHED_AVG_MAX;
 
-	/* Since n < SCHED_AVG_MAX_N, n/SCHED_AVG_HALFLIFE < 11 */
-	contrib = __accumulated_sum_N32[n/SCHED_AVG_HALFLIFE];
-	n %= SCHED_AVG_HALFLIFE;
-	contrib = __decay_sum(contrib, n);
-	return contrib + __accumulated_sum_N[n];
+	remainder += __decay_sum((u64)(1024 - period_contrib), periods);
+
+	periods -= 1;
+	if (likely(periods <= SCHED_AVG_HALFLIFE))
+		contrib = __accumulated_sum_N[periods];
+	else {
+		contrib = __accumulated_sum_N32[periods/SCHED_AVG_HALFLIFE];
+		periods %= SCHED_AVG_HALFLIFE;
+		contrib = __decay_sum(contrib, periods);
+		contrib += __accumulated_sum_N[periods];
+	}
+
+	return contrib + remainder;
 }
 
 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
 
+static __always_inline u32 accumulate_sum(u64 delta, struct sched_avg *sa,
+	struct cfs_rq *cfs_rq, int cpu, unsigned long weight, int running)
+{
+	u32 contrib;
+	u64 periods;
+	unsigned long scale_freq, scale_cpu;
+
+	scale_freq = arch_scale_freq_capacity(NULL, cpu);
+	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+
+	delta += sa->period_contrib;
+	periods = delta >> 10; /* A period is 1024us (~1ms) */
+
+	/*
+	 * Accumulating *_sum has two steps.
+	 *
+	 * Step 1: decay old *_sum if we crossed period boundaries.
+	 */
+	if (periods) {
+		sa->load_sum = __decay_sum(sa->load_sum, periods);
+		if (cfs_rq) {
+			cfs_rq->runnable_load_sum =
+				__decay_sum(cfs_rq->runnable_load_sum, periods);
+		}
+		sa->util_sum = __decay_sum((u64)(sa->util_sum), periods);
+	}
+
+	/*
+	 * Step 2: accumulate new *_sum since last_update_time. This at most has
+	 * three parts (at least one part): (1) remainder of incomplete last
+	 * period, (2) full periods since (1), and (3) incomplete current period.
+	 *
+	 * Fortunately, we can (and should) do all these three at once.
+	 */
+	delta %= 1024;
+	contrib = __accumulate_sum(periods, sa->period_contrib, delta);
+	sa->period_contrib = delta;
+
+	contrib = cap_scale(contrib, scale_freq);
+	if (weight) {
+		sa->load_sum += weight * contrib;
+		if (cfs_rq)
+			cfs_rq->runnable_load_sum += weight * contrib;
+	}
+	if (running)
+		sa->util_sum += contrib * scale_cpu;
+
+	return periods;
+}
+
 /*
  * We can represent the historical contribution to sched average as the
  * coefficients of a geometric series.  To do this we divide the history
@@ -2699,10 +2759,7 @@ static __always_inline int
 __update_sched_avg(u64 now, int cpu, struct sched_avg *sa,
 		   unsigned long weight, int running, struct cfs_rq *cfs_rq)
 {
-	u64 delta, scaled_delta, periods;
-	u32 contrib;
-	unsigned int delta_w, scaled_delta_w, decayed = 0;
-	unsigned long scale_freq, scale_cpu;
+	u64 delta;
 
 	delta = now - sa->last_update_time;
 	/*
@@ -2723,81 +2780,27 @@ __update_sched_avg(u64 now, int cpu, struct sched_avg *sa,
 		return 0;
 	sa->last_update_time = now;
 
-	scale_freq = arch_scale_freq_capacity(NULL, cpu);
-	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
-
-	/* delta_w is the amount already accumulated against our next period */
-	delta_w = sa->period_contrib;
-	if (delta + delta_w >= 1024) {
-		decayed = 1;
-
-		/* how much left for next period will start over, we don't know yet */
-		sa->period_contrib = 0;
-
-		/*
-		 * Now that we know we're crossing a period boundary, figure
-		 * out how much from delta we need to complete the current
-		 * period and accrue it.
-		 */
-		delta_w = 1024 - delta_w;
-		scaled_delta_w = cap_scale(delta_w, scale_freq);
-		if (weight) {
-			sa->load_sum += weight * scaled_delta_w;
-			if (cfs_rq) {
-				cfs_rq->runnable_load_sum +=
-						weight * scaled_delta_w;
-			}
-		}
-		if (running)
-			sa->util_sum += scaled_delta_w * scale_cpu;
-
-		delta -= delta_w;
-
-		/* Figure out how many additional periods this update spans */
-		periods = delta / 1024;
-		delta %= 1024;
-
-		sa->load_sum = __decay_sum(sa->load_sum, periods + 1);
-		if (cfs_rq) {
-			cfs_rq->runnable_load_sum =
-				__decay_sum(cfs_rq->runnable_load_sum, periods + 1);
-		}
-		sa->util_sum = __decay_sum((u64)(sa->util_sum), periods + 1);
-
-		/* Efficiently calculate \sum (1..n_period) 1024*y^i */
-		contrib = __accumulate_sum(periods);
-		contrib = cap_scale(contrib, scale_freq);
-		if (weight) {
-			sa->load_sum += weight * contrib;
-			if (cfs_rq)
-				cfs_rq->runnable_load_sum += weight * contrib;
-		}
-		if (running)
-			sa->util_sum += contrib * scale_cpu;
-	}
-
-	/* Remainder of delta accrued against u_0` */
-	scaled_delta = cap_scale(delta, scale_freq);
-	if (weight) {
-		sa->load_sum += weight * scaled_delta;
-		if (cfs_rq)
-			cfs_rq->runnable_load_sum += weight * scaled_delta;
-	}
-	if (running)
-		sa->util_sum += scaled_delta * scale_cpu;
-
-	sa->period_contrib += delta;
+	/*
+	 * Now we know we crossed measurement unit boundaries. The *_avg
+	 * accrues by two steps:
+	 *
+	 * Step 1: accumulate *_sum since last_update_time. If we haven't
+	 * crossed period boundaries, finish.
+	 */
+	if (!accumulate_sum(delta, sa, cfs_rq, cpu, weight, running))
+		return 0;
 
-	if (decayed) {
-		sa->load_avg = div_u64(sa->load_sum, SCHED_AVG_MAX);
-		if (cfs_rq) {
-			cfs_rq->runnable_load_avg =
-				div_u64(cfs_rq->runnable_load_sum, SCHED_AVG_MAX);
-		}
-		sa->util_avg = sa->util_sum / SCHED_AVG_MAX;
+	/*
+	 * Step 2: update *_avg.
+	 */
+	sa->load_avg = div_u64(sa->load_sum, SCHED_AVG_MAX);
+	if (cfs_rq) {
+		cfs_rq->runnable_load_avg =
+			div_u64(cfs_rq->runnable_load_sum, SCHED_AVG_MAX);
 	}
+	sa->util_avg = sa->util_sum / SCHED_AVG_MAX;
 
-	return decayed;
+	return 1;
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-- 
1.7.9.5