linux-kernel - [RFC PATCH 7/9] sched/fair: Optimize __update_sched

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1463338774-3324-8-git-send-email-yuyang.du@intel.com>
Date:	Mon, 16 May 2016 02:59:32 +0800
From:	Yuyang Du <yuyang.du@...el.com>
To:	peterz@...radead.org, mingo@...nel.org,
	linux-kernel@...r.kernel.org
Cc:	bsegall@...gle.com, pjt@...gle.com, morten.rasmussen@....com,
	vincent.guittot@...aro.org, dietmar.eggemann@....com,
	juri.lelli@....com, Yuyang Du <yuyang.du@...el.com>
Subject: [RFC PATCH 7/9] sched/fair: Optimize __update_sched_avg()

__update_sched_avg() has these steps:

  1. add the remainder of the last incomplete period
  2. decay old sum
  3. accumulate new sum in full periods since last_update_time
  4. add the current incomplete period
  5. update averages

Previously, we separately computed steps 1, 3, and 4, leading to
each one of them ugly in codes and costly in overhead.

Illustation:

             c1          c3           c4
             ^           ^            ^
             |           |            |
           |<->|<----------------->|<--->|
   ... |---x---|------| ... |------|-----x (now)

c1, c3, and c4 are the accumulated (meanwhile decayed) contributions
in timing aspect of steps 1, 3, and 4 respectively.

With them, the accumulated contribution to load_sum, for example, is:

contrib = c1 * weight * freq_scaled;
contrib += c3 * weight * freq_scaled;
contrib += c4 * weight * freq_scaled;

Obviously, we can optimize the above by:

contrib = c1 + c3 + c4;
contrib *= weight * freq_scaled;

One issue is that c1 must be additionally decayed as opposed to
decaying it with step 2. However, peformance wise, the two approaches
should be on par with each other if you compare __decay_sum() with a
round of contrib accumulation. And overall, we definitely will save
tens of instructions, although the performance gain may not be observed
by benchmarks, whereas code wise, this apprach is obviously much simpler.

Code size (allyesconfig):

Before: kernel/sched/built-in.o 1404840
After : kernel/sched/built-in.o 1404016  (-824B)

Signed-off-by: Yuyang Du <yuyang.du@...el.com>
---
 kernel/sched/fair.c |  180 +++++++++++++++++++++++++--------------------------
 1 file changed, 89 insertions(+), 91 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1bbac7e..e1cde19 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -668,7 +668,7 @@ static unsigned long task_h_load(struct task_struct *p);
  */
 #define SCHED_AVG_HALFLIFE 32	/* number of periods as a half-life */
 #define SCHED_AVG_MAX 47742	/* maximum possible sched avg */
-#define SCHED_AVG_MAX_N 345	/* number of full periods to produce SCHED_AVG_MAX */
+#define SCHED_AVG_MAX_N 347	/* number of full periods to produce SCHED_AVG_MAX */
 
 /* Give new sched_entity start runnable values to heavy its load in infant time */
 void init_entity_runnable_average(struct sched_entity *se)
@@ -2652,24 +2652,83 @@ static __always_inline u64 __decay_sum(u64 val, u32 n)
  * We can compute this efficiently by combining:
  * y^32 = 1/2 with precomputed \Sum 1024*y^n   (where n < 32)
  */
-static __always_inline u32 __accumulate_sum(u32 n)
+static __always_inline u32
+__accumulate_sum(u32 periods, u32 period_contrib, u32 remainder)
 {
-	u32 contrib = 0;
+	u32 contrib;
 
-	if (likely(n <= SCHED_AVG_HALFLIFE))
-		return __accumulated_sum_N[n];
-	else if (unlikely(n >= SCHED_AVG_MAX_N))
+	if (!periods)
+		return remainder - period_contrib;
+
+	if (unlikely(periods >= SCHED_AVG_MAX_N))
 		return SCHED_AVG_MAX;
 
-	/* Since n < SCHED_AVG_MAX_N, n/SCHED_AVG_HALFLIFE < 11 */
-	contrib = __accumulated_sum_N32[n/SCHED_AVG_HALFLIFE];
-	n %= SCHED_AVG_HALFLIFE;
-	contrib = __decay_sum(contrib, n);
-	return contrib + __accumulated_sum_N[n];
+	remainder += __decay_sum((u64)(1024 - period_contrib), periods);
+
+	periods -= 1;
+	if (likely(periods <= SCHED_AVG_HALFLIFE))
+		contrib = __accumulated_sum_N[periods];
+	else {
+		contrib = __accumulated_sum_N32[periods/SCHED_AVG_HALFLIFE];
+		periods %= SCHED_AVG_HALFLIFE;
+		contrib = __decay_sum(contrib, periods);
+		contrib += __accumulated_sum_N[periods];
+	}
+
+	return contrib + remainder;
 }
 
 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
 
+static __always_inline u32 accumulate_sum(u64 delta, struct sched_avg *sa,
+	struct cfs_rq *cfs_rq, int cpu, unsigned long weight, int running)
+{
+	u32 contrib, periods;
+	unsigned long scale_freq, scale_cpu;
+
+	scale_freq = arch_scale_freq_capacity(NULL, cpu);
+	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+
+	delta += sa->period_contrib;
+	periods = delta >> 10; /* A period is 1024us (~1ms) */
+
+	/*
+	 * Accumulating *_sum has two steps.
+	 *
+	 * Step 1: decay old *_sum if we crossed period boundaries.
+	 */
+	if (periods) {
+		sa->load_sum = __decay_sum(sa->load_sum, periods);
+		if (cfs_rq) {
+			cfs_rq->runnable_load_sum =
+				__decay_sum(cfs_rq->runnable_load_sum, periods);
+		}
+		sa->util_sum = __decay_sum((u64)(sa->util_sum), periods);
+	}
+
+	/*
+	 * Step 2: accumulate new *_sum since last_update_time. This at most has
+	 * three parts (at least one part): (1) remainder of incomplete last
+	 * period, (2) full periods since (1), and (3) incomplete current period.
+	 *
+	 * Fortunately, we can (and should) do all these three at once.
+	 */
+	delta %= 1024;
+	contrib = __accumulate_sum(periods, sa->period_contrib, delta);
+	sa->period_contrib = delta;
+
+	contrib = cap_scale(contrib, scale_freq);
+	if (weight) {
+		sa->load_sum += weight * contrib;
+		if (cfs_rq)
+			cfs_rq->runnable_load_sum += weight * contrib;
+	}
+	if (running)
+		sa->util_sum += contrib * scale_cpu;
+
+	return periods;
+}
+
 /*
  * We can represent the historical contribution to sched average as the
  * coefficients of a geometric series.  To do this we divide the history
@@ -2702,10 +2761,7 @@ static __always_inline int
 __update_sched_avg(u64 now, int cpu, struct sched_avg *sa,
 		   unsigned long weight, int running, struct cfs_rq *cfs_rq)
 {
-	u64 delta, scaled_delta;
-	u32 contrib, periods;
-	unsigned int delta_w, scaled_delta_w, decayed = 0;
-	unsigned long scale_freq, scale_cpu;
+	u64 delta;
 
 	delta = now - sa->last_update_time;
 	/*
@@ -2726,85 +2782,27 @@ __update_sched_avg(u64 now, int cpu, struct sched_avg *sa,
 		return 0;
 	sa->last_update_time = now;
 
-	scale_freq = arch_scale_freq_capacity(NULL, cpu);
-	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
-
-	/* delta_w is the amount already accumulated against our next period */
-	delta_w = sa->period_contrib;
-	if (delta + delta_w >= 1024) {
-		decayed = 1;
-
-		/* how much left for next period will start over, we don't know yet */
-		sa->period_contrib = 0;
-
-		/*
-		 * Now that we know we're crossing a period boundary, figure
-		 * out how much from delta we need to complete the current
-		 * period and accrue it.
-		 */
-		delta_w = 1024 - delta_w;
-		scaled_delta_w = cap_scale(delta_w, scale_freq);
-		if (weight) {
-			sa->load_sum += weight * scaled_delta_w;
-			if (cfs_rq) {
-				cfs_rq->runnable_load_sum +=
-						weight * scaled_delta_w;
-			}
-		}
-		if (running)
-			sa->util_sum += scaled_delta_w * scale_cpu;
-
-		delta -= delta_w;
-
-		/*
-		 * Figure out how many additional periods this update spans.
-		 * A period is 1024*1024ns or ~1ms, so a 32bit integer can hold
-		 * approximately a maximum of 49 (=2^32/1000/3600/24) days.
-		 */
-		periods = delta / 1024;
-		delta %= 1024;
-
-		sa->load_sum = __decay_sum(sa->load_sum, periods + 1);
-		if (cfs_rq) {
-			cfs_rq->runnable_load_sum =
-				__decay_sum(cfs_rq->runnable_load_sum, periods + 1);
-		}
-		sa->util_sum = __decay_sum((u64)(sa->util_sum), periods + 1);
-
-		/* Efficiently calculate \sum (1..n_period) 1024*y^i */
-		contrib = __accumulate_sum(periods);
-		contrib = cap_scale(contrib, scale_freq);
-		if (weight) {
-			sa->load_sum += weight * contrib;
-			if (cfs_rq)
-				cfs_rq->runnable_load_sum += weight * contrib;
-		}
-		if (running)
-			sa->util_sum += contrib * scale_cpu;
-	}
-
-	/* Remainder of delta accrued against u_0` */
-	scaled_delta = cap_scale(delta, scale_freq);
-	if (weight) {
-		sa->load_sum += weight * scaled_delta;
-		if (cfs_rq)
-			cfs_rq->runnable_load_sum += weight * scaled_delta;
-	}
-	if (running)
-		sa->util_sum += scaled_delta * scale_cpu;
-
-	sa->period_contrib += delta;
+	/*
+	 * Now we know we crossed measurement unit boundaries. The *_avg
+	 * accrues by two steps:
+	 *
+	 * Step 1: accumulate *_sum since last_update_time. If we haven't
+	 * crossed period boundaries, finish.
+	 */
+	if (!accumulate_sum(delta, sa, cfs_rq, cpu, weight, running))
+		return 0;
 
-	if (decayed) {
-		sa->load_avg = div_u64(sa->load_sum, SCHED_AVG_MAX);
-		if (cfs_rq) {
-			cfs_rq->runnable_load_avg =
-				div_u64(cfs_rq->runnable_load_sum, SCHED_AVG_MAX);
-		}
-		sa->util_avg = sa->util_sum / SCHED_AVG_MAX;
+	/*
+	 * Step 2: update *_avg.
+	 */
+	sa->load_avg = div_u64(sa->load_sum, SCHED_AVG_MAX);
+	if (cfs_rq) {
+		cfs_rq->runnable_load_avg =
+			div_u64(cfs_rq->runnable_load_sum, SCHED_AVG_MAX);
 	}
+	sa->util_avg = sa->util_sum / SCHED_AVG_MAX;
 
-	return decayed;
+	return 1;
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-- 
1.7.9.5