linux-kernel - [PATCH v1 10/10] sched/fair: Implement flat hierarchical structure for util

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Wed, 10 Aug 2016 08:14:55 +0800
From:	Yuyang Du <yuyang.du@...el.com>
To:	peterz@...radead.org, mingo@...nel.org,
	linux-kernel@...r.kernel.org
Cc:	bsegall@...gle.com, pjt@...gle.com, morten.rasmussen@....com,
	vincent.guittot@...aro.org, dietmar.eggemann@....com,
	matt@...eblueprint.co.uk, umgwanakikbuti@...il.com,
	Yuyang Du <yuyang.du@...el.com>
Subject: [PATCH v1 10/10] sched/fair: Implement flat hierarchical structure for util_avg

The util_avg follows task group's hierarchy to update, but the util_avgs
of all group entities and cfs_rqs except the top cfs_rq are needless and
thus never used. More importantly, the top cfs_rq's util_avg does not
reflect migration of a group task effectively, because the util_avg of
the task is accounted to its direct cfs_rq, and slowly trickle down to
the top cfs_rq.

So this patch proposes a flat task hierarchy for util_avg - all cfs tasks
affiliated to a rq are flat, removing their task group hierarchy, and
therefore the rq's util is the sum of all the cfs tasks.

Regarding overhead, the rq's util updates may be more costly - rq's util
updates can be very frequent, but we don't update any group entity's util,
so the net overhead should be evened out.

Signed-off-by: Yuyang Du <yuyang.du@...el.com>
---
 kernel/sched/core.c  |    1 +
 kernel/sched/debug.c |   13 +++--
 kernel/sched/fair.c  |  140 +++++++++++++++++++++++++++++++-------------------
 kernel/sched/sched.h |    5 +-
 4 files changed, 101 insertions(+), 58 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 91fe97f9..d215d04 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7475,6 +7475,7 @@ void __init sched_init(void)
 #ifdef CONFIG_NO_HZ_FULL
 		rq->last_sched_tick = 0;
 #endif
+		atomic_long_set(&rq->removed_util_avg, 0);
 #endif /* CONFIG_SMP */
 		init_rq_hrtick(rq);
 		atomic_set(&rq->nr_iowait, 0);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2a0a999..14dc121 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -395,7 +395,6 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 	P(se->load.weight);
 #ifdef CONFIG_SMP
 	P(se->avg.load_avg);
-	P(se->avg.util_avg);
 #endif
 #undef PN
 #undef P
@@ -462,6 +461,14 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
 		print_task(m, rq, p);
 	}
 	rcu_read_unlock();
+
+#ifdef CONFIG_SMP
+	SEQ_printf(m, "\nutilization: \n");
+	SEQ_printf(m, "  .%-30s: %lu\n", "util_avg",
+			rq->avg.util_avg);
+	SEQ_printf(m, "  .%-30s: %ld\n", "removed_util_avg",
+			atomic_long_read(&rq->removed_util_avg));
+#endif
 }
 
 void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
@@ -510,12 +517,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 			cfs_rq->avg.load_avg);
 	SEQ_printf(m, "  .%-30s: %lu\n", "runnable_load_avg",
 			cfs_rq->runnable_load_avg);
-	SEQ_printf(m, "  .%-30s: %lu\n", "util_avg",
-			cfs_rq->avg.util_avg);
 	SEQ_printf(m, "  .%-30s: %ld\n", "removed_load_avg",
 			atomic_long_read(&cfs_rq->removed_load_avg));
-	SEQ_printf(m, "  .%-30s: %ld\n", "removed_util_avg",
-			atomic_long_read(&cfs_rq->removed_util_avg));
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	SEQ_printf(m, "  .%-30s: %lu\n", "tg_load_avg_contrib",
 			cfs_rq->tg_load_avg_contrib);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1d4640c..b514129 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -703,47 +703,30 @@ static void attach_entity_sched_avg(struct cfs_rq *cfs_rq, struct sched_entity *
 
 /*
  * With new tasks being created, their initial util_avgs are extrapolated
- * based on the cfs_rq's current util_avg:
+ * based on the rq's current util_avg. To make the util_avg converge, we
+ * cap the util_avg of successive tasks to only 1/2 of the left utilization
+ * budget:
  *
- *   util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
- *
- * However, in many cases, the above util_avg does not give a desired
- * value. Moreover, the sum of the util_avgs may be divergent, such
- * as when the series is a harmonic series.
- *
- * To solve this problem, we also cap the util_avg of successive tasks to
- * only 1/2 of the left utilization budget:
- *
- *   util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
+ *   util_avg = (1024 - rq->avg.util_avg) / 2^n
  *
  * where n denotes the nth task.
  *
  * For example, a simplest series from the beginning would be like:
  *
- *  task  util_avg: 512, 256, 128,  64,  32,   16,    8, ...
- * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
- *
- * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
- * if util_avg > util_avg_cap.
+ *  task util_avg: 512, 256, 128,  64,  32,   16,    8, ...
+ *    rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
  */
 void post_init_entity_sched_avg(struct sched_entity *se)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+	struct rq *rq = rq_of(cfs_rq);
 	struct sched_avg *sa = &se->avg;
-	long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
+	long cap = (long)(SCHED_CAPACITY_SCALE - rq->avg.util_avg) / 2;
 	u64 now = cfs_rq_clock_task(cfs_rq);
 	int tg_update;
 
 	if (cap > 0) {
-		if (cfs_rq->avg.util_avg != 0) {
-			sa->util_avg  = cfs_rq->avg.util_avg * se->load.weight;
-			sa->util_avg /= (cfs_rq->avg.load_avg + 1);
-
-			if (sa->util_avg > cap)
-				sa->util_avg = cap;
-		} else {
-			sa->util_avg = cap;
-		}
+		sa->util_avg = cap;
 		sa->util_sum = sa->util_avg * SCHED_AVG_MAX;
 	}
 
@@ -2676,8 +2659,45 @@ __accumulate_sum(u64 periods, u32 period_contrib, u32 remainder)
 
 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
 
-static __always_inline u32 accumulate_sum(u64 delta, struct sched_avg *sa,
-	struct cfs_rq *cfs_rq, int cpu, unsigned long weight, int running)
+static __always_inline void
+__update_rq_util_avg(struct rq *rq, unsigned long scale_freq, unsigned long scale_cpu)
+{
+	u32 contrib;
+	struct sched_avg *sa = &rq->avg;
+	u64 delta, periods, now = rq_clock_task(rq);
+
+	/*
+	 * We have new delta (in ns unit) and periods (in ms unit).
+	 */
+	delta = (now - sa->last_update_time) >> 10;
+	if (!delta)
+		return;
+	sa->last_update_time = now;
+
+	delta += sa->period_contrib;
+	periods = delta >> 10;
+
+	/* Step 1: decay */
+	if (periods)
+		sa->util_sum = __decay_sum(sa->util_sum, periods);
+
+	/* Step 2: accumulate */
+	delta %= 1024;
+	contrib = __accumulate_sum(periods, sa->period_contrib, delta);
+	sa->period_contrib = delta;
+
+	contrib = cap_scale(contrib, scale_freq);
+	if (rq->cfs.curr != NULL) /* new running */
+		sa->util_sum += contrib * scale_cpu;
+
+	/* Step 3: update avg */
+	if (periods)
+		sa->util_avg = sa->util_sum / SCHED_AVG_MAX;
+}
+
+static __always_inline u32
+accumulate_sum(u64 delta, struct sched_avg *sa, struct cfs_rq *cfs_rq, int cpu,
+	       unsigned long weight, int running, int update_util)
 {
 	u32 contrib;
 	u64 periods;
@@ -2696,11 +2716,11 @@ static __always_inline u32 accumulate_sum(u64 delta, struct sched_avg *sa,
 	 */
 	if (periods) {
 		sa->load_sum = __decay_sum(sa->load_sum, periods);
-		if (cfs_rq) {
+		if (cfs_rq)
 			cfs_rq->runnable_load_sum =
 				__decay_sum(cfs_rq->runnable_load_sum, periods);
-		}
-		sa->util_sum = __decay_sum((u64)(sa->util_sum), periods);
+		else if (update_util)
+			sa->util_sum = __decay_sum((u64)(sa->util_sum), periods);
 	}
 
 	/*
@@ -2720,9 +2740,12 @@ static __always_inline u32 accumulate_sum(u64 delta, struct sched_avg *sa,
 		if (cfs_rq)
 			cfs_rq->runnable_load_sum += weight * contrib;
 	}
-	if (running)
+	if (running && update_util)
 		sa->util_sum += contrib * scale_cpu;
 
+	if (cfs_rq)
+		__update_rq_util_avg(rq_of(cfs_rq), scale_freq, scale_cpu);
+
 	return periods;
 }
 
@@ -2759,6 +2782,7 @@ __update_sched_avg(u64 now, int cpu, struct sched_avg *sa,
 		   unsigned long weight, int running, struct cfs_rq *cfs_rq)
 {
 	u64 delta;
+	int update_util = 0;
 
 	delta = now - sa->last_update_time;
 	/*
@@ -2780,24 +2804,30 @@ __update_sched_avg(u64 now, int cpu, struct sched_avg *sa,
 	sa->last_update_time = now;
 
 	/*
+	 * We update util_sum together with load_sum iff it is a task
+	 */
+	if (!cfs_rq && entity_is_task(container_of(sa, struct sched_entity, avg)))
+		update_util = 1;
+
+	/*
 	 * Now we know we crossed measurement unit boundaries. The *_avg
 	 * accrues by two steps:
 	 *
 	 * Step 1: accumulate *_sum since last_update_time. If we haven't
 	 * crossed period boundaries, finish.
 	 */
-	if (!accumulate_sum(delta, sa, cfs_rq, cpu, weight, running))
+	if (!accumulate_sum(delta, sa, cfs_rq, cpu, weight, running, update_util))
 		return 0;
 
 	/*
 	 * Step 2: update *_avg.
 	 */
 	sa->load_avg = div_u64(sa->load_sum, SCHED_AVG_MAX);
-	if (cfs_rq) {
+	if (cfs_rq)
 		cfs_rq->runnable_load_avg =
 			div_u64(cfs_rq->runnable_load_sum, SCHED_AVG_MAX);
-	}
-	sa->util_avg = sa->util_sum / SCHED_AVG_MAX;
+	else if (update_util)
+		sa->util_avg = sa->util_sum / SCHED_AVG_MAX;
 
 	return 1;
 }
@@ -2897,8 +2927,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
 		 *
 		 * See cpu_util().
 		 */
-		cpufreq_update_util(rq_clock(rq),
-				    min(cfs_rq->avg.util_avg, max), max);
+		cpufreq_update_util(rq_clock(rq), min(rq->avg.util_avg, max), max);
 	}
 }
 
@@ -2941,18 +2970,21 @@ update_cfs_rq_sched_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
 {
 	struct sched_avg *sa = &cfs_rq->avg;
 	int decayed, removed_load = 0, removed_util = 0;
+	struct rq *rq = rq_of(cfs_rq);
 
 	if (atomic_long_read(&cfs_rq->removed_load_avg)) {
 		s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
+
 		sub_positive(&sa->load_avg, r);
 		sub_positive(&sa->load_sum, r * SCHED_AVG_MAX);
 		removed_load = 1;
 	}
 
-	if (atomic_long_read(&cfs_rq->removed_util_avg)) {
-		long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
-		sub_positive(&sa->util_avg, r);
-		sub_positive(&sa->util_sum, r * SCHED_AVG_MAX);
+	if (atomic_long_read(&rq->removed_util_avg)) {
+		long r = atomic_long_xchg(&rq->removed_util_avg, 0);
+
+		sub_positive(&rq->avg.util_avg, r);
+		sub_positive(&rq->avg.util_sum, r * SCHED_AVG_MAX);
 		removed_util = 1;
 	}
 
@@ -2999,6 +3031,8 @@ static inline void update_sched_avg(struct sched_entity *se, int update_tg)
  */
 static void attach_entity_sched_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
+	struct rq *rq = rq_of(cfs_rq);
+
 	if (!sched_feat(ATTACH_AGE_LOAD))
 		goto skip_aging;
 
@@ -3022,8 +3056,8 @@ skip_aging:
 	se->avg.last_update_time = cfs_rq->avg.last_update_time;
 	cfs_rq->avg.load_avg += se->avg.load_avg;
 	cfs_rq->avg.load_sum += se->avg.load_sum;
-	cfs_rq->avg.util_avg += se->avg.util_avg;
-	cfs_rq->avg.util_sum += se->avg.util_sum;
+	rq->avg.util_avg += se->avg.util_avg;
+	rq->avg.util_sum += se->avg.util_sum;
 
 	cfs_rq_util_change(cfs_rq);
 }
@@ -3038,14 +3072,16 @@ skip_aging:
  */
 static void detach_entity_sched_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
+	struct rq *rq = rq_of(cfs_rq);
+
 	__update_sched_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
 			   &se->avg, se->on_rq * se->load.weight,
 			   cfs_rq->curr == se, NULL);
 
 	sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
 	sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
-	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
-	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+	sub_positive(&rq->avg.util_avg, se->avg.util_avg);
+	sub_positive(&rq->avg.util_sum, se->avg.util_sum);
 
 	cfs_rq_util_change(cfs_rq);
 }
@@ -3117,6 +3153,7 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
 static void remove_entity_sched_avg(struct sched_entity *se)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+	struct rq *rq = rq_of(cfs_rq);
 	u64 last_update_time;
 
 	/*
@@ -3134,7 +3171,7 @@ static void remove_entity_sched_avg(struct sched_entity *se)
 	__update_sched_avg(last_update_time, cpu_of(rq_of(cfs_rq)),
 			   &se->avg, 0, 0, NULL);
 	atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
-	atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
+	atomic_long_add(se->avg.util_avg, &rq->removed_util_avg);
 }
 
 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
@@ -5332,7 +5369,7 @@ done:
  * compare the utilization with the capacity of the CPU that is available for
  * CFS task (ie cpu_capacity).
  *
- * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
+ * rq->avg.util_avg is the sum of running time of runnable tasks plus the
  * recent utilization of currently non-runnable tasks on a CPU. It represents
  * the amount of utilization of a CPU in the range [0..capacity_orig] where
  * capacity_orig is the cpu_capacity available at the highest frequency
@@ -5341,9 +5378,9 @@ done:
  * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
  * the running time on this CPU scaled by capacity_curr.
  *
- * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
+ * Nevertheless, rq->avg.util_avg can be higher than capacity_curr or even
  * higher than capacity_orig because of unfortunate rounding in
- * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
+ * rq->avg.util_avg or just after migrating tasks and new task wakeups until
  * the average stabilizes with the new running time. We need to check that the
  * utilization stays within the range of [0..capacity_orig] and cap it if
  * necessary. Without utilization capping, a group could be seen as overloaded
@@ -5354,7 +5391,7 @@ done:
  */
 static int cpu_util(int cpu)
 {
-	unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
+	unsigned long util = cpu_rq(cpu)->avg.util_avg;
 	unsigned long capacity = capacity_orig_of(cpu);
 
 	return (util >= capacity) ? capacity : util;
@@ -8533,7 +8570,6 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 #endif
 #ifdef CONFIG_SMP
 	atomic_long_set(&cfs_rq->removed_load_avg, 0);
-	atomic_long_set(&cfs_rq->removed_util_avg, 0);
 #endif
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4723d4a..f6785c1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -398,7 +398,7 @@ struct cfs_rq {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	unsigned long tg_load_avg_contrib;
 #endif
-	atomic_long_t removed_load_avg, removed_util_avg;
+	atomic_long_t removed_load_avg;
 #ifndef CONFIG_64BIT
 	u64 load_last_update_time_copy;
 #endif
@@ -662,6 +662,9 @@ struct rq {
 
 	/* This is used to determine avg_idle's max value */
 	u64 max_idle_balance_cost;
+
+	struct sched_avg avg;
+	atomic_long_t removed_util_avg;
 #endif
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
-- 
1.7.9.5