linux-kernel - [rfc patch] sched/fair: Use instantaneous load for fork/exec balancing

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <1465891111.1694.13.camel@gmail.com>
Date:	Tue, 14 Jun 2016 09:58:31 +0200
From:	Mike Galbraith <umgwanakikbuti@...il.com>
To:	Peter Zijlstra <peterz@...radead.org>
Cc:	Yuyang Du <yuyang.du@...el.com>,
	LKML <linux-kernel@...r.kernel.org>
Subject: [rfc patch] sched/fair: Use instantaneous load for fork/exec
 balancing

SUSE's regression testing noticed that...

0905f04eb21f sched/fair: Fix new task's load avg removed from source CPU in wake_up_new_task()

...introduced a hackbench regression, and indeed it does.  I think this
regression has more to do with randomness than anything else, but in
general...

While averaging calms down load balancing, helping to keep migrations
down to a dull roar, it's not completely wonderful when it comes to
things that live in the here and now, hackbench being one such.

time sh -c 'for i in `seq 1000`; do hackbench -p -P > /dev/null; done'

real    0m55.397s
user    0m8.320s
sys     5m40.789s

echo LB_INSTANTANEOUS_LOAD > /sys/kernel/debug/sched_features

real    0m48.049s
user    0m6.510s
sys     5m6.291s

Signed-off-by: Mike Galbraith <umgwanakikbuti@...il.com>
---
 kernel/sched/fair.c     |   54 ++++++++++++++++++++++++------------------------
 kernel/sched/features.h |    1 
 kernel/sched/sched.h    |    6 +++++
 3 files changed, 35 insertions(+), 26 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -738,7 +738,7 @@ void post_init_entity_util_avg(struct sc
 	}
 }
 
-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
+static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq, int avg);
 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
 #else
 void init_entity_runnable_average(struct sched_entity *se)
@@ -1229,9 +1229,9 @@ bool should_numa_migrate_memory(struct t
 	       group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
 }
 
-static unsigned long weighted_cpuload(const int cpu);
-static unsigned long source_load(int cpu, int type);
-static unsigned long target_load(int cpu, int type);
+static unsigned long weighted_cpuload(const int cpu, int avg);
+static unsigned long source_load(int cpu, int type, int avg);
+static unsigned long target_load(int cpu, int type, int avg);
 static unsigned long capacity_of(int cpu);
 static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
 
@@ -1261,7 +1261,7 @@ static void update_numa_stats(struct num
 		struct rq *rq = cpu_rq(cpu);
 
 		ns->nr_running += rq->nr_running;
-		ns->load += weighted_cpuload(cpu);
+		ns->load += weighted_cpuload(cpu, LOAD_AVERAGE);
 		ns->compute_capacity += capacity_of(cpu);
 
 		cpus++;
@@ -3102,8 +3102,10 @@ void remove_entity_load_avg(struct sched
 	atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
 }
 
-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
+static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq, int avg)
 {
+	if (sched_feat(LB_INSTANTANEOUS_LOAD) && avg == LOAD_INSTANT)
+		return cfs_rq->load.weight;
 	return cfs_rq->runnable_load_avg;
 }
 
@@ -4701,9 +4703,9 @@ static void cpu_load_update(struct rq *t
 }
 
 /* Used instead of source_load when we know the type == 0 */
-static unsigned long weighted_cpuload(const int cpu)
+static unsigned long weighted_cpuload(const int cpu, int avg)
 {
-	return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
+	return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs, avg);
 }
 
 #ifdef CONFIG_NO_HZ_COMMON
@@ -4748,7 +4750,7 @@ static void cpu_load_update_idle(struct
 	/*
 	 * bail if there's load or we're actually up-to-date.
 	 */
-	if (weighted_cpuload(cpu_of(this_rq)))
+	if (weighted_cpuload(cpu_of(this_rq), LOAD_AVERAGE))
 		return;
 
 	cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
@@ -4769,7 +4771,7 @@ void cpu_load_update_nohz_start(void)
 	 * concurrently we'll exit nohz. And cpu_load write can race with
 	 * cpu_load_update_idle() but both updater would be writing the same.
 	 */
-	this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
+	this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq), LOAD_AVERAGE);
 }
 
 /*
@@ -4784,7 +4786,7 @@ void cpu_load_update_nohz_stop(void)
 	if (curr_jiffies == this_rq->last_load_update_tick)
 		return;
 
-	load = weighted_cpuload(cpu_of(this_rq));
+	load = weighted_cpuload(cpu_of(this_rq), LOAD_AVERAGE);
 	raw_spin_lock(&this_rq->lock);
 	update_rq_clock(this_rq);
 	cpu_load_update_nohz(this_rq, curr_jiffies, load);
@@ -4810,7 +4812,7 @@ static void cpu_load_update_periodic(str
  */
 void cpu_load_update_active(struct rq *this_rq)
 {
-	unsigned long load = weighted_cpuload(cpu_of(this_rq));
+	unsigned long load = weighted_cpuload(cpu_of(this_rq), LOAD_AVERAGE);
 
 	if (tick_nohz_tick_stopped())
 		cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
@@ -4825,10 +4827,10 @@ void cpu_load_update_active(struct rq *t
  * We want to under-estimate the load of migration sources, to
  * balance conservatively.
  */
-static unsigned long source_load(int cpu, int type)
+static unsigned long source_load(int cpu, int type, int avg)
 {
 	struct rq *rq = cpu_rq(cpu);
-	unsigned long total = weighted_cpuload(cpu);
+	unsigned long total = weighted_cpuload(cpu, avg);
 
 	if (type == 0 || !sched_feat(LB_BIAS))
 		return total;
@@ -4840,10 +4842,10 @@ static unsigned long source_load(int cpu
  * Return a high guess at the load of a migration-target cpu weighted
  * according to the scheduling class and "nice" value.
  */
-static unsigned long target_load(int cpu, int type)
+static unsigned long target_load(int cpu, int type, int avg)
 {
 	struct rq *rq = cpu_rq(cpu);
-	unsigned long total = weighted_cpuload(cpu);
+	unsigned long total = weighted_cpuload(cpu, avg);
 
 	if (type == 0 || !sched_feat(LB_BIAS))
 		return total;
@@ -4865,7 +4867,7 @@ static unsigned long cpu_avg_load_per_ta
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
-	unsigned long load_avg = weighted_cpuload(cpu);
+	unsigned long load_avg = weighted_cpuload(cpu, LOAD_AVERAGE);
 
 	if (nr_running)
 		return load_avg / nr_running;
@@ -5047,8 +5049,8 @@ static int wake_affine(struct sched_doma
 	idx	  = sd->wake_idx;
 	this_cpu  = smp_processor_id();
 	prev_cpu  = task_cpu(p);
-	load	  = source_load(prev_cpu, idx);
-	this_load = target_load(this_cpu, idx);
+	load	  = source_load(prev_cpu, idx, LOAD_AVERAGE);
+	this_load = target_load(this_cpu, idx, LOAD_AVERAGE);
 
 	/*
 	 * If sync wakeup then subtract the (maximum possible)
@@ -5136,9 +5138,9 @@ find_idlest_group(struct sched_domain *s
 		for_each_cpu(i, sched_group_cpus(group)) {
 			/* Bias balancing toward cpus of our domain */
 			if (local_group)
-				load = source_load(i, load_idx);
+				load = source_load(i, load_idx, LOAD_INSTANT);
 			else
-				load = target_load(i, load_idx);
+				load = target_load(i, load_idx, LOAD_INSTANT);
 
 			avg_load += load;
 		}
@@ -5197,7 +5199,7 @@ find_idlest_cpu(struct sched_group *grou
 				shallowest_idle_cpu = i;
 			}
 		} else if (shallowest_idle_cpu == -1) {
-			load = weighted_cpuload(i);
+			load = weighted_cpuload(i, LOAD_INSTANT);
 			if (load < min_load || (load == min_load && i == this_cpu)) {
 				min_load = load;
 				least_loaded_cpu = i;
@@ -6982,9 +6984,9 @@ static inline void update_sg_lb_stats(st
 
 		/* Bias balancing toward cpus of our domain */
 		if (local_group)
-			load = target_load(i, load_idx);
+			load = target_load(i, load_idx, LOAD_AVERAGE);
 		else
-			load = source_load(i, load_idx);
+			load = source_load(i, load_idx, LOAD_AVERAGE);
 
 		sgs->group_load += load;
 		sgs->group_util += cpu_util(i);
@@ -6998,7 +7000,7 @@ static inline void update_sg_lb_stats(st
 		sgs->nr_numa_running += rq->nr_numa_running;
 		sgs->nr_preferred_running += rq->nr_preferred_running;
 #endif
-		sgs->sum_weighted_load += weighted_cpuload(i);
+		sgs->sum_weighted_load += weighted_cpuload(i, LOAD_AVERAGE);
 		/*
 		 * No need to call idle_cpu() if nr_running is not 0
 		 */
@@ -7510,7 +7512,7 @@ static struct rq *find_busiest_queue(str
 
 		capacity = capacity_of(i);
 
-		wl = weighted_cpuload(i);
+		wl = weighted_cpuload(i, LOAD_AVERAGE);
 
 		/*
 		 * When comparing with imbalance, use weighted_cpuload()
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -39,6 +39,7 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true)
 SCHED_FEAT(HRTICK, false)
 SCHED_FEAT(DOUBLE_TICK, false)
 SCHED_FEAT(LB_BIAS, true)
+SCHED_FEAT(LB_INSTANTANEOUS_LOAD, false)
 
 /*
  * Decrement CPU capacity based on time not spent running tasks
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1630,6 +1630,12 @@ static inline void double_rq_unlock(stru
 		__release(rq2->lock);
 }
 
+/*
+ * Tell load balancing functions whether we want instant or average load
+ */
+#define LOAD_INSTANT	0
+#define LOAD_AVERAGE	1
+
 #else /* CONFIG_SMP */
 
 /*