linux-kernel - [RFC PATCH] sched: Improve scalability of select_idle

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20171208200754.4738-1-subhra.mazumdar@oracle.com>
Date:   Fri,  8 Dec 2017 12:07:54 -0800
From:   subhra mazumdar <subhra.mazumdar@...cle.com>
To:     linux-kernel@...r.kernel.org
Cc:     peterz@...radead.org, mingo@...hat.com
Subject: [RFC PATCH] sched: Improve scalability of select_idle_sibling using SMT balance

Current select_idle_sibling first tries to find a fully idle core using
select_idle_cores which can potentially search all cores and if it fails it
finds any idle cpu using select_idle_cpu. select_idle_cpu can potentially
search all cpus in the llc domain. These don't scale for large llc domains
and will only get worse with more cores in future.

This patch uses SMT balancing instead which uses a randomized approach. It
maintains SMT utilization per scheduling group which is the no. of busy
cpus in the group. This is accounted at the time of context switch. In the
fast path of wakeup it compares the target cpu group in select_idle_sibling
with a random group in the LLC domain and chooses the group which has more
SMT capacity (idle cpus) left using the SMT utilization. The balancing
happens top down till the lowest level (SMT core) is reached. Finally it
does a idle cpu search only in that core starting from a random cpu index.
The random number generation needs to be fast and uses a per cpu pseudo
random number generator.

Following are the numbers with various benchmarks on a x86 2 socket system
with 22 cores per socket and 2 hyperthreads per core:

hackbench process:
groups  baseline-rc6(avg)  %stdev  patch(avg)      %stdev
1       0.4797             15.75   0.4324 (+9.86%) 2.23
2       0.4877             9.99    0.4535 (+7.01%) 3.36
4       0.8603             1.09    0.8376 (+2.64%) 0.95
8       1.496              0.60    1.4516 (+2.97%) 1.38
16      2.6642             0.37    2.5857 (+2.95%) 0.68
32      4.6715             0.40    4.5158 (+3.33%) 0.67

uperf pingpong throughput with loopback interface and message size = 8k:
threads baseline-rc6(avg)  %stdev  patch(avg)       %stdev
8       49.47              0.35    51.16 (+3.42%)   0.53
16      95.28              0.77    101.02 (+6.03%)  0.43
32      156.77             1.17    181.52 (+15.79%) 0.96
48      193.24             0.22    212.90 (+10.17%) 0.45
64      216.21             9.33    264.14 (+22.17%) 0.69
128     379.62             10.29   416.36 (+9.68%)  1.04

Oracle DB TPC-C throughput normalized to baseline:
users   baseline-rc6 norm(avg)  %stdev  patch norm(avg)   %stdev
20      1                       0.94    1.0071 (+0.71%)   1.03
40      1                       0.82    1.0126 (+1.26%)   0.65
60      1                       1.10    0.9928 (-0.72%)   0.67
80      1                       0.63    1.003 (+0.30%)    0.64
100     1                       0.82    0.9957 (-0.43%)   0.15
120     1                       0.46    1.0034 (+0.34%)   1.74
140     1                       1.44    1.0247 (+2.47%)   0.15
160     1                       0.85    1.0445 (+4.45%)   0.81
180     1                       0.19    1.0382 (+3.82%)   0.57
200     1                       1.40    1.0295 (+2.95%)   0.94
220     1                       1.02    1.0242 (+2.42%)   0.85

Following are the cost (in us) of context_switch() and
select_idle_sibling() with hackbench 16 groups:

function                 baseline-rc6  %stdev  patch               %stdev

context_switch()         663.8799      4.46    687.4068 (+3.54%)   2.85
select_idle_sibling()    0.556         1.72    0.263 (-52.70%)     0.78

Signed-off-by: subhra mazumdar <subhra.mazumdar@...cle.com>
---
 include/linux/sched/topology.h |   4 +
 kernel/sched/core.c            |  60 ++++++++++
 kernel/sched/fair.c            | 249 +++++++++++++++++++++--------------------
 kernel/sched/idle_task.c       |   1 -
 kernel/sched/sched.h           |  37 +++---
 kernel/sched/topology.c        |  52 ++++++++-
 6 files changed, 257 insertions(+), 146 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 7d065ab..da1f11f 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -146,6 +146,10 @@ struct sched_domain {
 	struct sched_domain_shared *shared;
 
 	unsigned int span_weight;
+	struct sched_group **sg_array;
+	struct sched_group *sg_cpu;
+	int sg_nos;
+	int *cp_array;
 	/*
 	 * Span of all CPUs in this domain.
 	 *
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d17c5da..55c4931 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2743,6 +2743,21 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
 		put_user(task_pid_vnr(current), current->set_child_tid);
 }
 
+static inline void
+sd_context_switch(struct sched_domain *sd, struct rq *rq, int util)
+{
+	struct sched_group *sg_cpu;
+
+       /* atomically add/subtract the util */
+	sg_cpu = sd->sg_cpu;
+	if (util > 0)
+		atomic_inc(
+		    (atomic_t *)(&(sg_cpu->utilization)));
+	else
+		atomic_dec(
+		    (atomic_t *)(&(sg_cpu->utilization)));
+}
+
 /*
  * context_switch - switch to the new MM and the new thread's register state.
  */
@@ -2751,6 +2766,51 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	       struct task_struct *next, struct rq_flags *rf)
 {
 	struct mm_struct *mm, *oldmm;
+	int this_cpu = rq->cpu;
+	struct sched_domain *sd;
+	unsigned int cond;
+
+	cond = ((prev != rq->idle) << 1) | (next != rq->idle);
+	sd = rcu_dereference(per_cpu(sd_llc, this_cpu));
+
+	/*
+	 * From sd_llc downward update the SMT utilization.
+	 * Skip the lowest level 0.
+	 */
+	for_each_lower_domain(sd) {
+		if (sd->level == 0)
+			break;
+		if (rq->initial_util == UTIL_UNINITIALIZED) {
+			switch (cond) {
+			case PREV_IDLE_NEXT_NIDLE:
+			case PREV_NIDLE_NEXT_NIDLE:
+				sd_context_switch(sd, rq, SMT_THREAD_UTIL);
+				break;
+			case PREV_NIDLE_NEXT_IDLE:
+			case PREV_IDLE_NEXT_IDLE:
+				break;
+			}
+		} else {
+			switch (cond) {
+			case PREV_IDLE_NEXT_NIDLE:
+				sd_context_switch(sd, rq, SMT_THREAD_UTIL);
+				break;
+			case PREV_NIDLE_NEXT_IDLE:
+				sd_context_switch(sd, rq, -SMT_THREAD_UTIL);
+				break;
+			case PREV_IDLE_NEXT_IDLE:
+			case PREV_NIDLE_NEXT_NIDLE:
+				break;
+			}
+		}
+	}
+
+	if (sd) {
+		if (next == rq->idle)
+			rq->initial_util = UTIL_IDLE;
+		else
+			rq->initial_util = UTIL_BUSY;
+	}
 
 	prepare_task_switch(rq, prev, next);
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d3f3094..8d97be4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5631,128 +5631,139 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 
 #ifdef CONFIG_SCHED_SMT
 
-static inline void set_idle_cores(int cpu, int val)
+/*
+ * Find an idle cpu in the core starting search from random index.
+ */
+static int select_idle_smt(struct task_struct *p, struct sched_domain *sd)
 {
-	struct sched_domain_shared *sds;
+	int i, rand_index, rand_cpu;
+	int this_cpu = smp_processor_id();
 
-	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
-	if (sds)
-		WRITE_ONCE(sds->has_idle_cores, val);
-}
+	rand_index = CPU_PSEUDO_RANDOM(this_cpu) % sd->span_weight;
+	rand_cpu = sd->cp_array[rand_index];
 
-static inline bool test_idle_cores(int cpu, bool def)
-{
-	struct sched_domain_shared *sds;
-
-	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
-	if (sds)
-		return READ_ONCE(sds->has_idle_cores);
+	for_each_cpu_wrap(i, sched_domain_span(sd), rand_cpu) {
+		if (!cpumask_test_cpu(i, &p->cpus_allowed))
+			continue;
+		if (idle_cpu(i))
+			return i;
+	}
 
-	return def;
+	return -1;
 }
 
 /*
- * Scans the local SMT mask to see if the entire core is idle, and records this
- * information in sd_llc_shared->has_idle_cores.
- *
- * Since SMT siblings share all cache levels, inspecting this limited remote
- * state should be fairly cheap.
+ * Compare the SMT utilization of the two groups and select the one which
+ * has more capacity left.
  */
-void __update_idle_core(struct rq *rq)
+static int smt_should_migrate(struct sched_group *here,
+			      struct sched_group *there, int self)
 {
-	int core = cpu_of(rq);
-	int cpu;
+	int here_util, there_util;
+	int this_cpu = smp_processor_id();
 
-	rcu_read_lock();
-	if (test_idle_cores(core, true))
-		goto unlock;
+	here_util = here->utilization;
+	there_util = there->utilization;
 
-	for_each_cpu(cpu, cpu_smt_mask(core)) {
-		if (cpu == core)
-			continue;
+	/* Discount self utilization if it belongs to here or there */
+	if (self > 0) {
+		if (cpumask_test_cpu(this_cpu, sched_group_span(here)))
+			here_util = here_util - self;
+		else if (cpumask_test_cpu(this_cpu, sched_group_span(there)))
+			there_util = there_util - self;
+	}
 
-		if (!idle_cpu(cpu))
-			goto unlock;
+	/* Return true if other group has more capacity left */
+	if (there->group_weight - there_util >
+	    here->group_weight - here_util) {
+		return 1;
 	}
 
-	set_idle_cores(core, 1);
-unlock:
-	rcu_read_unlock();
+	return 0;
 }
 
 /*
- * Scan the entire LLC domain for idle cores; this dynamically switches off if
- * there are no idle cores left in the system; tracked through
- * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
+ * SMT balancing works by comparing the target cpu group with a random group
+ * in llc domain. It calls smt_should_migrate to decide which group has more
+ * capacity left. The balancing starts top down fom sd_llc till SMT core
+ * level. Finally idle cpu search is only done in the core.
  */
-static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
+static int smt_balance(struct task_struct *p, int cpu)
 {
-	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
-	int core, cpu;
+	struct sched_domain *sd = NULL, *tmp;
+	struct sched_group *sg, *sg_cpu, *start_sg, *tgt = NULL;
+	int rand_idx, weight;
+	int cpu_orig = cpu, rand_cpu;
+	int this_cpu = smp_processor_id();
+	struct rq *this_rq = cpu_rq(this_cpu);
+	struct rq *rq = cpu_rq(cpu);
+	int self = 0;
+	int balanced = 0;
 
-	if (!static_branch_likely(&sched_smt_present))
-		return -1;
+	if (p == this_rq->curr) {
+		if (rq->initial_util == UTIL_IDLE)
+			self = 0;
+		else if (rq->initial_util == UTIL_BUSY)
+			self = 1;
+	}
 
-	if (!test_idle_cores(target, false))
-		return -1;
+	sd = rcu_dereference(per_cpu(sd_llc, cpu));
 
-	cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
+	for_each_lower_domain(sd) {
+		if (sd->level == 0)
+			break;
 
-	for_each_cpu_wrap(core, cpus, target) {
-		bool idle = true;
+		/* Pick a random group that has cpus where the thread can run */
+		sg_cpu = sd->sg_cpu;
+		rand_idx = CPU_PSEUDO_RANDOM(this_cpu) %
+		    sd->sg_nos;
+		sg = start_sg = sd->sg_array[rand_idx];
+		do {
+			if (sg != sg_cpu && cpumask_intersects(
+			    sched_group_span(sg),
+			    &p->cpus_allowed)) {
+				tgt = sg;
+				break;
+			}
+		} while (sg = sg->next, sg != start_sg);
 
-		for_each_cpu(cpu, cpu_smt_mask(core)) {
-			cpumask_clear_cpu(cpu, cpus);
-			if (!idle_cpu(cpu))
-				idle = false;
+		/*
+		 * Compare the target group with random group and select the
+		 * one which has more SMT capacity left.
+		 */
+		if (tgt && smt_should_migrate(sg_cpu, tgt, self)) {
+			rand_idx = CPU_PSEUDO_RANDOM(this_cpu) %
+			    tgt->group_weight;
+			rand_cpu = tgt->cp_array[rand_idx];
+			for_each_cpu_wrap(cpu, sched_group_span(tgt),
+			    rand_cpu) {
+				if (cpumask_test_cpu(cpu, &p->cpus_allowed))
+					break;
+			}
+			weight = tgt->group_weight;
+			for_each_domain(cpu, tmp) {
+				if (weight < tmp->span_weight)
+					break;
+			}
+			sd = tmp;
+			balanced = 1;
 		}
-
-		if (idle)
-			return core;
+		tgt = NULL;
 	}
 
-	/*
-	 * Failed to find an idle core; stop looking for one.
-	 */
-	set_idle_cores(target, 0);
-
-	return -1;
-}
-
-/*
- * Scan the local SMT mask for idle CPUs.
- */
-static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
-{
-	int cpu;
-
-	if (!static_branch_likely(&sched_smt_present))
-		return -1;
-
-	for_each_cpu(cpu, cpu_smt_mask(target)) {
-		if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
-			continue;
-		if (idle_cpu(cpu))
-			return cpu;
+	/* sd is now lowest level SMT core */
+	if (sd && (balanced || !idle_cpu(cpu_orig))) {
+		cpu = select_idle_smt(p, sd);
+		if (cpu < 0)
+			cpu = cpu_orig;
+		return cpu;
 	}
 
-	return -1;
+	return cpu_orig;
 }
 
 #else /* CONFIG_SCHED_SMT */
 
-static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
-{
-	return -1;
-}
-
-static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
-{
-	return -1;
-}
-
-#endif /* CONFIG_SCHED_SMT */
-
 /*
  * Scan the LLC domain for idle CPUs; this is dynamically regulated by
  * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
@@ -5807,41 +5818,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
 	return cpu;
 }
 
-/*
- * Try and locate an idle core/thread in the LLC cache domain.
- */
-static int select_idle_sibling(struct task_struct *p, int prev, int target)
-{
-	struct sched_domain *sd;
-	int i;
-
-	if (idle_cpu(target))
-		return target;
-
-	/*
-	 * If the previous cpu is cache affine and idle, don't be stupid.
-	 */
-	if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
-		return prev;
-
-	sd = rcu_dereference(per_cpu(sd_llc, target));
-	if (!sd)
-		return target;
-
-	i = select_idle_core(p, sd, target);
-	if ((unsigned)i < nr_cpumask_bits)
-		return i;
-
-	i = select_idle_cpu(p, sd, target);
-	if ((unsigned)i < nr_cpumask_bits)
-		return i;
-
-	i = select_idle_smt(p, sd, target);
-	if ((unsigned)i < nr_cpumask_bits)
-		return i;
-
-	return target;
-}
+#endif /* CONFIG_SCHED_SMT */
 
 /*
  * cpu_util returns the amount of capacity of a CPU that is used by CFS
@@ -5924,6 +5901,30 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
 	return min_cap * 1024 < task_util(p) * capacity_margin;
 }
 
+static int select_idle_sibling(struct task_struct *p, int prev, int target)
+{
+	if (idle_cpu(target))
+		return target;
+
+	/*
+	 * If the previous cpu is cache affine and idle, don't be stupid.
+	 */
+	if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
+		return prev;
+
+#ifdef CONFIG_SCHED_SMT
+	return (smt_balance(p, target));
+#else
+	struct sched_domain *sd = rcu_dereference(per_cpu(sd_llc, target));
+
+	if (!sd)
+		return target;
+
+	return select_idle_cpu(p, sd, target);
+#endif
+
+}
+
 /*
  * select_task_rq_fair: Select target runqueue for the waking task in domains
  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 0c00172..a3d5a7c 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -27,7 +27,6 @@ static struct task_struct *
 pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 {
 	put_prev_task(rq, prev);
-	update_idle_core(rq);
 	schedstat_inc(rq->sched_goidle);
 	return rq->idle;
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 14db76c..0e681d4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -47,6 +47,22 @@
 struct rq;
 struct cpuidle_state;
 
+#define	CPU_PSEUDO_RANDOM(cpu)	(((cpu_rq((cpu)))->rotor)++)
+
+/* states of the rq */
+#define UTIL_UNINITIALIZED	0
+#define UTIL_IDLE		1
+#define UTIL_BUSY		2
+
+/* context switch conditions */
+#define	PREV_IDLE_NEXT_IDLE	0
+#define	PREV_IDLE_NEXT_NIDLE	1
+#define	PREV_NIDLE_NEXT_IDLE	2
+#define	PREV_NIDLE_NEXT_NIDLE	3
+
+/* smt utilization of a thread */
+#define	SMT_THREAD_UTIL	1
+
 /* task_struct::on_rq states: */
 #define TASK_ON_RQ_QUEUED	1
 #define TASK_ON_RQ_MIGRATING	2
@@ -737,6 +753,8 @@ struct rq {
 	/* cpu of this runqueue: */
 	int cpu;
 	int online;
+	unsigned int rotor;
+	int initial_util;
 
 	struct list_head cfs_tasks;
 
@@ -808,23 +826,6 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }
 
-
-#ifdef CONFIG_SCHED_SMT
-
-extern struct static_key_false sched_smt_present;
-
-extern void __update_idle_core(struct rq *rq);
-
-static inline void update_idle_core(struct rq *rq)
-{
-	if (static_branch_unlikely(&sched_smt_present))
-		__update_idle_core(rq);
-}
-
-#else
-static inline void update_idle_core(struct rq *rq) { }
-#endif
-
 DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
 #define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))
@@ -1080,6 +1081,8 @@ struct sched_group {
 	unsigned int group_weight;
 	struct sched_group_capacity *sgc;
 	int asym_prefer_cpu;		/* cpu of highest priority in group */
+	int utilization;
+	int *cp_array;
 
 	/*
 	 * The CPUs this group covers.
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index f1cf4f3..c1f6fbe 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -333,8 +333,10 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc)
 		if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
 			kfree(sg->sgc);
 
-		if (atomic_dec_and_test(&sg->ref))
+		if (atomic_dec_and_test(&sg->ref)) {
+			kfree(sg->cp_array);
 			kfree(sg);
+		}
 		sg = tmp;
 	} while (sg != first);
 }
@@ -350,6 +352,10 @@ static void destroy_sched_domain(struct sched_domain *sd)
 
 	if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
 		kfree(sd->shared);
+
+	kfree(sd->sg_array);
+	kfree(sd->cp_array);
+
 	kfree(sd);
 }
 
@@ -910,17 +916,30 @@ build_sched_groups(struct sched_domain *sd, int cpu)
  * group having more cpu_capacity will pickup more load compared to the
  * group having less cpu_capacity.
  */
-static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
+static void init_sched_groups_capacity(int cpu_orig, struct sched_domain *sd)
 {
 	struct sched_group *sg = sd->groups;
+	int sg_nos = 0, i, cp;
 
 	WARN_ON(!sg);
 
 	do {
 		int cpu, max_cpu = -1;
 
+		sg_nos++;
 		sg->group_weight = cpumask_weight(sched_group_span(sg));
 
+		/* Build the array of cpus for each group */
+		if (!sg->cp_array) {
+			sg->cp_array = kzalloc_node(sg->group_weight *
+			    sizeof(int), GFP_KERNEL, cpu_to_node(cpu_orig));
+			i = 0;
+			for_each_cpu(cp, sched_group_span(sg)) {
+				sg->cp_array[i] = cp;
+				i++;
+			}
+		}
+
 		if (!(sd->flags & SD_ASYM_PACKING))
 			goto next;
 
@@ -936,10 +955,35 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
 		sg = sg->next;
 	} while (sg != sd->groups);
 
-	if (cpu != group_balance_cpu(sg))
+	/* Build the array of groups for each domain */
+	sd->sg_array = kzalloc_node(sg_nos *
+	    sizeof(struct sched_group *),
+	    GFP_KERNEL, cpu_to_node(cpu_orig));
+	sd->sg_nos = sg_nos;
+	i = 0;
+	do {
+		if (cpumask_test_cpu(cpu_orig, sched_group_span(sg)))
+			sd->sg_cpu = sg;
+		sd->sg_array[i] = sg;
+		i++;
+		sg = sg->next;
+	} while (sg != sd->groups);
+
+	/* Build the array of cpus for the lowest level (SMT core) */
+	if (sd->level == 0) {
+		sd->cp_array = kzalloc_node(sd->span_weight * sizeof(int),
+		    GFP_KERNEL, cpu_to_node(cpu_orig));
+		i = 0;
+		for_each_cpu(cp, sched_domain_span(sd)) {
+			sd->cp_array[i] = cp;
+			i++;
+		}
+	}
+
+	if (cpu_orig != group_balance_cpu(sg))
 		return;
 
-	update_group_capacity(sd, cpu);
+	update_group_capacity(sd, cpu_orig);
 }
 
 /*
-- 
2.9.3