linux-kernel - [RFC PATCH V2] sched: Improve scalability of select_idle

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20180108221237.31761-1-subhra.mazumdar@oracle.com>
Date:   Mon,  8 Jan 2018 14:12:37 -0800
From:   subhra mazumdar <subhra.mazumdar@...cle.com>
To:     linux-kernel@...r.kernel.org
Cc:     peterz@...radead.org, mingo@...hat.com, steven.sistare@...cle.com,
        dhaval.giani@...cle.com
Subject: [RFC PATCH V2] sched: Improve scalability of select_idle_sibling using SMT balance

Current select_idle_sibling first tries to find a fully idle core using
select_idle_core which can potentially search all cores and if it fails it
finds any idle cpu using select_idle_cpu. select_idle_cpu can potentially
search all cpus in the llc domain. This doesn't scale for large llc domains
and will only get worse with more cores in future.

This patch solves the scalability problem of potentially searching all
cores or cpus by using a randomized approach called SMT balance. It
maintains a utilization of the SMTs per scheduling group based on the
number of busy CPUs in the group (henceforth referred to as SMT
utilization). This is accounted at the time of context switch. The SMT
utilization is maintained only for levels below the LLC domain, so only for
cores. During context switch each cpu of a core atomically increments or
decrements the SMT utilization variable of that core depending on whether
the cpu is going busy or idle. Since the atomic variable is per core, it
will be updated only by the hyperthreads in a core with minimal contention.
The cost of context_switch() function below with this patch confirms that
the overhead is minimal and mostly within noise margin.

In the fast path of wakeup the scheduler compares the target cpu group of
select_idle_sibling with a random group in the LLC domain w.r.t SMT
utilization to determine a better core to schedule. It chooses the core
which has more SMT capacity (idle cpus) left. The SMT capacity is computed
simply by subtracting SMT utilization from group weight. This comparison
can be done in O(1). Finally it does an idle cpu search only in that core
starting from a random cpu index. The random number generation needs to be
fast and uses a per cpu pseudo random number generator.

Following are the numbers with various benchmarks on a x86 2 socket system
with 22 cores per socket and 2 hyperthreads per core:

hackbench process:
groups  baseline-rc6(avg)  %stdev  patch(avg)      %stdev
1       0.4797             15.75   0.4324 (+9.86%) 2.23
2       0.4877             9.99    0.4535 (+7.01%) 3.36
4       0.8603             1.09    0.8376 (+2.64%) 0.95
8       1.496              0.60    1.4516 (+2.97%) 1.38
16      2.6642             0.37    2.5857 (+2.95%) 0.68
32      4.6715             0.40    4.5158 (+3.33%) 0.67

uperf pingpong throughput with loopback interface and message size = 8k:
threads baseline-rc6(avg)  %stdev  patch(avg)       %stdev
8       49.47              0.35    51.16 (+3.42%)   0.53
16      95.28              0.77    101.02 (+6.03%)  0.43
32      156.77             1.17    181.52 (+15.79%) 0.96
48      193.24             0.22    212.90 (+10.17%) 0.45
64      216.21             9.33    264.14 (+22.17%) 0.69
128     379.62             10.29   416.36 (+9.68%)  1.04

Oracle DB TPC-C throughput normalized to baseline:
users   baseline-rc6 norm(avg)  %stdev  patch norm(avg)   %stdev
20      1                       0.94    1.0071 (+0.71%)   1.03
40      1                       0.82    1.0126 (+1.26%)   0.65
60      1                       1.10    0.9928 (-0.72%)   0.67
80      1                       0.63    1.003 (+0.30%)    0.64
100     1                       0.82    0.9957 (-0.43%)   0.15
120     1                       0.46    1.0034 (+0.34%)   1.74
140     1                       1.44    1.0247 (+2.47%)   0.15
160     1                       0.85    1.0445 (+4.45%)   0.81
180     1                       0.19    1.0382 (+3.82%)   0.57
200     1                       1.40    1.0295 (+2.95%)   0.94
220     1                       1.02    1.0242 (+2.42%)   0.85

Following are the cost (in us) of context_switch() and
select_idle_sibling() with hackbench 16 groups:

function                 baseline-rc6  %stdev  patch               %stdev
context_switch()         663.8799      4.46    687.4068 (+3.54%)   2.85
select_idle_sibling()    0.556         1.72    0.263 (-52.70%)     0.78

Signed-off-by: subhra mazumdar <subhra.mazumdar@...cle.com>
---
 include/linux/sched/topology.h |   2 +
 kernel/sched/core.c            |  38 +++++++
 kernel/sched/fair.c            | 245 ++++++++++++++++++++---------------------
 kernel/sched/idle_task.c       |   1 -
 kernel/sched/sched.h           |  26 ++---
 kernel/sched/topology.c        |  35 +++++-
 6 files changed, 197 insertions(+), 150 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 7d065ab..cd1f129 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -146,6 +146,8 @@ struct sched_domain {
 	struct sched_domain_shared *shared;
 
 	unsigned int span_weight;
+	struct sched_group **sg_array;
+	int sg_num;
 	/*
 	 * Span of all CPUs in this domain.
 	 *
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d17c5da..805451b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2743,6 +2743,18 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
 		put_user(task_pid_vnr(current), current->set_child_tid);
 }
 
+static __always_inline void
+sd_context_switch(struct sched_domain *sd, struct rq *rq, int util)
+{
+	struct sched_group *sg_cpu = sd->groups;
+
+       /* atomically add/subtract the util */
+	if (util > 0)
+		atomic_inc((atomic_t *)&sg_cpu->utilization);
+	else
+		atomic_dec((atomic_t *)&sg_cpu->utilization);
+}
+
 /*
  * context_switch - switch to the new MM and the new thread's register state.
  */
@@ -2751,6 +2763,31 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	       struct task_struct *next, struct rq_flags *rf)
 {
 	struct mm_struct *mm, *oldmm;
+	int this_cpu = rq->cpu;
+	struct sched_domain *sd;
+	int prev_busy, next_busy;
+
+	if (rq->curr_util == UTIL_UNINITIALIZED)
+		prev_busy = 0;
+	else
+		prev_busy = (prev != rq->idle);
+	next_busy = (next != rq->idle);
+
+	/*
+	 * From sd_llc downward update the SMT utilization.
+	 * Skip the lowest level 0.
+	 */
+	sd = rcu_dereference_sched(per_cpu(sd_llc, this_cpu));
+	if (next_busy != prev_busy) {
+		for_each_lower_domain(sd) {
+			if (sd->level == 0)
+				break;
+			sd_context_switch(sd, rq, next_busy - prev_busy);
+		}
+	}
+
+	if (sd)
+		rq->curr_util = next_busy;
 
 	prepare_task_switch(rq, prev, next);
 
@@ -5917,6 +5954,7 @@ void __init sched_init(void)
 		rq->idle_stamp = 0;
 		rq->avg_idle = 2*sysctl_sched_migration_cost;
 		rq->max_idle_balance_cost = sysctl_sched_migration_cost;
+		rq->curr_util = UTIL_UNINITIALIZED;
 
 		INIT_LIST_HEAD(&rq->cfs_tasks);
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d3f3094..6313ab1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5631,128 +5631,126 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 
 #ifdef CONFIG_SCHED_SMT
 
-static inline void set_idle_cores(int cpu, int val)
+/*
+ * Find an idle cpu in the core starting search from random index.
+ */
+static int select_idle_smt(struct task_struct *p, struct sched_group *sg)
 {
-	struct sched_domain_shared *sds;
+	int i, rand_index, rand_cpu;
+	int this_cpu = smp_processor_id();
 
-	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
-	if (sds)
-		WRITE_ONCE(sds->has_idle_cores, val);
-}
+	rand_index = CPU_PSEUDO_RANDOM(this_cpu) % sg->group_weight;
+	rand_cpu = sg->cp_array[rand_index];
 
-static inline bool test_idle_cores(int cpu, bool def)
-{
-	struct sched_domain_shared *sds;
-
-	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
-	if (sds)
-		return READ_ONCE(sds->has_idle_cores);
+	for_each_cpu_wrap(i, sched_group_span(sg), rand_cpu) {
+		if (!cpumask_test_cpu(i, &p->cpus_allowed))
+			continue;
+		if (idle_cpu(i))
+			return i;
+	}
 
-	return def;
+	return -1;
 }
 
 /*
- * Scans the local SMT mask to see if the entire core is idle, and records this
- * information in sd_llc_shared->has_idle_cores.
- *
- * Since SMT siblings share all cache levels, inspecting this limited remote
- * state should be fairly cheap.
+ * Compare the SMT utilization of the two groups and select the one which
+ * has more capacity left.
  */
-void __update_idle_core(struct rq *rq)
+static int smt_should_migrate(struct sched_group *here,
+			      struct sched_group *there, int self_util)
 {
-	int core = cpu_of(rq);
-	int cpu;
-
-	rcu_read_lock();
-	if (test_idle_cores(core, true))
-		goto unlock;
-
-	for_each_cpu(cpu, cpu_smt_mask(core)) {
-		if (cpu == core)
-			continue;
+	int this_cpu = smp_processor_id();
+	int here_util = here->utilization, there_util = there->utilization;
 
-		if (!idle_cpu(cpu))
-			goto unlock;
+	/* Discount self utilization if it belongs to here or there */
+	if (self_util > 0) {
+		if (cpumask_test_cpu(this_cpu, sched_group_span(here)))
+			here_util -= self_util;
+		else if (cpumask_test_cpu(this_cpu, sched_group_span(there)))
+			there_util -= self_util;
 	}
 
-	set_idle_cores(core, 1);
-unlock:
-	rcu_read_unlock();
+	/* Return true if other group has more capacity left */
+	return (there->group_weight - there_util >
+		here->group_weight - here_util);
 }
 
 /*
- * Scan the entire LLC domain for idle cores; this dynamically switches off if
- * there are no idle cores left in the system; tracked through
- * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
- */
-static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
-{
-	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
-	int core, cpu;
-
-	if (!static_branch_likely(&sched_smt_present))
-		return -1;
+ * SMT balancing works by comparing the target cpu group with a random group
+ * in llc domain. It calls smt_should_migrate to decide which group has more
+ * capacity left. The balancing starts top down fom sd_llc to SMT core level.
+ * Finally idle cpu search is only done in the core.
+ */
+static int smt_balance(struct task_struct *p, struct sched_domain *sd, int cpu)
+{
+	struct sched_group *sg, *src_sg, *start_sg, *tgt_sg;
+	struct cpumask *span;
+	int rand_idx, weight;
+	int cpu_orig = cpu;
+	int rand_cpu;
+	int this_cpu = smp_processor_id();
+	struct rq *this_rq = cpu_rq(this_cpu);
+	struct rq *rq = cpu_rq(cpu);
+	int self_util = 0;
+	int balanced = 0;
 
-	if (!test_idle_cores(target, false))
-		return -1;
+	if (p == this_rq->curr)
+		self_util = rq->curr_util;
 
-	cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
+	for_each_lower_domain(sd) {
+		if (sd->level == 0)
+			break;
 
-	for_each_cpu_wrap(core, cpus, target) {
-		bool idle = true;
+		/* Pick a random group that has cpus where the thread can run */
+		src_sg = sd->groups;
+		tgt_sg = NULL;
+		rand_idx = CPU_PSEUDO_RANDOM(this_cpu) % sd->sg_num;
+		start_sg = sd->sg_array[rand_idx];
+		sg = start_sg;
+		do {
+			span = sched_group_span(sg);
+			if (sg != src_sg &&
+			    cpumask_intersects(span, &p->cpus_allowed)) {
+				tgt_sg = sg;
+				break;
+			}
+			sg = sg->next;
+		} while (sg != start_sg);
 
-		for_each_cpu(cpu, cpu_smt_mask(core)) {
-			cpumask_clear_cpu(cpu, cpus);
-			if (!idle_cpu(cpu))
-				idle = false;
+		/*
+		 * Compare the target group with random group and select the
+		 * one which has more SMT capacity left. Choose a random CPU in
+		 * the group to spread the load, then find the group's parent sd
+		 * so the group's sd is used on the next main loop iteration.
+		 */
+		if (tgt_sg && smt_should_migrate(src_sg, tgt_sg, self_util)) {
+			weight = tgt_sg->group_weight;
+			rand_idx = CPU_PSEUDO_RANDOM(this_cpu) % weight;
+			rand_cpu = tgt_sg->cp_array[rand_idx];
+			for_each_cpu_wrap(cpu, span, rand_cpu) {
+				if (cpumask_test_cpu(cpu, &p->cpus_allowed))
+					break;
+			}
+			for_each_domain(cpu, sd) {
+				if (weight < sd->span_weight)
+					break;
+			}
+			balanced = 1;
 		}
-
-		if (idle)
-			return core;
 	}
 
-	/*
-	 * Failed to find an idle core; stop looking for one.
-	 */
-	set_idle_cores(target, 0);
-
-	return -1;
-}
-
-/*
- * Scan the local SMT mask for idle CPUs.
- */
-static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
-{
-	int cpu;
-
-	if (!static_branch_likely(&sched_smt_present))
-		return -1;
-
-	for_each_cpu(cpu, cpu_smt_mask(target)) {
-		if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
-			continue;
-		if (idle_cpu(cpu))
+	/* sd is now lowest level SMT core */
+	if (sd->parent && (balanced || !idle_cpu(cpu_orig))) {
+		cpu = select_idle_smt(p, sd->parent->groups);
+		if (cpu >= 0)
 			return cpu;
 	}
 
-	return -1;
+	return cpu_orig;
 }
 
 #else /* CONFIG_SCHED_SMT */
 
-static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
-{
-	return -1;
-}
-
-static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
-{
-	return -1;
-}
-
-#endif /* CONFIG_SCHED_SMT */
-
 /*
  * Scan the LLC domain for idle CPUs; this is dynamically regulated by
  * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
@@ -5807,41 +5805,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
 	return cpu;
 }
 
-/*
- * Try and locate an idle core/thread in the LLC cache domain.
- */
-static int select_idle_sibling(struct task_struct *p, int prev, int target)
-{
-	struct sched_domain *sd;
-	int i;
-
-	if (idle_cpu(target))
-		return target;
-
-	/*
-	 * If the previous cpu is cache affine and idle, don't be stupid.
-	 */
-	if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
-		return prev;
-
-	sd = rcu_dereference(per_cpu(sd_llc, target));
-	if (!sd)
-		return target;
-
-	i = select_idle_core(p, sd, target);
-	if ((unsigned)i < nr_cpumask_bits)
-		return i;
-
-	i = select_idle_cpu(p, sd, target);
-	if ((unsigned)i < nr_cpumask_bits)
-		return i;
-
-	i = select_idle_smt(p, sd, target);
-	if ((unsigned)i < nr_cpumask_bits)
-		return i;
-
-	return target;
-}
+#endif /* CONFIG_SCHED_SMT */
 
 /*
  * cpu_util returns the amount of capacity of a CPU that is used by CFS
@@ -5924,6 +5888,31 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
 	return min_cap * 1024 < task_util(p) * capacity_margin;
 }
 
+static int select_idle_sibling(struct task_struct *p, int prev, int target)
+{
+	struct sched_domain *sd;
+
+	if (idle_cpu(target))
+		return target;
+
+	/*
+	 * If the previous cpu is cache affine and idle, don't be stupid.
+	 */
+	if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
+		return prev;
+
+	sd = rcu_dereference(per_cpu(sd_llc, target));
+	if (!sd)
+		return target;
+
+#ifdef CONFIG_SCHED_SMT
+	return smt_balance(p, sd, target);
+#else
+
+	return select_idle_cpu(p, sd, target);
+#endif
+}
+
 /*
  * select_task_rq_fair: Select target runqueue for the waking task in domains
  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 0c00172..a3d5a7c 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -27,7 +27,6 @@ static struct task_struct *
 pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 {
 	put_prev_task(rq, prev);
-	update_idle_core(rq);
 	schedstat_inc(rq->sched_goidle);
 	return rq->idle;
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 14db76c..2c681cd 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -47,6 +47,11 @@
 struct rq;
 struct cpuidle_state;
 
+#define	CPU_PSEUDO_RANDOM(cpu)	(cpu_rq(cpu)->rotor++)
+
+/* uninitialized state of the rq */
+#define UTIL_UNINITIALIZED	-1
+
 /* task_struct::on_rq states: */
 #define TASK_ON_RQ_QUEUED	1
 #define TASK_ON_RQ_MIGRATING	2
@@ -737,6 +742,8 @@ struct rq {
 	/* cpu of this runqueue: */
 	int cpu;
 	int online;
+	unsigned int rotor;
+	int curr_util;
 
 	struct list_head cfs_tasks;
 
@@ -808,23 +815,6 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }
 
-
-#ifdef CONFIG_SCHED_SMT
-
-extern struct static_key_false sched_smt_present;
-
-extern void __update_idle_core(struct rq *rq);
-
-static inline void update_idle_core(struct rq *rq)
-{
-	if (static_branch_unlikely(&sched_smt_present))
-		__update_idle_core(rq);
-}
-
-#else
-static inline void update_idle_core(struct rq *rq) { }
-#endif
-
 DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
 #define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))
@@ -1080,6 +1070,8 @@ struct sched_group {
 	unsigned int group_weight;
 	struct sched_group_capacity *sgc;
 	int asym_prefer_cpu;		/* cpu of highest priority in group */
+	int utilization;
+	int *cp_array;
 
 	/*
 	 * The CPUs this group covers.
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index f1cf4f3..09c982f 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -333,8 +333,10 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc)
 		if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
 			kfree(sg->sgc);
 
-		if (atomic_dec_and_test(&sg->ref))
+		if (atomic_dec_and_test(&sg->ref)) {
+			kfree(sg->cp_array);
 			kfree(sg);
+		}
 		sg = tmp;
 	} while (sg != first);
 }
@@ -350,6 +352,9 @@ static void destroy_sched_domain(struct sched_domain *sd)
 
 	if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
 		kfree(sd->shared);
+
+	kfree(sd->sg_array);
+
 	kfree(sd);
 }
 
@@ -910,17 +915,29 @@ build_sched_groups(struct sched_domain *sd, int cpu)
  * group having more cpu_capacity will pickup more load compared to the
  * group having less cpu_capacity.
  */
-static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
+static void init_sched_groups_capacity(int cpu_orig, struct sched_domain *sd)
 {
 	struct sched_group *sg = sd->groups;
+	int sg_num = 0, i, cp;
 
 	WARN_ON(!sg);
 
 	do {
 		int cpu, max_cpu = -1;
 
+		sg_num++;
 		sg->group_weight = cpumask_weight(sched_group_span(sg));
 
+		/* Build the array of cpus for each group */
+		if (!sg->cp_array) {
+			sg->cp_array = kzalloc_node(sg->group_weight *
+						    sizeof(int), GFP_KERNEL,
+						    cpu_to_node(cpu_orig));
+			i = 0;
+			for_each_cpu(cp, sched_group_span(sg))
+				sg->cp_array[i++] = cp;
+		}
+
 		if (!(sd->flags & SD_ASYM_PACKING))
 			goto next;
 
@@ -936,10 +953,20 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
 		sg = sg->next;
 	} while (sg != sd->groups);
 
-	if (cpu != group_balance_cpu(sg))
+	/* Build the array of groups for each domain */
+	sd->sg_array = kzalloc_node(sg_num * sizeof(void *), GFP_KERNEL,
+				    cpu_to_node(cpu_orig));
+	sd->sg_num = sg_num;
+	i = 0;
+	do {
+		sd->sg_array[i++] = sg;
+		sg = sg->next;
+	} while (sg != sd->groups);
+
+	if (cpu_orig != group_balance_cpu(sg))
 		return;
 
-	update_group_capacity(sd, cpu);
+	update_group_capacity(sd, cpu_orig);
 }
 
 /*
-- 
2.9.3