linux-kernel - [PATCH] sched: Make select_idle_sibling search domain configurable

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20200728070131.1629670-1-xii@google.com>
Date:   Tue, 28 Jul 2020 00:01:31 -0700
From:   Xi Wang <xii@...gle.com>
To:     Ingo Molnar <mingo@...hat.com>,
        Peter Zijlstra <peterz@...radead.org>
Cc:     Juri Lelli <juri.lelli@...hat.com>,
        Vincent Guittot <vincent.guittot@...aro.org>,
        Dietmar Eggemann <dietmar.eggemann@....com>,
        Steven Rostedt <rostedt@...dmis.org>,
        Mel Gorman <mgorman@...e.de>, linux-kernel@...r.kernel.org,
        linux-fsdevel@...r.kernel.org, Xi Wang <xii@...gle.com>
Subject: [PATCH] sched: Make select_idle_sibling search domain configurable

The scope of select_idle_sibling idle cpu search is LLC. This
becomes a problem for the AMD CCX architecture, as the sd_llc is only
4 cores. On a many core machine, the range of search is too small to
reach a satisfactory level of statistical multiplexing / efficient
utilization of short idle time slices.

With this patch idle sibling search is detached from LLC and it
becomes run time configurable. To reduce search and migration
overheads, a presearch domain is added. The presearch domain will be
searched first before the "main search" domain, e.g.:

sysctl_sched_wake_idle_domain == 2 ("MC" domain)
sysctl_sched_wake_idle_presearch_domain == 1 ("DIE" domain)

Presearch will go through 4 cores of a CCX. If no idle cpu is found
during presearch, full search will go through the remaining cores of
a cpu socket.

Heuristics including sd->avg_scan_cost and sds->have_idle_cores
are only active for the main search.

On a 128 core (2 socket * 64 core, 256 hw threads) AMD machine ran
hackbench as

hackbench -g 20 -f 20 --loops 10000

A snapshot of run time was

Baseline: 11.8
With the patch: 7.6    (configured as in the example above)

Signed-off-by: Xi Wang <xii@...gle.com>
---
 block/blk-mq.c                 |   2 +-
 block/blk-softirq.c            |   2 +-
 include/linux/cpuset.h         |  10 +-
 include/linux/sched/topology.h |  11 +-
 kernel/cgroup/cpuset.c         |  32 ++++--
 kernel/sched/core.c            |  10 +-
 kernel/sched/fair.c            | 191 +++++++++++++++++++++------------
 kernel/sched/sched.h           |   9 +-
 kernel/sched/topology.c        |  87 ++++++++++-----
 kernel/sysctl.c                |  25 +++++
 10 files changed, 256 insertions(+), 123 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4e0d173beaa3..20aee9f047e2 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -626,7 +626,7 @@ void blk_mq_force_complete_rq(struct request *rq)
 
 	cpu = get_cpu();
 	if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags))
-		shared = cpus_share_cache(cpu, ctx->cpu);
+		shared = cpus_share_sis(cpu, ctx->cpu);
 
 	if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
 		rq->csd.func = __blk_mq_complete_request_remote;
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 6e7ec87d49fa..dd38ac0e1f2e 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -108,7 +108,7 @@ void __blk_complete_request(struct request *req)
 	 */
 	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && ccpu != -1) {
 		if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags))
-			shared = cpus_share_cache(cpu, ccpu);
+			shared = cpus_share_sis(cpu, ccpu);
 	} else
 		ccpu = cpu;
 
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 04c20de66afc..8b243aa8462e 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -117,6 +117,7 @@ static inline int cpuset_do_slab_mem_spread(void)
 extern bool current_cpuset_is_being_rebound(void);
 
 extern void rebuild_sched_domains(void);
+extern void rebuild_sched_domains_force(void);
 
 extern void cpuset_print_current_mems_allowed(void);
 
@@ -173,7 +174,7 @@ static inline void cpuset_force_rebuild(void) { }
 
 static inline void cpuset_update_active_cpus(void)
 {
-	partition_sched_domains(1, NULL, NULL);
+	partition_sched_domains(1, NULL, NULL, 0);
 }
 
 static inline void cpuset_wait_for_hotplug(void) { }
@@ -259,7 +260,12 @@ static inline bool current_cpuset_is_being_rebound(void)
 
 static inline void rebuild_sched_domains(void)
 {
-	partition_sched_domains(1, NULL, NULL);
+	partition_sched_domains(1, NULL, NULL, 0);
+}
+
+static inline void rebuild_sched_domains_force(void)
+{
+	partition_sched_domains(1, NULL, NULL, 1);
 }
 
 static inline void cpuset_print_current_mems_allowed(void)
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index fb11091129b3..aff9739cf516 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -151,16 +151,17 @@ static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
 
 extern void partition_sched_domains_locked(int ndoms_new,
 					   cpumask_var_t doms_new[],
-					   struct sched_domain_attr *dattr_new);
+					   struct sched_domain_attr *dattr_new,
+					   int force_update);
 
 extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
-				    struct sched_domain_attr *dattr_new);
+				    struct sched_domain_attr *dattr_new, int force_update);
 
 /* Allocate an array of sched domains, for partition_sched_domains(). */
 cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
 
-bool cpus_share_cache(int this_cpu, int that_cpu);
+bool cpus_share_sis(int this_cpu, int that_cpu);
 
 typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
 typedef int (*sched_domain_flags_f)(void);
@@ -199,7 +200,7 @@ struct sched_domain_attr;
 
 static inline void
 partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
-			       struct sched_domain_attr *dattr_new)
+			       struct sched_domain_attr *dattr_new, int force_update)
 {
 }
 
@@ -209,7 +210,7 @@ partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 {
 }
 
-static inline bool cpus_share_cache(int this_cpu, int that_cpu)
+static inline bool cpus_share_sis(int this_cpu, int that_cpu)
 {
 	return true;
 }
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 642415b8c3c9..5087b90c4c47 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -962,10 +962,10 @@ static void rebuild_root_domains(void)
 
 static void
 partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
-				    struct sched_domain_attr *dattr_new)
+				    struct sched_domain_attr *dattr_new, int force_update)
 {
 	mutex_lock(&sched_domains_mutex);
-	partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
+	partition_sched_domains_locked(ndoms_new, doms_new, dattr_new, force_update);
 	rebuild_root_domains();
 	mutex_unlock(&sched_domains_mutex);
 }
@@ -981,7 +981,7 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
  *
  * Call with cpuset_mutex held.  Takes get_online_cpus().
  */
-static void rebuild_sched_domains_locked(void)
+static void rebuild_sched_domains_locked(int force_update)
 {
 	struct sched_domain_attr *attr;
 	cpumask_var_t *doms;
@@ -1007,23 +1007,33 @@ static void rebuild_sched_domains_locked(void)
 	ndoms = generate_sched_domains(&doms, &attr);
 
 	/* Have scheduler rebuild the domains */
-	partition_and_rebuild_sched_domains(ndoms, doms, attr);
+	partition_and_rebuild_sched_domains(ndoms, doms, attr, force_update);
 }
 #else /* !CONFIG_SMP */
-static void rebuild_sched_domains_locked(void)
+static void rebuild_sched_domains_locked(int force_update)
 {
 }
 #endif /* CONFIG_SMP */
 
-void rebuild_sched_domains(void)
+void __rebuild_sched_domains(int force_update)
 {
 	get_online_cpus();
 	percpu_down_write(&cpuset_rwsem);
-	rebuild_sched_domains_locked();
+	rebuild_sched_domains_locked(force_update);
 	percpu_up_write(&cpuset_rwsem);
 	put_online_cpus();
 }
 
+void rebuild_sched_domains(void)
+{
+	__rebuild_sched_domains(0);
+}
+
+void rebuild_sched_domains_force(void)
+{
+	__rebuild_sched_domains(1);
+}
+
 /**
  * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
  * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
@@ -1437,7 +1447,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
 	rcu_read_unlock();
 
 	if (need_rebuild_sched_domains)
-		rebuild_sched_domains_locked();
+		rebuild_sched_domains_locked(0);
 }
 
 /**
@@ -1837,7 +1847,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
 		cs->relax_domain_level = val;
 		if (!cpumask_empty(cs->cpus_allowed) &&
 		    is_sched_load_balance(cs))
-			rebuild_sched_domains_locked();
+			rebuild_sched_domains_locked(0);
 	}
 
 	return 0;
@@ -1903,7 +1913,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
 	spin_unlock_irq(&callback_lock);
 
 	if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
-		rebuild_sched_domains_locked();
+		rebuild_sched_domains_locked(0);
 
 	if (spread_flag_changed)
 		update_tasks_flags(cs);
@@ -1994,7 +2004,7 @@ static int update_prstate(struct cpuset *cs, int val)
 	if (parent->child_ecpus_count)
 		update_sibling_cpumasks(parent, cs, &tmp);
 
-	rebuild_sched_domains_locked();
+	rebuild_sched_domains_locked(0);
 out:
 	free_cpumasks(NULL, &tmp);
 	return err;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e15543cb8481..e28548fc63f0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2350,9 +2350,9 @@ void wake_up_if_idle(int cpu)
 	rcu_read_unlock();
 }
 
-bool cpus_share_cache(int this_cpu, int that_cpu)
+bool cpus_share_sis(int this_cpu, int that_cpu)
 {
-	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
+	return per_cpu(sd_sis_id, this_cpu) == per_cpu(sd_sis_id, that_cpu);
 }
 
 static inline bool ttwu_queue_cond(int cpu, int wake_flags)
@@ -2361,7 +2361,7 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags)
 	 * If the CPU does not share cache, then queue the task on the
 	 * remote rqs wakelist to avoid accessing remote data.
 	 */
-	if (!cpus_share_cache(smp_processor_id(), cpu))
+	if (!cpus_share_sis(smp_processor_id(), cpu))
 		return true;
 
 	/*
@@ -6501,7 +6501,7 @@ static void cpuset_cpu_active(void)
 		 * operation in the resume sequence, just build a single sched
 		 * domain, ignoring cpusets.
 		 */
-		partition_sched_domains(1, NULL, NULL);
+		partition_sched_domains(1, NULL, NULL, 0);
 		if (--num_cpus_frozen)
 			return;
 		/*
@@ -6522,7 +6522,7 @@ static int cpuset_cpu_inactive(unsigned int cpu)
 		cpuset_update_active_cpus();
 	} else {
 		num_cpus_frozen++;
-		partition_sched_domains(1, NULL, NULL);
+		partition_sched_domains(1, NULL, NULL, 0);
 	}
 	return 0;
 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 04fa8dbcfa4d..0ed71f2f3a81 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5736,8 +5736,8 @@ static void record_wakee(struct task_struct *p)
  * at a frequency roughly N times higher than one of its wakees.
  *
  * In order to determine whether we should let the load spread vs consolidating
- * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
- * partner, and a factor of lls_size higher frequency in the other.
+ * sis domain, we look for a minimum 'flip' frequency of sis_size in one partner,
+ * and a factor of sis_size higher frequency in the other.
  *
  * With both conditions met, we can be relatively sure that the relationship is
  * non-monogamous, with partner count exceeding socket size.
@@ -5750,7 +5750,7 @@ static int wake_wide(struct task_struct *p)
 {
 	unsigned int master = current->wakee_flips;
 	unsigned int slave = p->wakee_flips;
-	int factor = __this_cpu_read(sd_llc_size);
+	int factor = __this_cpu_read(sd_sis_size);
 
 	if (master < slave)
 		swap(master, slave);
@@ -5786,7 +5786,7 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync)
 	 * a cpufreq perspective, it's better to have higher utilisation
 	 * on one CPU.
 	 */
-	if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
+	if (available_idle_cpu(this_cpu) && cpus_share_sis(this_cpu, prev_cpu))
 		return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
 
 	if (sync && cpu_rq(this_cpu)->nr_running == 1)
@@ -5978,7 +5978,7 @@ static inline void set_idle_cores(int cpu, int val)
 {
 	struct sched_domain_shared *sds;
 
-	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+	sds = rcu_dereference(per_cpu(sd_sis_shared, cpu));
 	if (sds)
 		WRITE_ONCE(sds->has_idle_cores, val);
 }
@@ -5987,7 +5987,7 @@ static inline bool test_idle_cores(int cpu, bool def)
 {
 	struct sched_domain_shared *sds;
 
-	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+	sds = rcu_dereference(per_cpu(sd_sis_shared, cpu));
 	if (sds)
 		return READ_ONCE(sds->has_idle_cores);
 
@@ -5996,7 +5996,7 @@ static inline bool test_idle_cores(int cpu, bool def)
 
 /*
  * Scans the local SMT mask to see if the entire core is idle, and records this
- * information in sd_llc_shared->has_idle_cores.
+ * information in sd_sis_shared->has_idle_cores.
  *
  * Since SMT siblings share all cache levels, inspecting this limited remote
  * state should be fairly cheap.
@@ -6024,13 +6024,12 @@ void __update_idle_core(struct rq *rq)
 }
 
 /*
- * Scan the entire LLC domain for idle cores; this dynamically switches off if
+ * Scan the entire sis domain for idle cores; this dynamically switches off if
  * there are no idle cores left in the system; tracked through
- * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
+ * sd_sis->shared->has_idle_cores and enabled through update_idle_core() above.
  */
-static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
+static int select_idle_core(struct task_struct *p, struct cpumask *cpus, int target)
 {
-	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
 	int core, cpu;
 
 	if (!static_branch_likely(&sched_smt_present))
@@ -6039,18 +6038,18 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
 	if (!test_idle_cores(target, false))
 		return -1;
 
-	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
-
 	for_each_cpu_wrap(core, cpus, target) {
 		bool idle = true;
 
+		if (core != cpumask_first(cpu_smt_mask(core)))
+			continue;
+
 		for_each_cpu(cpu, cpu_smt_mask(core)) {
 			if (!available_idle_cpu(cpu)) {
 				idle = false;
 				break;
 			}
 		}
-		cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
 
 		if (idle)
 			return core;
@@ -6099,45 +6098,45 @@ static inline int select_idle_smt(struct task_struct *p, int target)
 #endif /* CONFIG_SCHED_SMT */
 
 /*
- * Scan the LLC domain for idle CPUs; this is dynamically regulated by
+ * Scan the sis domain for idle CPUs; this is dynamically regulated by
  * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
  * average idle time for this rq (as found in rq->avg_idle).
  */
-static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
+static int select_idle_cpu(struct task_struct *p, struct cpumask *cpus,
+	bool main_search, unsigned int span_weight, int target)
 {
-	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
 	struct sched_domain *this_sd;
 	u64 avg_cost, avg_idle;
 	u64 time;
 	int this = smp_processor_id();
 	int cpu, nr = INT_MAX;
 
-	this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
-	if (!this_sd)
-		return -1;
+	if (main_search) {
+		this_sd = rcu_dereference(*this_cpu_ptr(&sd_sis));
+		if (!this_sd)
+			return -1;
 
-	/*
-	 * Due to large variance we need a large fuzz factor; hackbench in
-	 * particularly is sensitive here.
-	 */
-	avg_idle = this_rq()->avg_idle / 512;
-	avg_cost = this_sd->avg_scan_cost + 1;
+		/*
+		 * Due to large variance we need a large fuzz factor; hackbench in
+		 * particularly is sensitive here.
+		 */
+		avg_idle = this_rq()->avg_idle / 512;
+		avg_cost = this_sd->avg_scan_cost + 1;
 
-	if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost)
-		return -1;
+		if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost)
+			return -1;
 
-	if (sched_feat(SIS_PROP)) {
-		u64 span_avg = sd->span_weight * avg_idle;
-		if (span_avg > 4*avg_cost)
-			nr = div_u64(span_avg, avg_cost);
-		else
-			nr = 4;
+		if (sched_feat(SIS_PROP)) {
+			u64 span_avg = span_weight * avg_idle;
+			if (span_avg > 4*avg_cost)
+				nr = div_u64(span_avg, avg_cost);
+			else
+				nr = 4;
+		}
 	}
 
 	time = cpu_clock(this);
 
-	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
-
 	for_each_cpu_wrap(cpu, cpus, target) {
 		if (!--nr)
 			return -1;
@@ -6145,8 +6144,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
 			break;
 	}
 
-	time = cpu_clock(this) - time;
-	update_avg(&this_sd->avg_scan_cost, time);
+	if (main_search) {
+		time = cpu_clock(this) - time;
+		update_avg(&this_sd->avg_scan_cost, time);
+	}
 
 	return cpu;
 }
@@ -6186,19 +6187,21 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
 }
 
 /*
- * Try and locate an idle core/thread in the LLC cache domain.
+ * Try and locate an idle core/thread in the sis domain.
  */
 static int select_idle_sibling(struct task_struct *p, int prev, int target)
 {
-	struct sched_domain *sd;
-	int i, recent_used_cpu;
+	struct sched_domain *sd_asym;
+	struct sched_domain *sd[2];
+	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
+	int i, r, recent_used_cpu;
 
 	/*
 	 * For asymmetric CPU capacity systems, our domain of interest is
-	 * sd_asym_cpucapacity rather than sd_llc.
+	 * sd_asym_cpucapacity rather than sd_sis.
 	 */
 	if (static_branch_unlikely(&sched_asym_cpucapacity)) {
-		sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
+		sd_asym = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
 		/*
 		 * On an asymmetric CPU capacity system where an exclusive
 		 * cpuset defines a symmetric island (i.e. one unique
@@ -6207,10 +6210,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 		 * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
 		 * capacity path.
 		 */
-		if (!sd)
+		if (!sd_asym)
 			goto symmetric;
 
-		i = select_idle_capacity(p, sd, target);
+		i = select_idle_capacity(p, sd_asym, target);
 		return ((unsigned)i < nr_cpumask_bits) ? i : target;
 	}
 
@@ -6221,7 +6224,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	/*
 	 * If the previous CPU is cache affine and idle, don't be stupid:
 	 */
-	if (prev != target && cpus_share_cache(prev, target) &&
+	if (prev != target && cpus_share_sis(prev, target) &&
 	    (available_idle_cpu(prev) || sched_idle_cpu(prev)))
 		return prev;
 
@@ -6243,7 +6246,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	recent_used_cpu = p->recent_used_cpu;
 	if (recent_used_cpu != prev &&
 	    recent_used_cpu != target &&
-	    cpus_share_cache(recent_used_cpu, target) &&
+	    cpus_share_sis(recent_used_cpu, target) &&
 	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
 	    cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
 		/*
@@ -6254,21 +6257,35 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 		return recent_used_cpu;
 	}
 
-	sd = rcu_dereference(per_cpu(sd_llc, target));
-	if (!sd)
-		return target;
+	for (i = 0; ; i++) {
+		if (i == 0) {
+			sd[0] = rcu_dereference(per_cpu(sd_sis_pre, target));
+			if (!sd[0])
+				continue;
+			cpumask_and(cpus, sched_domain_span(sd[0]), p->cpus_ptr);
+		} else if (i == 1) {
+			sd[1] = rcu_dereference(per_cpu(sd_sis, target));
+			if (!sd[1])
+				continue;
+			cpumask_and(cpus, sched_domain_span(sd[1]), p->cpus_ptr);
+			if (sd[0])
+				cpumask_andnot(cpus, cpus, sched_domain_span(sd[0]));
+		} else {
+			break;
+		}
 
-	i = select_idle_core(p, sd, target);
-	if ((unsigned)i < nr_cpumask_bits)
-		return i;
+		r = select_idle_core(p, cpus, target);
+		if ((unsigned)r < nr_cpumask_bits)
+			return r;
 
-	i = select_idle_cpu(p, sd, target);
-	if ((unsigned)i < nr_cpumask_bits)
-		return i;
+		r = select_idle_cpu(p, cpus, (i == 1), sd[i]->span_weight, target);
+		if ((unsigned)r < nr_cpumask_bits)
+			return r;
 
-	i = select_idle_smt(p, target);
-	if ((unsigned)i < nr_cpumask_bits)
-		return i;
+		r = select_idle_smt(p, target);
+		if ((unsigned)r < nr_cpumask_bits)
+			return r;
+	}
 
 	return target;
 }
@@ -6718,6 +6735,46 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 	return new_cpu;
 }
 
+
+#ifdef CONFIG_SMP
+
+extern int sysctl_sched_wake_idle_domain;
+extern int sysctl_sched_wake_idle_presearch_domain;
+
+DEFINE_MUTEX(wake_idle_domain_mutex);
+
+int proc_sched_wake_idle_domain_handler(struct ctl_table *table,
+		int write, void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table tmp = *table;
+	int *sysctl = tmp.data;
+	int val = *sysctl;
+	int min = -1, max = INT_MAX;
+	int rc;
+
+	tmp.extra1 = &min;
+	tmp.extra2 = &max;
+	tmp.data = &val;
+
+	rc = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+	if (rc || !write)
+		return rc;
+
+	mutex_lock(&wake_idle_domain_mutex);
+	*sysctl = val;
+	rebuild_sched_domains_force();
+	mutex_unlock(&wake_idle_domain_mutex);
+
+	pr_info("Idle cpu search (select_idle_sibling) domains changed to: "
+	  "sched_wake_idle_domain %d sched_wake_idle_presearch domain %d\n",
+	  sysctl_sched_wake_idle_domain, sysctl_sched_wake_idle_presearch_domain);
+
+	return 0;
+}
+
+#endif
+
 static void detach_entity_cfs_rq(struct sched_entity *se);
 
 /*
@@ -10136,21 +10193,21 @@ static void nohz_balancer_kick(struct rq *rq)
 		 * cache use, instead we want to embrace asymmetry and only
 		 * ensure tasks have enough CPU capacity.
 		 *
-		 * Skip the LLC logic because it's not relevant in that case.
+		 * Skip the sis logic because it's not relevant in that case.
 		 */
 		goto unlock;
 	}
 
-	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+	sds = rcu_dereference(per_cpu(sd_sis_shared, cpu));
 	if (sds) {
 		/*
-		 * If there is an imbalance between LLC domains (IOW we could
-		 * increase the overall cache use), we need some less-loaded LLC
+		 * If there is an imbalance between sis domains (IOW we could
+		 * increase the overall cache use), we need some less-loaded sis
 		 * domain to pull some load. Likewise, we may need to spread
-		 * load within the current LLC domain (e.g. packed SMT cores but
+		 * load within the current sis domain (e.g. packed SMT cores but
 		 * other CPUs are idle). We can't really know from here how busy
 		 * the others are - so just get a nohz balance going if it looks
-		 * like this LLC domain has tasks we could move.
+		 * like this sis domain has tasks we could move.
 		 */
 		nr_busy = atomic_read(&sds->nr_busy_cpus);
 		if (nr_busy > 1) {
@@ -10170,7 +10227,7 @@ static void set_cpu_sd_state_busy(int cpu)
 	struct sched_domain *sd;
 
 	rcu_read_lock();
-	sd = rcu_dereference(per_cpu(sd_llc, cpu));
+	sd = rcu_dereference(per_cpu(sd_sis, cpu));
 
 	if (!sd || !sd->nohz_idle)
 		goto unlock;
@@ -10200,7 +10257,7 @@ static void set_cpu_sd_state_idle(int cpu)
 	struct sched_domain *sd;
 
 	rcu_read_lock();
-	sd = rcu_dereference(per_cpu(sd_llc, cpu));
+	sd = rcu_dereference(per_cpu(sd_sis, cpu));
 
 	if (!sd || sd->nohz_idle)
 		goto unlock;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 877fb08eb1b0..641a5bacdf77 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1415,10 +1415,11 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
 	return sd;
 }
 
-DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
-DECLARE_PER_CPU(int, sd_llc_size);
-DECLARE_PER_CPU(int, sd_llc_id);
-DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
+DECLARE_PER_CPU(struct sched_domain __rcu *, sd_sis);
+DECLARE_PER_CPU(int, sd_sis_size);
+DECLARE_PER_CPU(int, sd_sis_id);
+DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_sis_shared);
+DECLARE_PER_CPU(struct sched_domain *, sd_sis_pre);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index ba81187bb7af..bdda783c5148 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -605,41 +605,75 @@ static void destroy_sched_domains(struct sched_domain *sd)
 }
 
 /*
- * Keep a special pointer to the highest sched_domain that has
- * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
- * allows us to avoid some pointer chasing select_idle_sibling().
- *
- * Also keep a unique ID per domain (we use the first CPU number in
- * the cpumask of the domain), this allows us to quickly tell if
- * two CPUs are in the same cache domain, see cpus_share_cache().
+ * sd_sis is the select_idle_sibling search domain. It is generalized sd_llc
+ * not limited by the SD_SHARE_PKG_RESOURCE flag. With the sysctls sd_sis is
+ * also run time configurable.
+ * To limit overheads from searching / migrating among cores that don't share
+ * llc, a presearch domain can be enabled such that most searches / migrations
+ * still happen inside a smaller domain when the machine is lightly loaded.
+ *
+ * Keep a special pointer for this allows us to avoid some pointer chasing in
+ * select_idle_sibling(). Also keep a unique ID per domain (we use the first CPU
+ * number in the cpumask of the domain), this allows us to quickly tell if
+ * two CPUs are in the same sis domain, see cpus_share_sis().
  */
-DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
-DEFINE_PER_CPU(int, sd_llc_size);
-DEFINE_PER_CPU(int, sd_llc_id);
-DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_sis);
+DEFINE_PER_CPU(int, sd_sis_size);
+DEFINE_PER_CPU(int, sd_sis_id);
+DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_sis_shared);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
 DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
 
+int sysctl_sched_wake_idle_domain = -1;
+int sysctl_sched_wake_idle_presearch_domain = -1;
+DEFINE_PER_CPU(struct sched_domain *, sd_sis_pre);
+
 static void update_top_cache_domain(int cpu)
 {
 	struct sched_domain_shared *sds = NULL;
-	struct sched_domain *sd;
+	struct sched_domain *sd, *sdp;
 	int id = cpu;
 	int size = 1;
+	int level;
+
+	if (sysctl_sched_wake_idle_domain < 0) {
+		sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
+	} else {
+		level = 0;
+		for_each_domain(cpu, sd) {
+			if (level == sysctl_sched_wake_idle_domain)
+				break;
+			level++;
+		}
+	}
 
-	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
 	if (sd) {
 		id = cpumask_first(sched_domain_span(sd));
 		size = cpumask_weight(sched_domain_span(sd));
 		sds = sd->shared;
 	}
 
-	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
-	per_cpu(sd_llc_size, cpu) = size;
-	per_cpu(sd_llc_id, cpu) = id;
-	rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
+	rcu_assign_pointer(per_cpu(sd_sis, cpu), sd);
+	per_cpu(sd_sis_size, cpu) = size;
+	per_cpu(sd_sis_id, cpu) = id;
+	rcu_assign_pointer(per_cpu(sd_sis_shared, cpu), sds);
+
+	sdp = NULL;
+	if (sd && sysctl_sched_wake_idle_presearch_domain >= 0) {
+		level = 0;
+		for_each_domain(cpu, sdp) {
+			if (sdp == sd) {
+				sdp = NULL;
+				break;
+			}
+			if (level == sysctl_sched_wake_idle_presearch_domain)
+				break;
+			level++;
+		}
+	}
+	rcu_assign_pointer(per_cpu(sd_sis_pre, cpu), sdp);
 
 	sd = lowest_flag_domain(cpu, SD_NUMA);
 	rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
@@ -1400,14 +1434,12 @@ sd_init(struct sched_domain_topology_level *tl,
 	}
 
 	/*
-	 * For all levels sharing cache; connect a sched_domain_shared
-	 * instance.
+	 * Connect sched_domain_shared instances. As sd_sis can be changed at run
+	 * time, link all domains.
 	 */
-	if (sd->flags & SD_SHARE_PKG_RESOURCES) {
-		sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
-		atomic_inc(&sd->shared->ref);
-		atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
-	}
+	sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
+	atomic_inc(&sd->shared->ref);
+	atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
 
 	sd->private = sdd;
 
@@ -2204,7 +2236,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
  * Call with hotplug lock and sched_domains_mutex held
  */
 void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
-				    struct sched_domain_attr *dattr_new)
+				    struct sched_domain_attr *dattr_new, int force_update)
 {
 	bool __maybe_unused has_eas = false;
 	int i, j, n;
@@ -2217,6 +2249,7 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
 
 	/* Let the architecture update CPU core mappings: */
 	new_topology = arch_update_cpu_topology();
+	new_topology |= force_update;
 
 	if (!doms_new) {
 		WARN_ON_ONCE(dattr_new);
@@ -2310,9 +2343,9 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
  * Call with hotplug lock held
  */
 void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
-			     struct sched_domain_attr *dattr_new)
+			     struct sched_domain_attr *dattr_new, int force_update)
 {
 	mutex_lock(&sched_domains_mutex);
-	partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
+	partition_sched_domains_locked(ndoms_new, doms_new, dattr_new, 0);
 	mutex_unlock(&sched_domains_mutex);
 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index db1ce7af2563..b474851e1a66 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -144,6 +144,10 @@ static const int cap_last_cap = CAP_LAST_CAP;
 #ifdef CONFIG_DETECT_HUNG_TASK
 static unsigned long hung_task_timeout_max = (LONG_MAX/HZ);
 #endif
+#ifdef CONFIG_SMP
+extern int sysctl_sched_wake_idle_domain;
+extern int sysctl_sched_wake_idle_presearch_domain;
+#endif
 
 #ifdef CONFIG_INOTIFY_USER
 #include <linux/inotify.h>
@@ -202,6 +206,11 @@ static int max_extfrag_threshold = 1000;
 
 #endif /* CONFIG_SYSCTL */
 
+#ifdef CONFIG_SMP
+int proc_sched_wake_idle_domain_handler(struct ctl_table *table,
+		int write, void __user *buffer, size_t *lenp, loff_t *ppos);
+#endif
+
 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL)
 static int bpf_stats_handler(struct ctl_table *table, int write,
 			     void __user *buffer, size_t *lenp,
@@ -1834,6 +1843,22 @@ static struct ctl_table kern_table[] = {
 		.extra2		= SYSCTL_ONE,
 	},
 #endif
+#ifdef CONFIG_SMP
+	{
+		.procname	= "sched_wake_idle_domain",
+		.data		= &sysctl_sched_wake_idle_domain,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_sched_wake_idle_domain_handler,
+	},
+	{
+		.procname	= "sched_wake_idle_presearch_domain",
+		.data		= &sysctl_sched_wake_idle_presearch_domain,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_sched_wake_idle_domain_handler,
+	},
+#endif
 #ifdef CONFIG_PROVE_LOCKING
 	{
 		.procname	= "prove_locking",
-- 
2.28.0.rc0.142.g3c755180ce-goog