linux-kernel - Re: sched: tweak select_idle_sibling to look for idle threads

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20160503113131.GF3448@twins.programming.kicks-ass.net>
Date:	Tue, 3 May 2016 13:31:31 +0200
From:	Peter Zijlstra <peterz@...radead.org>
To:	Ingo Molnar <mingo@...nel.org>
Cc:	Mike Galbraith <mgalbraith@...e.de>, Chris Mason <clm@...com>,
	Matt Fleming <matt@...eblueprint.co.uk>,
	linux-kernel@...r.kernel.org
Subject: Re: sched: tweak select_idle_sibling to look for idle threads

On Mon, May 02, 2016 at 06:04:04PM +0200, Ingo Molnar wrote:
> * Peter Zijlstra <peterz@...radead.org> wrote:

> > If you want a laugh, modify select_idle_core() to remember the last idle
> > thread it encounters and have it return that when it fails to find an
> > idle core.. I'm still stumped to explain why it behaves the way it does.
> 
> Assuming by 'behaving the way it does' means it improves things, such a dynamic 
> with history/memory could be disrupting escalating feedback loops. Only guessing 
> though.

ha! no :-)


mainline-like:

root@...-ep:~/bench/sysbench# for i in OLD_IDLE NO_ORDER_IDLE IDLE_CORE NO_FORCE_CORE IDLE IDLE_SMT NO_IDLE_LAST NO_IDLE_FIRST ; do echo $i > /debug/sched_features ; done ; ./doit-psql 30 2 5 10 20 40 80
  2: [30 secs]     transactions:                        52336  (1744.47 per sec.)
  5: [30 secs]     transactions:                        121971 (4065.59 per sec.)
 10: [30 secs]     transactions:                        242741 (8091.05 per sec.)
 20: [30 secs]     transactions:                        382357 (12744.58 per sec.)
 40: [30 secs]     transactions:                        537705 (17922.33 per sec.)
 80: [30 secs]     transactions:                        544193 (18137.97 per sec.)


new code with settings that make sense (and aren't that far from current
mainline):

root@...-ep:~/bench/sysbench# for i in NO_OLD_IDLE NO_ORDER_IDLE IDLE_CORE NO_FORCE_CORE IDLE IDLE_SMT NO_IDLE_LAST NO_IDLE_FIRST ; do echo $i > /debug/sched_features ; done ; ./doit-psql 30 2 5 10 20 40 80
  2: [30 secs]     transactions:                        53663  (1788.71 per sec.)
  5: [30 secs]     transactions:                        122529 (4084.16 per sec.)
 10: [30 secs]     transactions:                        239607 (7986.60 per sec.)
 20: [30 secs]     transactions:                        379112 (12636.43 per sec.)
 40: [30 secs]     transactions:                        539161 (17970.83 per sec.)
 80: [30 secs]     transactions:                        544907 (18161.74 per sec.)


Then flip on the last_idle tracking in select_idle_core():

root@...-ep:~/bench/sysbench# for i in NO_OLD_IDLE NO_ORDER_IDLE IDLE_CORE NO_FORCE_CORE IDLE IDLE_SMT IDLE_LAST NO_IDLE_FIRST ; do echo $i > /debug/sched_features ; done ; ./doit-psql 30 2 5 10 20 40 80
  2: [30 secs]     transactions:                        54355  (1811.78 per sec.)
  5: [30 secs]     transactions:                        122609 (4086.81 per sec.)
 10: [30 secs]     transactions:                        238738 (7957.66 per sec.)
 20: [30 secs]     transactions:                        354693 (11822.49 per sec.)
 40: [30 secs]     transactions:                        421807 (14059.32 per sec.)
 80: [30 secs]     transactions:                        427088 (14234.25 per sec.)


And see the top end collapse..

The idea was that the whole has_idle_cores thing would switch off
select_idle_core() under 'high' load, and therefore last_idle tracking
would not affect that.

Clearly something is not quite working out :-)


Also, I think the current sharing of the load_balance_mask is borken.
While all users have BH disabled, the wakeup can be from IRQ context and
hence trample on the LB mask, lemme go fix that by allocating more masks
or so.

---
 kernel/sched/fair.c      | 269 +++++++++++++++++++++++++++++++++++++++--------
 kernel/sched/features.h  |   8 ++
 kernel/sched/idle_task.c |   4 +-
 kernel/sched/sched.h     |   1 +
 kernel/time/tick-sched.c |  10 +-
 5 files changed, 243 insertions(+), 49 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b8a33ab..98c2904 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1501,8 +1501,10 @@ balance:
 	 * One idle CPU per node is evaluated for a task numa move.
 	 * Call select_idle_sibling to maybe find a better one.
 	 */
-	if (!cur)
+	if (!cur) {
+		// XXX borken
 		env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+	}
 
 assign:
 	assigned = true;
@@ -4491,6 +4493,17 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 }
 
 #ifdef CONFIG_SMP
+
+/* 
+ * Working cpumask for:
+ *   load_balance, 
+ *   load_balance_newidle,
+ *   select_idle_core.
+ *
+ * Assumes softirqs are disabled when in use.
+ */
+DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
+
 #ifdef CONFIG_NO_HZ_COMMON
 /*
  * per rq 'load' arrray crap; XXX kill this.
@@ -5162,65 +5175,238 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 	return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
 }
 
+static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped)
+{
+	int next;
+
+again:
+	next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1);
+
+	if (*wrapped) {
+		if (next >= start)
+			return nr_cpumask_bits;
+	} else {
+		if (next >= nr_cpumask_bits) {
+			*wrapped = 1;
+			n = -1;
+			goto again;
+		}
+	}
+
+	return next;
+}
+
+#define for_each_cpu_wrap(cpu, mask, start, wrap)				\
+	for ((wrap) = 0, (cpu) = (start)-1;					\
+		(cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)),	\
+		(cpu) < nr_cpumask_bits; )
+
+#ifdef CONFIG_SCHED_SMT
+
+static inline void clear_idle_cores(int cpu)
+{
+	struct sched_domain *sd = rcu_dereference(per_cpu(sd_busy, cpu));
+	if (!sd)
+		return;
+
+	WRITE_ONCE(sd->groups->sgc->has_idle_cores, 0);
+}
+
+static inline void set_idle_cores(int cpu)
+{
+	struct sched_domain *sd = rcu_dereference(per_cpu(sd_busy, cpu));
+	if (!sd)
+		return;
+
+	WRITE_ONCE(sd->groups->sgc->has_idle_cores, 1);
+}
+
+static inline bool test_idle_cores(int cpu)
+{
+	if (sched_feat(FORCE_CORE)) {
+		return true;
+	} else {
+		struct sched_domain *sd = rcu_dereference(per_cpu(sd_busy, cpu));
+		if (!sd)
+			return false;
+
+		// XXX static key for !SMT topologies
+
+		return READ_ONCE(sd->groups->sgc->has_idle_cores);
+	}
+}
+
+void update_idle_core(struct rq *rq)
+{
+	int core = cpu_of(rq);
+	int cpu;
+
+	rcu_read_lock();
+	if (test_idle_cores(core))
+		goto unlock;
+
+	for_each_cpu(cpu, cpu_smt_mask(core)) {
+		if (cpu == core)
+			continue;
+
+		if (!idle_cpu(cpu))
+			goto unlock;
+	}
+
+	set_idle_cores(core);
+unlock:
+	rcu_read_unlock();
+}
+
+static int select_idle_core(struct task_struct *p, int target)
+{
+	struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
+	int core, cpu, wrap, last_idle = -1, first_idle = -1;
+	struct sched_domain *sd;
+
+	sd = rcu_dereference(per_cpu(sd_llc, target));
+	cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p));
+
+	for_each_cpu_wrap(core, cpus, target, wrap) {
+		bool idle = true;
+
+		for_each_cpu(cpu, cpu_smt_mask(core)) {
+			if (cpumask_test_and_clear_cpu(cpu, cpus)) {
+				if (sched_feat(IDLE_LAST))
+					last_idle = cpu;
+				if (sched_feat(IDLE_FIRST) && first_idle == -1)
+					first_idle = cpu;
+			}
+			if (!idle_cpu(cpu))
+				idle = false;
+		}
+
+		if (idle)
+			break;
+	}
+
+	if (sched_feat(IDLE_LAST) && ((unsigned)core >= nr_cpumask_bits))
+		return last_idle;
+
+	if (sched_feat(IDLE_FIRST) && ((unsigned)core >= nr_cpumask_bits))
+		return first_idle;
+
+	return core;
+}
+
+#else /* CONFIG_SCHED_SMT */
+
+static inline void clear_idle_cores(int cpu) { }
+static inline void set_idle_cores(int cpu) { }
+
+static inline bool test_idle_cores(int cpu)
+{
+	return false;
+}
+
+void update_idle_core(struct rq *rq) { }
+
+static inline int select_idle_core(struct task_struct *p, int target)
+{
+	return -1;
+}
+
+#endif /* CONFIG_SCHED_SMT */
+
 /*
- * Try and locate an idle CPU in the sched_domain.
+ * Try and locate an idle core/thread in the LLC cache domain.
  */
 static int select_idle_sibling(struct task_struct *p, int target)
 {
 	struct sched_domain *sd;
-	struct sched_group *sg;
-	int i = task_cpu(p);
+	int wrap, start, i = task_cpu(p);
 
 	if (idle_cpu(target))
 		return target;
 
 	/*
-	 * If the prevous cpu is cache affine and idle, don't be stupid.
+	 * If the previous cpu is cache affine and idle, don't be stupid.
 	 */
 	if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
 		return i;
 
-	/*
-	 * Otherwise, iterate the domains and find an eligible idle cpu.
-	 *
-	 * A completely idle sched group at higher domains is more
-	 * desirable than an idle group at a lower level, because lower
-	 * domains have smaller groups and usually share hardware
-	 * resources which causes tasks to contend on them, e.g. x86
-	 * hyperthread siblings in the lowest domain (SMT) can contend
-	 * on the shared cpu pipeline.
-	 *
-	 * However, while we prefer idle groups at higher domains
-	 * finding an idle cpu at the lowest domain is still better than
-	 * returning 'target', which we've already established, isn't
-	 * idle.
-	 */
-	sd = rcu_dereference(per_cpu(sd_llc, target));
-	for_each_lower_domain(sd) {
-		sg = sd->groups;
-		do {
-			if (!cpumask_intersects(sched_group_cpus(sg),
-						tsk_cpus_allowed(p)))
-				goto next;
+	start = target;
+	if (sched_feat(ORDER_IDLE))
+		start = per_cpu(sd_llc_id, target); /* first cpu in llc domain */
 
-			/* Ensure the entire group is idle */
-			for_each_cpu(i, sched_group_cpus(sg)) {
-				if (i == target || !idle_cpu(i))
+	sd = rcu_dereference(per_cpu(sd_llc, start));
+	if (!sd)
+		return target;
+
+	if (sched_feat(OLD_IDLE)) {
+		struct sched_group *sg;
+
+		for_each_lower_domain(sd) {
+			sg = sd->groups;
+			do {
+				if (!cpumask_intersects(sched_group_cpus(sg),
+							tsk_cpus_allowed(p)))
 					goto next;
-			}
+
+				/* Ensure the entire group is idle */
+				for_each_cpu(i, sched_group_cpus(sg)) {
+					if (i == target || !idle_cpu(i))
+						goto next;
+				}
+
+				/*
+				 * It doesn't matter which cpu we pick, the
+				 * whole group is idle.
+				 */
+				target = cpumask_first_and(sched_group_cpus(sg),
+						tsk_cpus_allowed(p));
+				goto done;
+next:
+				sg = sg->next;
+			} while (sg != sd->groups);
+		}
+done:
+		return target;
+	}
+
+	/*
+	 * If there are idle cores to be had, go find one.
+	 */
+	if (sched_feat(IDLE_CORE)) {
+		if (test_idle_cores(target)) {
+			i = select_idle_core(p, start);
+			if ((unsigned)i < nr_cpumask_bits)
+				return i;
 
 			/*
-			 * It doesn't matter which cpu we pick, the
-			 * whole group is idle.
+			 * Failed to find an idle core; stop looking for one.
 			 */
-			target = cpumask_first_and(sched_group_cpus(sg),
-					tsk_cpus_allowed(p));
-			goto done;
-next:
-			sg = sg->next;
-		} while (sg != sd->groups);
+			clear_idle_cores(target);
+		}
 	}
-done:
+
+	if (!sched_feat(IDLE))
+		return target;
+
+	/*
+	 * Otherwise, settle for anything idle in this cache domain.
+	 */
+	if (!sched_feat(IDLE_SMT)) {
+		for_each_cpu_wrap(i, sched_domain_span(sd), start, wrap) {
+			if (!cpumask_test_cpu(i, tsk_cpus_allowed(p)))
+				continue;
+			if (idle_cpu(i))
+				return i;
+		}
+	} else {
+		for_each_cpu(i, cpu_smt_mask(target)) {
+			if (!cpumask_test_cpu(i, tsk_cpus_allowed(p)))
+				continue;
+			if (idle_cpu(i))
+				return i;
+		}
+	}
+
 	return target;
 }
 
@@ -7229,9 +7415,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
  */
 #define MAX_PINNED_INTERVAL	512
 
-/* Working cpumask for load_balance and load_balance_newidle. */
-DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
-
 static int need_active_balance(struct lb_env *env)
 {
 	struct sched_domain *sd = env->sd;
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 69631fa..347f6fe 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -69,3 +69,11 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
 SCHED_FEAT(LB_MIN, false)
 SCHED_FEAT(ATTACH_AGE_LOAD, true)
 
+SCHED_FEAT(OLD_IDLE, false)
+SCHED_FEAT(ORDER_IDLE, false)
+SCHED_FEAT(IDLE_CORE, true)
+SCHED_FEAT(FORCE_CORE, false)
+SCHED_FEAT(IDLE_SMT, false)
+SCHED_FEAT(IDLE, true)
+SCHED_FEAT(IDLE_LAST, false)
+SCHED_FEAT(IDLE_FIRST, false)
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 47ce949..cb394db 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -23,11 +23,13 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
 	resched_curr(rq);
 }
 
+extern void update_idle_core(struct rq *rq);
+
 static struct task_struct *
 pick_next_task_idle(struct rq *rq, struct task_struct *prev)
 {
 	put_prev_task(rq, prev);
-
+	update_idle_core(rq);
 	schedstat_inc(rq, sched_goidle);
 	return rq->idle;
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 69da6fc..5994794 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -866,6 +866,7 @@ struct sched_group_capacity {
 	 * Number of busy cpus in this group.
 	 */
 	atomic_t nr_busy_cpus;
+	int	has_idle_cores;
 
 	unsigned long cpumask[0]; /* iteration mask */
 };
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 31872bc..6e42cd2 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -933,11 +933,11 @@ void tick_nohz_idle_enter(void)
 	WARN_ON_ONCE(irqs_disabled());
 
 	/*
- 	 * Update the idle state in the scheduler domain hierarchy
- 	 * when tick_nohz_stop_sched_tick() is called from the idle loop.
- 	 * State will be updated to busy during the first busy tick after
- 	 * exiting idle.
- 	 */
+	 * Update the idle state in the scheduler domain hierarchy
+	 * when tick_nohz_stop_sched_tick() is called from the idle loop.
+	 * State will be updated to busy during the first busy tick after
+	 * exiting idle.
+	 */
 	set_cpu_sd_state_idle();
 
 	local_irq_disable();