linux-kernel - Re: sched: tweak select_idle_sibling to look for idle threads

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20160412003044.smr24xzuom3locvo@floor.thefacebook.com>
Date:	Mon, 11 Apr 2016 20:30:44 -0400
From:	Chris Mason <clm@...com>
To:	Mike Galbraith <mgalbraith@...e.de>
CC:	Peter Zijlstra <peterz@...radead.org>,
	Ingo Molnar <mingo@...nel.org>,
	Matt Fleming <matt@...eblueprint.co.uk>,
	<linux-kernel@...r.kernel.org>
Subject: Re: sched: tweak select_idle_sibling to look for idle threads

On Mon, Apr 11, 2016 at 06:54:21AM +0200, Mike Galbraith wrote:
> On Sun, 2016-04-10 at 15:55 -0400, Chris Mason wrote:
> > On Sun, Apr 10, 2016 at 12:04:21PM +0200, Mike Galbraith wrote:
> > > On Sat, 2016-04-09 at 15:05 -0400, Chris Mason wrote:
> > > 
> > > > This does preserve the existing logic to prefer idle cores over idle
> > > > CPU threads, and includes some tests to try and avoid the idle scan when we're
> > > > actually better off sharing a non-idle CPU with someone else.
> > > 
> > > My box says the "oh nevermind" checks aren't selective enough, tbench
> > > dropped 4% at clients=cores, and 2% at clients=threads.
> > 
> > Ok, I was able to reproduce this by stuffing tbench_srv and tbench onto
> > just socket 0.  Version 2 below fixes things for me, but I'm hoping
> > someone can suggest a way to get task_hot() buddy checks without the rq
> > lock.
> > 
> > I haven't run this on production loads yet, but our 4.0 patch for this
> > uses task_hot(), so I'd expect it to be on par.  If this doesn't fix it
> > for you, I'll dig up a similar machine on Monday.
> 
> My box stopped caring.  I personally would be reluctant to apply it
> without a "you asked for it" button or a large pile of benchmark
> results.  Lock banging or not, full scan existing makes me nervous.


We can use a bitmap at the socket level to keep track of which cpus are
idle.  I'm sure there are better places for the array and better ways to
allocate, this is just a rough cut to make sure the idle tracking works.

-chris

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a10494a..1c3b5e4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1055,6 +1055,8 @@ struct sched_domain {
 	unsigned int balance_interval;	/* initialise to 1. units in ms. */
 	unsigned int nr_balance_failed; /* initialise to 0 */
 
+	cpumask_var_t idle_cpus_mask;
+
 	/* idle_balance() stats */
 	u64 max_newidle_lb_cost;
 	unsigned long next_decay_max_lb_cost;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 41f6b22..237d645 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3204,6 +3204,7 @@ again:
 static void __sched notrace __schedule(bool preempt)
 {
 	struct task_struct *prev, *next;
+	struct sched_domain *package_sd;
 	unsigned long *switch_count;
 	struct rq *rq;
 	int cpu;
@@ -3270,11 +3270,19 @@ static void __sched notrace __schedule(bool preempt)
 		update_rq_clock(rq);
 
 	next = pick_next_task(rq, prev);
+
 	clear_tsk_need_resched(prev);
 	clear_preempt_need_resched();
 	rq->clock_skip_update = 0;
 
 	if (likely(prev != next)) {
+		package_sd = rcu_dereference(per_cpu(sd_llc, cpu));
+		if (package_sd) {
+			if (prev->policy == SCHED_IDLE && next->policy != SCHED_IDLE)
+				cpumask_clear_cpu(cpu, package_sd->idle_cpus_mask);
+			else if (next->policy == SCHED_IDLE)
+				cpumask_set_cpu(cpu, package_sd->idle_cpus_mask);
+		}
 		rq->nr_switches++;
 		rq->curr = next;
 		++*switch_count;
@@ -6599,7 +6607,6 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
 		sd->imbalance_pct = 117;
 		sd->cache_nice_tries = 1;
 		sd->busy_idx = 2;
-
 #ifdef CONFIG_NUMA
 	} else if (sd->flags & SD_NUMA) {
 		sd->cache_nice_tries = 2;
@@ -7041,6 +7048,8 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
 		return child;
 
 	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
+	zalloc_cpumask_var(&sd->idle_cpus_mask, GFP_NOWAIT);
+	cpumask_and(sd->idle_cpus_mask, cpu_map, tl->mask(cpu));
 	if (child) {
 		sd->level = child->level + 1;
 		sched_domain_level_max = max(sched_domain_level_max, sd->level);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0c76505..cae6bd7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5026,7 +5026,7 @@ next:
 	 * the package.
 	 */
 	if (package_sd && should_scan_idle(p, target)) {
-		for_each_cpu_and(i, sched_domain_span(package_sd),
+		for_each_cpu_and(i, package_sd->idle_cpus_mask,
 				 tsk_cpus_allowed(p)) {
 			if (idle_cpu(i)) {
 				target = i;
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 544a713..7e34b42 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -202,6 +202,9 @@ DEFINE_PER_CPU(bool, cpu_dead_idle);
  */
 static void cpu_idle_loop(void)
 {
+	int cpu;
+	struct sched_domain *package_sd;
+
 	while (1) {
 		/*
 		 * If the arch has a polling bit, we maintain an invariant:
@@ -212,10 +215,19 @@ static void cpu_idle_loop(void)
 		 * guaranteed to cause the cpu to reschedule.
 		 */
 
+
 		__current_set_polling();
 		quiet_vmstat();
 		tick_nohz_idle_enter();
 
+		preempt_disable();
+		cpu = smp_processor_id();
+		package_sd = rcu_dereference(per_cpu(sd_llc, cpu));
+		if (package_sd) {
+			cpumask_set_cpu(cpu, package_sd->idle_cpus_mask);
+		}
+		preempt_enable();
+
 		while (!need_resched()) {
 			check_pgt_cache();
 			rmb();
-- 
2.8.0.rc2