lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20110929021343.GA12029@linux.vnet.ibm.com>
Date:	Thu, 29 Sep 2011 07:43:43 +0530
From:	Srivatsa Vaddagiri <vatsa@...ux.vnet.ibm.com>
To:	Suresh Siddha <suresh.b.siddha@...el.com>
Cc:	Venki Pallipadi <venki@...gle.com>,
	Peter Zijlstra <a.p.zijlstra@...llo.nl>,
	Paul Turner <pjt@...gle.com>, Ingo Molnar <mingo@...e.hu>,
	Vaidyanathan Srinivasan <svaidy@...ux.vnet.ibm.com>,
	Kamalesh Babulal <kamalesh@...ux.vnet.ibm.com>,
	"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>
Subject: Re: [PATCH v1] sched: fix nohz idle load balancer issues

* Suresh Siddha <suresh.b.siddha@...el.com> [2011-09-28 17:47:43]:

> Anyways, this appended patch is just for your quick experiment. I will
> think a bit more tomorrow morning before sending the patch with complete
> changelog. Thanks.


Ran my usual test to check idle time and here are the results:


Idle time ->		Avg.	Std.Dev	   Min      Max

Without your patch	8.9%     1.9	   5%	    14%
With your patch		3.5%     1.2	   2%	     7%
With my addon patch	2.8%	 0.5%	   2%	     4%

So your patch does improve things, but there is scope to improve it
further with an addon patch like below. I will experiment with some more
changes (like a sorted list of idle_cpu rq->next_balance) and send my v2
to apply on top of your patch later.

Signed-off-by: Srivatsa Vaddagiri <vatsa@...ux.vnet.ibm.com>

---

Index: current/kernel/sched.c
===================================================================
--- current.orig/kernel/sched.c
+++ current/kernel/sched.c
@@ -1404,6 +1404,12 @@ void wake_up_idle_cpu(int cpu)
 		smp_send_reschedule(cpu);
 }
 
+static void reset_first_second_pick_cpu(int cpu);
+
+#else	/* CONFIG_NO_HZ */
+
+static inline void reset_first_second_pick_cpu(int cpu) { }
+
 #endif /* CONFIG_NO_HZ */
 
 static u64 sched_avg_period(void)
@@ -8359,6 +8365,7 @@ void __init sched_init(void)
 	atomic_set(&nohz.load_balancer, nr_cpu_ids);
 	atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
 	atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
+	spin_lock_init(&nohz.next_balance_lock);
 #endif
 	/* May be allocated at isolcpus cmdline parse time */
 	if (cpu_isolated_map == NULL)
Index: current/kernel/sched_fair.c
===================================================================
--- current.orig/kernel/sched_fair.c
+++ current/kernel/sched_fair.c
@@ -4286,6 +4286,7 @@ static struct {
 	cpumask_var_t idle_cpus_mask;
 	cpumask_var_t grp_idle_mask;
 	unsigned long next_balance;     /* in jiffy units */
+	spinlock_t next_balance_lock;   /* Serialize update to 'next_balance' */
 } nohz ____cacheline_aligned;
 
 int get_nohz_load_balancer(void)
@@ -4427,8 +4428,12 @@ static void nohz_balancer_kick(int cpu)
 
 	ilb_cpu = get_nohz_load_balancer();
 
-	if (ilb_cpu >= nr_cpu_ids) {
-		ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
+	/*
+	 * ilb_cpu itself can be attempting to kick another idle cpu. Pick
+	 * another idle cpu in that case.
+	 */
+	if (ilb_cpu >= nr_cpu_ids || ilb_cpu == cpu) {
+		ilb_cpu = cpumask_any_but(nohz.idle_cpus_mask, cpu);
 		if (ilb_cpu >= nr_cpu_ids)
 			return;
 	}
@@ -4440,6 +4445,18 @@ static void nohz_balancer_kick(int cpu)
 	return;
 }
 
+/* Update nohz.next_balance with a new minimum value */
+static inline void set_nohz_next_balance(unsigned long next_balance)
+{
+	if (time_after(next_balance, nohz.next_balance))
+		return;
+
+	spin_lock(&nohz.next_balance_lock);
+	if (time_before(next_balance, nohz.next_balance))
+		nohz.next_balance = next_balance;
+	spin_unlock(&nohz.next_balance_lock);
+}
+
 /*
  * This routine will try to nominate the ilb (idle load balancing)
  * owner among the cpus whose ticks are stopped. ilb owner will do the idle
@@ -4475,11 +4492,6 @@ void select_nohz_load_balancer(int stop_
 
 		cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
 
-		if (atomic_read(&nohz.first_pick_cpu) == cpu)
-			atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
-		if (atomic_read(&nohz.second_pick_cpu) == cpu)
-			atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
-
 		if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
 			int new_ilb;
 
@@ -4498,8 +4510,10 @@ void select_nohz_load_balancer(int stop_
 				resched_cpu(new_ilb);
 				return;
 			}
+			set_nohz_next_balance(cpu_rq(cpu)->next_balance);
 			return;
 		}
+		set_nohz_next_balance(cpu_rq(cpu)->next_balance);
 	} else {
 		if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
 			return;
@@ -4615,9 +4629,16 @@ static void nohz_idle_balance(int this_c
 	struct rq *rq;
 	int balance_cpu;
 
-	if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
+	if (!this_rq->nohz_balance_kick)
 		return;
 
+	/* Wakeup another idle cpu to do idle load balance if we got busy */
+	if (!idle_cpu(this_cpu)) {
+		nohz_balancer_kick(this_cpu);
+		this_rq->nohz_balance_kick = 0;
+		return;
+	}
+
 	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
 		if (balance_cpu == this_cpu)
 			continue;
@@ -4628,7 +4649,7 @@ static void nohz_idle_balance(int this_c
 		 * balancing owner will pick it up.
 		 */
 		if (need_resched()) {
-			this_rq->nohz_balance_kick = 0;
+			nohz_balancer_kick(this_cpu);
 			break;
 		}
 
@@ -4643,7 +4664,7 @@ static void nohz_idle_balance(int this_c
 		if (time_after(this_rq->next_balance, rq->next_balance))
 			this_rq->next_balance = rq->next_balance;
 	}
-	nohz.next_balance = this_rq->next_balance;
+	set_nohz_next_balance(this_rq->next_balance);
 	this_rq->nohz_balance_kick = 0;
 }
 
@@ -4692,9 +4713,24 @@ static inline int nohz_kick_needed(struc
 	}
 	return 0;
 }
-#else
+
+/*
+ * Reset first_pick_cpu or second_pick_cpu identifier in case
+ * corresponding cpu is going idle.
+ */
+static void reset_first_second_pick_cpu(int cpu)
+{
+	if (atomic_read(&nohz.first_pick_cpu) == cpu)
+		atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
+	if (atomic_read(&nohz.second_pick_cpu) == cpu)
+		atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
+}
+
+#else	/* CONFIG_NO_HZ */
+
 static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
-#endif
+
+#endif	/* CONFIG_NO_HZ */
 
 /*
  * run_rebalance_domains is triggered when needed from the scheduler tick.
@@ -4732,7 +4768,7 @@ static inline void trigger_load_balance(
 	    likely(!on_null_domain(cpu)))
 		raise_softirq(SCHED_SOFTIRQ);
 #ifdef CONFIG_NO_HZ
-	else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
+	if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
 		nohz_balancer_kick(cpu);
 #endif
 }
Index: current/kernel/sched_idletask.c
===================================================================
--- current.orig/kernel/sched_idletask.c
+++ current/kernel/sched_idletask.c
@@ -24,6 +24,8 @@ static struct task_struct *pick_next_tas
 {
 	schedstat_inc(rq, sched_goidle);
 	calc_load_account_idle(rq);
+	reset_first_second_pick_cpu(cpu_of(rq));
+
 	return rq->idle;
 }
 





--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ