linux-kernel - Re: [PATCH v3 3/3] sched: update blocked load when newly idle

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Mon, 12 Feb 2018 13:04:11 +0100
From:   Peter Zijlstra <peterz@...radead.org>
To:     Vincent Guittot <vincent.guittot@...aro.org>
Cc:     mingo@...nel.org, linux-kernel@...r.kernel.org,
        valentin.schneider@....com, morten.rasmussen@...s.arm.com,
        brendan.jackman@....com, dietmar.eggemann@....com
Subject: Re: [PATCH v3 3/3] sched: update blocked load when newly idle

On Mon, Feb 12, 2018 at 09:07:54AM +0100, Vincent Guittot wrote:
> @@ -8863,12 +8866,26 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
>  
>  	if (this_rq->avg_idle < sysctl_sched_migration_cost ||
>  	    !this_rq->rd->overload) {
> +		unsigned long has_blocked = READ_ONCE(nohz.has_blocked);
> +		unsigned long next_blocked = READ_ONCE(nohz.next_blocked);
> +
>  		rcu_read_lock();
>  		sd = rcu_dereference_check_sched_domain(this_rq->sd);
>  		if (sd)
>  			update_next_balance(sd, &next_balance);
>  		rcu_read_unlock();
>  
> +		/*
> +		 * Update blocked idle load if it has not been done for a
> +		 * while. Try to do it locally before entering idle but kick a
> +		 * ilb if it takes too much time and/or might delay next local
> +		 * wake up
> +		 */
> +		if (has_blocked && time_after_eq(jiffies, next_blocked) &&
> +				(this_rq->avg_idle < sysctl_sched_migration_cost ||
> +				!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE)))
> +			kick_ilb(NOHZ_STATS_KICK);
> +
>  		goto out;
>  	}

So I really hate this one, also I suspect its broken, because we do this
check before dropping rq->lock and _nohz_idle_balance() will take
rq->lock.

Aside from the above being an unreadable mess, I dislike that it breaks
the various isolation crud, we should not touch CPUs outside of our
domain.

Maybe something like the below? (unfinished)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7839,7 +7839,7 @@ group_type group_classify(struct sched_g
 	return group_other;
 }
 
-static bool update_nohz_stats(struct rq *rq)
+static bool update_nohz_stats(struct rq *rq, bool force)
 {
 #ifdef CONFIG_NO_HZ_COMMON
 	unsigned int cpu = rq->cpu;
@@ -7850,7 +7850,7 @@ static bool update_nohz_stats(struct rq
 	if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
 		return false;
 
-	if (!time_after(jiffies, rq->last_blocked_load_update_tick))
+	if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))
 		return true;
 
 	update_blocked_averages(cpu);
@@ -7883,7 +7883,7 @@ static inline void update_sg_lb_stats(st
 	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
 		struct rq *rq = cpu_rq(i);
 
-		if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq))
+		if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
 			env->flags |= LBF_NOHZ_AGAIN;
 
 		/* Bias balancing toward cpus of our domain */
@@ -8857,6 +8857,29 @@ update_next_balance(struct sched_domain
 		*next_balance = next;
 }
 
+static int nohz_age(struct sched_domain *sd)
+{
+	struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
+	bool has_blocked_load;
+
+	WRITE_ONCE(nohz.has_blocked, 0);
+
+	smp_mb();
+
+	cpumask_and(cpus, sched_domain_span(sd), nohz.idle_cpus_mask);
+
+	has_blocked_load = cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(sd));
+
+	for_each_cpu(cpu, cpus) {
+		struct rq *rq = cpu_rq(cpu);
+
+		has_blocked_load |= update_nohz_stats(rq, true);
+	}
+
+	if (has_blocked_load)
+		WRITE_ONCE(nohz.has_blocked, 1);
+}
+
 /*
  * idle_balance is called by schedule() if this_cpu is about to become
  * idle. Attempts to pull tasks from other CPUs.
@@ -8868,6 +8891,7 @@ static int idle_balance(struct rq *this_
 	struct sched_domain *sd;
 	int pulled_task = 0;
 	u64 curr_cost = 0;
+	bool nohz_blocked = false;
 
 	/*
 	 * We must set idle_stamp _before_ calling idle_balance(), such that we
@@ -8889,8 +8913,8 @@ static int idle_balance(struct rq *this_
 	 */
 	rq_unpin_lock(this_rq, rf);
 
-	if (this_rq->avg_idle < sysctl_sched_migration_cost ||
-	    !this_rq->rd->overload) {
+	if (this_rq->avg_idle < sysctl_sched_migration_cost) {
+short_idle:
 		rcu_read_lock();
 		sd = rcu_dereference_check_sched_domain(this_rq->sd);
 		if (sd)
@@ -8900,6 +8924,18 @@ static int idle_balance(struct rq *this_
 		goto out;
 	}
 
+	if (!this_rq->rd->overload) {
+#ifdef CONFIG_NO_HZ_COMMON
+		unsigned int has_blocked = READ_ONCE(nohz.has_blocked);
+		unsigned long next_blocked = READ_ONE(nohz.next_blocked);
+
+		if (has_blocked && time_after_eq(jiffies, next_blocked))
+			nohz_blocked = true;
+		else
+#endif
+		goto short_idle;
+	}
+
 	raw_spin_unlock(&this_rq->lock);
 
 	update_blocked_averages(this_cpu);
@@ -8919,9 +8955,13 @@ static int idle_balance(struct rq *this_
 		if (sd->flags & SD_BALANCE_NEWIDLE) {
 			t0 = sched_clock_cpu(this_cpu);
 
-			pulled_task = load_balance(this_cpu, this_rq,
-						   sd, CPU_NEWLY_IDLE,
-						   &continue_balancing);
+			if (nohz_blocked) {
+				nohz_age(sd);
+			} else {
+				pulled_task = load_balance(this_cpu, this_rq,
+						sd, CPU_NEWLY_IDLE,
+						&continue_balancing);
+			}
 
 			domain_cost = sched_clock_cpu(this_cpu) - t0;
 			if (domain_cost > sd->max_newidle_lb_cost)
@@ -9497,8 +9537,7 @@ static bool nohz_idle_balance(struct rq
 
 		rq = cpu_rq(balance_cpu);
 
-		update_blocked_averages(rq->cpu);
-		has_blocked_load |= rq->has_blocked_load;
+		has_blocked_load |= update_nohz_stats(rq, true);
 
 		/*
 		 * If time for next balance is due,