netdev - Re: Serious performance degradation in Linux 4.15

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20180212151642.GU25201@hirez.programming.kicks-ass.net>
Date:   Mon, 12 Feb 2018 16:16:42 +0100
From:   Peter Zijlstra <peterz@...radead.org>
To:     Jon Maloy <jon.maloy@...csson.com>
Cc:     "netdev@...r.kernel.org" <netdev@...r.kernel.org>,
        "mingo@...nel.org" <mingo@...nel.org>,
        "David Miller (davem@...emloft.net)" <davem@...emloft.net>,
        Mike Galbraith <umgwanakikbuti@...il.com>,
        Matt Fleming <matt@...eblueprint.co.uk>
Subject: Re: Serious performance degradation in Linux 4.15

On Fri, Feb 09, 2018 at 05:59:12PM +0000, Jon Maloy wrote:
> Command for TCP:
> "netperf TCP_STREAM  (netperf -n 4 -f m -c 4 -C 4 -P 1 -H 10.0.0.1 -t TCP_STREAM -l 10 -- -O THROUGHPUT)"
> Command for TIPC:
> "netperf TIPC_STREAM (netperf -n 4 -f m -c 4 -C 4 -P 1 -H 10.0.0.1 -t TCP_STREAM -l 10 -- -O THROUGHPUT)"

That looks like identical tests to me. And my netperf (debian testing)
doesn't appear to have -t TIPC_STREAM.

Please try a coherent report and I'll have another look. Don't (again)
forget to mention what kind of setup you're running this on.


On my IVB-EP (2 sockets, 10 cores, 2 threads), performance cpufreq,
PTI=n RETPOLINE=n, I get:


CPUS=`grep -c ^processor /proc/cpuinfo`

for test in TCP_STREAM
do
        for i in 1 $((CPUS/4)) $((CPUS/2)) $((CPUS)) $((CPUS*2))
        do
                echo -n $test-$i ": "

                (
                  for ((j=0; j<i; j++))
                  do
                        netperf -t $test -4 -c -C -l 60 -P0 | head -1 &
                  done

                  wait
                ) | awk '{ n++; v+=$5; } END { print "Avg: " v/n }'
        done
done



NO_WA_OLD WA_IDLE WA_WEIGHT:

TCP_STREAM-1 : Avg: 44139.8
TCP_STREAM-10 : Avg: 27301.6
TCP_STREAM-20 : Avg: 12701.5
TCP_STREAM-40 : Avg: 5711.62
TCP_STREAM-80 : Avg: 2870.16


WA_OLD NO_WA_IDLE NO_WA_WEIGHT:

TCP_STREAM-1 : Avg: 25293.1
TCP_STREAM-10 : Avg: 28196.3
TCP_STREAM-20 : Avg: 12463.7
TCP_STREAM-40 : Avg: 5566.83
TCP_STREAM-80 : Avg: 2630.03

---
 include/linux/sched/topology.h |  4 ++
 kernel/sched/fair.c            | 99 +++++++++++++++++++++++++++++++++++++-----
 kernel/sched/features.h        |  2 +
 3 files changed, 93 insertions(+), 12 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 26347741ba50..2cb74343c252 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -72,6 +72,10 @@ struct sched_domain_shared {
 	atomic_t	ref;
 	atomic_t	nr_busy_cpus;
 	int		has_idle_cores;
+
+	unsigned long	nr_running;
+	unsigned long	load;
+	unsigned long	capacity;
 };
 
 struct sched_domain {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5eb3ffc9be84..4a561311241a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5680,6 +5680,68 @@ static int wake_wide(struct task_struct *p)
 	return 1;
 }
 
+struct llc_stats {
+	unsigned long nr_running;
+	unsigned long load;
+	unsigned long capacity;
+	int		has_capacity;
+};
+
+static bool get_llc_stats(struct llc_stats *stats, int cpu)
+{
+	struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+
+	if (!sds)
+		return false;
+
+	stats->nr_running = READ_ONCE(sds->nr_running);
+	stats->load	  = READ_ONCE(sds->load);
+	stats->capacity	  = READ_ONCE(sds->capacity);
+	stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu);
+
+	return true;
+}
+
+static int
+wake_affine_old(struct sched_domain *sd, struct task_struct *p,
+		int this_cpu, int prev_cpu, int sync)
+{
+	struct llc_stats prev_stats, this_stats;
+	s64 this_eff_load, prev_eff_load;
+	unsigned long task_load;
+
+	if (!get_llc_stats(&prev_stats, prev_cpu) ||
+	    !get_llc_stats(&this_stats, this_cpu))
+		return nr_cpumask_bits;
+
+	if (sync) {
+		unsigned long current_load = task_h_load(current);
+		if (current_load > this_stats.load)
+			return this_cpu;
+
+		this_stats.load -= current_load;
+	}
+
+	if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
+		return nr_cpumask_bits;
+
+	if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running)
+		return this_cpu;
+
+	task_load = task_h_load(p);
+
+	this_eff_load = 100;
+	this_eff_load *= prev_stats.capacity;
+
+	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+	prev_eff_load *= this_stats.capacity;
+
+	this_eff_load *= this_stats.load + task_load;
+	prev_eff_load *= prev_stats.load - task_load;
+
+	return this_eff_load <= prev_eff_load ? this_cpu : nr_cpumask_bits;
+}
+
 /*
  * The purpose of wake_affine() is to quickly determine on which CPU we can run
  * soonest. For the purpose of speed we only consider the waking and previous
@@ -5756,6 +5818,9 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
 	int this_cpu = smp_processor_id();
 	int target = nr_cpumask_bits;
 
+	if (sched_feat(WA_OLD))
+		target = wake_affine_old(sd, p, this_cpu, prev_cpu, sync);
+
 	if (sched_feat(WA_IDLE))
 		target = wake_affine_idle(this_cpu, prev_cpu, sync);
 
@@ -6209,18 +6274,20 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 		return prev;
 
 	/* Check a recently used CPU as a potential idle candidate */
-	recent_used_cpu = p->recent_used_cpu;
-	if (recent_used_cpu != prev &&
-	    recent_used_cpu != target &&
-	    cpus_share_cache(recent_used_cpu, target) &&
-	    idle_cpu(recent_used_cpu) &&
-	    cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
-		/*
-		 * Replace recent_used_cpu with prev as it is a potential
-		 * candidate for the next wake.
-		 */
-		p->recent_used_cpu = prev;
-		return recent_used_cpu;
+	if (sched_feat(SIS_RECENT)) {
+		recent_used_cpu = p->recent_used_cpu;
+		if (recent_used_cpu != prev &&
+		    recent_used_cpu != target &&
+		    cpus_share_cache(recent_used_cpu, target) &&
+		    idle_cpu(recent_used_cpu) &&
+		    cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
+			/*
+			 * Replace recent_used_cpu with prev as it is a potential
+			 * candidate for the next wake.
+			 */
+			p->recent_used_cpu = prev;
+			return recent_used_cpu;
+		}
 	}
 
 	sd = rcu_dereference(per_cpu(sd_llc, target));
@@ -7961,6 +8028,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
  */
 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
 {
+	struct sched_domain_shared *shared = env->sd->shared;
 	struct sched_domain *child = env->sd->child;
 	struct sched_group *sg = env->sd->groups;
 	struct sg_lb_stats *local = &sds->local_stat;
@@ -8032,6 +8100,13 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 		if (env->dst_rq->rd->overload != overload)
 			env->dst_rq->rd->overload = overload;
 	}
+
+	if (!shared)
+		return;
+
+	WRITE_ONCE(shared->nr_running, sds->total_running);
+	WRITE_ONCE(shared->load, sds->total_load);
+	WRITE_ONCE(shared->capacity, sds->total_capacity);
 }
 
 /**
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 9552fd5854bf..bdb0a66caaae 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -57,6 +57,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
  */
 SCHED_FEAT(SIS_AVG_CPU, false)
 SCHED_FEAT(SIS_PROP, true)
+SCHED_FEAT(SIS_RECENT, true)
 
 /*
  * Issue a WARN when we do multiple update_rq_clock() calls
@@ -82,6 +83,7 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
 SCHED_FEAT(LB_MIN, false)
 SCHED_FEAT(ATTACH_AGE_LOAD, true)
 
+SCHED_FEAT(WA_OLD, false)
 SCHED_FEAT(WA_IDLE, true)
 SCHED_FEAT(WA_WEIGHT, true)
 SCHED_FEAT(WA_BIAS, true)