[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20180212151642.GU25201@hirez.programming.kicks-ass.net>
Date: Mon, 12 Feb 2018 16:16:42 +0100
From: Peter Zijlstra <peterz@...radead.org>
To: Jon Maloy <jon.maloy@...csson.com>
Cc: "netdev@...r.kernel.org" <netdev@...r.kernel.org>,
"mingo@...nel.org" <mingo@...nel.org>,
"David Miller (davem@...emloft.net)" <davem@...emloft.net>,
Mike Galbraith <umgwanakikbuti@...il.com>,
Matt Fleming <matt@...eblueprint.co.uk>
Subject: Re: Serious performance degradation in Linux 4.15
On Fri, Feb 09, 2018 at 05:59:12PM +0000, Jon Maloy wrote:
> Command for TCP:
> "netperf TCP_STREAM (netperf -n 4 -f m -c 4 -C 4 -P 1 -H 10.0.0.1 -t TCP_STREAM -l 10 -- -O THROUGHPUT)"
> Command for TIPC:
> "netperf TIPC_STREAM (netperf -n 4 -f m -c 4 -C 4 -P 1 -H 10.0.0.1 -t TCP_STREAM -l 10 -- -O THROUGHPUT)"
That looks like identical tests to me. And my netperf (debian testing)
doesn't appear to have -t TIPC_STREAM.
Please try a coherent report and I'll have another look. Don't (again)
forget to mention what kind of setup you're running this on.
On my IVB-EP (2 sockets, 10 cores, 2 threads), performance cpufreq,
PTI=n RETPOLINE=n, I get:
CPUS=`grep -c ^processor /proc/cpuinfo`
for test in TCP_STREAM
do
for i in 1 $((CPUS/4)) $((CPUS/2)) $((CPUS)) $((CPUS*2))
do
echo -n $test-$i ": "
(
for ((j=0; j<i; j++))
do
netperf -t $test -4 -c -C -l 60 -P0 | head -1 &
done
wait
) | awk '{ n++; v+=$5; } END { print "Avg: " v/n }'
done
done
NO_WA_OLD WA_IDLE WA_WEIGHT:
TCP_STREAM-1 : Avg: 44139.8
TCP_STREAM-10 : Avg: 27301.6
TCP_STREAM-20 : Avg: 12701.5
TCP_STREAM-40 : Avg: 5711.62
TCP_STREAM-80 : Avg: 2870.16
WA_OLD NO_WA_IDLE NO_WA_WEIGHT:
TCP_STREAM-1 : Avg: 25293.1
TCP_STREAM-10 : Avg: 28196.3
TCP_STREAM-20 : Avg: 12463.7
TCP_STREAM-40 : Avg: 5566.83
TCP_STREAM-80 : Avg: 2630.03
---
include/linux/sched/topology.h | 4 ++
kernel/sched/fair.c | 99 +++++++++++++++++++++++++++++++++++++-----
kernel/sched/features.h | 2 +
3 files changed, 93 insertions(+), 12 deletions(-)
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 26347741ba50..2cb74343c252 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -72,6 +72,10 @@ struct sched_domain_shared {
atomic_t ref;
atomic_t nr_busy_cpus;
int has_idle_cores;
+
+ unsigned long nr_running;
+ unsigned long load;
+ unsigned long capacity;
};
struct sched_domain {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5eb3ffc9be84..4a561311241a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5680,6 +5680,68 @@ static int wake_wide(struct task_struct *p)
return 1;
}
+struct llc_stats {
+ unsigned long nr_running;
+ unsigned long load;
+ unsigned long capacity;
+ int has_capacity;
+};
+
+static bool get_llc_stats(struct llc_stats *stats, int cpu)
+{
+ struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+
+ if (!sds)
+ return false;
+
+ stats->nr_running = READ_ONCE(sds->nr_running);
+ stats->load = READ_ONCE(sds->load);
+ stats->capacity = READ_ONCE(sds->capacity);
+ stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu);
+
+ return true;
+}
+
+static int
+wake_affine_old(struct sched_domain *sd, struct task_struct *p,
+ int this_cpu, int prev_cpu, int sync)
+{
+ struct llc_stats prev_stats, this_stats;
+ s64 this_eff_load, prev_eff_load;
+ unsigned long task_load;
+
+ if (!get_llc_stats(&prev_stats, prev_cpu) ||
+ !get_llc_stats(&this_stats, this_cpu))
+ return nr_cpumask_bits;
+
+ if (sync) {
+ unsigned long current_load = task_h_load(current);
+ if (current_load > this_stats.load)
+ return this_cpu;
+
+ this_stats.load -= current_load;
+ }
+
+ if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
+ return nr_cpumask_bits;
+
+ if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running)
+ return this_cpu;
+
+ task_load = task_h_load(p);
+
+ this_eff_load = 100;
+ this_eff_load *= prev_stats.capacity;
+
+ prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+ prev_eff_load *= this_stats.capacity;
+
+ this_eff_load *= this_stats.load + task_load;
+ prev_eff_load *= prev_stats.load - task_load;
+
+ return this_eff_load <= prev_eff_load ? this_cpu : nr_cpumask_bits;
+}
+
/*
* The purpose of wake_affine() is to quickly determine on which CPU we can run
* soonest. For the purpose of speed we only consider the waking and previous
@@ -5756,6 +5818,9 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
int this_cpu = smp_processor_id();
int target = nr_cpumask_bits;
+ if (sched_feat(WA_OLD))
+ target = wake_affine_old(sd, p, this_cpu, prev_cpu, sync);
+
if (sched_feat(WA_IDLE))
target = wake_affine_idle(this_cpu, prev_cpu, sync);
@@ -6209,18 +6274,20 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
return prev;
/* Check a recently used CPU as a potential idle candidate */
- recent_used_cpu = p->recent_used_cpu;
- if (recent_used_cpu != prev &&
- recent_used_cpu != target &&
- cpus_share_cache(recent_used_cpu, target) &&
- idle_cpu(recent_used_cpu) &&
- cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
- /*
- * Replace recent_used_cpu with prev as it is a potential
- * candidate for the next wake.
- */
- p->recent_used_cpu = prev;
- return recent_used_cpu;
+ if (sched_feat(SIS_RECENT)) {
+ recent_used_cpu = p->recent_used_cpu;
+ if (recent_used_cpu != prev &&
+ recent_used_cpu != target &&
+ cpus_share_cache(recent_used_cpu, target) &&
+ idle_cpu(recent_used_cpu) &&
+ cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
+ /*
+ * Replace recent_used_cpu with prev as it is a potential
+ * candidate for the next wake.
+ */
+ p->recent_used_cpu = prev;
+ return recent_used_cpu;
+ }
}
sd = rcu_dereference(per_cpu(sd_llc, target));
@@ -7961,6 +8028,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
*/
static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
+ struct sched_domain_shared *shared = env->sd->shared;
struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat;
@@ -8032,6 +8100,13 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
if (env->dst_rq->rd->overload != overload)
env->dst_rq->rd->overload = overload;
}
+
+ if (!shared)
+ return;
+
+ WRITE_ONCE(shared->nr_running, sds->total_running);
+ WRITE_ONCE(shared->load, sds->total_load);
+ WRITE_ONCE(shared->capacity, sds->total_capacity);
}
/**
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 9552fd5854bf..bdb0a66caaae 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -57,6 +57,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
*/
SCHED_FEAT(SIS_AVG_CPU, false)
SCHED_FEAT(SIS_PROP, true)
+SCHED_FEAT(SIS_RECENT, true)
/*
* Issue a WARN when we do multiple update_rq_clock() calls
@@ -82,6 +83,7 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
SCHED_FEAT(LB_MIN, false)
SCHED_FEAT(ATTACH_AGE_LOAD, true)
+SCHED_FEAT(WA_OLD, false)
SCHED_FEAT(WA_IDLE, true)
SCHED_FEAT(WA_WEIGHT, true)
SCHED_FEAT(WA_BIAS, true)
Powered by blists - more mailing lists