lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <BN6PR15MB15531135BB3A0AAEB78A754F9AF60@BN6PR15MB1553.namprd15.prod.outlook.com>
Date:   Tue, 13 Feb 2018 08:14:24 +0000
From:   Jon Maloy <jon.maloy@...csson.com>
To:     Peter Zijlstra <peterz@...radead.org>
CC:     "netdev@...r.kernel.org" <netdev@...r.kernel.org>,
        "mingo@...nel.org" <mingo@...nel.org>,
        "David Miller (davem@...emloft.net)" <davem@...emloft.net>,
        Mike Galbraith <umgwanakikbuti@...il.com>,
        "Matt Fleming" <matt@...eblueprint.co.uk>
Subject: RE: Serious performance degradation in Linux 4.15

The person who reported this is on vacation right now. I will be back with more detailed info in two weeks.

///jon

> -----Original Message-----
> From: netdev-owner@...r.kernel.org [mailto:netdev-
> owner@...r.kernel.org] On Behalf Of Peter Zijlstra
> Sent: Monday, February 12, 2018 16:17
> To: Jon Maloy <jon.maloy@...csson.com>
> Cc: netdev@...r.kernel.org; mingo@...nel.org; David Miller
> (davem@...emloft.net) <davem@...emloft.net>; Mike Galbraith
> <umgwanakikbuti@...il.com>; Matt Fleming <matt@...eblueprint.co.uk>
> Subject: Re: Serious performance degradation in Linux 4.15
> 
> On Fri, Feb 09, 2018 at 05:59:12PM +0000, Jon Maloy wrote:
> > Command for TCP:
> > "netperf TCP_STREAM  (netperf -n 4 -f m -c 4 -C 4 -P 1 -H 10.0.0.1 -t
> TCP_STREAM -l 10 -- -O THROUGHPUT)"
> > Command for TIPC:
> > "netperf TIPC_STREAM (netperf -n 4 -f m -c 4 -C 4 -P 1 -H 10.0.0.1 -t
> TCP_STREAM -l 10 -- -O THROUGHPUT)"
> 
> That looks like identical tests to me. And my netperf (debian testing) doesn't
> appear to have -t TIPC_STREAM.
> 
> Please try a coherent report and I'll have another look. Don't (again) forget to
> mention what kind of setup you're running this on.
> 
> 
> On my IVB-EP (2 sockets, 10 cores, 2 threads), performance cpufreq, PTI=n
> RETPOLINE=n, I get:
> 
> 
> CPUS=`grep -c ^processor /proc/cpuinfo`
> 
> for test in TCP_STREAM
> do
>         for i in 1 $((CPUS/4)) $((CPUS/2)) $((CPUS)) $((CPUS*2))
>         do
>                 echo -n $test-$i ": "
> 
>                 (
>                   for ((j=0; j<i; j++))
>                   do
>                         netperf -t $test -4 -c -C -l 60 -P0 | head -1 &
>                   done
> 
>                   wait
>                 ) | awk '{ n++; v+=$5; } END { print "Avg: " v/n }'
>         done
> done
> 
> 
> 
> NO_WA_OLD WA_IDLE WA_WEIGHT:
> 
> TCP_STREAM-1 : Avg: 44139.8
> TCP_STREAM-10 : Avg: 27301.6
> TCP_STREAM-20 : Avg: 12701.5
> TCP_STREAM-40 : Avg: 5711.62
> TCP_STREAM-80 : Avg: 2870.16
> 
> 
> WA_OLD NO_WA_IDLE NO_WA_WEIGHT:
> 
> TCP_STREAM-1 : Avg: 25293.1
> TCP_STREAM-10 : Avg: 28196.3
> TCP_STREAM-20 : Avg: 12463.7
> TCP_STREAM-40 : Avg: 5566.83
> TCP_STREAM-80 : Avg: 2630.03
> 
> ---
>  include/linux/sched/topology.h |  4 ++
>  kernel/sched/fair.c            | 99
> +++++++++++++++++++++++++++++++++++++-----
>  kernel/sched/features.h        |  2 +
>  3 files changed, 93 insertions(+), 12 deletions(-)
> 
> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
> index 26347741ba50..2cb74343c252 100644
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -72,6 +72,10 @@ struct sched_domain_shared {
>  	atomic_t	ref;
>  	atomic_t	nr_busy_cpus;
>  	int		has_idle_cores;
> +
> +	unsigned long	nr_running;
> +	unsigned long	load;
> +	unsigned long	capacity;
>  };
> 
>  struct sched_domain {
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index
> 5eb3ffc9be84..4a561311241a 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5680,6 +5680,68 @@ static int wake_wide(struct task_struct *p)
>  	return 1;
>  }
> 
> +struct llc_stats {
> +	unsigned long nr_running;
> +	unsigned long load;
> +	unsigned long capacity;
> +	int		has_capacity;
> +};
> +
> +static bool get_llc_stats(struct llc_stats *stats, int cpu) {
> +	struct sched_domain_shared *sds =
> +rcu_dereference(per_cpu(sd_llc_shared, cpu));
> +
> +	if (!sds)
> +		return false;
> +
> +	stats->nr_running = READ_ONCE(sds->nr_running);
> +	stats->load	  = READ_ONCE(sds->load);
> +	stats->capacity	  = READ_ONCE(sds->capacity);
> +	stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu);
> +
> +	return true;
> +}
> +
> +static int
> +wake_affine_old(struct sched_domain *sd, struct task_struct *p,
> +		int this_cpu, int prev_cpu, int sync) {
> +	struct llc_stats prev_stats, this_stats;
> +	s64 this_eff_load, prev_eff_load;
> +	unsigned long task_load;
> +
> +	if (!get_llc_stats(&prev_stats, prev_cpu) ||
> +	    !get_llc_stats(&this_stats, this_cpu))
> +		return nr_cpumask_bits;
> +
> +	if (sync) {
> +		unsigned long current_load = task_h_load(current);
> +		if (current_load > this_stats.load)
> +			return this_cpu;
> +
> +		this_stats.load -= current_load;
> +	}
> +
> +	if (prev_stats.has_capacity && prev_stats.nr_running <
> this_stats.nr_running+1)
> +		return nr_cpumask_bits;
> +
> +	if (this_stats.has_capacity && this_stats.nr_running+1 <
> prev_stats.nr_running)
> +		return this_cpu;
> +
> +	task_load = task_h_load(p);
> +
> +	this_eff_load = 100;
> +	this_eff_load *= prev_stats.capacity;
> +
> +	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
> +	prev_eff_load *= this_stats.capacity;
> +
> +	this_eff_load *= this_stats.load + task_load;
> +	prev_eff_load *= prev_stats.load - task_load;
> +
> +	return this_eff_load <= prev_eff_load ? this_cpu : nr_cpumask_bits;
> }
> +
>  /*
>   * The purpose of wake_affine() is to quickly determine on which CPU we
> can run
>   * soonest. For the purpose of speed we only consider the waking and
> previous @@ -5756,6 +5818,9 @@ static int wake_affine(struct
> sched_domain *sd, struct task_struct *p,
>  	int this_cpu = smp_processor_id();
>  	int target = nr_cpumask_bits;
> 
> +	if (sched_feat(WA_OLD))
> +		target = wake_affine_old(sd, p, this_cpu, prev_cpu, sync);
> +
>  	if (sched_feat(WA_IDLE))
>  		target = wake_affine_idle(this_cpu, prev_cpu, sync);
> 
> @@ -6209,18 +6274,20 @@ static int select_idle_sibling(struct task_struct *p,
> int prev, int target)
>  		return prev;
> 
>  	/* Check a recently used CPU as a potential idle candidate */
> -	recent_used_cpu = p->recent_used_cpu;
> -	if (recent_used_cpu != prev &&
> -	    recent_used_cpu != target &&
> -	    cpus_share_cache(recent_used_cpu, target) &&
> -	    idle_cpu(recent_used_cpu) &&
> -	    cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
> -		/*
> -		 * Replace recent_used_cpu with prev as it is a potential
> -		 * candidate for the next wake.
> -		 */
> -		p->recent_used_cpu = prev;
> -		return recent_used_cpu;
> +	if (sched_feat(SIS_RECENT)) {
> +		recent_used_cpu = p->recent_used_cpu;
> +		if (recent_used_cpu != prev &&
> +		    recent_used_cpu != target &&
> +		    cpus_share_cache(recent_used_cpu, target) &&
> +		    idle_cpu(recent_used_cpu) &&
> +		    cpumask_test_cpu(p->recent_used_cpu, &p-
> >cpus_allowed)) {
> +			/*
> +			 * Replace recent_used_cpu with prev as it is a
> potential
> +			 * candidate for the next wake.
> +			 */
> +			p->recent_used_cpu = prev;
> +			return recent_used_cpu;
> +		}
>  	}
> 
>  	sd = rcu_dereference(per_cpu(sd_llc, target)); @@ -7961,6 +8028,7
> @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
>   */
>  static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats
> *sds)  {
> +	struct sched_domain_shared *shared = env->sd->shared;
>  	struct sched_domain *child = env->sd->child;
>  	struct sched_group *sg = env->sd->groups;
>  	struct sg_lb_stats *local = &sds->local_stat; @@ -8032,6 +8100,13
> @@ static inline void update_sd_lb_stats(struct lb_env *env, struct
> sd_lb_stats *sd
>  		if (env->dst_rq->rd->overload != overload)
>  			env->dst_rq->rd->overload = overload;
>  	}
> +
> +	if (!shared)
> +		return;
> +
> +	WRITE_ONCE(shared->nr_running, sds->total_running);
> +	WRITE_ONCE(shared->load, sds->total_load);
> +	WRITE_ONCE(shared->capacity, sds->total_capacity);
>  }
> 
>  /**
> diff --git a/kernel/sched/features.h b/kernel/sched/features.h index
> 9552fd5854bf..bdb0a66caaae 100644
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -57,6 +57,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
>   */
>  SCHED_FEAT(SIS_AVG_CPU, false)
>  SCHED_FEAT(SIS_PROP, true)
> +SCHED_FEAT(SIS_RECENT, true)
> 
>  /*
>   * Issue a WARN when we do multiple update_rq_clock() calls @@ -82,6
> +83,7 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)  SCHED_FEAT(LB_MIN,
> false)  SCHED_FEAT(ATTACH_AGE_LOAD, true)
> 
> +SCHED_FEAT(WA_OLD, false)
>  SCHED_FEAT(WA_IDLE, true)
>  SCHED_FEAT(WA_WEIGHT, true)
>  SCHED_FEAT(WA_BIAS, true)

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ