[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <c47155ca-0e51-0060-c8c2-11588a850a06@linux.vnet.ibm.com>
Date: Wed, 27 Sep 2017 12:27:50 -0400
From: Eric Farman <farman@...ux.vnet.ibm.com>
To: Peter Zijlstra <peterz@...radead.org>
Cc: Rik van Riel <riel@...hat.com>,
王金浦 <jinpuwang@...il.com>,
LKML <linux-kernel@...r.kernel.org>,
Ingo Molnar <mingo@...hat.com>,
Christian Borntraeger <borntraeger@...ibm.com>,
"KVM-ML (kvm@...r.kernel.org)" <kvm@...r.kernel.org>,
vcaputo@...garu.com, Matthew Rosato <mjrosato@...ux.vnet.ibm.com>
Subject: Re: sysbench throughput degradation in 4.13+
On 09/27/2017 05:35 AM, Peter Zijlstra wrote:
> On Fri, Sep 22, 2017 at 12:12:45PM -0400, Eric Farman wrote:
>>
>> MySQL. We've tried a few different configs with both test=oltp and
>> test=threads, but both show the same behavior. What I have settled on for
>> my repro is the following:
>>
>
> Right, didn't even need to run it in a guest to observe a regression.
>
> So the below cures native sysbench and NAS bench for me, does it also
> work for you virt thingy?
>
Ran a quick test this morning with 4.13.0 + 90001d67be2f + a731ebe6f17b
and then with/without this patch. An oltp sysbench run shows that guest
cpu migrations decreased significantly, from ~27K to ~2K over 5 seconds.
So, we applied this patch to linux-next (next-20170926) and ran it
against a couple sysbench tests:
--test=oltp
Baseline: 5655.26 transactions/second
Patched: 9618.13 transactions/second
--test=threads
Baseline: 25482.9 events/sec
Patched: 29577.9 events/sec
That's good! With that...
Tested-by: Eric Farman <farman@...ux.vnet.ibm.com>
Thanks!
- Eric
>
> PRE (current tip/master):
>
> ivb-ex sysbench:
>
> 2: [30 secs] transactions: 64110 (2136.94 per sec.)
> 5: [30 secs] transactions: 143644 (4787.99 per sec.)
> 10: [30 secs] transactions: 274298 (9142.93 per sec.)
> 20: [30 secs] transactions: 418683 (13955.45 per sec.)
> 40: [30 secs] transactions: 320731 (10690.15 per sec.)
> 80: [30 secs] transactions: 355096 (11834.28 per sec.)
>
> hsw-ex NAS:
>
> OMP_PROC_BIND/lu.C.x_threads_144_run_1.log: Time in seconds = 18.01
> OMP_PROC_BIND/lu.C.x_threads_144_run_2.log: Time in seconds = 17.89
> OMP_PROC_BIND/lu.C.x_threads_144_run_3.log: Time in seconds = 17.93
> lu.C.x_threads_144_run_1.log: Time in seconds = 434.68
> lu.C.x_threads_144_run_2.log: Time in seconds = 405.36
> lu.C.x_threads_144_run_3.log: Time in seconds = 433.83
>
>
> POST (+patch):
>
> ivb-ex sysbench:
>
> 2: [30 secs] transactions: 64494 (2149.75 per sec.)
> 5: [30 secs] transactions: 145114 (4836.99 per sec.)
> 10: [30 secs] transactions: 278311 (9276.69 per sec.)
> 20: [30 secs] transactions: 437169 (14571.60 per sec.)
> 40: [30 secs] transactions: 669837 (22326.73 per sec.)
> 80: [30 secs] transactions: 631739 (21055.88 per sec.)
>
> hsw-ex NAS:
>
> lu.C.x_threads_144_run_1.log: Time in seconds = 23.36
> lu.C.x_threads_144_run_2.log: Time in seconds = 22.96
> lu.C.x_threads_144_run_3.log: Time in seconds = 22.52
>
>
> This patch takes out all the shiny wake_affine stuff and goes back to
> utter basics. Rik was there another NUMA benchmark that wanted your
> fancy stuff? Because NAS isn't it.
>
> (the previous, slightly fancier wake_affine was basically a !idle
> extension of this, in that it would pick the 'shortest' of the 2 queues
> and thereby run quickest, in approximation)
>
> I'll try and run a number of other benchmarks I have around to see if
> there's anything that shows a difference between the below trivial
> wake_affine and the old 2-cpu-load one.
>
> ---
> include/linux/sched/topology.h | 8 ---
> kernel/sched/fair.c | 125 ++---------------------------------------
> 2 files changed, 6 insertions(+), 127 deletions(-)
>
> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
> index d7b6dab956ec..7d065abc7a47 100644
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -71,14 +71,6 @@ struct sched_domain_shared {
> atomic_t ref;
> atomic_t nr_busy_cpus;
> int has_idle_cores;
> -
> - /*
> - * Some variables from the most recent sd_lb_stats for this domain,
> - * used by wake_affine().
> - */
> - unsigned long nr_running;
> - unsigned long load;
> - unsigned long capacity;
> };
>
> struct sched_domain {
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 70ba32e08a23..66930ce338af 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5356,115 +5356,19 @@ static int wake_wide(struct task_struct *p)
> return 1;
> }
>
> -struct llc_stats {
> - unsigned long nr_running;
> - unsigned long load;
> - unsigned long capacity;
> - int has_capacity;
> -};
> -
> -static bool get_llc_stats(struct llc_stats *stats, int cpu)
> -{
> - struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
> -
> - if (!sds)
> - return false;
> -
> - stats->nr_running = READ_ONCE(sds->nr_running);
> - stats->load = READ_ONCE(sds->load);
> - stats->capacity = READ_ONCE(sds->capacity);
> - stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu);
> -
> - return true;
> -}
> -
> -/*
> - * Can a task be moved from prev_cpu to this_cpu without causing a load
> - * imbalance that would trigger the load balancer?
> - *
> - * Since we're running on 'stale' values, we might in fact create an imbalance
> - * but recomputing these values is expensive, as that'd mean iteration 2 cache
> - * domains worth of CPUs.
> - */
> -static bool
> -wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
> - int this_cpu, int prev_cpu, int sync)
> -{
> - struct llc_stats prev_stats, this_stats;
> - s64 this_eff_load, prev_eff_load;
> - unsigned long task_load;
> -
> - if (!get_llc_stats(&prev_stats, prev_cpu) ||
> - !get_llc_stats(&this_stats, this_cpu))
> - return false;
> -
> - /*
> - * If sync wakeup then subtract the (maximum possible)
> - * effect of the currently running task from the load
> - * of the current LLC.
> - */
> - if (sync) {
> - unsigned long current_load = task_h_load(current);
> -
> - /* in this case load hits 0 and this LLC is considered 'idle' */
> - if (current_load > this_stats.load)
> - return true;
> -
> - this_stats.load -= current_load;
> - }
> -
> - /*
> - * The has_capacity stuff is not SMT aware, but by trying to balance
> - * the nr_running on both ends we try and fill the domain at equal
> - * rates, thereby first consuming cores before siblings.
> - */
> -
> - /* if the old cache has capacity, stay there */
> - if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
> - return false;
> -
> - /* if this cache has capacity, come here */
> - if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running)
> - return true;
> -
> - /*
> - * Check to see if we can move the load without causing too much
> - * imbalance.
> - */
> - task_load = task_h_load(p);
> -
> - this_eff_load = 100;
> - this_eff_load *= prev_stats.capacity;
> -
> - prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
> - prev_eff_load *= this_stats.capacity;
> -
> - this_eff_load *= this_stats.load + task_load;
> - prev_eff_load *= prev_stats.load - task_load;
> -
> - return this_eff_load <= prev_eff_load;
> -}
> -
> static int wake_affine(struct sched_domain *sd, struct task_struct *p,
> int prev_cpu, int sync)
> {
> int this_cpu = smp_processor_id();
> - bool affine;
> -
> - /*
> - * Default to no affine wakeups; wake_affine() should not effect a task
> - * placement the load-balancer feels inclined to undo. The conservative
> - * option is therefore to not move tasks when they wake up.
> - */
> - affine = false;
> + bool affine = false;
>
> /*
> - * If the wakeup is across cache domains, try to evaluate if movement
> - * makes sense, otherwise rely on select_idle_siblings() to do
> - * placement inside the cache domain.
> + * If we can run _now_ on the waking CPU, go there, otherwise meh.
> */
> - if (!cpus_share_cache(prev_cpu, this_cpu))
> - affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
> + if (idle_cpu(this_cpu))
> + affine = true;
> + else if (sync && cpu_rq(this_cpu)->nr_running == 1)
> + affine = true;
>
> schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
> if (affine) {
> @@ -7600,7 +7504,6 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
> */
> static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
> {
> - struct sched_domain_shared *shared = env->sd->shared;
> struct sched_domain *child = env->sd->child;
> struct sched_group *sg = env->sd->groups;
> struct sg_lb_stats *local = &sds->local_stat;
> @@ -7672,22 +7575,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
> if (env->dst_rq->rd->overload != overload)
> env->dst_rq->rd->overload = overload;
> }
> -
> - if (!shared)
> - return;
> -
> - /*
> - * Since these are sums over groups they can contain some CPUs
> - * multiple times for the NUMA domains.
> - *
> - * Currently only wake_affine_llc() and find_busiest_group()
> - * uses these numbers, only the last is affected by this problem.
> - *
> - * XXX fix that.
> - */
> - WRITE_ONCE(shared->nr_running, sds->total_running);
> - WRITE_ONCE(shared->load, sds->total_load);
> - WRITE_ONCE(shared->capacity, sds->total_capacity);
> }
>
> /**
>
Powered by blists - more mailing lists