[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <6359b6d0-1923-8e6b-0d61-a8c2f8b24cf2@linux.vnet.ibm.com>
Date: Thu, 28 Sep 2017 07:04:07 -0400
From: Eric Farman <farman@...ux.vnet.ibm.com>
To: Rik van Riel <riel@...hat.com>,
Peter Zijlstra <peterz@...radead.org>
Cc: 王金浦 <jinpuwang@...il.com>,
LKML <linux-kernel@...r.kernel.org>,
Ingo Molnar <mingo@...hat.com>,
Christian Borntraeger <borntraeger@...ibm.com>,
"KVM-ML (kvm@...r.kernel.org)" <kvm@...r.kernel.org>,
vcaputo@...garu.com, Matthew Rosato <mjrosato@...ux.vnet.ibm.com>
Subject: Re: sysbench throughput degradation in 4.13+
On 09/27/2017 01:58 PM, Rik van Riel wrote:
> On Wed, 27 Sep 2017 11:35:30 +0200
> Peter Zijlstra <peterz@...radead.org> wrote:
>
>> On Fri, Sep 22, 2017 at 12:12:45PM -0400, Eric Farman wrote:
>>>
>>> MySQL. We've tried a few different configs with both test=oltp and
>>> test=threads, but both show the same behavior. What I have settled on for
>>> my repro is the following:
>>>
>>
>> Right, didn't even need to run it in a guest to observe a regression.
>>
>> So the below cures native sysbench and NAS bench for me, does it also
>> work for you virt thingy?
>>
>>
>> PRE (current tip/master):
>>
>> ivb-ex sysbench:
>>
>> 2: [30 secs] transactions: 64110 (2136.94 per sec.)
>> 5: [30 secs] transactions: 143644 (4787.99 per sec.)
>> 10: [30 secs] transactions: 274298 (9142.93 per sec.)
>> 20: [30 secs] transactions: 418683 (13955.45 per sec.)
>> 40: [30 secs] transactions: 320731 (10690.15 per sec.)
>> 80: [30 secs] transactions: 355096 (11834.28 per sec.)
>>
>> hsw-ex NAS:
>>
>> OMP_PROC_BIND/lu.C.x_threads_144_run_1.log: Time in seconds = 18.01
>> OMP_PROC_BIND/lu.C.x_threads_144_run_2.log: Time in seconds = 17.89
>> OMP_PROC_BIND/lu.C.x_threads_144_run_3.log: Time in seconds = 17.93
>> lu.C.x_threads_144_run_1.log: Time in seconds = 434.68
>> lu.C.x_threads_144_run_2.log: Time in seconds = 405.36
>> lu.C.x_threads_144_run_3.log: Time in seconds = 433.83
>>
>>
>> POST (+patch):
>>
>> ivb-ex sysbench:
>>
>> 2: [30 secs] transactions: 64494 (2149.75 per sec.)
>> 5: [30 secs] transactions: 145114 (4836.99 per sec.)
>> 10: [30 secs] transactions: 278311 (9276.69 per sec.)
>> 20: [30 secs] transactions: 437169 (14571.60 per sec.)
>> 40: [30 secs] transactions: 669837 (22326.73 per sec.)
>> 80: [30 secs] transactions: 631739 (21055.88 per sec.)
>>
>> hsw-ex NAS:
>>
>> lu.C.x_threads_144_run_1.log: Time in seconds = 23.36
>> lu.C.x_threads_144_run_2.log: Time in seconds = 22.96
>> lu.C.x_threads_144_run_3.log: Time in seconds = 22.52
>>
>>
>> This patch takes out all the shiny wake_affine stuff and goes back to
>> utter basics. Rik was there another NUMA benchmark that wanted your
>> fancy stuff? Because NAS isn't it.
>
> I like the simplicity of your approach! I hope it does not break
> stuff like netperf...
>
> I have been working on the patch below, which is much less optimistic
> about when to do an affine wakeup than before.
>
> It may be worth testing, in case it works better with some workload,
> though relying on cached values still makes me somewhat uneasy.
>
Here are numbers for our environment, to compare the two patches:
sysbench --test=threads:
next-20170926: 25470.8
-with-Peters-patch: 29559.1
-with-Riks-patch: 29283
sysbench --test=oltp:
next-20170926: 5722.37
-with-Peters-patch: 9623.45
-with-Riks-patch: 9360.59
Didn't record host cpu migrations in all scenarios, but a spot check
showed a similar reduction in both patches.
- Eric
> I will try to get kernels tested here that implement both approaches,
> to see what ends up working best.
>
> ---8<---
> Subject: sched: make wake_affine_llc less eager
>
> With the wake_affine_llc logic, tasks get moved around too eagerly,
> and then moved back later, leading to poor performance for some
> workloads.
>
> Make wake_affine_llc less eager by comparing the minimum load of
> the source LLC with the maximum load of the destination LLC, similar
> to how source_load and target_load work for regular migration.
>
> Also, get rid of an overly optimistic test that could potentially
> pull across a lot of tasks if the target LLC happened to have fewer
> runnable tasks at load balancing time.
>
> Conversely, sync wakeups could happen without taking LLC loads
> into account, if the waker would leave an idle CPU behind on
> the target LLC.
>
> Signed-off-by: Rik van Riel <riel@...hat.com>
>
> ---
> include/linux/sched/topology.h | 3 ++-
> kernel/sched/fair.c | 56 +++++++++++++++++++++++++++++++++---------
> 2 files changed, 46 insertions(+), 13 deletions(-)
>
> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
> index d7b6dab956ec..0c295ff5049b 100644
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -77,7 +77,8 @@ struct sched_domain_shared {
> * used by wake_affine().
> */
> unsigned long nr_running;
> - unsigned long load;
> + unsigned long min_load;
> + unsigned long max_load;
> unsigned long capacity;
> };
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 86195add977f..7740c6776e08 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5239,6 +5239,23 @@ static unsigned long target_load(int cpu, int type)
> return max(rq->cpu_load[type-1], total);
> }
>
> +static void min_max_load(int cpu, unsigned long *min_load,
> + unsigned long *max_load)
> +{
> + struct rq *rq = cpu_rq(cpu);
> + unsigned long minl = ULONG_MAX;
> + unsigned long maxl = 0;
> + int i;
> +
> + for (i = 0; i < CPU_LOAD_IDX_MAX; i++) {
> + minl = min(minl, rq->cpu_load[i]);
> + maxl = max(maxl, rq->cpu_load[i]);
> + }
> +
> + *min_load = minl;
> + *max_load = maxl;
> +}
> +
> static unsigned long capacity_of(int cpu)
> {
> return cpu_rq(cpu)->cpu_capacity;
> @@ -5310,7 +5327,8 @@ static int wake_wide(struct task_struct *p)
>
> struct llc_stats {
> unsigned long nr_running;
> - unsigned long load;
> + unsigned long min_load;
> + unsigned long max_load;
> unsigned long capacity;
> int has_capacity;
> };
> @@ -5323,7 +5341,8 @@ static bool get_llc_stats(struct llc_stats *stats, int cpu)
> return false;
>
> stats->nr_running = READ_ONCE(sds->nr_running);
> - stats->load = READ_ONCE(sds->load);
> + stats->min_load = READ_ONCE(sds->min_load);
> + stats->max_load = READ_ONCE(sds->max_load);
> stats->capacity = READ_ONCE(sds->capacity);
> stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu);
>
> @@ -5359,10 +5378,14 @@ wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
> unsigned long current_load = task_h_load(current);
>
> /* in this case load hits 0 and this LLC is considered 'idle' */
> - if (current_load > this_stats.load)
> + if (current_load > this_stats.max_load)
> + return true;
> +
> + /* allow if the CPU would go idle, regardless of LLC load */
> + if (current_load >= target_load(this_cpu, sd->wake_idx))
> return true;
>
> - this_stats.load -= current_load;
> + this_stats.max_load -= current_load;
> }
>
> /*
> @@ -5375,10 +5398,6 @@ wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
> if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
> return false;
>
> - /* if this cache has capacity, come here */
> - if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running)
> - return true;
> -
> /*
> * Check to see if we can move the load without causing too much
> * imbalance.
> @@ -5391,8 +5410,8 @@ wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
> prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
> prev_eff_load *= this_stats.capacity;
>
> - this_eff_load *= this_stats.load + task_load;
> - prev_eff_load *= prev_stats.load - task_load;
> + this_eff_load *= this_stats.max_load + task_load;
> + prev_eff_load *= prev_stats.min_load - task_load;
>
> return this_eff_load <= prev_eff_load;
> }
> @@ -7033,6 +7052,8 @@ enum group_type {
> struct sg_lb_stats {
> unsigned long avg_load; /*Avg load across the CPUs of the group */
> unsigned long group_load; /* Total load over the CPUs of the group */
> + unsigned long min_load;
> + unsigned long max_load;
> unsigned long sum_weighted_load; /* Weighted load of group's tasks */
> unsigned long load_per_task;
> unsigned long group_capacity;
> @@ -7059,6 +7080,8 @@ struct sd_lb_stats {
> unsigned long total_load; /* Total load of all groups in sd */
> unsigned long total_capacity; /* Total capacity of all groups in sd */
> unsigned long avg_load; /* Average load across all groups in sd */
> + unsigned long min_load; /* Sum of lowest loadavg on CPUs */
> + unsigned long max_load; /* Sum of highest loadavg on CPUs */
>
> struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
> struct sg_lb_stats local_stat; /* Statistics of the local group */
> @@ -7077,6 +7100,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
> .local = NULL,
> .total_running = 0UL,
> .total_load = 0UL,
> + .min_load = 0UL,
> + .max_load = 0UL,
> .total_capacity = 0UL,
> .busiest_stat = {
> .avg_load = 0UL,
> @@ -7358,7 +7383,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
> int local_group, struct sg_lb_stats *sgs,
> bool *overload)
> {
> - unsigned long load;
> + unsigned long load, min_load, max_load;
> int i, nr_running;
>
> memset(sgs, 0, sizeof(*sgs));
> @@ -7372,7 +7397,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
> else
> load = source_load(i, load_idx);
>
> + min_max_load(i, &min_load, &max_load);
> +
> sgs->group_load += load;
> + sgs->min_load += min_load;
> + sgs->max_load += max_load;
> sgs->group_util += cpu_util(i);
> sgs->sum_nr_running += rq->cfs.h_nr_running;
>
> @@ -7569,6 +7598,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
> /* Now, start updating sd_lb_stats */
> sds->total_running += sgs->sum_nr_running;
> sds->total_load += sgs->group_load;
> + sds->min_load += sgs->min_load;
> + sds->max_load += sgs->max_load;
> sds->total_capacity += sgs->group_capacity;
>
> sg = sg->next;
> @@ -7596,7 +7627,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
> * XXX fix that.
> */
> WRITE_ONCE(shared->nr_running, sds->total_running);
> - WRITE_ONCE(shared->load, sds->total_load);
> + WRITE_ONCE(shared->min_load, sds->min_load);
> + WRITE_ONCE(shared->max_load, sds->max_load);
> WRITE_ONCE(shared->capacity, sds->total_capacity);
> }
>
>
>
Powered by blists - more mailing lists