[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20170801192650.GA27425@destiny>
Date: Tue, 1 Aug 2017 15:26:51 -0400
From: Josef Bacik <josef@...icpanda.com>
To: Peter Zijlstra <peterz@...radead.org>
Cc: Rik van Riel <riel@...hat.com>, linux-kernel@...r.kernel.org,
jhladky@...hat.com, mingo@...nel.org, mgorman@...e.de,
josef@...icpanda.com
Subject: Re: [PATCH] sched/fair: Fix wake_affine() for !NUMA_BALANCING
On Tue, Aug 01, 2017 at 02:19:12PM +0200, Peter Zijlstra wrote:
> On Tue, Jun 27, 2017 at 10:55:58AM -0400, Rik van Riel wrote:
> > On Tue, 2017-06-27 at 07:39 +0200, Peter Zijlstra wrote:
>
> > > If LLC < NUMA or !NUMA_BALANCING we have a region that needs to do
> > > _something_.
> >
> > Agreed. I will fix this. Given that this is a bit
> > of a corner case,
>
>
> Ok, so it turns out all of facebook sits inside that corner case because
> they have !NUMA_BALANCING.
>
> How does the below patch work?
>
> ---
> Subject: sched/fair: Fix wake_affine() for !NUMA_BALANCING
> From: Peter Zijlstra <peterz@...radead.org>
> Date: Mon Jul 31 17:50:05 CEST 2017
>
> In commit:
>
> 3fed382b46ba ("sched/numa: Implement NUMA node level wake_affine()")
>
> Rik changed wake_affine to consider NUMA information when balancing
> between LLC domains.
>
> There are a number of problems here which this patch tries to address:
>
> - LLC < NODE; in this case we'd use the wrong information to balance
> - !NUMA_BALANCING: in this case, the new code doesn't do any
> balancing at all
> - re-computes the NUMA data for every wakeup, this can mean iterating
> up to 64 CPUs for every wakeup.
> - default affine wakeups inside a cache
>
> We address these by saving the load/capacity values for each
> sched_domain during regular load-balance and using these values in
> wake_affine_llc(). The obvious down-side to using cached values is
> that they can be too old and poorly reflect reality.
>
> But this way we can use LLC wide information and thus not rely on
> assuming LLC matches NODE. We also don't rely on NUMA_BALANCING nor do
> we have to aggegate two nodes (or even cache domains) worth of CPUs
> for each wakeup.
>
> Cc: Josef Bacik <josef@...icpanda.com>
> Cc: Rik van Riel <riel@...hat.com>
> Fixes: 3fed382b46ba ("sched/numa: Implement NUMA node level wake_affine()")
> Signed-off-by: Peter Zijlstra (Intel) <peterz@...radead.org>
> ---
> include/linux/sched/topology.h | 9 ++
> kernel/sched/fair.c | 182 +++++++++++++++++++++++++----------------
> 2 files changed, 123 insertions(+), 68 deletions(-)
>
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -71,6 +71,15 @@ struct sched_domain_shared {
> atomic_t ref;
> atomic_t nr_busy_cpus;
> int has_idle_cores;
> +
> + /*
> + * Some variables from the most recent sd_lb_stats for this domain.
> + *
> + * used by wake_affine().
> + */
> + unsigned long nr_running;
> + unsigned long load;
> + unsigned long capacity;
> };
>
> struct sched_domain {
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -2586,59 +2586,6 @@ void task_tick_numa(struct rq *rq, struc
> }
> }
>
> -/*
> - * Can a task be moved from prev_cpu to this_cpu without causing a load
> - * imbalance that would trigger the load balancer?
> - */
> -static inline bool numa_wake_affine(struct sched_domain *sd,
> - struct task_struct *p, int this_cpu,
> - int prev_cpu, int sync)
> -{
> - struct numa_stats prev_load, this_load;
> - s64 this_eff_load, prev_eff_load;
> -
> - update_numa_stats(&prev_load, cpu_to_node(prev_cpu));
> - update_numa_stats(&this_load, cpu_to_node(this_cpu));
> -
> - /*
> - * If sync wakeup then subtract the (maximum possible)
> - * effect of the currently running task from the load
> - * of the current CPU:
> - */
> - if (sync) {
> - unsigned long current_load = task_h_load(current);
> -
> - if (this_load.load > current_load)
> - this_load.load -= current_load;
> - else
> - this_load.load = 0;
> - }
> -
> - /*
> - * In low-load situations, where this_cpu's node is idle due to the
> - * sync cause above having dropped this_load.load to 0, move the task.
> - * Moving to an idle socket will not create a bad imbalance.
> - *
> - * Otherwise check if the nodes are near enough in load to allow this
> - * task to be woken on this_cpu's node.
> - */
> - if (this_load.load > 0) {
> - unsigned long task_load = task_h_load(p);
> -
> - this_eff_load = 100;
> - this_eff_load *= prev_load.compute_capacity;
> -
> - prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
> - prev_eff_load *= this_load.compute_capacity;
> -
> - this_eff_load *= this_load.load + task_load;
> - prev_eff_load *= prev_load.load - task_load;
> -
> - return this_eff_load <= prev_eff_load;
> - }
> -
> - return true;
> -}
> #else
> static void task_tick_numa(struct rq *rq, struct task_struct *curr)
> {
> @@ -2652,14 +2599,6 @@ static inline void account_numa_dequeue(
> {
> }
>
> -#ifdef CONFIG_SMP
> -static inline bool numa_wake_affine(struct sched_domain *sd,
> - struct task_struct *p, int this_cpu,
> - int prev_cpu, int sync)
> -{
> - return true;
> -}
> -#endif /* !SMP */
> #endif /* CONFIG_NUMA_BALANCING */
>
> static void
> @@ -5356,20 +5295,110 @@ static int wake_wide(struct task_struct
> return 1;
> }
>
> +struct llc_stats {
> + unsigned long nr_running;
> + unsigned long load;
> + unsigned long capacity;
> + int has_capacity;
> +};
> +
> +static void get_llc_stats(struct llc_stats *stats, int cpu)
> +{
> + struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
> +
> + stats->nr_running = READ_ONCE(sds->nr_running);
> + stats->load = READ_ONCE(sds->load);
> + stats->capacity = READ_ONCE(sds->capacity);
> + stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu);
> +}
> +
> +/*
> + * Can a task be moved from prev_cpu to this_cpu without causing a load
> + * imbalance that would trigger the load balancer?
> + *
> + * Since we're running on 'stale' values, we might in fact create an imbalance
> + * but recomputing these values is expensive, as that'd mean iteration 2 cache
> + * domains worth of CPUs.
> + */
> +static bool
> +wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
> + int this_cpu, int prev_cpu, int sync)
> +{
> + struct llc_stats prev_stats, this_stats;
> + s64 this_eff_load, prev_eff_load;
> + unsigned long task_load;
> +
> + get_llc_stats(&prev_stats, prev_cpu);
> + get_llc_stats(&this_stats, this_cpu);
> +
> + /*
> + * If sync wakeup then subtract the (maximum possible)
> + * effect of the currently running task from the load
> + * of the current LLC.
> + */
> + if (sync) {
> + unsigned long current_load = task_h_load(current);
> +
> + /* in this case load hits 0 and this LLC is considered 'idle' */
> + if (current_load > this_stats.load)
> + return true;
> +
> + this_stats.load -= current_load;
> + }
> +
> + /*
> + * The has_capacity stuff is not SMT aware, but by trying to balance
> + * the nr_running on both ends we try and fill the domain at equal
> + * rates, thereby first consuming cores before siblings.
> + */
> +
> + /* if the old cache has capacity, stay there */
> + if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
> + return false;
> +
> + /* if this cache has capacity, come here */
> + if (this_stats.has_capacity && this_stats.nr_running < prev_stats.nr_running+1)
> + return true;
> +
> +
> + /*
> + * Check to see if we can move the load without causing too much
> + * imbalance.
> + */
> + task_load = task_h_load(p);
> +
> + this_eff_load = 100;
> + this_eff_load *= prev_stats.capacity;
> +
> + prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
> + prev_eff_load *= this_stats.capacity;
> +
> + this_eff_load *= this_stats.load + task_load;
> + prev_eff_load *= prev_stats.load - task_load;
> +
> + return this_eff_load <= prev_eff_load;
> +}
> +
> static int wake_affine(struct sched_domain *sd, struct task_struct *p,
> int prev_cpu, int sync)
> {
> int this_cpu = smp_processor_id();
> - bool affine = false;
> + bool affine;
>
> /*
> - * Common case: CPUs are in the same socket, and select_idle_sibling()
> - * will do its thing regardless of what we return:
> + * Default to no affine wakeups; wake_affine() should not effect a task
> + * placement the load-balancer feels inclined to undo. The conservative
> + * option is therefore to not move tasks when they wake up.
> */
> - if (cpus_share_cache(prev_cpu, this_cpu))
> - affine = true;
> - else
> - affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync);
> + affine = false;
> +
> + /*
> + * If the wakeup is across cache domains, try to evaluate if movement
> + * makes sense, otherwise rely on select_idle_siblings() to do
> + * placement inside the cache domain.
> + */
> + if (!cpus_share_cache(prev_cpu, this_cpu))
> + affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
>
> schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
> if (affine) {
> @@ -7049,6 +7078,7 @@ struct sg_lb_stats {
> struct sd_lb_stats {
> struct sched_group *busiest; /* Busiest group in this sd */
> struct sched_group *local; /* Local group in this sd */
> + unsigned long total_running;
> unsigned long total_load; /* Total load of all groups in sd */
> unsigned long total_capacity; /* Total capacity of all groups in sd */
> unsigned long avg_load; /* Average load across all groups in sd */
> @@ -7068,6 +7098,7 @@ static inline void init_sd_lb_stats(stru
> *sds = (struct sd_lb_stats){
> .busiest = NULL,
> .local = NULL,
> + .total_running = 0UL,
> .total_load = 0UL,
> .total_capacity = 0UL,
> .busiest_stat = {
> @@ -7503,6 +7534,7 @@ static inline enum fbq_type fbq_classify
> */
> static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
> {
> + struct sched_domain_shared *shared = env->sd->shared;
> struct sched_domain *child = env->sd->child;
> struct sched_group *sg = env->sd->groups;
> struct sg_lb_stats *local = &sds->local_stat;
> @@ -7559,6 +7591,7 @@ static inline void update_sd_lb_stats(st
>
> next_group:
> /* Now, start updating sd_lb_stats */
> + sds->total_running += sgs->sum_nr_running;
> sds->total_load += sgs->group_load;
> sds->total_capacity += sgs->group_capacity;
>
> @@ -7574,6 +7607,18 @@ static inline void update_sd_lb_stats(st
> env->dst_rq->rd->overload = overload;
> }
>
> + /*
> + * Since these are sums over groups they can contain some CPUs
> + * multiple times for the NUMA domains.
> + *
> + * Currently only wake_affine_llc() and find_busiest_group()
> + * uses these numbers, only the last is affected by this problem.
> + *
> + * XXX fix that.
> + */
> + WRITE_ONCE(shared->nr_running, sds->total_running);
> + WRITE_ONCE(shared->load, sds->total_load);
> + WRITE_ONCE(shared->capacity, sds->total_capacity);
This panic's on boot for me because shared is NULL. Same happens in
select_task_rq_fair when it tries to do the READ_ONCE. Here is my .config in
case it's something strange with my config. Thanks,
Josef
View attachment "config" of type "text/plain" (105888 bytes)
Powered by blists - more mailing lists