linux-kernel - Re: [PATCH 1/2] sched: force update of blocked load of idle cpus

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAKfTPtDu33pC-VofPzkcW9=f1KN-LsezAQnupjb93iE2RYkLWw@mail.gmail.com>
Date:   Mon, 20 Nov 2017 10:04:42 +0100
From:   Vincent Guittot <vincent.guittot@...aro.org>
To:     Brendan Jackman <brendan.jackman@....com>
Cc:     Dietmar Eggemann <dietmar.eggemann@....com>,
        Ingo Molnar <mingo@...nel.org>,
        Peter Zijlstra <peterz@...radead.org>,
        linux-kernel <linux-kernel@...r.kernel.org>,
        Ingo Molnar <mingo@...hat.com>,
        Morten Rasmussen <morten.rasmussen@....com>
Subject: Re: [PATCH 1/2] sched: force update of blocked load of idle cpus

On 24 October 2017 at 14:25, Brendan Jackman <brendan.jackman@....com> wrote:
> From: Vincent Guittot <vincent.guittot@...aro.org>
>
> When idle, the blocked load of CPUs will be updated only when an idle
> load balance is triggered which may never happen. Because of this
> uncertainty on the execution of idle load balance, the utilization,
> the load and the shares of idle cfs_rq can stay artificially high and
> steal shares and running time to busy cfs_rqs of the task group.
> Add a new light idle load balance state which ensures that blocked loads
> are periodically updated and decayed but does not perform any task
> migration.
>
> The remote load udpates are rate-limited, so that they are not
> performed with a shorter period than LOAD_AVG_PERIOD (i.e. PELT
> half-life). This is the period after which we have a known 50% error
> in stale load.
>
> Cc: Dietmar Eggemann <dietmar.eggemann@....com>
> Cc: Vincent Guittot <vincent.guittot@...aro.org>
> Cc: Ingo Molnar <mingo@...hat.com>
> Cc: Morten Rasmussen <morten.rasmussen@....com>
> Cc: Peter Zijlstra <peterz@...radead.org>
> Signed-off-by: Vincent Guittot <vincent.guittot@...aro.org>
> [Switched remote update interval to use PELT half life]
> [Moved update_blocked_averges call outside rebalance_domains
>  to simplify code]
> Signed-off-by: Brendan Jackman <brendan.jackman@....com>
> ---
>  kernel/sched/fair.c  | 71 +++++++++++++++++++++++++++++++++++++++++++++-------
>  kernel/sched/sched.h |  1 +
>  2 files changed, 63 insertions(+), 9 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 85d1ec1c3b39..9085caf49c76 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5976,6 +5976,9 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
>         return min_cap * 1024 < task_util(p) * capacity_margin;
>  }
>
> +static inline bool nohz_kick_needed(struct rq *rq, bool only_update);
> +static void nohz_balancer_kick(bool only_update);
> +
>  /*
>   * select_task_rq_fair: Select target runqueue for the waking task in domains
>   * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
> @@ -6074,6 +6077,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
>         }
>         rcu_read_unlock();
>
> +#ifdef CONFIG_NO_HZ_COMMON
> +       if (nohz_kick_needed(cpu_rq(new_cpu), true))
> +               nohz_balancer_kick(true);
> +#endif
> +
>         return new_cpu;
>  }
>
> @@ -8653,6 +8661,7 @@ static struct {
>         cpumask_var_t idle_cpus_mask;
>         atomic_t nr_cpus;
>         unsigned long next_balance;     /* in jiffy units */
> +       unsigned long next_update;     /* in jiffy units */
>  } nohz ____cacheline_aligned;
>
>  static inline int find_new_ilb(void)
> @@ -8670,7 +8679,7 @@ static inline int find_new_ilb(void)
>   * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
>   * CPU (if there is one).
>   */
> -static void nohz_balancer_kick(void)
> +static void nohz_balancer_kick(bool only_update)
>  {
>         int ilb_cpu;
>
> @@ -8683,6 +8692,10 @@ static void nohz_balancer_kick(void)
>
>         if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
>                 return;
> +
> +       if (only_update)
> +               set_bit(NOHZ_STATS_KICK, nohz_flags(ilb_cpu));
> +
>         /*
>          * Use smp_send_reschedule() instead of resched_cpu().
>          * This way we generate a sched IPI on the target cpu which
> @@ -8770,6 +8783,8 @@ void nohz_balance_enter_idle(int cpu)
>         atomic_inc(&nohz.nr_cpus);
>         set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
>  }
> +#else
> +static inline void nohz_balancer_kick(bool only_update) {}
>  #endif
>
>  static DEFINE_SPINLOCK(balancing);
> @@ -8801,8 +8816,6 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
>         int need_serialize, need_decay = 0;
>         u64 max_cost = 0;
>
> -       update_blocked_averages(cpu);
> -
>         rcu_read_lock();
>         for_each_domain(cpu, sd) {
>                 /*
> @@ -8901,6 +8914,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>  {
>         int this_cpu = this_rq->cpu;
>         struct rq *rq;
> +       struct sched_domain *sd;
>         int balance_cpu;
>         /* Earliest time when we have to do rebalance again */
>         unsigned long next_balance = jiffies + 60*HZ;
> @@ -8910,6 +8924,23 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>             !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
>                 goto end;
>
> +       /*
> +        * This cpu is going to update the blocked load of idle CPUs either
> +        * before doing a rebalancing or just to keep metrics up to date. we
> +        * can safely update the next update timestamp
> +        */
> +       rcu_read_lock();
> +       sd = rcu_dereference(this_rq->sd);
> +       /*
> +        * Check whether there is a sched_domain available for this cpu.
> +        * The last other cpu can have been unplugged since the ILB has been
> +        * triggered and the sched_domain can now be null. The idle balance
> +        * sequence will quickly be aborted as there is no more idle CPUs
> +        */
> +       if (sd)
> +               nohz.next_update = jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD);
> +       rcu_read_unlock();
> +
>         for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
>                 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
>                         continue;
> @@ -8936,7 +8967,15 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>                         cpu_load_update_idle(rq);
>                         rq_unlock_irq(rq, &rf);
>
> -                       rebalance_domains(rq, CPU_IDLE);
> +                       update_blocked_averages(balance_cpu);
> +                       /*
> +                        * This idle load balance softirq may have been
> +                        * triggered only to update the blocked load and shares
> +                        * of idle CPUs (which we have just done for
> +                        * balance_cpu). In that case skip the actual balance.
> +                        */
> +                       if (!test_bit(NOHZ_STATS_KICK, nohz_flags(this_cpu)))
> +                               rebalance_domains(rq, idle);
>                 }
>
>                 if (time_after(next_balance, rq->next_balance)) {
> @@ -8967,7 +9006,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
>   *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
>   *     domain span are idle.
>   */
> -static inline bool nohz_kick_needed(struct rq *rq)
> +static inline bool nohz_kick_needed(struct rq *rq, bool only_update)
>  {
>         unsigned long now = jiffies;
>         struct sched_domain_shared *sds;
> @@ -8975,7 +9014,7 @@ static inline bool nohz_kick_needed(struct rq *rq)
>         int nr_busy, i, cpu = rq->cpu;
>         bool kick = false;
>
> -       if (unlikely(rq->idle_balance))
> +       if (unlikely(rq->idle_balance) && !only_update)
>                 return false;
>
>         /*
> @@ -8992,6 +9031,13 @@ static inline bool nohz_kick_needed(struct rq *rq)
>         if (likely(!atomic_read(&nohz.nr_cpus)))
>                 return false;
>
> +       if (only_update) {
> +               if (time_before(now, nohz.next_update))
> +                       return false;
> +               else
> +                       return true;
> +       }
> +
>         if (time_before(now, nohz.next_balance))
>                 return false;
>
> @@ -9041,6 +9087,7 @@ static inline bool nohz_kick_needed(struct rq *rq)
>  }
>  #else
>  static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
> +static inline bool nohz_kick_needed(struct rq *rq, bool only_update) { return false; }
>  #endif
>
>  /*
> @@ -9062,7 +9109,12 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
>          * and abort nohz_idle_balance altogether if we pull some load.
>          */
>         nohz_idle_balance(this_rq, idle);
> -       rebalance_domains(this_rq, idle);
> +       update_blocked_averages(this_rq->cpu);
> +       if (!test_bit(NOHZ_STATS_KICK, nohz_flags(this_rq->cpu)))

The NOHZ_STATS_KICK field is only defined with CONFIG_NO_HZ_COMMON.

> +               rebalance_domains(this_rq, idle);
> +#ifdef CONFIG_NO_HZ_COMMON
> +       clear_bit(NOHZ_STATS_KICK, nohz_flags(this_rq->cpu));
> +#endif
>  }
>
>  /*
> @@ -9077,8 +9129,8 @@ void trigger_load_balance(struct rq *rq)
>         if (time_after_eq(jiffies, rq->next_balance))
>                 raise_softirq(SCHED_SOFTIRQ);
>  #ifdef CONFIG_NO_HZ_COMMON
> -       if (nohz_kick_needed(rq))
> -               nohz_balancer_kick();
> +       if (nohz_kick_needed(rq, false))
> +               nohz_balancer_kick(false);
>  #endif
>  }
>
> @@ -9657,6 +9709,7 @@ __init void init_sched_fair_class(void)
>
>  #ifdef CONFIG_NO_HZ_COMMON
>         nohz.next_balance = jiffies;
> +       nohz.next_update = jiffies;
>         zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
>  #endif
>  #endif /* SMP */
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 14db76cd496f..6f95ef653f73 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1978,6 +1978,7 @@ extern void cfs_bandwidth_usage_dec(void);
>  enum rq_nohz_flag_bits {
>         NOHZ_TICK_STOPPED,
>         NOHZ_BALANCE_KICK,
> +       NOHZ_STATS_KICK
>  };
>
>  #define nohz_flags(cpu)        (&cpu_rq(cpu)->nohz_flags)
> --
> 2.14.1
>