linux-kernel - Re: [RFC PATCH 3/3] sched/fair: Traverse cpufreq policies to detect capacity inversion

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAKfTPtCawKvhMwJYVUskYcX7eR2K7SziWVzvjGh6JCVB+WT5tQ@mail.gmail.com>
Date:   Fri, 2 Dec 2022 15:57:21 +0100
From:   Vincent Guittot <vincent.guittot@...aro.org>
To:     Qais Yousef <qyousef@...alina.io>
Cc:     Ingo Molnar <mingo@...nel.org>,
        Peter Zijlstra <peterz@...radead.org>,
        Dietmar Eggemann <dietmar.eggemann@....com>,
        "Rafael J. Wysocki" <rafael@...nel.org>,
        Viresh Kumar <viresh.kumar@...aro.org>,
        linux-pm@...r.kernel.org, linux-kernel@...r.kernel.org,
        Lukasz Luba <lukasz.luba@....com>, Wei Wang <wvw@...gle.com>,
        Xuewen Yan <xuewen.yan94@...il.com>,
        Hank <han.lin@...iatek.com>,
        Jonathan JMChen <Jonathan.JMChen@...iatek.com>
Subject: Re: [RFC PATCH 3/3] sched/fair: Traverse cpufreq policies to detect
 capacity inversion

On Sun, 27 Nov 2022 at 15:18, Qais Yousef <qyousef@...alina.io> wrote:
>
> We used performance domains to traverse the list of domains that share
> the same cpufreq policy to detect when this domain is severely impacted
> by thermal pressure to cause it to be lower than another domain in the
> system - capacity inversion.
>
> Since performance domains are only available for when energy model or
> schedutil are present, this makes the detection mechanism unavailable
> for Capacity Aware Scheduling (CAS).
>
> Since we only care about traversing the capacity_orig() of any cpu
> within that domain; export for_each_active_policy() to traverse the
> cpufreq policies instead of performance domains.
>
> Introduce a new for_each_active_policy_safe() to protect against races
> with deletion. Races against additions are fine since we can't
> eliminate the race without having to do heavy handed locking which is
> unacceptable in this path. The policy should be visible in the next
> tick if we missed it.
>
> Fixes: 44c7b80bffc3 ("sched/fair: Detect capacity inversion")
> Signed-off-by: Qais Yousef (Google) <qyousef@...alina.io>
> ---
>
> Rafael, Viresh, I hope it's okay to export these macros in the public header.
> And that my usage is okay; I'm not sure if I missed important locking rules.
>
>
>  drivers/cpufreq/cpufreq.c | 12 +-----------
>  include/linux/cpufreq.h   | 26 ++++++++++++++++++++++++++
>  kernel/sched/fair.c       | 13 +++++--------
>  3 files changed, 32 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> index 69b3d61852ac..b11e7c545fc1 100644
> --- a/drivers/cpufreq/cpufreq.c
> +++ b/drivers/cpufreq/cpufreq.c
> @@ -31,17 +31,7 @@
>  #include <linux/units.h>
>  #include <trace/events/power.h>
>
> -static LIST_HEAD(cpufreq_policy_list);
> -
> -/* Macros to iterate over CPU policies */
> -#define for_each_suitable_policy(__policy, __active)                    \
> -       list_for_each_entry(__policy, &cpufreq_policy_list, policy_list) \
> -               if ((__active) == !policy_is_inactive(__policy))
> -
> -#define for_each_active_policy(__policy)               \
> -       for_each_suitable_policy(__policy, true)
> -#define for_each_inactive_policy(__policy)             \
> -       for_each_suitable_policy(__policy, false)
> +LIST_HEAD(cpufreq_policy_list);
>
>  /* Iterate over governors */
>  static LIST_HEAD(cpufreq_governor_list);
> diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
> index d5595d57f4e5..c3c79d4ad821 100644
> --- a/include/linux/cpufreq.h
> +++ b/include/linux/cpufreq.h
> @@ -780,6 +780,32 @@ static inline void dev_pm_opp_free_cpufreq_table(struct device *dev,
>                         continue;                                               \
>                 else
>
> +#ifdef CONFIG_CPU_FREQ
> +extern struct list_head cpufreq_policy_list;
> +
> +/* Macros to iterate over CPU policies */
> +#define for_each_suitable_policy(__policy, __active)                    \
> +       list_for_each_entry(__policy, &cpufreq_policy_list, policy_list) \
> +               if ((__active) == !policy_is_inactive(__policy))
> +
> +#define for_each_suitable_policy_safe(__policy, __n, __active)                    \
> +       list_for_each_entry_safe(__policy, __n, &cpufreq_policy_list, policy_list) \
> +               if ((__active) == !policy_is_inactive(__policy))
> +#else
> +#define for_each_suitable_policy(__policy, __active)           while (0)
> +#define for_each_suitable_policy_safe(__policy, __n, __active) while (0)
> +#endif
> +
> +#define for_each_active_policy(__policy)               \
> +       for_each_suitable_policy(__policy, true)
> +#define for_each_inactive_policy(__policy)             \
> +       for_each_suitable_policy(__policy, false)
> +
> +#define for_each_active_policy_safe(__policy, __n)             \
> +       for_each_suitable_policy_safe(__policy, __n, true)
> +#define for_each_inactive_policy_safe(__policy, __n)           \
> +       for_each_suitable_policy_safe(__policy, __n, false)
> +
>
>  int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy,
>                                     struct cpufreq_frequency_table *table);
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 7c0dd57e562a..4bbbca85134b 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -8856,23 +8856,20 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
>          *   * Thermal pressure will impact all cpus in this perf domain
>          *     equally.
>          */
> -       if (sched_energy_enabled()) {
> +       if (static_branch_unlikely(&sched_asym_cpucapacity)) {
>                 unsigned long inv_cap = capacity_orig - thermal_load_avg(rq);
> -               struct perf_domain *pd = rcu_dereference(rq->rd->pd);
> +               struct cpufreq_policy *policy, __maybe_unused *policy_n;
>
>                 rq->cpu_capacity_inverted = 0;
>
> -               SCHED_WARN_ON(!rcu_read_lock_held());
> -
> -               for (; pd; pd = pd->next) {
> -                       struct cpumask *pd_span = perf_domain_span(pd);
> +               for_each_active_policy_safe(policy, policy_n) {

So you are looping all cpufreq policy (and before the perf domain) in
the period load balance. That' really not something we should or want
to do


>                         unsigned long pd_cap_orig, pd_cap;
>
>                         /* We can't be inverted against our own pd */
> -                       if (cpumask_test_cpu(cpu_of(rq), pd_span))
> +                       if (cpumask_test_cpu(cpu_of(rq), policy->cpus))
>                                 continue;
>
> -                       cpu = cpumask_any(pd_span);
> +                       cpu = cpumask_any(policy->cpus);
>                         pd_cap_orig = arch_scale_cpu_capacity(cpu);
>
>                         if (capacity_orig < pd_cap_orig)
> --
> 2.25.1
>