[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <6inw6vhqe3uuxjozeioqujzdtksmsdr4estzkdsbb762rcxa7z@sn56uogdatjr>
Date: Wed, 17 Apr 2024 18:39:00 -0700
From: Vanshidhar Konda <vanshikonda@...amperecomputing.com>
To: Beata Michalska <beata.michalska@....com>
Cc: linux-kernel@...r.kernel.org, linux-arm-kernel@...ts.infradead.org,
ionela.voinescu@....com, sudeep.holla@....com, will@...nel.org, catalin.marinas@....com,
vincent.guittot@...aro.org, sumitg@...dia.com, yang@...amperecomputing.com,
lihuisong@...wei.com, viresh.kumar@...aro.org
Subject: Re: [PATCH v5 3/5] arm64: Provide an AMU-based version of
arch_freq_get_on_cpu
On Wed, Apr 17, 2024 at 10:38:46AM +0100, Beata Michalska wrote:
>With the Frequency Invariance Engine (FIE) being already wired up with
>sched tick and making use of relevant (core counter and constant
>counter) AMU counters, getting the current frequency for a given CPU,
>can be achieved by utilizing the frequency scale factor which reflects
>an average CPU frequency for the last tick period length.
>
>The solution is partially based on APERF/MPERF implementation of
>arch_freq_get_on_cpu.
>
>Suggested-by: Ionela Voinescu <ionela.voinescu@....com>
>Signed-off-by: Beata Michalska <beata.michalska@....com>
>---
> arch/arm64/kernel/topology.c | 110 +++++++++++++++++++++++++++++++----
> 1 file changed, 100 insertions(+), 10 deletions(-)
>
>diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
>index 3c814a278534..475fdbf3032a 100644
>--- a/arch/arm64/kernel/topology.c
>+++ b/arch/arm64/kernel/topology.c
>@@ -17,6 +17,7 @@
> #include <linux/cpufreq.h>
> #include <linux/init.h>
> #include <linux/percpu.h>
>+#include <linux/sched/isolation.h>
>
> #include <asm/cpu.h>
> #include <asm/cputype.h>
>@@ -88,18 +89,28 @@ int __init parse_acpi_topology(void)
> * initialized.
> */
> static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, arch_max_freq_scale) = 1UL << (2 * SCHED_CAPACITY_SHIFT);
>-static DEFINE_PER_CPU(u64, arch_const_cycles_prev);
>-static DEFINE_PER_CPU(u64, arch_core_cycles_prev);
> static cpumask_var_t amu_fie_cpus;
>
>+struct amu_cntr_sample {
>+ u64 arch_const_cycles_prev;
>+ u64 arch_core_cycles_prev;
>+ unsigned long last_update;
>+};
>+
>+static DEFINE_PER_CPU_SHARED_ALIGNED(struct amu_cntr_sample, cpu_amu_samples);
>+
> void update_freq_counters_refs(void)
> {
>- this_cpu_write(arch_core_cycles_prev, read_corecnt());
>- this_cpu_write(arch_const_cycles_prev, read_constcnt());
>+ struct amu_cntr_sample *amu_sample = this_cpu_ptr(&cpu_amu_samples);
>+
>+ amu_sample->arch_core_cycles_prev = read_corecnt();
>+ amu_sample->arch_const_cycles_prev = read_constcnt();
> }
>
> static inline bool freq_counters_valid(int cpu)
> {
>+ struct amu_cntr_sample *amu_sample = per_cpu_ptr(&cpu_amu_samples, cpu);
>+
> if ((cpu >= nr_cpu_ids) || !cpumask_test_cpu(cpu, cpu_present_mask))
> return false;
>
>@@ -108,8 +119,8 @@ static inline bool freq_counters_valid(int cpu)
> return false;
> }
>
>- if (unlikely(!per_cpu(arch_const_cycles_prev, cpu) ||
>- !per_cpu(arch_core_cycles_prev, cpu))) {
>+ if (unlikely(!amu_sample->arch_const_cycles_prev ||
>+ !amu_sample->arch_core_cycles_prev)) {
> pr_debug("CPU%d: cycle counters are not enabled.\n", cpu);
> return false;
> }
>@@ -152,17 +163,22 @@ void freq_inv_set_max_ratio(int cpu, u64 max_rate)
>
> static void amu_scale_freq_tick(void)
> {
>+ struct amu_cntr_sample *amu_sample = this_cpu_ptr(&cpu_amu_samples);
> u64 prev_core_cnt, prev_const_cnt;
> u64 core_cnt, const_cnt, scale;
>
>- prev_const_cnt = this_cpu_read(arch_const_cycles_prev);
>- prev_core_cnt = this_cpu_read(arch_core_cycles_prev);
>+ prev_const_cnt = amu_sample->arch_const_cycles_prev;
>+ prev_core_cnt = amu_sample->arch_core_cycles_prev;
>
> update_freq_counters_refs();
>
>- const_cnt = this_cpu_read(arch_const_cycles_prev);
>- core_cnt = this_cpu_read(arch_core_cycles_prev);
>+ const_cnt = amu_sample->arch_const_cycles_prev;
>+ core_cnt = amu_sample->arch_core_cycles_prev;
>
>+ /*
>+ * This should not happen unless the AMUs have been reset and the
>+ * counter values have not been restored - unlikely
>+ */
> if (unlikely(core_cnt <= prev_core_cnt ||
> const_cnt <= prev_const_cnt))
> return;
>@@ -182,6 +198,8 @@ static void amu_scale_freq_tick(void)
>
> scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE);
> this_cpu_write(arch_freq_scale, (unsigned long)scale);
>+
>+ amu_sample->last_update = jiffies;
> }
>
> static struct scale_freq_data amu_sfd = {
>@@ -189,6 +207,78 @@ static struct scale_freq_data amu_sfd = {
> .set_freq_scale = amu_scale_freq_tick,
> };
>
>+static __always_inline bool amu_fie_cpu_supported(unsigned int cpu)
>+{
>+ return cpumask_available(amu_fie_cpus) &&
>+ cpumask_test_cpu(cpu, amu_fie_cpus);
>+}
>+
>+#define AMU_SAMPLE_EXP_MS 20
>+
>+unsigned int arch_freq_get_on_cpu(int cpu)
>+{
>+ struct amu_cntr_sample *amu_sample;
>+ unsigned int start_cpu = cpu;
>+ unsigned long last_update;
>+ unsigned int freq = 0;
>+ u64 scale;
>+
>+ if (!amu_fie_cpu_supported(cpu) || !arch_scale_freq_ref(cpu))
>+ return 0;
>+
>+retry:
>+ amu_sample = per_cpu_ptr(&cpu_amu_samples, cpu);
>+
>+ last_update = amu_sample->last_update;
>+
>+ /*
>+ * For those CPUs that are in full dynticks mode,
>+ * and those that have not seen tick for a while
>+ * try an alternative source for the counters (and thus freq scale),
While testing this on AmpereOne system I found that the scaling_cur_freq
and cpufreq_cur_freq are inconsistent for nohz_full CPUs that are being
throttled (OS requested freq != HW provided freq).
For the test I ran stress-ng workload on 9 cores. All the other cores
are idle. I then forced the hardware to throttle the active cores - core
won't run at maximum frequency despite a request from the OS. Each core
has an independent cpufreq policy.
For the nohz_full CPUs since arch_freq_get_on_cpu bails out. In
show_scaling_cur_freq() the next check is to see if
cpufreq_driver->set_policy method is implemented. cppc_cpufreq does not
implement this method and we just end up returning the policy->cur
value. As discussed in other threads, it looks like we want the behavior
to be identical to x86 systems. In that case it seems like returning 0
from arch_freq_get_on_cpu is not going to be valid behavior.
Core scaling_cur_freq cpufreq_cur_freq
[0]: 2700000 2700000
[1]: 2750000 2750000
nohz_full=2-7
[2]: 3200000 2691000
[3]: 3200000 2645000
[4]: 3200000 2731000
[5]: 3200000 2714000
[6]: 3200000 2466000
[7]: 3200000 2708000
isolcpus=8-11 (no workload applied to core 10-11)
[8]: 2700000 2700000
[9]: 2550000 2550000
[10]: 1046875 1046875
[11]: 1096875 1096875
Thanks,
Vanshi
>+ * if available, for given policy:
>+ * this boils down to identifying an active cpu within the same freq
>+ * domain, if any.
>+ */
>+ if (!housekeeping_cpu(cpu, HK_TYPE_TICK) ||
>+ time_is_before_jiffies(last_update + msecs_to_jiffies(AMU_SAMPLE_EXP_MS))) {
>+ struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
>+ int ref_cpu = cpu;
>+
>+ if (!policy)
>+ goto leave;
>+
>+ if (!policy_is_shared(policy)) {
>+ cpufreq_cpu_put(policy);
>+ goto leave;
>+ }
>+
>+ do {
>+ ref_cpu = cpumask_next_wrap(ref_cpu, policy->cpus,
>+ start_cpu, false);
>+
>+ } while (ref_cpu < nr_cpu_ids && idle_cpu(ref_cpu));
>+
>+ cpufreq_cpu_put(policy);
>+
>+ if (ref_cpu >= nr_cpu_ids)
>+ /* No alternative to pull info from */
>+ goto leave;
>+
>+ cpu = ref_cpu;
>+ goto retry;
>+ }
>+ /*
>+ * Reversed computation to the one used to determine
>+ * the arch_freq_scale value
>+ * (see amu_scale_freq_tick for details)
>+ */
>+ scale = arch_scale_freq_capacity(cpu);
>+ freq = scale * arch_scale_freq_ref(cpu);
>+ freq >>= SCHED_CAPACITY_SHIFT;
>+leave:
>+ return freq;
>+}
>+
> static void amu_fie_setup(const struct cpumask *cpus)
> {
> int cpu;
>--
>2.25.1
>
Powered by blists - more mailing lists