linux-kernel - Re: [PATCH v5 3/5] arm64: Provide an AMU-based version of arch_freq_get_on

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <6inw6vhqe3uuxjozeioqujzdtksmsdr4estzkdsbb762rcxa7z@sn56uogdatjr>
Date: Wed, 17 Apr 2024 18:39:00 -0700
From: Vanshidhar Konda <vanshikonda@...amperecomputing.com>
To: Beata Michalska <beata.michalska@....com>
Cc: linux-kernel@...r.kernel.org, linux-arm-kernel@...ts.infradead.org, 
	ionela.voinescu@....com, sudeep.holla@....com, will@...nel.org, catalin.marinas@....com, 
	vincent.guittot@...aro.org, sumitg@...dia.com, yang@...amperecomputing.com, 
	lihuisong@...wei.com, viresh.kumar@...aro.org
Subject: Re: [PATCH v5 3/5] arm64: Provide an AMU-based version of
 arch_freq_get_on_cpu

On Wed, Apr 17, 2024 at 10:38:46AM +0100, Beata Michalska wrote:
>With the Frequency Invariance Engine (FIE) being already wired up with
>sched tick and making use of relevant (core counter and constant
>counter) AMU counters, getting the current frequency for a given CPU,
>can be achieved by utilizing the frequency scale factor which reflects
>an average CPU frequency for the last tick period length.
>
>The solution is partially based on APERF/MPERF implementation of
>arch_freq_get_on_cpu.
>
>Suggested-by: Ionela Voinescu <ionela.voinescu@....com>
>Signed-off-by: Beata Michalska <beata.michalska@....com>
>---
> arch/arm64/kernel/topology.c | 110 +++++++++++++++++++++++++++++++----
> 1 file changed, 100 insertions(+), 10 deletions(-)
>
>diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
>index 3c814a278534..475fdbf3032a 100644
>--- a/arch/arm64/kernel/topology.c
>+++ b/arch/arm64/kernel/topology.c
>@@ -17,6 +17,7 @@
> #include <linux/cpufreq.h>
> #include <linux/init.h>
> #include <linux/percpu.h>
>+#include <linux/sched/isolation.h>
>
> #include <asm/cpu.h>
> #include <asm/cputype.h>
>@@ -88,18 +89,28 @@ int __init parse_acpi_topology(void)
>  * initialized.
>  */
> static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, arch_max_freq_scale) =  1UL << (2 * SCHED_CAPACITY_SHIFT);
>-static DEFINE_PER_CPU(u64, arch_const_cycles_prev);
>-static DEFINE_PER_CPU(u64, arch_core_cycles_prev);
> static cpumask_var_t amu_fie_cpus;
>
>+struct amu_cntr_sample {
>+	u64		arch_const_cycles_prev;
>+	u64		arch_core_cycles_prev;
>+	unsigned long	last_update;
>+};
>+
>+static DEFINE_PER_CPU_SHARED_ALIGNED(struct amu_cntr_sample, cpu_amu_samples);
>+
> void update_freq_counters_refs(void)
> {
>-	this_cpu_write(arch_core_cycles_prev, read_corecnt());
>-	this_cpu_write(arch_const_cycles_prev, read_constcnt());
>+	struct amu_cntr_sample *amu_sample = this_cpu_ptr(&cpu_amu_samples);
>+
>+	amu_sample->arch_core_cycles_prev = read_corecnt();
>+	amu_sample->arch_const_cycles_prev = read_constcnt();
> }
>
> static inline bool freq_counters_valid(int cpu)
> {
>+	struct amu_cntr_sample *amu_sample = per_cpu_ptr(&cpu_amu_samples, cpu);
>+
> 	if ((cpu >= nr_cpu_ids) || !cpumask_test_cpu(cpu, cpu_present_mask))
> 		return false;
>
>@@ -108,8 +119,8 @@ static inline bool freq_counters_valid(int cpu)
> 		return false;
> 	}
>
>-	if (unlikely(!per_cpu(arch_const_cycles_prev, cpu) ||
>-		     !per_cpu(arch_core_cycles_prev, cpu))) {
>+	if (unlikely(!amu_sample->arch_const_cycles_prev ||
>+		     !amu_sample->arch_core_cycles_prev)) {
> 		pr_debug("CPU%d: cycle counters are not enabled.\n", cpu);
> 		return false;
> 	}
>@@ -152,17 +163,22 @@ void freq_inv_set_max_ratio(int cpu, u64 max_rate)
>
> static void amu_scale_freq_tick(void)
> {
>+	struct amu_cntr_sample *amu_sample = this_cpu_ptr(&cpu_amu_samples);
> 	u64 prev_core_cnt, prev_const_cnt;
> 	u64 core_cnt, const_cnt, scale;
>
>-	prev_const_cnt = this_cpu_read(arch_const_cycles_prev);
>-	prev_core_cnt = this_cpu_read(arch_core_cycles_prev);
>+	prev_const_cnt = amu_sample->arch_const_cycles_prev;
>+	prev_core_cnt = amu_sample->arch_core_cycles_prev;
>
> 	update_freq_counters_refs();
>
>-	const_cnt = this_cpu_read(arch_const_cycles_prev);
>-	core_cnt = this_cpu_read(arch_core_cycles_prev);
>+	const_cnt = amu_sample->arch_const_cycles_prev;
>+	core_cnt = amu_sample->arch_core_cycles_prev;
>
>+	/*
>+	 * This should not happen unless the AMUs have been reset and the
>+	 * counter values have not been restored - unlikely
>+	 */
> 	if (unlikely(core_cnt <= prev_core_cnt ||
> 		     const_cnt <= prev_const_cnt))
> 		return;
>@@ -182,6 +198,8 @@ static void amu_scale_freq_tick(void)
>
> 	scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE);
> 	this_cpu_write(arch_freq_scale, (unsigned long)scale);
>+
>+	amu_sample->last_update = jiffies;
> }
>
> static struct scale_freq_data amu_sfd = {
>@@ -189,6 +207,78 @@ static struct scale_freq_data amu_sfd = {
> 	.set_freq_scale = amu_scale_freq_tick,
> };
>
>+static __always_inline bool amu_fie_cpu_supported(unsigned int cpu)
>+{
>+	return cpumask_available(amu_fie_cpus) &&
>+		cpumask_test_cpu(cpu, amu_fie_cpus);
>+}
>+
>+#define AMU_SAMPLE_EXP_MS	20
>+
>+unsigned int arch_freq_get_on_cpu(int cpu)
>+{
>+	struct amu_cntr_sample *amu_sample;
>+	unsigned int start_cpu = cpu;
>+	unsigned long last_update;
>+	unsigned int freq = 0;
>+	u64 scale;
>+
>+	if (!amu_fie_cpu_supported(cpu) || !arch_scale_freq_ref(cpu))
>+		return 0;
>+
>+retry:
>+	amu_sample = per_cpu_ptr(&cpu_amu_samples, cpu);
>+
>+	last_update = amu_sample->last_update;
>+
>+	/*
>+	 * For those CPUs that are in full dynticks mode,
>+	 * and those that have not seen tick for a while
>+	 * try an alternative source for the counters (and thus freq scale),

While testing this on AmpereOne system I found that the scaling_cur_freq
and cpufreq_cur_freq are inconsistent for nohz_full CPUs that are being
throttled (OS requested freq != HW provided freq).

For the test I ran stress-ng workload on 9 cores. All the other cores
are idle. I then forced the hardware to throttle the active cores - core
won't run at maximum frequency despite a request from the OS. Each core
has an independent cpufreq policy.

For the nohz_full CPUs since arch_freq_get_on_cpu bails out. In
show_scaling_cur_freq() the next check is to see if
cpufreq_driver->set_policy method is implemented. cppc_cpufreq does not
implement this method and we just end up returning the policy->cur
value. As discussed in other threads, it looks like we want the behavior
to be identical to x86 systems. In that case it seems like returning 0
from arch_freq_get_on_cpu is not going to be valid behavior.

Core    scaling_cur_freq cpufreq_cur_freq
[0]:    2700000          2700000
[1]:    2750000          2750000

nohz_full=2-7
[2]:    3200000          2691000
[3]:    3200000          2645000
[4]:    3200000          2731000
[5]:    3200000          2714000
[6]:    3200000          2466000
[7]:    3200000          2708000

isolcpus=8-11 (no workload applied to core 10-11)
[8]:    2700000          2700000
[9]:    2550000          2550000
[10]:   1046875          1046875
[11]:   1096875          1096875

Thanks,
Vanshi

>+	 * if available, for given policy:
>+	 * this boils down to identifying an active cpu within the same freq
>+	 * domain, if any.
>+	 */
>+	if (!housekeeping_cpu(cpu, HK_TYPE_TICK) ||
>+	    time_is_before_jiffies(last_update + msecs_to_jiffies(AMU_SAMPLE_EXP_MS))) {
>+		struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
>+		int ref_cpu = cpu;
>+
>+		if (!policy)
>+			goto leave;
>+
>+		if (!policy_is_shared(policy)) {
>+			cpufreq_cpu_put(policy);
>+			goto leave;
>+		}
>+
>+		do {
>+			ref_cpu = cpumask_next_wrap(ref_cpu, policy->cpus,
>+						    start_cpu, false);
>+
>+		} while (ref_cpu < nr_cpu_ids && idle_cpu(ref_cpu));
>+
>+		cpufreq_cpu_put(policy);
>+
>+		if (ref_cpu >= nr_cpu_ids)
>+			/* No alternative to pull info from */
>+			goto leave;
>+
>+		cpu = ref_cpu;
>+		goto retry;
>+	}
>+	/*
>+	 * Reversed computation to the one used to determine
>+	 * the arch_freq_scale value
>+	 * (see amu_scale_freq_tick for details)
>+	 */
>+	scale = arch_scale_freq_capacity(cpu);
>+	freq = scale * arch_scale_freq_ref(cpu);
>+	freq >>= SCHED_CAPACITY_SHIFT;
>+leave:
>+	return freq;
>+}
>+
> static void amu_fie_setup(const struct cpumask *cpus)
> {
> 	int cpu;
>-- 
>2.25.1
>