lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Sat, 17 Jun 2017 02:30:09 +0200
From:   "Rafael J. Wysocki" <rjw@...ysocki.net>
To:     Len Brown <lenb@...nel.org>
Cc:     x86@...nel.org, srinivas.pandruvada@...ux.intel.com,
        hpa@...ux.intel.com, peterz@...radead.org, rafael@...nel.org,
        linux-pm@...r.kernel.org, linux-kernel@...r.kernel.org,
        Len Brown <len.brown@...el.com>
Subject: Re: [PATCH 2/5] x86: use common aperfmperf_khz_on_cpu() to calculate KHz using APERF/MPERF

On Wednesday, June 07, 2017 07:39:13 PM Len Brown wrote:
> From: Len Brown <len.brown@...el.com>
> 
> The goal of this change is to give users a uniform and meaningful
> result when they read /sys/...cpufreq/scaling_cur_freq
> on modern x86 hardware, as compared to what they get today.
> 
> Modern x86 processors include the hardware needed
> to accurately calculate frequency over an interval --
> APERF, MPERF, and the TSC.
> 
> Here we provide an x86 routine to make this calculation
> on supported hardware, and use it in preference to any
> driver driver-specific cpufreq_driver.get() routine.
> 
> MHz is computed like so:
> 
> MHz = base_MHz * delta_APERF / delta_MPERF
> 
> MHz is the average frequency of the busy processor
> over a measurement interval.  The interval is
> defined to be the time between successive invocations
> of aperfmperf_khz_on_cpu(), which are expected to to
> happen on-demand when users read sysfs attribute
> cpufreq/scaling_cur_freq.
> 
> As with previous methods of calculating MHz,
> idle time is excluded.
> 
> base_MHz above is from TSC calibration global "cpu_khz".
> 
> This x86 native method to calculate MHz returns a meaningful result
> no matter if P-states are controlled by hardware or firmware
> and/or if the Linux cpufreq sub-system is or is-not installed.
> 
> When this routine is invoked more frequently, the measurement
> interval becomes shorter.  However, the code limits re-computation
> to 10ms intervals so that average frequency remains meaningful.
> 
> Discerning users are encouraged to take advantage of
> the turbostat(8) utility, which can gracefully handle
> concurrent measurement intervals of arbitrary length.
> 
> Signed-off-by: Len Brown <len.brown@...el.com>
> ---
>  arch/x86/kernel/cpu/Makefile     |  1 +
>  arch/x86/kernel/cpu/aperfmperf.c | 82 ++++++++++++++++++++++++++++++++++++++++
>  drivers/cpufreq/cpufreq.c        |  7 +++-
>  include/linux/cpufreq.h          | 13 +++++++
>  4 files changed, 102 insertions(+), 1 deletion(-)
>  create mode 100644 arch/x86/kernel/cpu/aperfmperf.c
> 
> diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
> index 5200001..cdf8249 100644
> --- a/arch/x86/kernel/cpu/Makefile
> +++ b/arch/x86/kernel/cpu/Makefile
> @@ -21,6 +21,7 @@ obj-y			+= common.o
>  obj-y			+= rdrand.o
>  obj-y			+= match.o
>  obj-y			+= bugs.o
> +obj-$(CONFIG_CPU_FREQ)	+= aperfmperf.o
>  
>  obj-$(CONFIG_PROC_FS)	+= proc.o
>  obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
> diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
> new file mode 100644
> index 0000000..5ccf63a
> --- /dev/null
> +++ b/arch/x86/kernel/cpu/aperfmperf.c
> @@ -0,0 +1,82 @@
> +/*
> + * x86 APERF/MPERF KHz calculation for
> + * /sys/.../cpufreq/scaling_cur_freq
> + *
> + * Copyright (C) 2017 Intel Corp.
> + * Author: Len Brown <len.brown@...el.com>
> + *
> + * This file is licensed under GPLv2.
> + */
> +
> +#include <linux/jiffies.h>
> +#include <linux/math64.h>
> +#include <linux/percpu.h>
> +#include <linux/smp.h>
> +
> +struct aperfmperf_sample {
> +	unsigned int khz;
> +	unsigned long jiffies;
> +	u64 aperf;
> +	u64 mperf;
> +};
> +
> +static DEFINE_PER_CPU(struct aperfmperf_sample, samples);
> +
> +/*
> + * aperfmperf_snapshot_khz()
> + * On the current CPU, snapshot APERF, MPERF, and jiffies
> + * unless we already did it within 10ms
> + * calculate kHz, save snapshot
> + */
> +static void aperfmperf_snapshot_khz(void *dummy)
> +{
> +	u64 aperf, aperf_delta;
> +	u64 mperf, mperf_delta;
> +	struct aperfmperf_sample *s = &get_cpu_var(samples);
> +
> +	/* Don't bother re-computing within 10 ms */
> +	if (time_before(jiffies, s->jiffies + HZ/100))
> +		goto out;
> +
> +	rdmsrl(MSR_IA32_APERF, aperf);
> +	rdmsrl(MSR_IA32_MPERF, mperf);
> +
> +	aperf_delta = aperf - s->aperf;
> +	mperf_delta = mperf - s->mperf;
> +
> +	/*
> +	 * There is no architectural guarantee that MPERF
> +	 * increments faster than we can read it.
> +	 */
> +	if (mperf_delta == 0)
> +		goto out;
> +
> +	/*
> +	 * if (cpu_khz * aperf_delta) fits into ULLONG_MAX, then
> +	 *	khz = (cpu_khz * aperf_delta) / mperf_delta
> +	 */
> +	if (div64_u64(ULLONG_MAX, cpu_khz) > aperf_delta)
> +		s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
> +	else	/* khz = aperf_delta / (mperf_delta / cpu_khz) */
> +		s->khz = div64_u64(aperf_delta,
> +			div64_u64(mperf_delta, cpu_khz));
> +	s->jiffies = jiffies;
> +	s->aperf = aperf;
> +	s->mperf = mperf;
> +
> +out:
> +	put_cpu_var(samples);
> +}
> +
> +unsigned int aperfmperf_khz_on_cpu(unsigned int cpu)
> +{
> +	if (!cpu_khz)
> +		return 0;
> +
> +	if (!static_cpu_has(X86_FEATURE_APERFMPERF))
> +		return 0;
> +
> +	smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
> +
> +	return per_cpu(samples.khz, cpu);
> +}
> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> index 26b643d..a667fac 100644
> --- a/drivers/cpufreq/cpufreq.c
> +++ b/drivers/cpufreq/cpufreq.c
> @@ -635,8 +635,13 @@ show_one(scaling_max_freq, max);
>  static ssize_t show_scaling_cur_freq(struct cpufreq_policy *policy, char *buf)
>  {
>  	ssize_t ret;
> +	unsigned int freq;
>  
> -	if (cpufreq_driver && cpufreq_driver->setpolicy && cpufreq_driver->get)
> +	freq = arch_freq_get_on_cpu(policy->cpu);
> +	if (freq)
> +		ret = sprintf(buf, "%u\n", freq);
> +	else if (cpufreq_driver && cpufreq_driver->setpolicy &&
> +			cpufreq_driver->get)
>  		ret = sprintf(buf, "%u\n", cpufreq_driver->get(policy->cpu));
>  	else
>  		ret = sprintf(buf, "%u\n", policy->cur);

I wonder if we could change intel_pstate_get() to simply return
aperfmperf_khz_on_cpu(cpu_num)?

That would allow us to avoid the extra branch here and get rid of the
#ifdef x86 from the header.

> diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
> index a5ce0bbe..cfc6220 100644
> --- a/include/linux/cpufreq.h
> +++ b/include/linux/cpufreq.h
> @@ -883,6 +883,19 @@ static inline bool policy_has_boost_freq(struct cpufreq_policy *policy)
>  }
>  #endif
>  
> +#ifdef CONFIG_X86
> +extern unsigned int aperfmperf_khz_on_cpu(unsigned int cpu);
> +static inline unsigned int arch_freq_get_on_cpu(int cpu)
> +{
> +	return aperfmperf_khz_on_cpu(cpu);
> +}
> +#else
> +static inline unsigned int arch_freq_get_on_cpu(int cpu)
> +{
> +	return 0;
> +}
> +#endif
> +
>  /* the following are really really optional */
>  extern struct freq_attr cpufreq_freq_attr_scaling_available_freqs;
>  extern struct freq_attr cpufreq_freq_attr_scaling_boost_freqs;
> 

Thanks,
Rafael

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ