APERF/MPERF support for cpu_power. APERF/MPERF is arch defined to be a relative scale of work capacity per logical cpu, this is assumed to include SMT and Turbo mode. APERF/MPERF are specified to both reset to 0 when either counter wraps, which is highly inconvenient, since that'll give a blimp when that happens. The manual specifies writing 0 to the counters after each read, but that's 1) too expensive, and 2) destroys the possibility of sharing these counters with other users, so we live with the blimp - the other existing user does too. Signed-off-by: Peter Zijlstra Signed-off-by: Dinakar Guniguntala --- arch/x86/kernel/cpu/Makefile | 2 - arch/x86/kernel/cpu/sched.c | 58 +++++++++++++++++++++++++++++++++++++++++++ include/linux/sched.h | 4 ++ 3 files changed, 63 insertions(+), 1 deletion(-) Index: linux-2.6.31.4-rt14-lb1/arch/x86/kernel/cpu/Makefile =================================================================== --- linux-2.6.31.4-rt14-lb1.orig/arch/x86/kernel/cpu/Makefile 2009-10-21 10:47:15.000000000 -0400 +++ linux-2.6.31.4-rt14-lb1/arch/x86/kernel/cpu/Makefile 2009-10-21 10:49:00.000000000 -0400 @@ -13,7 +13,7 @@ obj-y := intel_cacheinfo.o addon_cpuid_features.o obj-y += proc.o capflags.o powerflags.o common.o -obj-y += vmware.o hypervisor.o +obj-y += vmware.o hypervisor.o sched.o obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o obj-$(CONFIG_X86_64) += bugs_64.o Index: linux-2.6.31.4-rt14-lb1/arch/x86/kernel/cpu/sched.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.31.4-rt14-lb1/arch/x86/kernel/cpu/sched.c 2009-10-21 10:49:00.000000000 -0400 @@ -0,0 +1,58 @@ +#include +#include +#include +#include + +#include +#include + +static DEFINE_PER_CPU(struct aperfmperf, old_aperfmperf); + +static unsigned long scale_aperfmperf(void) +{ + struct aperfmperf cur, val, *old = &__get_cpu_var(old_aperfmperf); + unsigned long ratio = SCHED_LOAD_SCALE; + unsigned long flags; + + local_irq_save(flags); + get_aperfmperf(&val); + local_irq_restore(flags); + + cur = val; + cur.aperf -= old->aperf; + cur.mperf -= old->mperf; + *old = val; + + cur.mperf >>= SCHED_LOAD_SHIFT; + if (cur.mperf) + ratio = div_u64(cur.aperf, cur.mperf); + + return ratio; +} + +unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu) +{ + /* + * do aperf/mperf on the cpu level because it includes things + * like turbo mode, which are relevant to full cores. + */ + if (boot_cpu_has(X86_FEATURE_APERFMPERF)) + return scale_aperfmperf(); + + /* + * maybe have something cpufreq here + */ + + return default_scale_freq_power(sd, cpu); +} + +unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu) +{ + /* + * aperf/mperf already includes the smt gain + */ + if (boot_cpu_has(X86_FEATURE_APERFMPERF)) + return SCHED_LOAD_SCALE; + + return default_scale_smt_power(sd, cpu); +} Index: linux-2.6.31.4-rt14-lb1/include/linux/sched.h =================================================================== --- linux-2.6.31.4-rt14-lb1.orig/include/linux/sched.h 2009-10-21 10:47:15.000000000 -0400 +++ linux-2.6.31.4-rt14-lb1/include/linux/sched.h 2009-10-21 10:49:00.000000000 -0400 @@ -1047,6 +1047,10 @@ } #endif /* !CONFIG_SMP */ + +unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu); +unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu); + struct io_context; /* See blkdev.h */ -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/