[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <3b0ce952-0674-d01d-3fc0-795d35743723@linaro.org>
Date: Tue, 5 Jul 2022 11:09:09 +0200
From: Daniel Lezcano <daniel.lezcano@...aro.org>
To: Lukasz Luba <lukasz.luba@....com>, linux-kernel@...r.kernel.org,
linux-pm@...r.kernel.org
Cc: amitk@...nel.org, rui.zhang@...el.com, viresh.kumar@...aro.org,
rafael@...nel.org, dietmar.eggemann@....com, nm@...com,
sboyd@...nel.org, sudeep.holla@....com, cristian.marussi@....com,
matthias.bgg@...il.com, linux-mediatek@...ts.infradead.org,
linux-arm-kernel@...ts.infradead.org
Subject: Re: [PATCH 1/4] PM: EM: convert power field to micro-Watts precision
and align drivers
On 22/06/2022 16:57, Lukasz Luba wrote:
> The milli-Watts precision causes rounding errors while calculating
> efficiency cost for each OPP. This is especially visible in the 'simple'
> Energy Model (EM), where the power for each OPP is provided from OPP
> framework. This can cause some OPPs to be marked inefficient, while
> using micro-Watts precision that might not happen.
>
> Update all EM users which access 'power' field and assume the value is
> in milli-Watts.
>
> Solve also an issue with potential overflow in calculation of energy
> estimation on 32bit machine. It's needed now since the power value
> (thus the 'cost' as well) are higher.
>
> Example calculation which shows the rounding error and impact:
>
> power = 'dyn-power-coeff' * volt_mV * volt_mV * freq_MHz
>
> power_a_uW = (100 * 600mW * 600mW * 500MHz) / 10^6 = 18000
> power_a_mW = (100 * 600mW * 600mW * 500MHz) / 10^9 = 18
>
> power_b_uW = (100 * 605mW * 605mW * 600MHz) / 10^6 = 21961
> power_b_mW = (100 * 605mW * 605mW * 600MHz) / 10^9 = 21
>
> max_freq = 2000MHz
>
> cost_a_mW = 18 * 2000MHz/500MHz = 72
> cost_a_uW = 18000 * 2000MHz/500MHz = 72000
>
> cost_b_mW = 21 * 2000MHz/600MHz = 70 // <- artificially better
> cost_b_uW = 21961 * 2000MHz/600MHz = 73203
>
> The 'cost_b_mW' (which is based on old milli-Watts) is misleadingly
> better that the 'cost_b_uW' (this patch uses micro-Watts) and such
> would have impact on the 'inefficient OPPs' information in the Cpufreq
> framework. This patch set removes the rounding issue.
Thanks for this detailed description, it really helps to understand why
this change is needed.
Perhaps it would make sense to add a power_uw in the EM structure and
keeping the old one with the milli-watts in order to reduce the impact
of the change.
It is a suggestion if you find it more convenient. Otherwise I'm fine
with this approach too.
A few comments below.
> Signed-off-by: Lukasz Luba <lukasz.luba@....com>
> ---
> drivers/cpufreq/mediatek-cpufreq-hw.c | 7 +--
> drivers/cpufreq/scmi-cpufreq.c | 6 +++
> drivers/opp/of.c | 15 ++++---
> drivers/powercap/dtpm_cpu.c | 5 +--
> drivers/thermal/cpufreq_cooling.c | 13 +++++-
> drivers/thermal/devfreq_cooling.c | 19 ++++++--
> include/linux/energy_model.h | 63 ++++++++++++++++++++-------
> kernel/power/energy_model.c | 31 ++++++++-----
> 8 files changed, 114 insertions(+), 45 deletions(-)
>
> diff --git a/drivers/cpufreq/mediatek-cpufreq-hw.c b/drivers/cpufreq/mediatek-cpufreq-hw.c
> index 813cccbfe934..f0e0a35c7f21 100644
> --- a/drivers/cpufreq/mediatek-cpufreq-hw.c
> +++ b/drivers/cpufreq/mediatek-cpufreq-hw.c
> @@ -51,7 +51,7 @@ static const u16 cpufreq_mtk_offsets[REG_ARRAY_SIZE] = {
> };
>
[ ... ]
> diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c
> index b8151d95a806..dc19e7c80751 100644
> --- a/drivers/thermal/cpufreq_cooling.c
> +++ b/drivers/thermal/cpufreq_cooling.c
> @@ -21,6 +21,7 @@
> #include <linux/pm_qos.h>
> #include <linux/slab.h>
> #include <linux/thermal.h>
> +#include <linux/units.h>
>
> #include <trace/events/thermal.h>
>
> @@ -101,6 +102,7 @@ static unsigned long get_level(struct cpufreq_cooling_device *cpufreq_cdev,
> static u32 cpu_freq_to_power(struct cpufreq_cooling_device *cpufreq_cdev,
> u32 freq)
> {
> + unsigned long power_mw;
> int i;
>
> for (i = cpufreq_cdev->max_level - 1; i >= 0; i--) {
> @@ -108,16 +110,23 @@ static u32 cpu_freq_to_power(struct cpufreq_cooling_device *cpufreq_cdev,
> break;
> }
>
> - return cpufreq_cdev->em->table[i + 1].power;
> + power_mw = cpufreq_cdev->em->table[i + 1].power;
> + power_mw /= MICROWATT_PER_MILLIWATT;
Won't this fail with an unresolved symbols on some archs ? I mean may be
do_div should be used instead ?
> +
> + return power_mw;
> }
[ ... ]
> #ifdef CONFIG_64BIT
> -#define em_scale_power(p) ((p) * 1000)
> +#define em_estimate_energy(cost, sum_util, scale_cpu) \
> + (((cost) * (sum_util)) / (scale_cpu))
> #else
> -#define em_scale_power(p) (p)
> +#define em_estimate_energy(cost, sum_util, scale_cpu) \
> + (((cost) / (scale_cpu)) * (sum_util))
> #endif
>
> struct em_data_callback {
> @@ -112,7 +143,7 @@ struct em_data_callback {
> * and frequency.
> *
> * In case of CPUs, the power is the one of a single CPU in the domain,
> - * expressed in milli-Watts or an abstract scale. It is expected to
> + * expressed in micro-Watts or an abstract scale. It is expected to
> * fit in the [0, EM_MAX_POWER] range.
> *
> * Return 0 on success.
> @@ -148,7 +179,7 @@ struct em_perf_domain *em_cpu_get(int cpu);
> struct em_perf_domain *em_pd_get(struct device *dev);
> int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
> struct em_data_callback *cb, cpumask_t *span,
> - bool milliwatts);
> + bool microwatts);
> void em_dev_unregister_perf_domain(struct device *dev);
>
> /**
> @@ -273,7 +304,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
> * pd_nrg = ------------------------ (4)
> * scale_cpu
> */
> - return ps->cost * sum_util / scale_cpu;
> + return em_estimate_energy(ps->cost, sum_util, scale_cpu);
> }
>
> /**
> @@ -297,7 +328,7 @@ struct em_data_callback {};
> static inline
> int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
> struct em_data_callback *cb, cpumask_t *span,
> - bool milliwatts)
> + bool microwatts)
> {
> return -EINVAL;
> }
> diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
> index 6c373f2960e7..910668ec8838 100644
> --- a/kernel/power/energy_model.c
> +++ b/kernel/power/energy_model.c
> @@ -108,10 +108,11 @@ static void em_debug_remove_pd(struct device *dev) {}
>
> static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
> int nr_states, struct em_data_callback *cb,
> - unsigned long flags)
> + unsigned long flags, int num_devs)
> {
> unsigned long power, freq, prev_freq = 0, prev_cost = ULONG_MAX;
> struct em_perf_state *table;
> + unsigned long max_cost = 0;
> int i, ret;
> u64 fmax;
>
> @@ -145,7 +146,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
>
> /*
> * The power returned by active_state() is expected to be
> - * positive and to fit into 16 bits.
> + * positive and be in range.
> */
> if (!power || power > EM_MAX_POWER) {
> dev_err(dev, "EM: invalid power: %lu\n",
> @@ -170,7 +171,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
> goto free_ps_table;
> }
> } else {
> - power_res = em_scale_power(table[i].power);
> + power_res = table[i].power;
> cost = div64_u64(fmax * power_res, table[i].frequency);
> }
>
> @@ -183,6 +184,15 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
> } else {
> prev_cost = table[i].cost;
> }
> +
> + if (max_cost < table[i].cost)
> + max_cost = table[i].cost;
> + }
> +
> + /* Check if it won't overflow during energy estimation. */
> + if (em_validate_cost(max_cost, num_devs)) {
I'm not finding the em_validate_cost() function
> + dev_err(dev, "EM: too big 'cost' value: %lu\n", max_cost);
> + goto free_ps_table;
> }
>
> pd->table = table;
> @@ -199,9 +209,9 @@ static int em_create_pd(struct device *dev, int nr_states,
> struct em_data_callback *cb, cpumask_t *cpus,
> unsigned long flags)
> {
> + int cpu, ret, num_devs = 1;
> struct em_perf_domain *pd;
> struct device *cpu_dev;
> - int cpu, ret;
>
> if (_is_cpu_device(dev)) {
> pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL);
> @@ -209,13 +219,14 @@ static int em_create_pd(struct device *dev, int nr_states,
> return -ENOMEM;
>
> cpumask_copy(em_span_cpus(pd), cpus);
> + num_devs = cpumask_weight(cpus);
Why is this change needed ? What is the connection with the uW unit change ?
> } else {
> pd = kzalloc(sizeof(*pd), GFP_KERNEL);
> if (!pd)
> return -ENOMEM;
> }
>
> - ret = em_create_perf_table(dev, pd, nr_states, cb, flags);
> + ret = em_create_perf_table(dev, pd, nr_states, cb, flags, num_devs);
> if (ret) {
> kfree(pd);
> return ret;
> @@ -314,13 +325,13 @@ EXPORT_SYMBOL_GPL(em_cpu_get);
> * @cpus : Pointer to cpumask_t, which in case of a CPU device is
> * obligatory. It can be taken from i.e. 'policy->cpus'. For other
> * type of devices this should be set to NULL.
> - * @milliwatts : Flag indicating that the power values are in milliWatts or
> + * @microwatts : Flag indicating that the power values are in micro-Watts or
> * in some other scale. It must be set properly.
> *
> * Create Energy Model tables for a performance domain using the callbacks
> * defined in cb.
> *
> - * The @milliwatts is important to set with correct value. Some kernel
> + * The @microwatts is important to set with correct value. Some kernel
> * sub-systems might rely on this flag and check if all devices in the EM are
> * using the same scale.
> *
> @@ -331,7 +342,7 @@ EXPORT_SYMBOL_GPL(em_cpu_get);
> */
> int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
> struct em_data_callback *cb, cpumask_t *cpus,
> - bool milliwatts)
> + bool microwatts)
> {
> unsigned long cap, prev_cap = 0;
> unsigned long flags = 0;
> @@ -381,8 +392,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
> }
> }
>
> - if (milliwatts)
> - flags |= EM_PERF_DOMAIN_MILLIWATTS;
> + if (microwatts)
> + flags |= EM_PERF_DOMAIN_MICROWATTS;
> else if (cb->get_cost)
> flags |= EM_PERF_DOMAIN_ARTIFICIAL;
>
--
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
Follow Linaro: <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog
Powered by blists - more mailing lists