[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <50546456-8b65-4e42-97b1-01e3cc48beac@arm.com>
Date: Tue, 5 Nov 2024 09:08:12 +0000
From: Lukasz Luba <lukasz.luba@....com>
To: "Rafael J. Wysocki" <rafael@...nel.org>
Cc: linux-kernel@...r.kernel.org, linux-pm@...r.kernel.org,
dietmar.eggemann@....com
Subject: Re: [PATCH v3 1/1] PM: EM: Add min/max available performance state
limits
On 11/4/24 22:02, Rafael J. Wysocki wrote:
> On Wed, Oct 30, 2024 at 5:40 PM Lukasz Luba <lukasz.luba@....com> wrote:
>>
>> On some devices there are HW dependencies for shared frequency and voltage
>> between devices. It will impact Energy Aware Scheduler (EAS) decision,
>> where CPUs share the voltage & frequency domain with other CPUs or devices
>> e.g.
>> - Mid CPUs + Big CPU
>> - Little CPU + L3 cache in DSU
>> - some other device + Little CPUs
>>
>> Detailed explanation of one example:
>> When the L3 cache frequency is increased, the affected Little CPUs might
>> run at higher voltage and frequency. That higher voltage causes higher CPU
>> power and thus more energy is used for running the tasks. This is
>> important for background running tasks, which try to run on energy
>> efficient CPUs.
>>
>> Therefore, add performance state limits which are applied for the device
>> (in this case CPU). This is important on SoCs with HW dependencies
>> mentioned above so that the Energy Aware Scheduler (EAS) does not use
>> performance states outside the valid min-max range for energy calculation.
>>
>> Signed-off-by: Lukasz Luba <lukasz.luba@....com>
>> ---
>> include/linux/energy_model.h | 29 ++++++++++++++------
>> kernel/power/energy_model.c | 52 ++++++++++++++++++++++++++++++++++++
>> 2 files changed, 73 insertions(+), 8 deletions(-)
>>
>> diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
>> index 1ff52020cf757..752e0b2975820 100644
>> --- a/include/linux/energy_model.h
>> +++ b/include/linux/energy_model.h
>> @@ -55,6 +55,8 @@ struct em_perf_table {
>> * struct em_perf_domain - Performance domain
>> * @em_table: Pointer to the runtime modifiable em_perf_table
>> * @nr_perf_states: Number of performance states
>> + * @min_perf_state: Minimum allowed Performance State index
>> + * @max_perf_state: Maximum allowed Performance State index
>> * @flags: See "em_perf_domain flags"
>> * @cpus: Cpumask covering the CPUs of the domain. It's here
>> * for performance reasons to avoid potential cache
>> @@ -70,6 +72,8 @@ struct em_perf_table {
>> struct em_perf_domain {
>> struct em_perf_table __rcu *em_table;
>> int nr_perf_states;
>> + int min_perf_state;
>> + int max_perf_state;
>> unsigned long flags;
>> unsigned long cpus[];
>> };
>> @@ -173,13 +177,14 @@ void em_table_free(struct em_perf_table __rcu *table);
>> int em_dev_compute_costs(struct device *dev, struct em_perf_state *table,
>> int nr_states);
>> int em_dev_update_chip_binning(struct device *dev);
>> +int em_update_performance_limits(struct em_perf_domain *pd,
>> + unsigned long freq_min_khz, unsigned long freq_max_khz);
>>
>> /**
>> * em_pd_get_efficient_state() - Get an efficient performance state from the EM
>> * @table: List of performance states, in ascending order
>> - * @nr_perf_states: Number of performance states
>> + * @pd: performance domain for which this must be done
>> * @max_util: Max utilization to map with the EM
>> - * @pd_flags: Performance Domain flags
>> *
>> * It is called from the scheduler code quite frequently and as a consequence
>> * doesn't implement any check.
>> @@ -188,13 +193,16 @@ int em_dev_update_chip_binning(struct device *dev);
>> * requirement.
>> */
>> static inline int
>> -em_pd_get_efficient_state(struct em_perf_state *table, int nr_perf_states,
>> - unsigned long max_util, unsigned long pd_flags)
>> +em_pd_get_efficient_state(struct em_perf_state *table,
>> + struct em_perf_domain *pd, unsigned long max_util)
>> {
>> + unsigned long pd_flags = pd->flags;
>> + int min_ps = pd->min_perf_state;
>> + int max_ps = pd->max_perf_state;
>> struct em_perf_state *ps;
>> int i;
>>
>> - for (i = 0; i < nr_perf_states; i++) {
>> + for (i = min_ps; i <= max_ps; i++) {
>> ps = &table[i];
>> if (ps->performance >= max_util) {
>> if (pd_flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES &&
>> @@ -204,7 +212,7 @@ em_pd_get_efficient_state(struct em_perf_state *table, int nr_perf_states,
>> }
>> }
>>
>> - return nr_perf_states - 1;
>> + return max_ps;
>> }
>>
>> /**
>> @@ -253,8 +261,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
>> * requested performance.
>> */
>> em_table = rcu_dereference(pd->em_table);
>> - i = em_pd_get_efficient_state(em_table->state, pd->nr_perf_states,
>> - max_util, pd->flags);
>> + i = em_pd_get_efficient_state(em_table->state, pd, max_util);
>> ps = &em_table->state[i];
>>
>> /*
>> @@ -391,6 +398,12 @@ static inline int em_dev_update_chip_binning(struct device *dev)
>> {
>> return -EINVAL;
>> }
>> +static inline
>> +int em_update_performance_limits(struct em_perf_domain *pd,
>> + unsigned long freq_min_khz, unsigned long freq_max_khz)
>> +{
>> + return -EINVAL;
>> +}
>> #endif
>>
>> #endif
>> diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
>> index 927cc55ba0b3d..d07faf42eace6 100644
>> --- a/kernel/power/energy_model.c
>> +++ b/kernel/power/energy_model.c
>> @@ -628,6 +628,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
>> goto unlock;
>>
>> dev->em_pd->flags |= flags;
>> + dev->em_pd->min_perf_state = 0;
>> + dev->em_pd->max_perf_state = nr_states - 1;
>>
>> em_cpufreq_update_efficiencies(dev, dev->em_pd->em_table->state);
>>
>> @@ -856,3 +858,53 @@ int em_dev_update_chip_binning(struct device *dev)
>> return em_recalc_and_update(dev, pd, em_table);
>> }
>> EXPORT_SYMBOL_GPL(em_dev_update_chip_binning);
>> +
>> +
>> +/**
>> + * em_update_performance_limits() - Update Energy Model with performance
>> + * limits information.
>> + * @pd : Performance Domain with EM that has to be updated.
>> + * @freq_min_khz : New minimum allowed frequency for this device.
>> + * @freq_max_khz : New maximum allowed frequency for this device.
>> + *
>> + * This function allows to update the EM with information about available
>> + * performance levels. It takes the minimum and maximum frequency in kHz
>> + * and does internal translation to performance levels.
>> + * Returns 0 on success or -EINVAL when failed.
>> + */
>> +int em_update_performance_limits(struct em_perf_domain *pd,
>> + unsigned long freq_min_khz, unsigned long freq_max_khz)
>> +{
>> + struct em_perf_state *table;
>> + int min_ps = -1;
>> + int max_ps = -1;
>> + int i;
>> +
>> + if (!pd)
>> + return -EINVAL;
>> +
>> + rcu_read_lock();
>> + table = em_perf_state_from_pd(pd);
>> +
>> + for (i = 0; i < pd->nr_perf_states; i++) {
>> + if (freq_min_khz == table[i].frequency)
>> + min_ps = i;
>> + if (freq_max_khz == table[i].frequency)
>> + max_ps = i;
>> + }
>> + rcu_read_unlock();
>> +
>> + /* Only update when both are found and sane */
>> + if (min_ps < 0 || max_ps < 0 || max_ps < min_ps)
>> + return -EINVAL;
>> +
>> +
>> + /* Guard simultaneous updates and make them atomic */
>> + mutex_lock(&em_pd_mutex);
>> + pd->min_perf_state = min_ps;
>> + pd->max_perf_state = max_ps;
>> + mutex_unlock(&em_pd_mutex);
>> +
>> + return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(em_update_performance_limits);
>> --
>
> Applied as 6.13 material, thanks!
Thank you Rafael!
Powered by blists - more mailing lists