[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <346158ff-688b-4fa1-830d-807eb1a30448@amd.com>
Date: Wed, 23 Apr 2025 19:49:25 +0530
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@....com>
To: Mario Limonciello <mario.limonciello@....com>, gautham.shenoy@....com
Cc: linux-pm@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v2 2/2] cpufreq/amd-pstate: Add support for the "Requested
CPU Min frequency" BIOS option
On 4/23/2025 7:35 PM, Mario Limonciello wrote:
> On 4/21/2025 11:02 PM, Dhananjay Ugwekar wrote:
>>
>>
>> On 4/21/2025 10:23 PM, Mario Limonciello wrote:
>>> On 4/21/2025 3:04 AM, Dhananjay Ugwekar wrote:
>>>> Initialize lower frequency limit to the "Requested CPU Min frequency"
>>>> BIOS option (if it is set) value as part of the driver->init()
>>>> callback. The BIOS specified value is passed by the PMFW as min_perf in
>>>> CPPC_REQ MSR.
>>>>
>>>> To ensure that we don't mistake a stale min_perf value in CPPC_REQ
>>>> value as the "Requested CPU Min frequency" during a kexec wakeup, reset
>>>> the CPPC_REQ.min_perf value back to the BIOS specified one in the offline,
>>>> exit and suspend callbacks. amd_pstate_target() and
>>>> amd_pstate_epp_update_limit() which are invoked as part of the resume()
>>>> and online() callbacks will take care of restoring the CPPC_REQ back to
>>>> the latest sane values.
>>>>
>>>> Signed-off-by: Dhananjay Ugwekar <dhananjay.ugwekar@....com>
>>>> ---
>>>> Changes in v2:
>>>> * Modify the condition in msr_init_perf to initialize perf.bios_min_perf
>>>> to 0 by default
>>>> * Use READ_ONCE to read cpudata->perf in exit, suspend and offline
>>>> callbacks
>>>> ---
>>>> drivers/cpufreq/amd-pstate.c | 67 +++++++++++++++++++++++++++++-------
>>>> drivers/cpufreq/amd-pstate.h | 2 ++
>>>> 2 files changed, 56 insertions(+), 13 deletions(-)
>>>>
>>>> diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
>>>> index 02de51001eba..407fdd31fb0b 100644
>>>> --- a/drivers/cpufreq/amd-pstate.c
>>>> +++ b/drivers/cpufreq/amd-pstate.c
>>>> @@ -389,7 +389,8 @@ static inline int amd_pstate_cppc_enable(struct cpufreq_policy *policy)
>>>> static int msr_init_perf(struct amd_cpudata *cpudata)
>>>> {
>>>> union perf_cached perf = READ_ONCE(cpudata->perf);
>>>> - u64 cap1, numerator;
>>>> + u64 cap1, numerator, cppc_req;
>>>> + u8 min_perf;
>>>> int ret = rdmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1,
>>>> &cap1);
>>>> @@ -400,6 +401,22 @@ static int msr_init_perf(struct amd_cpudata *cpudata)
>>>> if (ret)
>>>> return ret;
>>>> + ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &cppc_req);
>>>> + if (ret)
>>>> + return ret;
>>>> +
>>>> + WRITE_ONCE(cpudata->cppc_req_cached, cppc_req);
>>>> + min_perf = FIELD_GET(AMD_CPPC_MIN_PERF_MASK, cppc_req);
>>>> +
>>>> + /*
>>>> + * Clear out the min_perf part to check if the rest of the MSR is 0, if yes, this is an
>>>> + * indication that the min_perf value is the one specified through the BIOS option
>>>> + */
>>>> + cppc_req &= ~(AMD_CPPC_MIN_PERF_MASK);
>>>> +
>>>> + if (!cppc_req)
>>>> + perf.bios_min_perf = min_perf;
>>>> +
>>>> perf.highest_perf = numerator;
>>>> perf.max_limit_perf = numerator;
>>>> perf.min_limit_perf = FIELD_GET(AMD_CPPC_LOWEST_PERF_MASK, cap1);
>>>> @@ -580,20 +597,26 @@ static int amd_pstate_verify(struct cpufreq_policy_data *policy_data)
>>>> {
>>>> /*
>>>> * Initialize lower frequency limit (i.e.policy->min) with
>>>> - * lowest_nonlinear_frequency which is the most energy efficient
>>>> - * frequency. Override the initial value set by cpufreq core and
>>>> - * amd-pstate qos_requests.
>>>> + * lowest_nonlinear_frequency or the min frequency (if) specified in BIOS,
>>>> + * Override the initial value set by cpufreq core and amd-pstate qos_requests.
>>>> */
>>>> if (policy_data->min == FREQ_QOS_MIN_DEFAULT_VALUE) {
>>>> struct cpufreq_policy *policy __free(put_cpufreq_policy) =
>>>> cpufreq_cpu_get(policy_data->cpu);
>>>> struct amd_cpudata *cpudata;
>>>> + union perf_cached perf;
>>>> if (!policy)
>>>> return -EINVAL;
>>>> cpudata = policy->driver_data;
>>>> - policy_data->min = cpudata->lowest_nonlinear_freq;
>>>> + perf = READ_ONCE(cpudata->perf);
>>>> +
>>>> + if (perf.bios_min_perf)
>>>> + policy_data->min = perf_to_freq(perf, cpudata->nominal_freq,
>>>> + perf.bios_min_perf);
>>>> + else
>>>> + policy_data->min = cpudata->lowest_nonlinear_freq;
>>>> }
>>>> cpufreq_verify_within_cpu_limits(policy_data);
>>>> @@ -1040,6 +1063,10 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy)
>>>> static void amd_pstate_cpu_exit(struct cpufreq_policy *policy)
>>>> {
>>>> struct amd_cpudata *cpudata = policy->driver_data;
>>>> + union perf_cached perf = READ_ONCE(cpudata->perf);
>>>> +
>>>> + /* Reset CPPC_REQ MSR to the BIOS value */
>>>> + amd_pstate_update_perf(policy, perf.bios_min_perf, 0U, 0U, 0U, false);
>>>> freq_qos_remove_request(&cpudata->req[1]);
>>>> freq_qos_remove_request(&cpudata->req[0]);
>>>> @@ -1428,7 +1455,6 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy)
>>>> struct amd_cpudata *cpudata;
>>>> union perf_cached perf;
>>>> struct device *dev;
>>>> - u64 value;
>>>> int ret;
>>>> /*
>>>> @@ -1493,12 +1519,6 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy)
>>>> cpudata->epp_default = AMD_CPPC_EPP_BALANCE_PERFORMANCE;
>>>> }
>>>> - if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
>>>> - ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &value);
>>>> - if (ret)
>>>> - return ret;
>>>> - WRITE_ONCE(cpudata->cppc_req_cached, value);
>>>> - }
>>>> ret = amd_pstate_set_epp(policy, cpudata->epp_default);
>>>> if (ret)
>>>> return ret;
>>>> @@ -1518,6 +1538,11 @@ static void amd_pstate_epp_cpu_exit(struct cpufreq_policy *policy)
>>>> struct amd_cpudata *cpudata = policy->driver_data;
>>>> if (cpudata) {
>>>> + union perf_cached perf = READ_ONCE(cpudata->perf);
>>>> +
>>>> + /* Reset CPPC_REQ MSR to the BIOS value */
>>>> + amd_pstate_update_perf(policy, perf.bios_min_perf, 0U, 0U, 0U, false);
>>>> +
>>>> kfree(cpudata);
>>>> policy->driver_data = NULL;
>>>> }
>>>> @@ -1575,12 +1600,28 @@ static int amd_pstate_cpu_online(struct cpufreq_policy *policy)
>>>> static int amd_pstate_cpu_offline(struct cpufreq_policy *policy)
>>>> {
>>>> - return 0;
>>>> + struct amd_cpudata *cpudata = policy->driver_data;
>>>> + union perf_cached perf = READ_ONCE(cpudata->perf);
>>>> +
>>>> + /*
>>>> + * Reset CPPC_REQ MSR to the BIOS value, this will allow us to retain the BIOS specified
>>>> + * min_perf value across kexec reboots. If this CPU is just onlined normally after this, the
>>>> + * limits, epp and desired perf will get reset to the cached values in cpudata struct
>>>> + */
>>>> + return amd_pstate_update_perf(policy, perf.bios_min_perf, 0U, 0U, 0U, false);
>>>> }
>>>> static int amd_pstate_suspend(struct cpufreq_policy *policy)
>>>> {
>>>> struct amd_cpudata *cpudata = policy->driver_data;
>>>> + union perf_cached perf = READ_ONCE(cpudata->perf);
>>>> +
>>>> + /*
>>>> + * Reset CPPC_REQ MSR to the BIOS value, this will allow us to retain the BIOS specified
>>>> + * min_perf value across kexec reboots. If this CPU is just resumed back without kexec,
>>>> + * the limits, epp and desired perf will get reset to the cached values in cpudata struct
>>>> + */
>>>> + amd_pstate_update_perf(policy, perf.bios_min_perf, 0U, 0U, 0U, false);
>>>
>>> In EPP mode this appears it would be OK because the perf value should get reset in the resume for amd_pstate_epp_update_limit() but in passive mode won't this never get reset on resume from suspend?
>>
>> In passive mode, on resume, amd_pstate_target gets invoked through the code flow mentioned below,
>>
>> Cpufreq_resume()->cpufreq_start_governor->(cpufreq_driver->start()&&cpufreq_driver->limits())->amd_pstate_target() [this is for _target() based governors]
>> For schedutil, start_governor will register the update_util hook and it will be called at every util change, which eventually calls adjust_perf()
>>
>> I tested these scenarios using "sudo rtcwake -m mem -s 10" (suspend to mem for 10 seconds) on a server system, within 1-2 mins of the wakeup the CPPC_REQ MSR
>> values of all CPUs get updated to sane ones. It would be helpful if you could test for such scenarios on the client systems as well.
>>
>> That said, there might be a small window between the resume and governor trigger, where the value in CPPC_REQ MSR would be invalid. Are we okay with that ?
>
> For server systems it's probably not much of a big deal since servers aren't frequently suspended, bu this window of time seems untenable for a client machine. As a user that would mean effectively they have wrong limits programmed after wakeup for 1-2 minutes and could potentially have performance gimped as a result.
>
> I'd say let's just flush the right value immediately after resume and then the write 1-2 minutes later becomes a no-op. With the checks we have in the driver now I expect that they don't even turn into MSR writes.
Makes sense, I'll add this and post v3
>
>>
>>>
>>>> /* invalidate to ensure it's rewritten during resume */
>>>> cpudata->cppc_req_cached = 0;
>>>> diff --git a/drivers/cpufreq/amd-pstate.h b/drivers/cpufreq/amd-pstate.h
>>>> index fbe1c08d3f06..2f7ae364d331 100644
>>>> --- a/drivers/cpufreq/amd-pstate.h
>>>> +++ b/drivers/cpufreq/amd-pstate.h
>>>> @@ -30,6 +30,7 @@
>>>> * @lowest_perf: the absolute lowest performance level of the processor
>>>> * @min_limit_perf: Cached value of the performance corresponding to policy->min
>>>> * @max_limit_perf: Cached value of the performance corresponding to policy->max
>>>> + * @bios_min_perf: Cached perf value corresponding to the "Requested CPU Min Frequency" BIOS option
>>>> */
>>>> union perf_cached {
>>>> struct {
>>>> @@ -39,6 +40,7 @@ union perf_cached {
>>>> u8 lowest_perf;
>>>> u8 min_limit_perf;
>>>> u8 max_limit_perf;
>>>> + u8 bios_min_perf;
>>>> };
>>>> u64 val;
>>>> };
>>>
>>
>
Powered by blists - more mailing lists