[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Z0a0JNRPuRYaVrcI@BLRRASHENOY1.amd.com>
Date: Wed, 27 Nov 2024 11:24:44 +0530
From: "Gautham R. Shenoy" <gautham.shenoy@....com>
To: Patryk Wlazlyn <patryk.wlazlyn@...ux.intel.com>
Cc: x86@...nel.org, linux-kernel@...r.kernel.org, linux-pm@...r.kernel.org,
rafael.j.wysocki@...el.com, peterz@...radead.org,
dave.hansen@...ux.intel.com, tglx@...utronix.de,
len.brown@...el.com, artem.bityutskiy@...ux.intel.com
Subject: Re: [PATCH v5 1/3] x86/smp: Allow calling mwait_play_dead with an
arbitrary hint
Hello Patryk,
On Tue, Nov 26, 2024 at 09:15:37PM +0100, Patryk Wlazlyn wrote:
> The MWAIT instruction needs different hints on different CPUs to reach
> specific idle states. The current hint calculation* in mwait_play_dead()
> code works in practice on current Intel hardware, but it fails on a
> recent one, Intel's Sierra Forest and possibly some future ones. Those
> newer CPUs' power efficiency suffers when the CPU is put offline.
>
> * The current algorithm for looking up the mwait hint for the deepest
> cstate, in mwait_play_dead() code works by inspecting CPUID leaf 0x5 and
> calculates the mwait hint based on the number of reported substates.
> This approach depends on the hints associated with them to be continuous
> in the range [0, NUM_SUBSTATES-1]. This continuity is not documented and
> is not met on the recent Intel platforms.
>
> For example, Intel's Sierra Forest report two cstates with two substates
> each in cpuid leaf 0x5:
>
> Name* target cstate target subcstate (mwait hint)
> ===========================================================
> C1 0x00 0x00
> C1E 0x00 0x01
>
> -- 0x10 ----
>
> C6S 0x20 0x22
> C6P 0x20 0x23
>
> -- 0x30 ----
>
> /* No more (sub)states all the way down to the end. */
> ===========================================================
>
> * Names of the cstates are not included in the CPUID leaf 0x5, they are
> taken from the product specific documentation.
>
> Notice that hints 0x20 and 0x21 are skipped entirely for the target
> cstate 0x20 (C6), being a cause of the problem for the current cpuid
> leaf 0x5 algorithm.
>
> Allow cpuidle code to call mwait play dead loop with a known hint for
> the deepest idle state on a given platform, skipping the cpuid based
> calculation.
Apologies for what may appear as bikeshedding, after this patch, the
cpuidle code still won't call any mwait based play dead loop since the
support for enter_dead for FFh based idle states in acpi_idle and
intel_idle only gets added in Patches 2 and 3.
Does it make sense to split this Patch 1 into 2 patches : 1/4 and 4/4
1/4 just introduces the mwait_play_dead_with_hint() helper which will
be used by patches 2 and 3.
4/4 get rids of the of logic to find the deepest state from
mwait_play_dead() and modifies native_play_dead() to call
cpuidle_play_dead() followed by hlt_play_dead() thus removing any
reference to mwait_play_dead(). Optionally you can even rename
mwait_play_dead_with_hints() to mwait_play_dead().
That way the changelog that you have for this patch can be used in 4/4
since with the addition of play_dead support for FFh states in both
acpi_idle and intel_idle via patches 2 and 3, the logic to find the
deepest ffh state in mwait_play_dead() is no longer required.
Thoughts ?
--
Thanks and Regards
gautham.
>
> Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@...ux.intel.com>
> ---
> arch/x86/include/asm/smp.h | 3 +++
> arch/x86/kernel/smpboot.c | 46 +++++---------------------------------
> 2 files changed, 8 insertions(+), 41 deletions(-)
>
> diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
> index ca073f40698f..633b4a4aec6b 100644
> --- a/arch/x86/include/asm/smp.h
> +++ b/arch/x86/include/asm/smp.h
> @@ -114,6 +114,7 @@ void wbinvd_on_cpu(int cpu);
> int wbinvd_on_all_cpus(void);
>
> void smp_kick_mwait_play_dead(void);
> +void mwait_play_dead(unsigned long hint);
>
> void native_smp_send_reschedule(int cpu);
> void native_send_call_func_ipi(const struct cpumask *mask);
> @@ -164,6 +165,8 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu)
> {
> return (struct cpumask *)cpumask_of(0);
> }
> +
> +static inline void mwait_play_dead(unsigned long eax_hint) { }
> #endif /* CONFIG_SMP */
>
> #ifdef CONFIG_DEBUG_NMI_SELFTEST
> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> index b5a8f0891135..5dc143e1d6af 100644
> --- a/arch/x86/kernel/smpboot.c
> +++ b/arch/x86/kernel/smpboot.c
> @@ -1276,45 +1276,9 @@ void play_dead_common(void)
> * We need to flush the caches before going to sleep, lest we have
> * dirty data in our caches when we come back up.
> */
> -static inline void mwait_play_dead(void)
> +void __noreturn mwait_play_dead(unsigned long eax_hint)
> {
> struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
> - unsigned int eax, ebx, ecx, edx;
> - unsigned int highest_cstate = 0;
> - unsigned int highest_subcstate = 0;
> - int i;
> -
> - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
> - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
> - return;
> - if (!this_cpu_has(X86_FEATURE_MWAIT))
> - return;
> - if (!this_cpu_has(X86_FEATURE_CLFLUSH))
> - return;
> - if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
> - return;
> -
> - eax = CPUID_MWAIT_LEAF;
> - ecx = 0;
> - native_cpuid(&eax, &ebx, &ecx, &edx);
> -
> - /*
> - * eax will be 0 if EDX enumeration is not valid.
> - * Initialized below to cstate, sub_cstate value when EDX is valid.
> - */
> - if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) {
> - eax = 0;
> - } else {
> - edx >>= MWAIT_SUBSTATE_SIZE;
> - for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
> - if (edx & MWAIT_SUBSTATE_MASK) {
> - highest_cstate = i;
> - highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
> - }
> - }
> - eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
> - (highest_subcstate - 1);
> - }
>
> /* Set up state for the kexec() hack below */
> md->status = CPUDEAD_MWAIT_WAIT;
> @@ -1335,7 +1299,7 @@ static inline void mwait_play_dead(void)
> mb();
> __monitor(md, 0, 0);
> mb();
> - __mwait(eax, 0);
> + __mwait(eax_hint, 0);
>
> if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
> /*
> @@ -1407,9 +1371,9 @@ void native_play_dead(void)
> play_dead_common();
> tboot_shutdown(TB_SHUTDOWN_WFS);
>
> - mwait_play_dead();
> - if (cpuidle_play_dead())
> - hlt_play_dead();
> + /* Below returns only on error. */
> + cpuidle_play_dead();
> + hlt_play_dead();
> }
>
> #else /* ... !CONFIG_HOTPLUG_CPU */
> --
> 2.47.1
>
Powered by blists - more mailing lists