linux-kernel - Re: [PATCH RFC] x86/cpu: fix intermittent lockup on poweroff

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives

Hash Suite: Windows password security audit tool. GUI, reports in PDF.

[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]

Message-ID: <01a44722-931a-7aff-4f4b-75e78855beb1@amd.com>
Date:   Wed, 26 Apr 2023 12:51:00 -0500
From:   Tom Lendacky <thomas.lendacky@....com>
To:     Tony Battersby <tonyb@...ernetics.com>,
        Thomas Gleixner <tglx@...utronix.de>,
        Dave Hansen <dave.hansen@...el.com>,
        Ingo Molnar <mingo@...hat.com>, Borislav Petkov <bp@...en8.de>,
        Dave Hansen <dave.hansen@...ux.intel.com>, x86@...nel.org
Cc:     "H. Peter Anvin" <hpa@...or.com>,
        Mario Limonciello <mario.limonciello@....com>,
        "linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
        Andi Kleen <ak@...ux.intel.com>
Subject: Re: [PATCH RFC] x86/cpu: fix intermittent lockup on poweroff

On 4/26/23 12:37, Tony Battersby wrote:
> On 4/26/23 12:37, Thomas Gleixner wrote:
>> The problem really seems to be that the control CPU goes off before the
>> other CPUs have finished and depending on timing that causes the
>> wreckage. Otherwise the mdelay(100) would not have helped at all.
>>
>> But looking at it, that num_online_cpus() == 1 check in
>> stop_other_cpus() is fragile as hell independent of that wbinvd() issue.
>>
>> Something like the completely untested below should cure that.
>>
>> Thanks,
>>
>>          tglx
>> ---
>>   arch/x86/include/asm/cpu.h |    2 ++
>>   arch/x86/kernel/process.c  |   10 ++++++++++
>>   arch/x86/kernel/smp.c      |   15 ++++++++++++---
>>   3 files changed, 24 insertions(+), 3 deletions(-)
>>
>> --- a/arch/x86/include/asm/cpu.h
>> +++ b/arch/x86/include/asm/cpu.h
>> @@ -98,4 +98,6 @@ extern u64 x86_read_arch_cap_msr(void);
>>   int intel_find_matching_signature(void *mc, unsigned int csig, int cpf);
>>   int intel_microcode_sanity_check(void *mc, bool print_err, int hdr_type);
>>   
>> +extern atomic_t stop_cpus_count;
>> +
>>   #endif /* _ASM_X86_CPU_H */
>> --- a/arch/x86/kernel/process.c
>> +++ b/arch/x86/kernel/process.c
>> @@ -752,6 +752,8 @@ bool xen_set_default_idle(void)
>>   }
>>   #endif
>>   
>> +atomic_t stop_cpus_count;
>> +
>>   void __noreturn stop_this_cpu(void *dummy)
>>   {
>>   	local_irq_disable();
>> @@ -776,6 +778,14 @@ void __noreturn stop_this_cpu(void *dumm
>>   	 */
>>   	if (cpuid_eax(0x8000001f) & BIT(0))
>>   		native_wbinvd();
>> +
>> +	/*
>> +	 * native_stop_other_cpus() will write to @stop_cpus_count after
>> +	 * observing that it went down to zero, which will invalidate the
>> +	 * cacheline on this CPU.
>> +	 */
>> +	atomic_dec(&stop_cpus_count);

This is probably going to pull in a cache line and cause the problem the 
native_wbinvd() is trying to avoid.

Thanks,
Tom

>> +
>>   	for (;;) {
>>   		/*
>>   		 * Use native_halt() so that memory contents don't change
>> --- a/arch/x86/kernel/smp.c
>> +++ b/arch/x86/kernel/smp.c
>> @@ -27,6 +27,7 @@
>>   #include <asm/mmu_context.h>
>>   #include <asm/proto.h>
>>   #include <asm/apic.h>
>> +#include <asm/cpu.h>
>>   #include <asm/idtentry.h>
>>   #include <asm/nmi.h>
>>   #include <asm/mce.h>
>> @@ -171,6 +172,8 @@ static void native_stop_other_cpus(int w
>>   		if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1)
>>   			return;
>>   
>> +		atomic_set(&stop_cpus_count, num_online_cpus() - 1);
>> +
>>   		/* sync above data before sending IRQ */
>>   		wmb();
>>   
>> @@ -183,12 +186,12 @@ static void native_stop_other_cpus(int w
>>   		 * CPUs reach shutdown state.
>>   		 */
>>   		timeout = USEC_PER_SEC;
>> -		while (num_online_cpus() > 1 && timeout--)
>> +		while (atomic_read(&stop_cpus_count) > 0 && timeout--)
>>   			udelay(1);
>>   	}
>>   
>>   	/* if the REBOOT_VECTOR didn't work, try with the NMI */
>> -	if (num_online_cpus() > 1) {
>> +	if (atomic_read(&stop_cpus_count) > 0) {
>>   		/*
>>   		 * If NMI IPI is enabled, try to register the stop handler
>>   		 * and send the IPI. In any case try to wait for the other
>> @@ -208,7 +211,7 @@ static void native_stop_other_cpus(int w
>>   		 * one or more CPUs do not reach shutdown state.
>>   		 */
>>   		timeout = USEC_PER_MSEC * 10;
>> -		while (num_online_cpus() > 1 && (wait || timeout--))
>> +		while (atomic_read(&stop_cpus_count) > 0 && (wait || timeout--))
>>   			udelay(1);
>>   	}
>>   
>> @@ -216,6 +219,12 @@ static void native_stop_other_cpus(int w
>>   	disable_local_APIC();
>>   	mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
>>   	local_irq_restore(flags);
>> +
>> +	/*
>> +	 * Ensure that the cache line is invalidated on the other CPUs. See
>> +	 * comment vs. SME in stop_this_cpu().
>> +	 */
>> +	atomic_set(&stop_cpus_count, INT_MAX);
>>   }
>>   
>>   /*
>>
> Tested-by: Tony Battersby <tonyb@...ernetics.com>
> 
> 10 successful poweroffs in a row with wbinvd() enabled.  As I mentioned
> before though, I don't have an AMD CPU to test the SME cache
> invalidation logic.
> 
> I will reply with my patch with an updated title and description.
> 
> Tony
> 
>