lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <943dc66e-0266-4f2c-8154-5a4510fdb843@windriver.com>
Date: Wed, 4 Sep 2024 15:46:47 +0800
From: guocai he <guocai.he.cn@...driver.com>
To: Thomas Gleixner <tglx@...utronix.de>, Jan Kiszka
 <jan.kiszka@...mens.com>,
        Henning Schild <henning.schild@...mens.com>
Cc: Peter Zijlstra <peterz@...radead.org>, x86@...nel.org,
        linux-kernel@...r.kernel.org, Ingo Molnar <mingo@...hat.com>,
        Borislav Petkov <bp@...en8.de>, Guenter Roeck <linux@...ck-us.net>,
        xenomai@...omai.org
Subject: Re: sched: Unexpected reschedule of offline CPU#2!

Thanks very much.

I will let our customer to try this patch and let you know the result.


-guocai

On 9/3/24 23:27, Thomas Gleixner wrote:
> CAUTION: This email comes from a non Wind River email account!
> Do not click links or open attachments unless you recognize the sender and know the content is safe.
>
> On Tue, Jul 27 2021 at 10:46, Jan Kiszka wrote:
>
> Picking up this dead thread again.
>
>> What is supposed to prevent the following in mainline:
>>
>> CPU 0                   CPU 1                      CPU 2
>>
>> native_stop_other_cpus                             <INTERRUPT>
>>    send_IPI_allbutself                              ...
>>                          <INTERRUPT>
>>                          sysvec_reboot
>>                            stop_this_cpu
>>                              set_cpu_online(false)
>>                                                     native_smp_send_reschedule(1)
>>                                                       if (cpu_is_offline(1)) ...
> Nothing. And that's what probably happens if I read the stack trace
> correctly.
>
> But we can be slightly smarter about this for the reboot IPI (the NMI
> case does not have that issue).
>
> CPU 0                   CPU 1                      CPU 2
>
> native_stop_other_cpus                             <INTERRUPT>
>     send_IPI_allbutself                              ...
>                           <IPI>
>                           sysvec_reboot
>                             wait_for_others();
>                                                      </INTERRUPT>
>                                                      <IPI>
>                                                      sysvec_reboot
>                                                      wait_for_others();
>                             stop_this_cpu();           stop_this_cpu();
>                               set_cpu_online(false);     set_cpu_online(false);
>
> Something like the uncompiled below.
>
> Thanks,
>
>          tglx
> ---
> --- a/arch/x86/include/asm/cpu.h
> +++ b/arch/x86/include/asm/cpu.h
> @@ -68,5 +68,6 @@ bool intel_find_matching_signature(void
>   int intel_microcode_sanity_check(void *mc, bool print_err, int hdr_type);
>
>   extern struct cpumask cpus_stop_mask;
> +atomic_t cpus_stop_in_ipi;
>
>   #endif /* _ASM_X86_CPU_H */
> --- a/arch/x86/include/asm/processor.h
> +++ b/arch/x86/include/asm/processor.h
> @@ -721,7 +721,7 @@ bool xen_set_default_idle(void);
>   #define xen_set_default_idle 0
>   #endif
>
> -void __noreturn stop_this_cpu(void *dummy);
> +void __noreturn stop_this_cpu(bool sync);
>   void microcode_check(struct cpuinfo_x86 *prev_info);
>   void store_cpu_caps(struct cpuinfo_x86 *info);
>
> --- a/arch/x86/kernel/process.c
> +++ b/arch/x86/kernel/process.c
> @@ -791,9 +791,10 @@ bool xen_set_default_idle(void)
>   }
>   #endif
>
> +atomic_t cpus_stop_in_ipi;
>   struct cpumask cpus_stop_mask;
>
> -void __noreturn stop_this_cpu(void *dummy)
> +void __noreturn stop_this_cpu(bool sync)
>   {
>          struct cpuinfo_x86 *c = this_cpu_ptr(&cpu_info);
>          unsigned int cpu = smp_processor_id();
> @@ -801,6 +802,16 @@ void __noreturn stop_this_cpu(void *dumm
>          local_irq_disable();
>
>          /*
> +        * Account this CPU and loop until the other CPUs reached this
> +        * point. If they don't react, the control CPU will raise an NMI.
> +        */
> +       if (sync) {
> +               atomic_dec(&cpus_stop_in_ipi);
> +               while (atomic_read(&cpus_stop_in_ipi))
> +                       cpu_relax();
> +       }
> +
> +       /*
>           * Remove this CPU from the online mask and disable it
>           * unconditionally. This might be redundant in case that the reboot
>           * vector was handled late and stop_other_cpus() sent an NMI.
> --- a/arch/x86/kernel/reboot.c
> +++ b/arch/x86/kernel/reboot.c
> @@ -788,7 +788,7 @@ static void native_machine_halt(void)
>
>          tboot_shutdown(TB_SHUTDOWN_HALT);
>
> -       stop_this_cpu(NULL);
> +       stop_this_cpu(false);
>   }
>
>   static void native_machine_power_off(void)
> --- a/arch/x86/kernel/smp.c
> +++ b/arch/x86/kernel/smp.c
> @@ -125,7 +125,7 @@ static int smp_stop_nmi_callback(unsigne
>                  return NMI_HANDLED;
>
>          cpu_emergency_disable_virtualization();
> -       stop_this_cpu(NULL);
> +       stop_this_cpu(false);
>
>          return NMI_HANDLED;
>   }
> @@ -137,7 +137,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_reboot)
>   {
>          apic_eoi();
>          cpu_emergency_disable_virtualization();
> -       stop_this_cpu(NULL);
> +       stop_this_cpu(true);
>   }
>
>   static int register_stop_handler(void)
> @@ -189,6 +189,7 @@ static void native_stop_other_cpus(int w
>           */
>          cpumask_copy(&cpus_stop_mask, cpu_online_mask);
>          cpumask_clear_cpu(this_cpu, &cpus_stop_mask);
> +       atomic_set(&cpus_stop_in_ipi, num_online_cpus() - 1);
>
>          if (!cpumask_empty(&cpus_stop_mask)) {
>                  apic_send_IPI_allbutself(REBOOT_VECTOR);
> @@ -235,10 +236,12 @@ static void native_stop_other_cpus(int w
>          local_irq_restore(flags);
>
>          /*
> -        * Ensure that the cpus_stop_mask cache lines are invalidated on
> -        * the other CPUs. See comment vs. SME in stop_this_cpu().
> +        * Ensure that the cpus_stop_mask and cpus_stop_in_ipi cache lines
> +        * are invalidated on the other CPUs. See comment vs. SME in
> +        * stop_this_cpu().
>           */
>          cpumask_clear(&cpus_stop_mask);
> +       atomic_set(&cpus_stop_in_ipi, 0);
>   }
>
>   /*
>

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ