lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <ab147205-41cc-4c1b-8280-000e638b6534@seco.com>
Date: Mon, 11 Mar 2024 12:30:36 -0400
From: Sean Anderson <sean.anderson@...o.com>
To: Thomas Gleixner <tglx@...utronix.de>,
 Mirsad Todorovac <mirsad.todorovac@....unizg.hr>,
 linux-kernel@...r.kernel.org
Cc: Frederic Weisbecker <frederic@...nel.org>, Ingo Molnar <mingo@...nel.org>
Subject: Re: [BUG] KCSAN: data-race in tick_nohz_idle_stop_tick /
 tick_nohz_next_event

On 3/11/24 10:48, Thomas Gleixner wrote:
> On Thu, Mar 07 2024 at 14:43, Sean Anderson wrote:
>> On 8/18/23 10:15, Mirsad Todorovac wrote:
>>
>> The justification seems to be in tick_sched_do_timer:
>>
>>         /*
>>          * Check if the do_timer duty was dropped. We don't care about
>>          * concurrency: This happens only when the CPU in charge went
>>          * into a long sleep. If two CPUs happen to assign themselves to
>>          * this duty, then the jiffies update is still serialized by
>>          * jiffies_lock.
>>          *
>>          * If nohz_full is enabled, this should not happen because the
>>          * tick_do_timer_cpu never relinquishes.
>>          */
>>
>> with the other assignment being in tick_nohz_stop_tick. I'm not familiar
>> enough with this code to say whether we should be using READ/WRITE_ONCE
>> or maybe just data_race (as would be implied by the comment above).
>
> It wants to be READ/WRITE_ONCE(). Something like the untested below,
> which applies to -next or the master branch of the tip tree.
>
> Thanks,
>
>         tglx
> ---
>
> --- a/kernel/time/tick-common.c
> +++ b/kernel/time/tick-common.c
> @@ -7,6 +7,7 @@
>   * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
>   * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
>   */
> +#include <linux/compiler.h>
>  #include <linux/cpu.h>
>  #include <linux/err.h>
>  #include <linux/hrtimer.h>
> @@ -84,7 +85,7 @@ int tick_is_oneshot_available(void)
>   */
>  static void tick_periodic(int cpu)
>  {
> -     if (tick_do_timer_cpu == cpu) {
> +     if (READ_ONCE(tick_do_timer_cpu) == cpu) {
>               raw_spin_lock(&jiffies_lock);
>               write_seqcount_begin(&jiffies_seq);
>
> @@ -215,8 +216,8 @@ static void tick_setup_device(struct tic
>                * If no cpu took the do_timer update, assign it to
>                * this cpu:
>                */
> -             if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
> -                     tick_do_timer_cpu = cpu;
> +             if (READ_ONCE(tick_do_timer_cpu) == TICK_DO_TIMER_BOOT) {
> +                     WRITE_ONCE(tick_do_timer_cpu, cpu);
>                       tick_next_period = ktime_get();
>  #ifdef CONFIG_NO_HZ_FULL
>                       /*
> @@ -232,7 +233,7 @@ static void tick_setup_device(struct tic
>                                               !tick_nohz_full_cpu(cpu)) {
>                       tick_take_do_timer_from_boot();
>                       tick_do_timer_boot_cpu = -1;
> -                     WARN_ON(tick_do_timer_cpu != cpu);
> +                     WARN_ON(READ_ON_ONCE(tick_do_timer_cpu) != cpu);
>  #endif
>               }
>
> @@ -406,10 +407,10 @@ void tick_assert_timekeeping_handover(vo
>  int tick_cpu_dying(unsigned int dying_cpu)
>  {
>       /*
> -      * If the current CPU is the timekeeper, it's the only one that
> -      * can safely hand over its duty. Also all online CPUs are in
> -      * stop machine, guaranteed not to be idle, therefore it's safe
> -      * to pick any online successor.
> +      * If the current CPU is the timekeeper, it's the only one that can
> +      * safely hand over its duty. Also all online CPUs are in stop
> +      * machine, guaranteed not to be idle, therefore there is no
> +      * concurrency and it's safe to pick any online successor.
>        */
>       if (tick_do_timer_cpu == dying_cpu)
>               tick_do_timer_cpu = cpumask_first(cpu_online_mask);
> --- a/kernel/time/tick-sched.c
> +++ b/kernel/time/tick-sched.c
> @@ -8,6 +8,7 @@
>   *
>   *  Started by: Thomas Gleixner and Ingo Molnar
>   */
> +#include <linux/compiler.h>
>  #include <linux/cpu.h>
>  #include <linux/err.h>
>  #include <linux/hrtimer.h>
> @@ -204,7 +205,7 @@ static inline void tick_sched_flag_clear
>
>  static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
>  {
> -     int cpu = smp_processor_id();
> +     int tick_cpu, cpu = smp_processor_id();
>
>       /*
>        * Check if the do_timer duty was dropped. We don't care about
> @@ -216,16 +217,18 @@ static void tick_sched_do_timer(struct t
>        * If nohz_full is enabled, this should not happen because the
>        * 'tick_do_timer_cpu' CPU never relinquishes.
>        */
> -     if (IS_ENABLED(CONFIG_NO_HZ_COMMON) &&
> -         unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) {
> +     tick_cpu = READ_ONCE(tick_do_timer_cpu);
> +
> +     if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && unlikely(tick_cpu == TICK_DO_TIMER_NONE)) {
>  #ifdef CONFIG_NO_HZ_FULL
>               WARN_ON_ONCE(tick_nohz_full_running);
>  #endif
> -             tick_do_timer_cpu = cpu;
> +             WRITE_ONCE(tick_do_timer_cpu, cpu);
> +             tick_cpu = cpu;
>       }
>
>       /* Check if jiffies need an update */
> -     if (tick_do_timer_cpu == cpu)
> +     if (tick_cpu == cpu)
>               tick_do_update_jiffies64(now);
>
>       /*
> @@ -610,7 +613,7 @@ bool tick_nohz_cpu_hotpluggable(unsigned
>        * timers, workqueues, timekeeping, ...) on behalf of full dynticks
>        * CPUs. It must remain online when nohz full is enabled.
>        */
> -     if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
> +     if (tick_nohz_full_running && READ_ONCE(tick_do_timer_cpu) == cpu)
>               return false;
>       return true;
>  }
> @@ -890,6 +893,7 @@ static ktime_t tick_nohz_next_event(stru
>  {
>       u64 basemono, next_tick, delta, expires;
>       unsigned long basejiff;
> +     int tick_cpu;
>
>       basemono = get_jiffies_update(&basejiff);
>       ts->last_jiffies = basejiff;
> @@ -946,9 +950,9 @@ static ktime_t tick_nohz_next_event(stru
>        * Otherwise we can sleep as long as we want.
>        */
>       delta = timekeeping_max_deferment();
> -     if (cpu != tick_do_timer_cpu &&
> -         (tick_do_timer_cpu != TICK_DO_TIMER_NONE ||
> -          !tick_sched_flag_test(ts, TS_FLAG_DO_TIMER_LAST)))
> +     tick_cpu = READ_ONCE(tick_do_timer_cpu);
> +     if (tick_cpu != cpu &&
> +         (tick_cpu != TICK_DO_TIMER_NONE || !tick_sched_flag_test(ts, TS_FLAG_DO_TIMER_LAST)))
>               delta = KTIME_MAX;
>
>       /* Calculate the next expiry time */
> @@ -969,6 +973,7 @@ static void tick_nohz_stop_tick(struct t
>       unsigned long basejiff = ts->last_jiffies;
>       u64 basemono = ts->timer_expires_base;
>       bool timer_idle = tick_sched_flag_test(ts, TS_FLAG_STOPPED);
> +     int tick_cpu;
>       u64 expires;
>
>       /* Make sure we won't be trying to stop it twice in a row. */
> @@ -1006,10 +1011,11 @@ static void tick_nohz_stop_tick(struct t
>        * do_timer() never gets invoked. Keep track of the fact that it
>        * was the one which had the do_timer() duty last.
>        */
> -     if (cpu == tick_do_timer_cpu) {
> -             tick_do_timer_cpu = TICK_DO_TIMER_NONE;
> +     tick_cpu = READ_ONCE(tick_do_timer_cpu);
> +     if (tick_cpu == cpu) {
> +             WRITE_ONCE(tick_do_timer_cpu, TICK_DO_TIMER_NONE);
>               tick_sched_flag_set(ts, TS_FLAG_DO_TIMER_LAST);
> -     } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
> +     } else if (tick_cpu != TICK_DO_TIMER_NONE) {
>               tick_sched_flag_clear(ts, TS_FLAG_DO_TIMER_LAST);
>       }
>
> @@ -1172,15 +1178,17 @@ static bool can_stop_idle_tick(int cpu,
>               return false;
>
>       if (tick_nohz_full_enabled()) {
> +             int tick_cpu = READ_ONCE(tick_do_timer_cpu);
> +
>               /*
>                * Keep the tick alive to guarantee timekeeping progression
>                * if there are full dynticks CPUs around
>                */
> -             if (tick_do_timer_cpu == cpu)
> +             if (tick_cpu == cpu)
>                       return false;
>
>               /* Should not happen for nohz-full */
> -             if (WARN_ON_ONCE(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
> +             if (WARN_ON_ONCE(tick_cpu == TICK_DO_TIMER_NONE))
>                       return false;
>       }

This fixes the issue for me (tested on 6.6; tweaked a bit for conflicts).

--Sean


[Embedded World 2024, SECO SpA]<https://www.messe-ticket.de/Nuernberg/embeddedworld2024/Register/ew24517689>

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ