linux-kernel - Re: [PATCH v3] sched/clock: Avoid false sharing for sched_clock

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAKfTPtAhx42QjRWEMULsz3M3AX=fdhhBm6PddcvxkcuuUjZjQA@mail.gmail.com>
Date: Fri, 16 Jan 2026 11:22:45 +0100
From: Vincent Guittot <vincent.guittot@...aro.org>
To: K Prateek Nayak <kprateek.nayak@....com>
Cc: Wangyang Guo <wangyang.guo@...el.com>, Peter Zijlstra <peterz@...radead.org>, 
	Ingo Molnar <mingo@...hat.com>, Juri Lelli <juri.lelli@...hat.com>, 
	Dietmar Eggemann <dietmar.eggemann@....com>, Steven Rostedt <rostedt@...dmis.org>, 
	Ben Segall <bsegall@...gle.com>, Mel Gorman <mgorman@...e.de>, 
	Valentin Schneider <vschneid@...hat.com>, linux-kernel@...r.kernel.org, 
	Shrikanth Hegde <sshegde@...ux.ibm.com>, Benjamin Lei <benjamin.lei@...el.com>, 
	Tim Chen <tim.c.chen@...ux.intel.com>, Tianyou Li <tianyou.li@...el.com>
Subject: Re: [PATCH v3] sched/clock: Avoid false sharing for sched_clock_irqtime

On Fri, 16 Jan 2026 at 10:43, K Prateek Nayak <kprateek.nayak@....com> wrote:
>
> Hello Wangyang,
>
> On 1/16/2026 8:09 AM, Wangyang Guo wrote:
> >   */
> >  DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
> >
> > -int sched_clock_irqtime;
> > -
> >  void enable_sched_clock_irqtime(void)
> >  {
> > -     sched_clock_irqtime = 1;
> > +     static_branch_enable(&sched_clock_irqtime);
> >  }
> >
> > +static void __disable_sched_clock_irqtime(struct work_struct *work)
> > +{
> > +     static_branch_disable(&sched_clock_irqtime);
> > +}
> > +
> > +static DECLARE_WORK(sched_clock_irqtime_work, __disable_sched_clock_irqtime);
> > +
> >  void disable_sched_clock_irqtime(void)
> >  {
> > -     sched_clock_irqtime = 0;
> > +     /* disable_sched_clock_irqtime can be called in atomic
> > +      * context with mark_tsc_unstable(), use wq to avoid
> > +      * "sleeping in atomic context" warning.
> > +      */
> > +     if (irqtime_enabled())
> > +             schedule_work(&sched_clock_irqtime_work);
> >  }
>
> Your approach looks good to avoid the scheduling while atomic issue.
> Just a small observation: The only user of disable_sched_clock_irqtime()
> is tsc_.*mark_unstable() which calls clear_sched_clock_stable() just
> before doing disable_sched_clock_irqtime().
>
> It makes me wonder if we can just reuse "sched_clock_work" to also\
> disable sched_clock_irqtime()?
>
> Peter, Vincent, do we need to do enable_sched_clock_irqtime() that early
> when we detect TSC freq / sched_clock_register() or can we wait until we
> do __set_sched_clock_stable()?

By default we don't need a workqueue to disable sched clock irq time
but only tsc clock needs it just like when it disables
sched_lock_stable

So the enablement during init should remain the same. Why would all
sched clocks delay their irq time accounting just for tsc.

Furthermore, __set_sched_clock_stable() is under
CONFIG_HAVE_UNSTABLE_SCHED_CLOCK

I think that disabling irq time accounting if it was enabled in
__sched_clock_work() should be good

>
> If we can wait until we mark sched_clock() as stable, we can consolidate
> enabling / disabling of irqtime with that of __sched_clock_stable.
> Something like the following on top of Wangyang's patch:
>
> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> index 7d3e13e14eab..8bb9c0baa93d 100644
> --- a/arch/x86/kernel/tsc.c
> +++ b/arch/x86/kernel/tsc.c
> @@ -1143,7 +1143,6 @@ static void tsc_cs_mark_unstable(struct clocksource *cs)
>         tsc_unstable = 1;
>         if (using_native_sched_clock())
>                 clear_sched_clock_stable();
> -       disable_sched_clock_irqtime();
>         pr_info("Marking TSC unstable due to clocksource watchdog\n");
>  }
>
> @@ -1213,7 +1212,6 @@ void mark_tsc_unstable(char *reason)
>         tsc_unstable = 1;
>         if (using_native_sched_clock())
>                 clear_sched_clock_stable();
> -       disable_sched_clock_irqtime();
>         pr_info("Marking TSC unstable due to %s\n", reason);
>
>         clocksource_mark_unstable(&clocksource_tsc_early);
> @@ -1234,6 +1232,22 @@ bool tsc_clocksource_watchdog_disabled(void)
>                tsc_as_watchdog && !no_tsc_watchdog;
>  }
>
> +#ifdef CONFIG_IRQ_TIME_ACCOUNTING
> +/*
> + * Allow IRQ time accounting if the user hasn't
> + * disabled it and TSC is found to be stable at
> + * the time of late_initcall().
> + *
> + * If the TSC is detected to be unstable later,
> + * the IRQ time accounting will be disabled from
> + * clear_sched_clock_stable().
> + */
> +bool sched_clock_supports_irqtime_acct(void)
> +{
> +       return !no_sched_irq_time && !tsc_unstable;
> +}
> +#endif
> +
>  static void __init check_system_tsc_reliable(void)
>  {
>  #if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
> @@ -1551,9 +1565,6 @@ void __init tsc_init(void)
>
>         cyc2ns_init_secondary_cpus();
>
> -       if (!no_sched_irq_time)
> -               enable_sched_clock_irqtime();
> -
>         lpj_fine = get_loops_per_jiffy();
>
>         check_system_tsc_reliable();
> diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h
> index 196f0ca351a2..e9b8c88fada5 100644
> --- a/include/linux/sched/clock.h
> +++ b/include/linux/sched/clock.h
> @@ -104,11 +104,7 @@ extern u64 local_clock(void);
>   * The reason for this explicit opt-in is not to have perf penalty with
>   * slow sched_clocks.
>   */
> -extern void enable_sched_clock_irqtime(void);
> -extern void disable_sched_clock_irqtime(void);
> -#else
> -static inline void enable_sched_clock_irqtime(void) {}
> -static inline void disable_sched_clock_irqtime(void) {}
> +bool sched_clock_supports_irqtime_acct(void);
>  #endif
>
>  #endif /* _LINUX_SCHED_CLOCK_H */
> diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
> index f5e6dd6a6b3a..4d43eef8c326 100644
> --- a/kernel/sched/clock.c
> +++ b/kernel/sched/clock.c
> @@ -137,6 +137,8 @@ notrace static void __set_sched_clock_stable(void)
>                         scd->tick_gtod, __gtod_offset,
>                         scd->tick_raw,  __sched_clock_offset);
>
> +       if (sched_clock_supports_irqtime_acct())
> +               enable_sched_clock_irqtime();
>         static_branch_enable(&__sched_clock_stable);
>         tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);
>  }
> @@ -173,6 +175,7 @@ notrace static void __sched_clock_work(struct work_struct *work)
>                         scd->tick_gtod, __gtod_offset,
>                         scd->tick_raw,  __sched_clock_offset);
>
> +       disable_sched_clock_irqtime();
>         static_branch_disable(&__sched_clock_stable);
>  }
>
> diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
> index a5a8bd0a5ede..83dd9f299ee4 100644
> --- a/kernel/sched/cputime.c
> +++ b/kernel/sched/cputime.c
> @@ -32,21 +32,9 @@ void enable_sched_clock_irqtime(void)
>         static_branch_enable(&sched_clock_irqtime);
>  }
>
> -static void __disable_sched_clock_irqtime(struct work_struct *work)
> -{
> -       static_branch_disable(&sched_clock_irqtime);
> -}
> -
> -static DECLARE_WORK(sched_clock_irqtime_work, __disable_sched_clock_irqtime);
> -
>  void disable_sched_clock_irqtime(void)
>  {
> -       /* disable_sched_clock_irqtime can be called in atomic
> -        * context with mark_tsc_unstable(), use wq to avoid
> -        * "sleeping in atomic context" warning.
> -        */
> -       if (irqtime_enabled())
> -               schedule_work(&sched_clock_irqtime_work);
> +       static_branch_disable(&sched_clock_irqtime);
>  }
>
>  static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 164ebf47e5fd..3bae8baf7c00 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -3333,6 +3333,8 @@ static inline u64 irq_time_read(int cpu)
>         return total;
>  }
>
> +void enable_sched_clock_irqtime(void);
> +void disable_sched_clock_irqtime(void);
>  #else /* !CONFIG_IRQ_TIME_ACCOUNTING: */
>
>  static inline int irqtime_enabled(void)
> @@ -3340,6 +3342,13 @@ static inline int irqtime_enabled(void)
>         return 0;
>  }
>
> +static inline bool sched_clock_supports_irqtime_acct(void)
> +{
> +       return false;
> +}
> +
> +static inline void enable_sched_clock_irqtime(void) {}
> +static inline void disable_sched_clock_irqtime(void) {}
>  #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
>
>  #ifdef CONFIG_CPU_FREQ
> diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
> index f39111830ca3..1cdd10026279 100644
> --- a/kernel/time/sched_clock.c
> +++ b/kernel/time/sched_clock.c
> @@ -174,6 +174,18 @@ static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
>         return HRTIMER_RESTART;
>  }
>
> +#ifdef CONFIG_IRQ_TIME_ACCOUNTING
> +/*
> + * Enable IRQ time accounting if we have a fast enough sched_clock().
> + * This is checked as a part of late_initcall() once all the clock
> + * devices have registered themselves.
> + */
> +bool sched_clock_supports_irqtime_acct(void)
> +{
> +       return irqtime > 0 || (irqtime == -1 && cd.rate >= 1000000);
> +}
> +#endif
> +
>  void sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
>  {
>         u64 res, wrap, new_mask, new_epoch, cyc, ns;
> @@ -238,10 +250,6 @@ void sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
>         pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n",
>                 bits, r, r_unit, res, wrap);
>
> -       /* Enable IRQ time accounting if we have a fast enough sched_clock() */
> -       if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))
> -               enable_sched_clock_irqtime();
> -
>         local_irq_restore(flags);
>
>         pr_debug("Registered %pS as sched_clock source\n", read);
> ---
>
> I don't know if it is any better (or even correct) but it does reduce
> the scope of {enable,disable}_sched_clock_irqtime() to kernel/sched/.
> Thoughts?
>
> --
> Thanks and Regards,
> Prateek
>