[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAKfTPtAhx42QjRWEMULsz3M3AX=fdhhBm6PddcvxkcuuUjZjQA@mail.gmail.com>
Date: Fri, 16 Jan 2026 11:22:45 +0100
From: Vincent Guittot <vincent.guittot@...aro.org>
To: K Prateek Nayak <kprateek.nayak@....com>
Cc: Wangyang Guo <wangyang.guo@...el.com>, Peter Zijlstra <peterz@...radead.org>,
Ingo Molnar <mingo@...hat.com>, Juri Lelli <juri.lelli@...hat.com>,
Dietmar Eggemann <dietmar.eggemann@....com>, Steven Rostedt <rostedt@...dmis.org>,
Ben Segall <bsegall@...gle.com>, Mel Gorman <mgorman@...e.de>,
Valentin Schneider <vschneid@...hat.com>, linux-kernel@...r.kernel.org,
Shrikanth Hegde <sshegde@...ux.ibm.com>, Benjamin Lei <benjamin.lei@...el.com>,
Tim Chen <tim.c.chen@...ux.intel.com>, Tianyou Li <tianyou.li@...el.com>
Subject: Re: [PATCH v3] sched/clock: Avoid false sharing for sched_clock_irqtime
On Fri, 16 Jan 2026 at 10:43, K Prateek Nayak <kprateek.nayak@....com> wrote:
>
> Hello Wangyang,
>
> On 1/16/2026 8:09 AM, Wangyang Guo wrote:
> > */
> > DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
> >
> > -int sched_clock_irqtime;
> > -
> > void enable_sched_clock_irqtime(void)
> > {
> > - sched_clock_irqtime = 1;
> > + static_branch_enable(&sched_clock_irqtime);
> > }
> >
> > +static void __disable_sched_clock_irqtime(struct work_struct *work)
> > +{
> > + static_branch_disable(&sched_clock_irqtime);
> > +}
> > +
> > +static DECLARE_WORK(sched_clock_irqtime_work, __disable_sched_clock_irqtime);
> > +
> > void disable_sched_clock_irqtime(void)
> > {
> > - sched_clock_irqtime = 0;
> > + /* disable_sched_clock_irqtime can be called in atomic
> > + * context with mark_tsc_unstable(), use wq to avoid
> > + * "sleeping in atomic context" warning.
> > + */
> > + if (irqtime_enabled())
> > + schedule_work(&sched_clock_irqtime_work);
> > }
>
> Your approach looks good to avoid the scheduling while atomic issue.
> Just a small observation: The only user of disable_sched_clock_irqtime()
> is tsc_.*mark_unstable() which calls clear_sched_clock_stable() just
> before doing disable_sched_clock_irqtime().
>
> It makes me wonder if we can just reuse "sched_clock_work" to also\
> disable sched_clock_irqtime()?
>
> Peter, Vincent, do we need to do enable_sched_clock_irqtime() that early
> when we detect TSC freq / sched_clock_register() or can we wait until we
> do __set_sched_clock_stable()?
By default we don't need a workqueue to disable sched clock irq time
but only tsc clock needs it just like when it disables
sched_lock_stable
So the enablement during init should remain the same. Why would all
sched clocks delay their irq time accounting just for tsc.
Furthermore, __set_sched_clock_stable() is under
CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
I think that disabling irq time accounting if it was enabled in
__sched_clock_work() should be good
>
> If we can wait until we mark sched_clock() as stable, we can consolidate
> enabling / disabling of irqtime with that of __sched_clock_stable.
> Something like the following on top of Wangyang's patch:
>
> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
> index 7d3e13e14eab..8bb9c0baa93d 100644
> --- a/arch/x86/kernel/tsc.c
> +++ b/arch/x86/kernel/tsc.c
> @@ -1143,7 +1143,6 @@ static void tsc_cs_mark_unstable(struct clocksource *cs)
> tsc_unstable = 1;
> if (using_native_sched_clock())
> clear_sched_clock_stable();
> - disable_sched_clock_irqtime();
> pr_info("Marking TSC unstable due to clocksource watchdog\n");
> }
>
> @@ -1213,7 +1212,6 @@ void mark_tsc_unstable(char *reason)
> tsc_unstable = 1;
> if (using_native_sched_clock())
> clear_sched_clock_stable();
> - disable_sched_clock_irqtime();
> pr_info("Marking TSC unstable due to %s\n", reason);
>
> clocksource_mark_unstable(&clocksource_tsc_early);
> @@ -1234,6 +1232,22 @@ bool tsc_clocksource_watchdog_disabled(void)
> tsc_as_watchdog && !no_tsc_watchdog;
> }
>
> +#ifdef CONFIG_IRQ_TIME_ACCOUNTING
> +/*
> + * Allow IRQ time accounting if the user hasn't
> + * disabled it and TSC is found to be stable at
> + * the time of late_initcall().
> + *
> + * If the TSC is detected to be unstable later,
> + * the IRQ time accounting will be disabled from
> + * clear_sched_clock_stable().
> + */
> +bool sched_clock_supports_irqtime_acct(void)
> +{
> + return !no_sched_irq_time && !tsc_unstable;
> +}
> +#endif
> +
> static void __init check_system_tsc_reliable(void)
> {
> #if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
> @@ -1551,9 +1565,6 @@ void __init tsc_init(void)
>
> cyc2ns_init_secondary_cpus();
>
> - if (!no_sched_irq_time)
> - enable_sched_clock_irqtime();
> -
> lpj_fine = get_loops_per_jiffy();
>
> check_system_tsc_reliable();
> diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h
> index 196f0ca351a2..e9b8c88fada5 100644
> --- a/include/linux/sched/clock.h
> +++ b/include/linux/sched/clock.h
> @@ -104,11 +104,7 @@ extern u64 local_clock(void);
> * The reason for this explicit opt-in is not to have perf penalty with
> * slow sched_clocks.
> */
> -extern void enable_sched_clock_irqtime(void);
> -extern void disable_sched_clock_irqtime(void);
> -#else
> -static inline void enable_sched_clock_irqtime(void) {}
> -static inline void disable_sched_clock_irqtime(void) {}
> +bool sched_clock_supports_irqtime_acct(void);
> #endif
>
> #endif /* _LINUX_SCHED_CLOCK_H */
> diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
> index f5e6dd6a6b3a..4d43eef8c326 100644
> --- a/kernel/sched/clock.c
> +++ b/kernel/sched/clock.c
> @@ -137,6 +137,8 @@ notrace static void __set_sched_clock_stable(void)
> scd->tick_gtod, __gtod_offset,
> scd->tick_raw, __sched_clock_offset);
>
> + if (sched_clock_supports_irqtime_acct())
> + enable_sched_clock_irqtime();
> static_branch_enable(&__sched_clock_stable);
> tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);
> }
> @@ -173,6 +175,7 @@ notrace static void __sched_clock_work(struct work_struct *work)
> scd->tick_gtod, __gtod_offset,
> scd->tick_raw, __sched_clock_offset);
>
> + disable_sched_clock_irqtime();
> static_branch_disable(&__sched_clock_stable);
> }
>
> diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
> index a5a8bd0a5ede..83dd9f299ee4 100644
> --- a/kernel/sched/cputime.c
> +++ b/kernel/sched/cputime.c
> @@ -32,21 +32,9 @@ void enable_sched_clock_irqtime(void)
> static_branch_enable(&sched_clock_irqtime);
> }
>
> -static void __disable_sched_clock_irqtime(struct work_struct *work)
> -{
> - static_branch_disable(&sched_clock_irqtime);
> -}
> -
> -static DECLARE_WORK(sched_clock_irqtime_work, __disable_sched_clock_irqtime);
> -
> void disable_sched_clock_irqtime(void)
> {
> - /* disable_sched_clock_irqtime can be called in atomic
> - * context with mark_tsc_unstable(), use wq to avoid
> - * "sleeping in atomic context" warning.
> - */
> - if (irqtime_enabled())
> - schedule_work(&sched_clock_irqtime_work);
> + static_branch_disable(&sched_clock_irqtime);
> }
>
> static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 164ebf47e5fd..3bae8baf7c00 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -3333,6 +3333,8 @@ static inline u64 irq_time_read(int cpu)
> return total;
> }
>
> +void enable_sched_clock_irqtime(void);
> +void disable_sched_clock_irqtime(void);
> #else /* !CONFIG_IRQ_TIME_ACCOUNTING: */
>
> static inline int irqtime_enabled(void)
> @@ -3340,6 +3342,13 @@ static inline int irqtime_enabled(void)
> return 0;
> }
>
> +static inline bool sched_clock_supports_irqtime_acct(void)
> +{
> + return false;
> +}
> +
> +static inline void enable_sched_clock_irqtime(void) {}
> +static inline void disable_sched_clock_irqtime(void) {}
> #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
>
> #ifdef CONFIG_CPU_FREQ
> diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
> index f39111830ca3..1cdd10026279 100644
> --- a/kernel/time/sched_clock.c
> +++ b/kernel/time/sched_clock.c
> @@ -174,6 +174,18 @@ static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
> return HRTIMER_RESTART;
> }
>
> +#ifdef CONFIG_IRQ_TIME_ACCOUNTING
> +/*
> + * Enable IRQ time accounting if we have a fast enough sched_clock().
> + * This is checked as a part of late_initcall() once all the clock
> + * devices have registered themselves.
> + */
> +bool sched_clock_supports_irqtime_acct(void)
> +{
> + return irqtime > 0 || (irqtime == -1 && cd.rate >= 1000000);
> +}
> +#endif
> +
> void sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
> {
> u64 res, wrap, new_mask, new_epoch, cyc, ns;
> @@ -238,10 +250,6 @@ void sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
> pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n",
> bits, r, r_unit, res, wrap);
>
> - /* Enable IRQ time accounting if we have a fast enough sched_clock() */
> - if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))
> - enable_sched_clock_irqtime();
> -
> local_irq_restore(flags);
>
> pr_debug("Registered %pS as sched_clock source\n", read);
> ---
>
> I don't know if it is any better (or even correct) but it does reduce
> the scope of {enable,disable}_sched_clock_irqtime() to kernel/sched/.
> Thoughts?
>
> --
> Thanks and Regards,
> Prateek
>
Powered by blists - more mailing lists