linux-kernel - Re: [PATCH v3] sched/clock: Avoid false sharing for sched_clock

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <95f2091c-273e-4e36-b543-735748efd5fc@amd.com>
Date: Fri, 16 Jan 2026 15:13:24 +0530
From: K Prateek Nayak <kprateek.nayak@....com>
To: Wangyang Guo <wangyang.guo@...el.com>, Peter Zijlstra
	<peterz@...radead.org>, Vincent Guittot <vincent.guittot@...aro.org>
CC: Ingo Molnar <mingo@...hat.com>, Juri Lelli <juri.lelli@...hat.com>,
	Dietmar Eggemann <dietmar.eggemann@....com>, Steven Rostedt
	<rostedt@...dmis.org>, Ben Segall <bsegall@...gle.com>, Mel Gorman
	<mgorman@...e.de>, Valentin Schneider <vschneid@...hat.com>,
	<linux-kernel@...r.kernel.org>, Shrikanth Hegde <sshegde@...ux.ibm.com>,
	Benjamin Lei <benjamin.lei@...el.com>, Tim Chen <tim.c.chen@...ux.intel.com>,
	Tianyou Li <tianyou.li@...el.com>
Subject: Re: [PATCH v3] sched/clock: Avoid false sharing for
 sched_clock_irqtime

Hello Wangyang,

On 1/16/2026 8:09 AM, Wangyang Guo wrote:
>   */
>  DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
>  
> -int sched_clock_irqtime;
> -
>  void enable_sched_clock_irqtime(void)
>  {
> -	sched_clock_irqtime = 1;
> +	static_branch_enable(&sched_clock_irqtime);
>  }
>  
> +static void __disable_sched_clock_irqtime(struct work_struct *work)
> +{
> +	static_branch_disable(&sched_clock_irqtime);
> +}
> +
> +static DECLARE_WORK(sched_clock_irqtime_work, __disable_sched_clock_irqtime);
> +
>  void disable_sched_clock_irqtime(void)
>  {
> -	sched_clock_irqtime = 0;
> +	/* disable_sched_clock_irqtime can be called in atomic
> +	 * context with mark_tsc_unstable(), use wq to avoid
> +	 * "sleeping in atomic context" warning.
> +	 */
> +	if (irqtime_enabled())
> +		schedule_work(&sched_clock_irqtime_work);
>  }

Your approach looks good to avoid the scheduling while atomic issue.
Just a small observation: The only user of disable_sched_clock_irqtime()
is tsc_.*mark_unstable() which calls clear_sched_clock_stable() just
before doing disable_sched_clock_irqtime().

It makes me wonder if we can just reuse "sched_clock_work" to also\
disable sched_clock_irqtime()?

Peter, Vincent, do we need to do enable_sched_clock_irqtime() that early
when we detect TSC freq / sched_clock_register() or can we wait until we
do __set_sched_clock_stable()?

If we can wait until we mark sched_clock() as stable, we can consolidate
enabling / disabling of irqtime with that of __sched_clock_stable.
Something like the following on top of Wangyang's patch:

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 7d3e13e14eab..8bb9c0baa93d 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1143,7 +1143,6 @@ static void tsc_cs_mark_unstable(struct clocksource *cs)
 	tsc_unstable = 1;
 	if (using_native_sched_clock())
 		clear_sched_clock_stable();
-	disable_sched_clock_irqtime();
 	pr_info("Marking TSC unstable due to clocksource watchdog\n");
 }
 
@@ -1213,7 +1212,6 @@ void mark_tsc_unstable(char *reason)
 	tsc_unstable = 1;
 	if (using_native_sched_clock())
 		clear_sched_clock_stable();
-	disable_sched_clock_irqtime();
 	pr_info("Marking TSC unstable due to %s\n", reason);
 
 	clocksource_mark_unstable(&clocksource_tsc_early);
@@ -1234,6 +1232,22 @@ bool tsc_clocksource_watchdog_disabled(void)
 	       tsc_as_watchdog && !no_tsc_watchdog;
 }
 
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+/*
+ * Allow IRQ time accounting if the user hasn't
+ * disabled it and TSC is found to be stable at
+ * the time of late_initcall().
+ *
+ * If the TSC is detected to be unstable later,
+ * the IRQ time accounting will be disabled from
+ * clear_sched_clock_stable().
+ */
+bool sched_clock_supports_irqtime_acct(void)
+{
+	return !no_sched_irq_time && !tsc_unstable;
+}
+#endif
+
 static void __init check_system_tsc_reliable(void)
 {
 #if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
@@ -1551,9 +1565,6 @@ void __init tsc_init(void)
 
 	cyc2ns_init_secondary_cpus();
 
-	if (!no_sched_irq_time)
-		enable_sched_clock_irqtime();
-
 	lpj_fine = get_loops_per_jiffy();
 
 	check_system_tsc_reliable();
diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h
index 196f0ca351a2..e9b8c88fada5 100644
--- a/include/linux/sched/clock.h
+++ b/include/linux/sched/clock.h
@@ -104,11 +104,7 @@ extern u64 local_clock(void);
  * The reason for this explicit opt-in is not to have perf penalty with
  * slow sched_clocks.
  */
-extern void enable_sched_clock_irqtime(void);
-extern void disable_sched_clock_irqtime(void);
-#else
-static inline void enable_sched_clock_irqtime(void) {}
-static inline void disable_sched_clock_irqtime(void) {}
+bool sched_clock_supports_irqtime_acct(void);
 #endif
 
 #endif /* _LINUX_SCHED_CLOCK_H */
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index f5e6dd6a6b3a..4d43eef8c326 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -137,6 +137,8 @@ notrace static void __set_sched_clock_stable(void)
 			scd->tick_gtod, __gtod_offset,
 			scd->tick_raw,  __sched_clock_offset);
 
+	if (sched_clock_supports_irqtime_acct())
+		enable_sched_clock_irqtime();
 	static_branch_enable(&__sched_clock_stable);
 	tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);
 }
@@ -173,6 +175,7 @@ notrace static void __sched_clock_work(struct work_struct *work)
 			scd->tick_gtod, __gtod_offset,
 			scd->tick_raw,  __sched_clock_offset);
 
+	disable_sched_clock_irqtime();
 	static_branch_disable(&__sched_clock_stable);
 }
 
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index a5a8bd0a5ede..83dd9f299ee4 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -32,21 +32,9 @@ void enable_sched_clock_irqtime(void)
 	static_branch_enable(&sched_clock_irqtime);
 }
 
-static void __disable_sched_clock_irqtime(struct work_struct *work)
-{
-	static_branch_disable(&sched_clock_irqtime);
-}
-
-static DECLARE_WORK(sched_clock_irqtime_work, __disable_sched_clock_irqtime);
-
 void disable_sched_clock_irqtime(void)
 {
-	/* disable_sched_clock_irqtime can be called in atomic
-	 * context with mark_tsc_unstable(), use wq to avoid
-	 * "sleeping in atomic context" warning.
-	 */
-	if (irqtime_enabled())
-		schedule_work(&sched_clock_irqtime_work);
+	static_branch_disable(&sched_clock_irqtime);
 }
 
 static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 164ebf47e5fd..3bae8baf7c00 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3333,6 +3333,8 @@ static inline u64 irq_time_read(int cpu)
 	return total;
 }
 
+void enable_sched_clock_irqtime(void);
+void disable_sched_clock_irqtime(void);
 #else /* !CONFIG_IRQ_TIME_ACCOUNTING: */
 
 static inline int irqtime_enabled(void)
@@ -3340,6 +3342,13 @@ static inline int irqtime_enabled(void)
 	return 0;
 }
 
+static inline bool sched_clock_supports_irqtime_acct(void)
+{
+	return false;
+}
+
+static inline void enable_sched_clock_irqtime(void) {}
+static inline void disable_sched_clock_irqtime(void) {}
 #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
 
 #ifdef CONFIG_CPU_FREQ
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index f39111830ca3..1cdd10026279 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -174,6 +174,18 @@ static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
 	return HRTIMER_RESTART;
 }
 
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+/*
+ * Enable IRQ time accounting if we have a fast enough sched_clock().
+ * This is checked as a part of late_initcall() once all the clock
+ * devices have registered themselves.
+ */
+bool sched_clock_supports_irqtime_acct(void)
+{
+	return irqtime > 0 || (irqtime == -1 && cd.rate >= 1000000);
+}
+#endif
+
 void sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
 {
 	u64 res, wrap, new_mask, new_epoch, cyc, ns;
@@ -238,10 +250,6 @@ void sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
 	pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n",
 		bits, r, r_unit, res, wrap);
 
-	/* Enable IRQ time accounting if we have a fast enough sched_clock() */
-	if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))
-		enable_sched_clock_irqtime();
-
 	local_irq_restore(flags);
 
 	pr_debug("Registered %pS as sched_clock source\n", read);
---

I don't know if it is any better (or even correct) but it does reduce
the scope of {enable,disable}_sched_clock_irqtime() to kernel/sched/.
Thoughts?

-- 
Thanks and Regards,
Prateek