[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <95f2091c-273e-4e36-b543-735748efd5fc@amd.com>
Date: Fri, 16 Jan 2026 15:13:24 +0530
From: K Prateek Nayak <kprateek.nayak@....com>
To: Wangyang Guo <wangyang.guo@...el.com>, Peter Zijlstra
<peterz@...radead.org>, Vincent Guittot <vincent.guittot@...aro.org>
CC: Ingo Molnar <mingo@...hat.com>, Juri Lelli <juri.lelli@...hat.com>,
Dietmar Eggemann <dietmar.eggemann@....com>, Steven Rostedt
<rostedt@...dmis.org>, Ben Segall <bsegall@...gle.com>, Mel Gorman
<mgorman@...e.de>, Valentin Schneider <vschneid@...hat.com>,
<linux-kernel@...r.kernel.org>, Shrikanth Hegde <sshegde@...ux.ibm.com>,
Benjamin Lei <benjamin.lei@...el.com>, Tim Chen <tim.c.chen@...ux.intel.com>,
Tianyou Li <tianyou.li@...el.com>
Subject: Re: [PATCH v3] sched/clock: Avoid false sharing for
sched_clock_irqtime
Hello Wangyang,
On 1/16/2026 8:09 AM, Wangyang Guo wrote:
> */
> DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
>
> -int sched_clock_irqtime;
> -
> void enable_sched_clock_irqtime(void)
> {
> - sched_clock_irqtime = 1;
> + static_branch_enable(&sched_clock_irqtime);
> }
>
> +static void __disable_sched_clock_irqtime(struct work_struct *work)
> +{
> + static_branch_disable(&sched_clock_irqtime);
> +}
> +
> +static DECLARE_WORK(sched_clock_irqtime_work, __disable_sched_clock_irqtime);
> +
> void disable_sched_clock_irqtime(void)
> {
> - sched_clock_irqtime = 0;
> + /* disable_sched_clock_irqtime can be called in atomic
> + * context with mark_tsc_unstable(), use wq to avoid
> + * "sleeping in atomic context" warning.
> + */
> + if (irqtime_enabled())
> + schedule_work(&sched_clock_irqtime_work);
> }
Your approach looks good to avoid the scheduling while atomic issue.
Just a small observation: The only user of disable_sched_clock_irqtime()
is tsc_.*mark_unstable() which calls clear_sched_clock_stable() just
before doing disable_sched_clock_irqtime().
It makes me wonder if we can just reuse "sched_clock_work" to also\
disable sched_clock_irqtime()?
Peter, Vincent, do we need to do enable_sched_clock_irqtime() that early
when we detect TSC freq / sched_clock_register() or can we wait until we
do __set_sched_clock_stable()?
If we can wait until we mark sched_clock() as stable, we can consolidate
enabling / disabling of irqtime with that of __sched_clock_stable.
Something like the following on top of Wangyang's patch:
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 7d3e13e14eab..8bb9c0baa93d 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1143,7 +1143,6 @@ static void tsc_cs_mark_unstable(struct clocksource *cs)
tsc_unstable = 1;
if (using_native_sched_clock())
clear_sched_clock_stable();
- disable_sched_clock_irqtime();
pr_info("Marking TSC unstable due to clocksource watchdog\n");
}
@@ -1213,7 +1212,6 @@ void mark_tsc_unstable(char *reason)
tsc_unstable = 1;
if (using_native_sched_clock())
clear_sched_clock_stable();
- disable_sched_clock_irqtime();
pr_info("Marking TSC unstable due to %s\n", reason);
clocksource_mark_unstable(&clocksource_tsc_early);
@@ -1234,6 +1232,22 @@ bool tsc_clocksource_watchdog_disabled(void)
tsc_as_watchdog && !no_tsc_watchdog;
}
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+/*
+ * Allow IRQ time accounting if the user hasn't
+ * disabled it and TSC is found to be stable at
+ * the time of late_initcall().
+ *
+ * If the TSC is detected to be unstable later,
+ * the IRQ time accounting will be disabled from
+ * clear_sched_clock_stable().
+ */
+bool sched_clock_supports_irqtime_acct(void)
+{
+ return !no_sched_irq_time && !tsc_unstable;
+}
+#endif
+
static void __init check_system_tsc_reliable(void)
{
#if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
@@ -1551,9 +1565,6 @@ void __init tsc_init(void)
cyc2ns_init_secondary_cpus();
- if (!no_sched_irq_time)
- enable_sched_clock_irqtime();
-
lpj_fine = get_loops_per_jiffy();
check_system_tsc_reliable();
diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h
index 196f0ca351a2..e9b8c88fada5 100644
--- a/include/linux/sched/clock.h
+++ b/include/linux/sched/clock.h
@@ -104,11 +104,7 @@ extern u64 local_clock(void);
* The reason for this explicit opt-in is not to have perf penalty with
* slow sched_clocks.
*/
-extern void enable_sched_clock_irqtime(void);
-extern void disable_sched_clock_irqtime(void);
-#else
-static inline void enable_sched_clock_irqtime(void) {}
-static inline void disable_sched_clock_irqtime(void) {}
+bool sched_clock_supports_irqtime_acct(void);
#endif
#endif /* _LINUX_SCHED_CLOCK_H */
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index f5e6dd6a6b3a..4d43eef8c326 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -137,6 +137,8 @@ notrace static void __set_sched_clock_stable(void)
scd->tick_gtod, __gtod_offset,
scd->tick_raw, __sched_clock_offset);
+ if (sched_clock_supports_irqtime_acct())
+ enable_sched_clock_irqtime();
static_branch_enable(&__sched_clock_stable);
tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);
}
@@ -173,6 +175,7 @@ notrace static void __sched_clock_work(struct work_struct *work)
scd->tick_gtod, __gtod_offset,
scd->tick_raw, __sched_clock_offset);
+ disable_sched_clock_irqtime();
static_branch_disable(&__sched_clock_stable);
}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index a5a8bd0a5ede..83dd9f299ee4 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -32,21 +32,9 @@ void enable_sched_clock_irqtime(void)
static_branch_enable(&sched_clock_irqtime);
}
-static void __disable_sched_clock_irqtime(struct work_struct *work)
-{
- static_branch_disable(&sched_clock_irqtime);
-}
-
-static DECLARE_WORK(sched_clock_irqtime_work, __disable_sched_clock_irqtime);
-
void disable_sched_clock_irqtime(void)
{
- /* disable_sched_clock_irqtime can be called in atomic
- * context with mark_tsc_unstable(), use wq to avoid
- * "sleeping in atomic context" warning.
- */
- if (irqtime_enabled())
- schedule_work(&sched_clock_irqtime_work);
+ static_branch_disable(&sched_clock_irqtime);
}
static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 164ebf47e5fd..3bae8baf7c00 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3333,6 +3333,8 @@ static inline u64 irq_time_read(int cpu)
return total;
}
+void enable_sched_clock_irqtime(void);
+void disable_sched_clock_irqtime(void);
#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */
static inline int irqtime_enabled(void)
@@ -3340,6 +3342,13 @@ static inline int irqtime_enabled(void)
return 0;
}
+static inline bool sched_clock_supports_irqtime_acct(void)
+{
+ return false;
+}
+
+static inline void enable_sched_clock_irqtime(void) {}
+static inline void disable_sched_clock_irqtime(void) {}
#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
#ifdef CONFIG_CPU_FREQ
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index f39111830ca3..1cdd10026279 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -174,6 +174,18 @@ static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
return HRTIMER_RESTART;
}
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+/*
+ * Enable IRQ time accounting if we have a fast enough sched_clock().
+ * This is checked as a part of late_initcall() once all the clock
+ * devices have registered themselves.
+ */
+bool sched_clock_supports_irqtime_acct(void)
+{
+ return irqtime > 0 || (irqtime == -1 && cd.rate >= 1000000);
+}
+#endif
+
void sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
{
u64 res, wrap, new_mask, new_epoch, cyc, ns;
@@ -238,10 +250,6 @@ void sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n",
bits, r, r_unit, res, wrap);
- /* Enable IRQ time accounting if we have a fast enough sched_clock() */
- if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))
- enable_sched_clock_irqtime();
-
local_irq_restore(flags);
pr_debug("Registered %pS as sched_clock source\n", read);
---
I don't know if it is any better (or even correct) but it does reduce
the scope of {enable,disable}_sched_clock_irqtime() to kernel/sched/.
Thoughts?
--
Thanks and Regards,
Prateek
Powered by blists - more mailing lists