[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250918080206.180399724@infradead.org>
Date: Thu, 18 Sep 2025 09:52:26 +0200
From: Peter Zijlstra <peterz@...radead.org>
To: tglx@...utronix.de
Cc: arnd@...db.de,
anna-maria@...utronix.de,
frederic@...nel.org,
peterz@...radead.org,
luto@...nel.org,
mingo@...hat.com,
juri.lelli@...hat.com,
vincent.guittot@...aro.org,
dietmar.eggemann@....com,
rostedt@...dmis.org,
bsegall@...gle.com,
mgorman@...e.de,
vschneid@...hat.com,
linux-kernel@...r.kernel.org,
oliver.sang@...el.com
Subject: [RFC][PATCH 7/8] entry,hrtimer: Push reprogramming timers into the interrupt return path
Currently hrtimer_interrupt() runs expired timers, which can re-arm
themselves, after which it computes the next expiration time and
re-programs the hardware.
However, things like HRTICK, a highres timer driving preemption,
cannot re-arm itself at the point of running, since the next task has
not been determined yet. The schedule() in the interrupt return path
will switch to the next task, which then causes a new hrtimer to be
programmed.
This then results in reprogramming the hardware at least twice, once
after running the timers, and once upon selecting the new task.
Notably, *both* events happen in the interrupt.
By pushing the hrtimer reprogram all the way into the interrupt return
path, it runs after schedule() and this double reprogram can be
avoided.
XXX: 0-day is unhappy with this patch -- it is reporting lockups that
very much look like a timer goes missing. Am unable to reproduce.
Notable: the lockup goes away when the workloads are ran without perf
monitors.
Signed-off-by: Peter Zijlstra (Intel) <peterz@...radead.org>
---
include/asm-generic/thread_info_tif.h | 5 ++++-
include/linux/hrtimer.h | 17 +++++++++++++++++
kernel/entry/common.c | 7 +++++++
kernel/sched/core.c | 6 ++++++
kernel/time/hrtimer.c | 28 ++++++++++++++++++++++++----
5 files changed, 58 insertions(+), 5 deletions(-)
--- a/include/asm-generic/thread_info_tif.h
+++ b/include/asm-generic/thread_info_tif.h
@@ -41,8 +41,11 @@
#define _TIF_PATCH_PENDING BIT(TIF_PATCH_PENDING)
#ifdef HAVE_TIF_RESTORE_SIGMASK
-# define TIF_RESTORE_SIGMASK 10 // Restore signal mask in do_signal() */
+# define TIF_RESTORE_SIGMASK 10 // Restore signal mask in do_signal()
# define _TIF_RESTORE_SIGMASK BIT(TIF_RESTORE_SIGMASK)
#endif
+#define TIF_HRTIMER_REARM 11 // re-arm the timer
+#define _TIF_HRTIMER_REARM BIT(TIF_HRTIMER_REARM)
+
#endif /* _ASM_GENERIC_THREAD_INFO_TIF_H_ */
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -175,10 +175,27 @@ extern void hrtimer_interrupt(struct clo
extern unsigned int hrtimer_resolution;
+#ifdef TIF_HRTIMER_REARM
+extern void _hrtimer_rearm(void);
+/*
+ * This is to be called on all irqentry_exit() paths; as well as in the context
+ * switch path before switch_to().
+ */
+static inline void hrtimer_rearm(void)
+{
+ if (test_thread_flag(TIF_HRTIMER_REARM))
+ _hrtimer_rearm();
+}
+#else
+static inline void hrtimer_rearm(void) { }
+#endif /* TIF_HRTIMER_REARM */
+
#else
#define hrtimer_resolution (unsigned int)LOW_RES_NSEC
+static inline void hrtimer_rearm(void) { }
+
#endif
static inline ktime_t
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -7,6 +7,7 @@
#include <linux/kmsan.h>
#include <linux/livepatch.h>
#include <linux/tick.h>
+#include <linux/hrtimer.h>
/* Workaround to allow gradual conversion of architecture code */
void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
@@ -71,6 +72,7 @@ noinstr void irqentry_exit_to_user_mode(
{
instrumentation_begin();
exit_to_user_mode_prepare(regs);
+ hrtimer_rearm();
instrumentation_end();
exit_to_user_mode();
}
@@ -183,6 +185,7 @@ noinstr void irqentry_exit(struct pt_reg
*/
if (state.exit_rcu) {
instrumentation_begin();
+ hrtimer_rearm();
/* Tell the tracer that IRET will enable interrupts */
trace_hardirqs_on_prepare();
lockdep_hardirqs_on_prepare();
@@ -196,10 +199,14 @@ noinstr void irqentry_exit(struct pt_reg
if (IS_ENABLED(CONFIG_PREEMPTION))
irqentry_exit_cond_resched();
+ hrtimer_rearm();
/* Covers both tracing and lockdep */
trace_hardirqs_on();
instrumentation_end();
} else {
+ instrumentation_begin();
+ hrtimer_rearm();
+ instrumentation_end();
/*
* IRQ flags state is correct already. Just tell RCU if it
* was not watching on entry.
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5161,6 +5161,12 @@ prepare_task_switch(struct rq *rq, struc
fire_sched_out_preempt_notifiers(prev, next);
kmap_local_sched_out();
prepare_task(next);
+ /*
+ * Notably, this must be called after pick_next_task() but before
+ * switch_to(), since the new task need not be on the return from
+ * interrupt path.
+ */
+ hrtimer_rearm();
prepare_arch_switch(next);
}
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1892,10 +1892,9 @@ static __latent_entropy void hrtimer_run
* Very similar to hrtimer_force_reprogram(), except it deals with
* in_hrirq and hang_detected.
*/
-static void __hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now)
+static void __hrtimer_rearm(struct hrtimer_cpu_base *cpu_base,
+ ktime_t now, ktime_t expires_next)
{
- ktime_t expires_next = hrtimer_update_next_event(cpu_base);
-
cpu_base->expires_next = expires_next;
cpu_base->in_hrtirq = 0;
@@ -1970,9 +1969,30 @@ void hrtimer_interrupt(struct clock_even
cpu_base->hang_detected = 1;
}
- __hrtimer_rearm(cpu_base, now);
+#ifdef TIF_HRTIMER_REARM
+ set_thread_flag(TIF_HRTIMER_REARM);
+#else
+ __hrtimer_rearm(cpu_base, now, expires_next);
+#endif
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
}
+
+#ifdef TIF_HRTIMER_REARM
+void _hrtimer_rearm(void)
+{
+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+ ktime_t now, expires_next;
+
+ lockdep_assert_irqs_disabled();
+
+ scoped_guard (raw_spinlock, &cpu_base->lock) {
+ now = hrtimer_update_base(cpu_base);
+ expires_next = hrtimer_update_next_event(cpu_base);
+ __hrtimer_rearm(cpu_base, now, expires_next);
+ clear_thread_flag(TIF_HRTIMER_REARM);
+ }
+}
+#endif /* TIF_HRTIMER_REARM */
#endif /* !CONFIG_HIGH_RES_TIMERS */
/*
Powered by blists - more mailing lists