lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250918080206.180399724@infradead.org>
Date: Thu, 18 Sep 2025 09:52:26 +0200
From: Peter Zijlstra <peterz@...radead.org>
To: tglx@...utronix.de
Cc: arnd@...db.de,
 anna-maria@...utronix.de,
 frederic@...nel.org,
 peterz@...radead.org,
 luto@...nel.org,
 mingo@...hat.com,
 juri.lelli@...hat.com,
 vincent.guittot@...aro.org,
 dietmar.eggemann@....com,
 rostedt@...dmis.org,
 bsegall@...gle.com,
 mgorman@...e.de,
 vschneid@...hat.com,
 linux-kernel@...r.kernel.org,
 oliver.sang@...el.com
Subject: [RFC][PATCH 7/8] entry,hrtimer: Push reprogramming timers into the interrupt return path

Currently hrtimer_interrupt() runs expired timers, which can re-arm
themselves, after which it computes the next expiration time and
re-programs the hardware.

However, things like HRTICK, a highres timer driving preemption,
cannot re-arm itself at the point of running, since the next task has
not been determined yet. The schedule() in the interrupt return path
will switch to the next task, which then causes a new hrtimer to be
programmed.

This then results in reprogramming the hardware at least twice, once
after running the timers, and once upon selecting the new task.

Notably, *both* events happen in the interrupt.

By pushing the hrtimer reprogram all the way into the interrupt return
path, it runs after schedule() and this double reprogram can be
avoided.

XXX: 0-day is unhappy with this patch -- it is reporting lockups that
very much look like a timer goes missing. Am unable to reproduce.
Notable: the lockup goes away when the workloads are ran without perf
monitors.

Signed-off-by: Peter Zijlstra (Intel) <peterz@...radead.org>
---
 include/asm-generic/thread_info_tif.h |    5 ++++-
 include/linux/hrtimer.h               |   17 +++++++++++++++++
 kernel/entry/common.c                 |    7 +++++++
 kernel/sched/core.c                   |    6 ++++++
 kernel/time/hrtimer.c                 |   28 ++++++++++++++++++++++++----
 5 files changed, 58 insertions(+), 5 deletions(-)

--- a/include/asm-generic/thread_info_tif.h
+++ b/include/asm-generic/thread_info_tif.h
@@ -41,8 +41,11 @@
 #define _TIF_PATCH_PENDING	BIT(TIF_PATCH_PENDING)
 
 #ifdef HAVE_TIF_RESTORE_SIGMASK
-# define TIF_RESTORE_SIGMASK	10	// Restore signal mask in do_signal() */
+# define TIF_RESTORE_SIGMASK	10	// Restore signal mask in do_signal()
 # define _TIF_RESTORE_SIGMASK	BIT(TIF_RESTORE_SIGMASK)
 #endif
 
+#define TIF_HRTIMER_REARM              11       // re-arm the timer
+#define _TIF_HRTIMER_REARM             BIT(TIF_HRTIMER_REARM)
+
 #endif /* _ASM_GENERIC_THREAD_INFO_TIF_H_ */
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -175,10 +175,27 @@ extern void hrtimer_interrupt(struct clo
 
 extern unsigned int hrtimer_resolution;
 
+#ifdef TIF_HRTIMER_REARM
+extern void _hrtimer_rearm(void);
+/*
+ * This is to be called on all irqentry_exit() paths; as well as in the context
+ * switch path before switch_to().
+ */
+static inline void hrtimer_rearm(void)
+{
+	if (test_thread_flag(TIF_HRTIMER_REARM))
+		_hrtimer_rearm();
+}
+#else
+static inline void hrtimer_rearm(void) { }
+#endif /* TIF_HRTIMER_REARM */
+
 #else
 
 #define hrtimer_resolution	(unsigned int)LOW_RES_NSEC
 
+static inline void hrtimer_rearm(void) { }
+
 #endif
 
 static inline ktime_t
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -7,6 +7,7 @@
 #include <linux/kmsan.h>
 #include <linux/livepatch.h>
 #include <linux/tick.h>
+#include <linux/hrtimer.h>
 
 /* Workaround to allow gradual conversion of architecture code */
 void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
@@ -71,6 +72,7 @@ noinstr void irqentry_exit_to_user_mode(
 {
 	instrumentation_begin();
 	exit_to_user_mode_prepare(regs);
+	hrtimer_rearm();
 	instrumentation_end();
 	exit_to_user_mode();
 }
@@ -183,6 +185,7 @@ noinstr void irqentry_exit(struct pt_reg
 		 */
 		if (state.exit_rcu) {
 			instrumentation_begin();
+			hrtimer_rearm();
 			/* Tell the tracer that IRET will enable interrupts */
 			trace_hardirqs_on_prepare();
 			lockdep_hardirqs_on_prepare();
@@ -196,10 +199,14 @@ noinstr void irqentry_exit(struct pt_reg
 		if (IS_ENABLED(CONFIG_PREEMPTION))
 			irqentry_exit_cond_resched();
 
+		hrtimer_rearm();
 		/* Covers both tracing and lockdep */
 		trace_hardirqs_on();
 		instrumentation_end();
 	} else {
+		instrumentation_begin();
+		hrtimer_rearm();
+		instrumentation_end();
 		/*
 		 * IRQ flags state is correct already. Just tell RCU if it
 		 * was not watching on entry.
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5161,6 +5161,12 @@ prepare_task_switch(struct rq *rq, struc
 	fire_sched_out_preempt_notifiers(prev, next);
 	kmap_local_sched_out();
 	prepare_task(next);
+	/*
+	 * Notably, this must be called after pick_next_task() but before
+	 * switch_to(), since the new task need not be on the return from
+	 * interrupt path.
+	 */
+	hrtimer_rearm();
 	prepare_arch_switch(next);
 }
 
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1892,10 +1892,9 @@ static __latent_entropy void hrtimer_run
  * Very similar to hrtimer_force_reprogram(), except it deals with
  * in_hrirq and hang_detected.
  */
-static void __hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now)
+static void __hrtimer_rearm(struct hrtimer_cpu_base *cpu_base,
+			    ktime_t now, ktime_t expires_next)
 {
-	ktime_t expires_next = hrtimer_update_next_event(cpu_base);
-
 	cpu_base->expires_next = expires_next;
 	cpu_base->in_hrtirq = 0;
 
@@ -1970,9 +1969,30 @@ void hrtimer_interrupt(struct clock_even
 		cpu_base->hang_detected = 1;
 	}
 
-	__hrtimer_rearm(cpu_base, now);
+#ifdef TIF_HRTIMER_REARM
+	set_thread_flag(TIF_HRTIMER_REARM);
+#else
+	__hrtimer_rearm(cpu_base, now, expires_next);
+#endif
 	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
 }
+
+#ifdef TIF_HRTIMER_REARM
+void _hrtimer_rearm(void)
+{
+	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+	ktime_t now, expires_next;
+
+	lockdep_assert_irqs_disabled();
+
+	scoped_guard (raw_spinlock, &cpu_base->lock) {
+		now = hrtimer_update_base(cpu_base);
+		expires_next = hrtimer_update_next_event(cpu_base);
+		__hrtimer_rearm(cpu_base, now, expires_next);
+		clear_thread_flag(TIF_HRTIMER_REARM);
+	}
+}
+#endif /* TIF_HRTIMER_REARM */
 #endif /* !CONFIG_HIGH_RES_TIMERS */
 
 /*



Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ