[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250813162824.420583910@linutronix.de>
Date: Wed, 13 Aug 2025 18:29:37 +0200 (CEST)
From: Thomas Gleixner <tglx@...utronix.de>
To: LKML <linux-kernel@...r.kernel.org>
Cc: Michael Jeanson <mjeanson@...icios.com>,
Mathieu Desnoyers <mathieu.desnoyers@...icios.com>,
Peter Zijlstra <peterz@...radead.org>,
"Paul E. McKenney" <paulmck@...nel.org>,
Boqun Feng <boqun.feng@...il.com>,
Wei Liu <wei.liu@...nel.org>,
Jens Axboe <axboe@...nel.dk>
Subject: [patch 10/11] rseq: Skip fixup when returning from a syscall
The TIF_NOTIFY_RESUME handler of restartable sequences is invoked as all
other functionality unconditionally when TIF_NOTIFY_RESUME is set for
what ever reason.
The invocation is already conditional on the rseq_event_pending bit being
set, but there is further room for improvement.
The heavy lifting of critical section fixup can be completely avoided, when
the exit to user mode loop is from a syscall unless it's a debug
kernel. There was no way for the RSEQ code to distinguish that case so far.
On architectures, which enable CONFIG_GENERIC_ENTRY, the information is now
available through a function argument to exit_to_user_notify_resume(),
which tells whether the invocation comes from return from syscall or return
from interrupt.
Let the RSEQ code utilize this 'from_irq' argument when
- CONFIG_GENERIC_ENTRY is enabled
- CONFIG_DEBUG_RSEQ is disabled
and skip the critical section fixup when the invocation comes from a
syscall return. The update of CPU and node ID has to happen in both cases,
so the out of line call has always to happen, when a event is pending
whether it's a syscall return or not.
This changes the current behaviour, which just blindly fixes up the
critical section unconditionally in the syscall case. But that's a user
space problem when it invokes a syscall from within a critical section and
expects it to work. That code was clearly never tested on a debug kernel
and user space can keep the pieces.
Signed-off-by: Thomas Gleixner <tglx@...utronix.de>
Cc: Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
Cc: Peter Zijlstra <peterz@...radead.org>
Cc: "Paul E. McKenney" <paulmck@...nel.org>
Cc: Boqun Feng <boqun.feng@...il.com>
---
include/linux/resume_user_mode.h | 2 +-
include/linux/rseq.h | 12 ++++++------
kernel/rseq.c | 22 +++++++++++++++++++++-
3 files changed, 28 insertions(+), 8 deletions(-)
--- a/include/linux/resume_user_mode.h
+++ b/include/linux/resume_user_mode.h
@@ -60,7 +60,7 @@ static inline void exit_to_user_notify_r
mem_cgroup_handle_over_high(GFP_KERNEL);
blkcg_maybe_throttle_current();
- rseq_handle_notify_resume(regs);
+ rseq_handle_notify_resume(regs, from_irq);
}
#ifndef CONFIG_GENERIC_ENTRY
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -13,19 +13,19 @@ static inline void rseq_set_notify_resum
set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
}
-void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs);
+void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs,
+ bool from_irq);
-static inline void rseq_handle_notify_resume(struct pt_regs *regs)
+static inline void rseq_handle_notify_resume(struct pt_regs *regs, bool from_irq)
{
if (IS_ENABLED(CONFIG_DEBUG_RESQ) || READ_ONCE(current->rseq_event_pending))
- __rseq_handle_notify_resume(NULL, regs);
+ __rseq_handle_notify_resume(NULL, regs, from_irq);
}
-static inline void rseq_signal_deliver(struct ksignal *ksig,
- struct pt_regs *regs)
+static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs)
{
if (current->rseq)
- __rseq_handle_notify_resume(ksig, regs);
+ __rseq_handle_notify_resume(ksig, regs, false);
}
static inline void rseq_notify_event(struct task_struct *t)
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -408,6 +408,22 @@ static int rseq_ip_fixup(struct pt_regs
return 0;
}
+static inline bool rseq_ignore_event(bool from_irq, bool ksig)
+{
+ /*
+ * On architectures which do not select_GENERIC_ENTRY
+ * @from_irq is not usable.
+ */
+ if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || !IS_ENABLED(CONFIG_GENERIC_ENTRY))
+ return false;
+
+ /*
+ * Avoid the heavy lifting when this is a return from syscall,
+ * i.e. not from interrupt and not from signal delivery.
+ */
+ return !from_irq && !ksig;
+}
+
/*
* This resume handler must always be executed between any of:
* - preemption,
@@ -419,7 +435,8 @@ static int rseq_ip_fixup(struct pt_regs
* respect to other threads scheduled on the same CPU, and with respect
* to signal handlers.
*/
-void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
+void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs,
+ bool from_irq)
{
struct task_struct *t = current;
int ret, sig;
@@ -467,6 +484,9 @@ void __rseq_handle_notify_resume(struct
t->rseq_event_pending = false;
}
+ if (rseq_ignore_event(from_irq, !!ksig))
+ event = false;
+
if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event) {
ret = rseq_ip_fixup(regs, event);
if (unlikely(ret < 0))
Powered by blists - more mailing lists