[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <9aa047e6-cea5-4f84-b763-36d8d5ad4731@efficios.com>
Date: Mon, 25 Aug 2025 13:36:03 -0400
From: Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
To: Thomas Gleixner <tglx@...utronix.de>, LKML <linux-kernel@...r.kernel.org>
Cc: Jens Axboe <axboe@...nel.dk>, Peter Zijlstra <peterz@...radead.org>,
"Paul E. McKenney" <paulmck@...nel.org>, Boqun Feng <boqun.feng@...il.com>,
Paolo Bonzini <pbonzini@...hat.com>, Sean Christopherson
<seanjc@...gle.com>, Wei Liu <wei.liu@...nel.org>,
Dexuan Cui <decui@...rosoft.com>, x86@...nel.org,
Arnd Bergmann <arnd@...db.de>, Heiko Carstens <hca@...ux.ibm.com>,
Christian Borntraeger <borntraeger@...ux.ibm.com>,
Sven Schnelle <svens@...ux.ibm.com>, Huacai Chen <chenhuacai@...nel.org>,
Paul Walmsley <paul.walmsley@...ive.com>, Palmer Dabbelt <palmer@...belt.com>
Subject: Re: [patch V2 06/37] rseq: Simplify the event notification
On 2025-08-23 12:39, Thomas Gleixner wrote:
> Since commit 0190e4198e47 ("rseq: Deprecate RSEQ_CS_FLAG_NO_RESTART_ON_*
> flags") the bits in task::rseq_event_mask are meaningless and just extra
> work in terms of setting them individually.
>
> Aside of that the only relevant point where an event has to be raised is
> context switch. Neither the CPU nor MM CID can change without going through
> a context switch.
Note: we may want to include the numa node id field as well in this
list of fields.
>
> Collapse them all into a single boolean which simplifies the code a lot and
> remove the pointless invocations which have been sprinkled all over the
> place for no value.
>
> Signed-off-by: Thomas Gleixner <tglx@...utronix.de>
> Cc: Peter Zijlstra <peterz@...radead.org>
> Cc: Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
> Cc: "Paul E. McKenney" <paulmck@...nel.org>
> Cc: Boqun Feng <boqun.feng@...il.com>
> ---
> V2: Reduce it to the sched switch event.
> ---
> fs/exec.c | 2 -
> include/linux/rseq.h | 66 +++++++++-------------------------------------
> include/linux/sched.h | 10 +++---
> include/uapi/linux/rseq.h | 21 ++++----------
> kernel/rseq.c | 28 +++++++++++--------
> kernel/sched/core.c | 5 ---
> kernel/sched/membarrier.c | 8 ++---
> 7 files changed, 48 insertions(+), 92 deletions(-)
> ---
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1775,7 +1775,7 @@ static int bprm_execve(struct linux_binp
> force_fatal_sig(SIGSEGV);
>
> sched_mm_cid_after_execve(current);
> - rseq_set_notify_resume(current);
> + rseq_sched_switch_event(current);
> current->in_execve = 0;
>
> return retval;
> --- a/include/linux/rseq.h
> +++ b/include/linux/rseq.h
> @@ -3,38 +3,8 @@
> #define _LINUX_RSEQ_H
>
> #ifdef CONFIG_RSEQ
> -
> -#include <linux/preempt.h>
> #include <linux/sched.h>
>
> -#ifdef CONFIG_MEMBARRIER
> -# define RSEQ_EVENT_GUARD irq
> -#else
> -# define RSEQ_EVENT_GUARD preempt
> -#endif
> -
> -/*
> - * Map the event mask on the user-space ABI enum rseq_cs_flags
> - * for direct mask checks.
> - */
> -enum rseq_event_mask_bits {
> - RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT,
> - RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT,
> - RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT,
> -};
> -
> -enum rseq_event_mask {
> - RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT),
> - RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT),
> - RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT),
> -};
> -
> -static inline void rseq_set_notify_resume(struct task_struct *t)
> -{
> - if (t->rseq)
> - set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
> -}
> -
> void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs);
>
> static inline void rseq_handle_notify_resume(struct pt_regs *regs)
> @@ -43,35 +13,27 @@ static inline void rseq_handle_notify_re
> __rseq_handle_notify_resume(NULL, regs);
> }
>
> -static inline void rseq_signal_deliver(struct ksignal *ksig,
> - struct pt_regs *regs)
> +static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs)
> {
> if (current->rseq) {
> - scoped_guard(RSEQ_EVENT_GUARD)
> - __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask);
> + current->rseq_event_pending = true;
> __rseq_handle_notify_resume(ksig, regs);
> }
> }
>
> -/* rseq_preempt() requires preemption to be disabled. */
> -static inline void rseq_preempt(struct task_struct *t)
> +static inline void rseq_sched_switch_event(struct task_struct *t)
> {
> - __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
> - rseq_set_notify_resume(t);
> -}
> -
> -/* rseq_migrate() requires preemption to be disabled. */
> -static inline void rseq_migrate(struct task_struct *t)
> -{
> - __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask);
> - rseq_set_notify_resume(t);
> + if (t->rseq) {
> + t->rseq_event_pending = true;
> + set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
> + }
> }
>
> static __always_inline void rseq_exit_to_user_mode(void)
> {
> if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) {
> - if (WARN_ON_ONCE(current->rseq && current->rseq_event_mask))
> - current->rseq_event_mask = 0;
> + if (WARN_ON_ONCE(current->rseq && current->rseq_event_pending))
> + current->rseq_event_pending = false;
> }
> }
>
> @@ -85,12 +47,12 @@ static inline void rseq_fork(struct task
> t->rseq = NULL;
> t->rseq_len = 0;
> t->rseq_sig = 0;
> - t->rseq_event_mask = 0;
> + t->rseq_event_pending = false;
> } else {
> t->rseq = current->rseq;
> t->rseq_len = current->rseq_len;
> t->rseq_sig = current->rseq_sig;
> - t->rseq_event_mask = current->rseq_event_mask;
> + t->rseq_event_pending = current->rseq_event_pending;
> }
> }
>
> @@ -99,15 +61,13 @@ static inline void rseq_execve(struct ta
> t->rseq = NULL;
> t->rseq_len = 0;
> t->rseq_sig = 0;
> - t->rseq_event_mask = 0;
> + t->rseq_event_pending = false;
> }
>
> #else /* CONFIG_RSEQ */
> -static inline void rseq_set_notify_resume(struct task_struct *t) { }
> static inline void rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { }
> static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
> -static inline void rseq_preempt(struct task_struct *t) { }
> -static inline void rseq_migrate(struct task_struct *t) { }
> +static inline void rseq_sched_switch_event(struct task_struct *t) { }
> static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { }
> static inline void rseq_execve(struct task_struct *t) { }
> static inline void rseq_exit_to_user_mode(void) { }
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1401,14 +1401,14 @@ struct task_struct {
> #endif /* CONFIG_NUMA_BALANCING */
>
> #ifdef CONFIG_RSEQ
> - struct rseq __user *rseq;
> - u32 rseq_len;
> - u32 rseq_sig;
> + struct rseq __user *rseq;
> + u32 rseq_len;
> + u32 rseq_sig;
> /*
> - * RmW on rseq_event_mask must be performed atomically
> + * RmW on rseq_event_pending must be performed atomically
> * with respect to preemption.
> */
> - unsigned long rseq_event_mask;
> + bool rseq_event_pending;
AFAIU, this rseq_event_pending field is now concurrently set from:
- rseq_signal_deliver (without any preempt nor irqoff guard)
- rseq_sched_switch_event (with preemption disabled)
Is it safe to concurrently store to a "bool" field within a structure
without any protection against concurrent stores ? Typically I've used
an integer field just to be on the safe side in that kind of situation.
AFAIR, a bool type needs to be at least 1 byte. Do all architectures
supported by Linux have a single byte store instruction, or can we end
up incorrectly storing to other nearby fields ? (for instance, DEC
Alpha ?)
> # ifdef CONFIG_DEBUG_RSEQ
> /*
> * This is a place holder to save a copy of the rseq fields for
> --- a/include/uapi/linux/rseq.h
> +++ b/include/uapi/linux/rseq.h
> @@ -114,20 +114,13 @@ struct rseq {
> /*
> * Restartable sequences flags field.
> *
> - * This field should only be updated by the thread which
> - * registered this data structure. Read by the kernel.
> - * Mainly used for single-stepping through rseq critical sections
> - * with debuggers.
> - *
> - * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT
> - * Inhibit instruction sequence block restart on preemption
> - * for this thread.
> - * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL
> - * Inhibit instruction sequence block restart on signal
> - * delivery for this thread.
> - * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE
> - * Inhibit instruction sequence block restart on migration for
> - * this thread.
> + * This field was initialy intended to allow event masking for for
initially
for for -> for
> + * single-stepping through rseq critical sections with debuggers.
> + * The kernel does not support this anymore and the relevant bits
> + * are checked for being always false:
> + * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT
> + * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL
> + * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE
> */
> __u32 flags;
>
> --- a/kernel/rseq.c
> +++ b/kernel/rseq.c
> @@ -78,6 +78,12 @@
> #define CREATE_TRACE_POINTS
> #include <trace/events/rseq.h>
>
> +#ifdef CONFIG_MEMBARRIER
> +# define RSEQ_EVENT_GUARD irq
> +#else
> +# define RSEQ_EVENT_GUARD preempt
> +#endif
> +
> /* The original rseq structure size (including padding) is 32 bytes. */
> #define ORIG_RSEQ_SIZE 32
>
> @@ -430,11 +436,11 @@ void __rseq_handle_notify_resume(struct
> */
> if (regs) {
> /*
> - * Read and clear the event mask first. If the task was not
> - * preempted or migrated or a signal is on the way, there
> - * is no point in doing any of the heavy lifting here on
> - * production kernels. In that case TIF_NOTIFY_RESUME was
> - * raised by some other functionality.
> + * Read and clear the event pending bit first. If the task
> + * was not preempted or migrated or a signal is on the way,
> + * there is no point in doing any of the heavy lifting here
> + * on production kernels. In that case TIF_NOTIFY_RESUME
> + * was raised by some other functionality.
> *
> * This is correct because the read/clear operation is
> * guarded against scheduler preemption, which makes it CPU
> @@ -447,15 +453,15 @@ void __rseq_handle_notify_resume(struct
> * with the result handed in to allow the detection of
> * inconsistencies.
> */
> - u32 event_mask;
> + bool event;
>
> scoped_guard(RSEQ_EVENT_GUARD) {
> - event_mask = t->rseq_event_mask;
> - t->rseq_event_mask = 0;
> + event = t->rseq_event_pending;
> + t->rseq_event_pending = false;
> }
>
> - if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event_mask) {
> - ret = rseq_ip_fixup(regs, !!event_mask);
> + if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event) {
> + ret = rseq_ip_fixup(regs, event);
> if (unlikely(ret < 0))
> goto error;
> }
> @@ -584,7 +590,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user
> * registered, ensure the cpu_id_start and cpu_id fields
> * are updated before returning to user-space.
> */
> - rseq_set_notify_resume(current);
> + rseq_sched_switch_event(current);
>
> return 0;
> }
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3364,7 +3364,6 @@ void set_task_cpu(struct task_struct *p,
> if (p->sched_class->migrate_task_rq)
> p->sched_class->migrate_task_rq(p, new_cpu);
> p->se.nr_migrations++;
> - rseq_migrate(p);
OK yes, all rseq_migrate can go away because it requires a preemption,
and those are combined into the same state.
Thanks,
Mathieu
> sched_mm_cid_migrate_from(p);
> perf_event_task_migrate(p);
> }
> @@ -4795,7 +4794,6 @@ int sched_cgroup_fork(struct task_struct
> p->sched_task_group = tg;
> }
> #endif
> - rseq_migrate(p);
> /*
> * We're setting the CPU for the first time, we don't migrate,
> * so use __set_task_cpu().
> @@ -4859,7 +4857,6 @@ void wake_up_new_task(struct task_struct
> * as we're not fully set-up yet.
> */
> p->recent_used_cpu = task_cpu(p);
> - rseq_migrate(p);
> __set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags));
> rq = __task_rq_lock(p, &rf);
> update_rq_clock(rq);
> @@ -5153,7 +5150,7 @@ prepare_task_switch(struct rq *rq, struc
> kcov_prepare_switch(prev);
> sched_info_switch(rq, prev, next);
> perf_event_task_sched_out(prev, next);
> - rseq_preempt(prev);
> + rseq_sched_switch_event(prev);
> fire_sched_out_preempt_notifiers(prev, next);
> kmap_local_sched_out();
> prepare_task(next);
> --- a/kernel/sched/membarrier.c
> +++ b/kernel/sched/membarrier.c
> @@ -199,7 +199,7 @@ static void ipi_rseq(void *info)
> * is negligible.
> */
> smp_mb();
> - rseq_preempt(current);
> + rseq_sched_switch_event(current);
> }
>
> static void ipi_sync_rq_state(void *info)
> @@ -407,9 +407,9 @@ static int membarrier_private_expedited(
> * membarrier, we will end up with some thread in the mm
> * running without a core sync.
> *
> - * For RSEQ, don't rseq_preempt() the caller. User code
> - * is not supposed to issue syscalls at all from inside an
> - * rseq critical section.
> + * For RSEQ, don't invoke rseq_sched_switch_event() on the
> + * caller. User code is not supposed to issue syscalls at
> + * all from inside an rseq critical section.
> */
> if (flags != MEMBARRIER_FLAG_SYNC_CORE) {
> preempt_disable();
>
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
Powered by blists - more mailing lists