lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <9aa047e6-cea5-4f84-b763-36d8d5ad4731@efficios.com>
Date: Mon, 25 Aug 2025 13:36:03 -0400
From: Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
To: Thomas Gleixner <tglx@...utronix.de>, LKML <linux-kernel@...r.kernel.org>
Cc: Jens Axboe <axboe@...nel.dk>, Peter Zijlstra <peterz@...radead.org>,
 "Paul E. McKenney" <paulmck@...nel.org>, Boqun Feng <boqun.feng@...il.com>,
 Paolo Bonzini <pbonzini@...hat.com>, Sean Christopherson
 <seanjc@...gle.com>, Wei Liu <wei.liu@...nel.org>,
 Dexuan Cui <decui@...rosoft.com>, x86@...nel.org,
 Arnd Bergmann <arnd@...db.de>, Heiko Carstens <hca@...ux.ibm.com>,
 Christian Borntraeger <borntraeger@...ux.ibm.com>,
 Sven Schnelle <svens@...ux.ibm.com>, Huacai Chen <chenhuacai@...nel.org>,
 Paul Walmsley <paul.walmsley@...ive.com>, Palmer Dabbelt <palmer@...belt.com>
Subject: Re: [patch V2 06/37] rseq: Simplify the event notification

On 2025-08-23 12:39, Thomas Gleixner wrote:
> Since commit 0190e4198e47 ("rseq: Deprecate RSEQ_CS_FLAG_NO_RESTART_ON_*
> flags") the bits in task::rseq_event_mask are meaningless and just extra
> work in terms of setting them individually.
> 
> Aside of that the only relevant point where an event has to be raised is
> context switch. Neither the CPU nor MM CID can change without going through
> a context switch.

Note: we may want to include the numa node id field as well in this
list of fields.

> 
> Collapse them all into a single boolean which simplifies the code a lot and
> remove the pointless invocations which have been sprinkled all over the
> place for no value.
> 
> Signed-off-by: Thomas Gleixner <tglx@...utronix.de>
> Cc: Peter Zijlstra <peterz@...radead.org>
> Cc: Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
> Cc: "Paul E. McKenney" <paulmck@...nel.org>
> Cc: Boqun Feng <boqun.feng@...il.com>
> ---
> V2: Reduce it to the sched switch event.
> ---
>   fs/exec.c                 |    2 -
>   include/linux/rseq.h      |   66 +++++++++-------------------------------------
>   include/linux/sched.h     |   10 +++---
>   include/uapi/linux/rseq.h |   21 ++++----------
>   kernel/rseq.c             |   28 +++++++++++--------
>   kernel/sched/core.c       |    5 ---
>   kernel/sched/membarrier.c |    8 ++---
>   7 files changed, 48 insertions(+), 92 deletions(-)
> ---
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1775,7 +1775,7 @@ static int bprm_execve(struct linux_binp
>   		force_fatal_sig(SIGSEGV);
>   
>   	sched_mm_cid_after_execve(current);
> -	rseq_set_notify_resume(current);
> +	rseq_sched_switch_event(current);
>   	current->in_execve = 0;
>   
>   	return retval;
> --- a/include/linux/rseq.h
> +++ b/include/linux/rseq.h
> @@ -3,38 +3,8 @@
>   #define _LINUX_RSEQ_H
>   
>   #ifdef CONFIG_RSEQ
> -
> -#include <linux/preempt.h>
>   #include <linux/sched.h>
>   
> -#ifdef CONFIG_MEMBARRIER
> -# define RSEQ_EVENT_GUARD	irq
> -#else
> -# define RSEQ_EVENT_GUARD	preempt
> -#endif
> -
> -/*
> - * Map the event mask on the user-space ABI enum rseq_cs_flags
> - * for direct mask checks.
> - */
> -enum rseq_event_mask_bits {
> -	RSEQ_EVENT_PREEMPT_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT,
> -	RSEQ_EVENT_SIGNAL_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT,
> -	RSEQ_EVENT_MIGRATE_BIT	= RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT,
> -};
> -
> -enum rseq_event_mask {
> -	RSEQ_EVENT_PREEMPT	= (1U << RSEQ_EVENT_PREEMPT_BIT),
> -	RSEQ_EVENT_SIGNAL	= (1U << RSEQ_EVENT_SIGNAL_BIT),
> -	RSEQ_EVENT_MIGRATE	= (1U << RSEQ_EVENT_MIGRATE_BIT),
> -};
> -
> -static inline void rseq_set_notify_resume(struct task_struct *t)
> -{
> -	if (t->rseq)
> -		set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
> -}
> -
>   void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs);
>   
>   static inline void rseq_handle_notify_resume(struct pt_regs *regs)
> @@ -43,35 +13,27 @@ static inline void rseq_handle_notify_re
>   		__rseq_handle_notify_resume(NULL, regs);
>   }
>   
> -static inline void rseq_signal_deliver(struct ksignal *ksig,
> -				       struct pt_regs *regs)
> +static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs)
>   {
>   	if (current->rseq) {
> -		scoped_guard(RSEQ_EVENT_GUARD)
> -			__set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask);
> +		current->rseq_event_pending = true;
>   		__rseq_handle_notify_resume(ksig, regs);
>   	}
>   }
>   
> -/* rseq_preempt() requires preemption to be disabled. */
> -static inline void rseq_preempt(struct task_struct *t)
> +static inline void rseq_sched_switch_event(struct task_struct *t)
>   {
> -	__set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
> -	rseq_set_notify_resume(t);
> -}
> -
> -/* rseq_migrate() requires preemption to be disabled. */
> -static inline void rseq_migrate(struct task_struct *t)
> -{
> -	__set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask);
> -	rseq_set_notify_resume(t);
> +	if (t->rseq) {
> +		t->rseq_event_pending = true;
> +		set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
> +	}
>   }
>   
>   static __always_inline void rseq_exit_to_user_mode(void)
>   {
>   	if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) {
> -		if (WARN_ON_ONCE(current->rseq && current->rseq_event_mask))
> -			current->rseq_event_mask = 0;
> +		if (WARN_ON_ONCE(current->rseq && current->rseq_event_pending))
> +			current->rseq_event_pending = false;
>   	}
>   }
>   
> @@ -85,12 +47,12 @@ static inline void rseq_fork(struct task
>   		t->rseq = NULL;
>   		t->rseq_len = 0;
>   		t->rseq_sig = 0;
> -		t->rseq_event_mask = 0;
> +		t->rseq_event_pending = false;
>   	} else {
>   		t->rseq = current->rseq;
>   		t->rseq_len = current->rseq_len;
>   		t->rseq_sig = current->rseq_sig;
> -		t->rseq_event_mask = current->rseq_event_mask;
> +		t->rseq_event_pending = current->rseq_event_pending;
>   	}
>   }
>   
> @@ -99,15 +61,13 @@ static inline void rseq_execve(struct ta
>   	t->rseq = NULL;
>   	t->rseq_len = 0;
>   	t->rseq_sig = 0;
> -	t->rseq_event_mask = 0;
> +	t->rseq_event_pending = false;
>   }
>   
>   #else /* CONFIG_RSEQ */
> -static inline void rseq_set_notify_resume(struct task_struct *t) { }
>   static inline void rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { }
>   static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
> -static inline void rseq_preempt(struct task_struct *t) { }
> -static inline void rseq_migrate(struct task_struct *t) { }
> +static inline void rseq_sched_switch_event(struct task_struct *t) { }
>   static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { }
>   static inline void rseq_execve(struct task_struct *t) { }
>   static inline void rseq_exit_to_user_mode(void) { }
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1401,14 +1401,14 @@ struct task_struct {
>   #endif /* CONFIG_NUMA_BALANCING */
>   
>   #ifdef CONFIG_RSEQ
> -	struct rseq __user *rseq;
> -	u32 rseq_len;
> -	u32 rseq_sig;
> +	struct rseq __user		*rseq;
> +	u32				rseq_len;
> +	u32				rseq_sig;
>   	/*
> -	 * RmW on rseq_event_mask must be performed atomically
> +	 * RmW on rseq_event_pending must be performed atomically
>   	 * with respect to preemption.
>   	 */
> -	unsigned long rseq_event_mask;
> +	bool				rseq_event_pending;

AFAIU, this rseq_event_pending field is now concurrently set from:

- rseq_signal_deliver (without any preempt nor irqoff guard)
- rseq_sched_switch_event (with preemption disabled)

Is it safe to concurrently store to a "bool" field within a structure
without any protection against concurrent stores ? Typically I've used
an integer field just to be on the safe side in that kind of situation.

AFAIR, a bool type needs to be at least 1 byte. Do all architectures
supported by Linux have a single byte store instruction, or can we end
up incorrectly storing to other nearby fields ? (for instance, DEC
Alpha ?)

>   # ifdef CONFIG_DEBUG_RSEQ
>   	/*
>   	 * This is a place holder to save a copy of the rseq fields for
> --- a/include/uapi/linux/rseq.h
> +++ b/include/uapi/linux/rseq.h
> @@ -114,20 +114,13 @@ struct rseq {
>   	/*
>   	 * Restartable sequences flags field.
>   	 *
> -	 * This field should only be updated by the thread which
> -	 * registered this data structure. Read by the kernel.
> -	 * Mainly used for single-stepping through rseq critical sections
> -	 * with debuggers.
> -	 *
> -	 * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT
> -	 *     Inhibit instruction sequence block restart on preemption
> -	 *     for this thread.
> -	 * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL
> -	 *     Inhibit instruction sequence block restart on signal
> -	 *     delivery for this thread.
> -	 * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE
> -	 *     Inhibit instruction sequence block restart on migration for
> -	 *     this thread.
> +	 * This field was initialy intended to allow event masking for for

initially

for for -> for

> +	 * single-stepping through rseq critical sections with debuggers.
> +	 * The kernel does not support this anymore and the relevant bits
> +	 * are checked for being always false:
> +	 *	- RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT
> +	 *	- RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL
> +	 *	- RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE
>   	 */
>   	__u32 flags;
>   
> --- a/kernel/rseq.c
> +++ b/kernel/rseq.c
> @@ -78,6 +78,12 @@
>   #define CREATE_TRACE_POINTS
>   #include <trace/events/rseq.h>
>   
> +#ifdef CONFIG_MEMBARRIER
> +# define RSEQ_EVENT_GUARD	irq
> +#else
> +# define RSEQ_EVENT_GUARD	preempt
> +#endif
> +
>   /* The original rseq structure size (including padding) is 32 bytes. */
>   #define ORIG_RSEQ_SIZE		32
>   
> @@ -430,11 +436,11 @@ void __rseq_handle_notify_resume(struct
>   	 */
>   	if (regs) {
>   		/*
> -		 * Read and clear the event mask first. If the task was not
> -		 * preempted or migrated or a signal is on the way, there
> -		 * is no point in doing any of the heavy lifting here on
> -		 * production kernels. In that case TIF_NOTIFY_RESUME was
> -		 * raised by some other functionality.
> +		 * Read and clear the event pending bit first. If the task
> +		 * was not preempted or migrated or a signal is on the way,
> +		 * there is no point in doing any of the heavy lifting here
> +		 * on production kernels. In that case TIF_NOTIFY_RESUME
> +		 * was raised by some other functionality.
>   		 *
>   		 * This is correct because the read/clear operation is
>   		 * guarded against scheduler preemption, which makes it CPU
> @@ -447,15 +453,15 @@ void __rseq_handle_notify_resume(struct
>   		 * with the result handed in to allow the detection of
>   		 * inconsistencies.
>   		 */
> -		u32 event_mask;
> +		bool event;
>   
>   		scoped_guard(RSEQ_EVENT_GUARD) {
> -			event_mask = t->rseq_event_mask;
> -			t->rseq_event_mask = 0;
> +			event = t->rseq_event_pending;
> +			t->rseq_event_pending = false;
>   		}
>   
> -		if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event_mask) {
> -			ret = rseq_ip_fixup(regs, !!event_mask);
> +		if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event) {
> +			ret = rseq_ip_fixup(regs, event);
>   			if (unlikely(ret < 0))
>   				goto error;
>   		}
> @@ -584,7 +590,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user
>   	 * registered, ensure the cpu_id_start and cpu_id fields
>   	 * are updated before returning to user-space.
>   	 */
> -	rseq_set_notify_resume(current);
> +	rseq_sched_switch_event(current);
>   
>   	return 0;
>   }
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3364,7 +3364,6 @@ void set_task_cpu(struct task_struct *p,
>   		if (p->sched_class->migrate_task_rq)
>   			p->sched_class->migrate_task_rq(p, new_cpu);
>   		p->se.nr_migrations++;
> -		rseq_migrate(p);

OK yes, all rseq_migrate can go away because it requires a preemption,
and those are combined into the same state.

Thanks,

Mathieu

>   		sched_mm_cid_migrate_from(p);
>   		perf_event_task_migrate(p);
>   	}
> @@ -4795,7 +4794,6 @@ int sched_cgroup_fork(struct task_struct
>   		p->sched_task_group = tg;
>   	}
>   #endif
> -	rseq_migrate(p);
>   	/*
>   	 * We're setting the CPU for the first time, we don't migrate,
>   	 * so use __set_task_cpu().
> @@ -4859,7 +4857,6 @@ void wake_up_new_task(struct task_struct
>   	 * as we're not fully set-up yet.
>   	 */
>   	p->recent_used_cpu = task_cpu(p);
> -	rseq_migrate(p);
>   	__set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags));
>   	rq = __task_rq_lock(p, &rf);
>   	update_rq_clock(rq);
> @@ -5153,7 +5150,7 @@ prepare_task_switch(struct rq *rq, struc
>   	kcov_prepare_switch(prev);
>   	sched_info_switch(rq, prev, next);
>   	perf_event_task_sched_out(prev, next);
> -	rseq_preempt(prev);
> +	rseq_sched_switch_event(prev);
>   	fire_sched_out_preempt_notifiers(prev, next);
>   	kmap_local_sched_out();
>   	prepare_task(next);
> --- a/kernel/sched/membarrier.c
> +++ b/kernel/sched/membarrier.c
> @@ -199,7 +199,7 @@ static void ipi_rseq(void *info)
>   	 * is negligible.
>   	 */
>   	smp_mb();
> -	rseq_preempt(current);
> +	rseq_sched_switch_event(current);
>   }
>   
>   static void ipi_sync_rq_state(void *info)
> @@ -407,9 +407,9 @@ static int membarrier_private_expedited(
>   		 * membarrier, we will end up with some thread in the mm
>   		 * running without a core sync.
>   		 *
> -		 * For RSEQ, don't rseq_preempt() the caller.  User code
> -		 * is not supposed to issue syscalls at all from inside an
> -		 * rseq critical section.
> +		 * For RSEQ, don't invoke rseq_sched_switch_event() on the
> +		 * caller.  User code is not supposed to issue syscalls at
> +		 * all from inside an rseq critical section.
>   		 */
>   		if (flags != MEMBARRIER_FLAG_SYNC_CORE) {
>   			preempt_disable();
> 


-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ