lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <6e0bcd7c-f842-4db0-b30b-5f6857b45b66@efficios.com>
Date: Thu, 11 Sep 2025 10:03:06 -0400
From: Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
To: Thomas Gleixner <tglx@...utronix.de>, LKML <linux-kernel@...r.kernel.org>
Cc: Michael Jeanson <mjeanson@...icios.com>, Jens Axboe <axboe@...nel.dk>,
 Peter Zijlstra <peterz@...radead.org>, "Paul E. McKenney"
 <paulmck@...nel.org>, Boqun Feng <boqun.feng@...il.com>,
 Paolo Bonzini <pbonzini@...hat.com>, Sean Christopherson
 <seanjc@...gle.com>, Wei Liu <wei.liu@...nel.org>,
 Dexuan Cui <decui@...rosoft.com>, x86@...nel.org,
 Arnd Bergmann <arnd@...db.de>, Heiko Carstens <hca@...ux.ibm.com>,
 Christian Borntraeger <borntraeger@...ux.ibm.com>,
 Sven Schnelle <svens@...ux.ibm.com>, Huacai Chen <chenhuacai@...nel.org>,
 Paul Walmsley <paul.walmsley@...ive.com>, Palmer Dabbelt <palmer@...belt.com>
Subject: Re: [patch V4 26/36] rseq: Optimize event setting

On 2025-09-08 17:32, Thomas Gleixner wrote:
> After removing the various condition bits earlier it turns out that one
> extra information is needed to avoid setting event::sched_switch and
> TIF_NOTIFY_RESUME unconditionally on every context switch.
> 
> The update of the RSEQ user space memory is only required, when either
> 
>    the task was interrupted in user space and schedules
> 
> or
> 
>    the CPU or MM CID changes in schedule() independent of the entry mode
> 
> Right now only the interrupt from user information is available.
> 
> Add a event flag, which is set when the CPU or MM CID or both change.

a event -> an event

> 
> Evaluate this event in the scheduler to decide whether the sched_switch
> event and the TIF bit need to be set.
> 
> It's an extra conditional in context_switch(), but the downside of
> unconditionally handling RSEQ after a context switch to user is way more
> significant. The utilized boolean logic minimizes this to a single
> conditional branch.
> 
> Signed-off-by: Thomas Gleixner <tglx@...utronix.de>
> 
> ---
>   fs/exec.c                  |    2 -
>   include/linux/rseq.h       |   81 +++++++++++++++++++++++++++++++++++++++++----
>   include/linux/rseq_types.h |   11 +++++-
>   kernel/rseq.c              |    2 -
>   kernel/sched/core.c        |    7 +++
>   kernel/sched/sched.h       |    5 ++
>   6 files changed, 95 insertions(+), 13 deletions(-)
> 
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1775,7 +1775,7 @@ static int bprm_execve(struct linux_binp
>   		force_fatal_sig(SIGSEGV);
>   
>   	sched_mm_cid_after_execve(current);
> -	rseq_sched_switch_event(current);
> +	rseq_force_update();
>   	current->in_execve = 0;
>   
>   	return retval;
> --- a/include/linux/rseq.h
> +++ b/include/linux/rseq.h
> @@ -9,7 +9,8 @@ void __rseq_handle_notify_resume(struct
>   
>   static inline void rseq_handle_notify_resume(struct pt_regs *regs)
>   {
> -	if (current->rseq.event.has_rseq)
> +	/* '&' is intentional to spare one conditional branch */
> +	if (current->rseq.event.sched_switch & current->rseq.event.has_rseq)

I wonder.. except for the corner case of rseq unregistration,
when can we have sched_switch set but not has_rseq ?

We could remove a load from the fast path and the AND if we
clear the sched_switch flag on rseq unregistration.

Thanks,

Mathieu

>   		__rseq_handle_notify_resume(regs);
>   }
>   
> @@ -31,12 +32,75 @@ static inline void rseq_signal_deliver(s
>   	}
>   }
>   
> -/* Raised from context switch and exevce to force evaluation on exit to user */
> -static inline void rseq_sched_switch_event(struct task_struct *t)
> +static inline void rseq_raise_notify_resume(struct task_struct *t)
>   {
> -	if (t->rseq.event.has_rseq) {
> -		t->rseq.event.sched_switch = true;
> -		set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
> +	set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
> +}
> +
> +/* Invoked from context switch to force evaluation on exit to user */
> +static __always_inline void rseq_sched_switch_event(struct task_struct *t)
> +{
> +	struct rseq_event *ev = &t->rseq.event;
> +
> +	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
> +		/*
> +		 * Avoid a boat load of conditionals by using simple logic
> +		 * to determine whether NOTIFY_RESUME needs to be raised.
> +		 *
> +		 * It's required when the CPU or MM CID has changed or
> +		 * the entry was from user space.
> +		 */
> +		bool raise = (ev->user_irq | ev->ids_changed) & ev->has_rseq;
> +
> +		if (raise) {
> +			ev->sched_switch = true;
> +			rseq_raise_notify_resume(t);
> +		}
> +	} else {
> +		if (ev->has_rseq) {
> +			t->rseq.event.sched_switch = true;
> +			rseq_raise_notify_resume(t);
> +		}
> +	}
> +}
> +
> +/*
> + * Invoked from __set_task_cpu() when a task migrates to enforce an IDs
> + * update.
> + *
> + * This does not raise TIF_NOTIFY_RESUME as that happens in
> + * rseq_sched_switch_event().
> + */
> +static __always_inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu)
> +{
> +	t->rseq.event.ids_changed = true;
> +}
> +
> +/*
> + * Invoked from switch_mm_cid() in context switch when the task gets a MM
> + * CID assigned.
> + *
> + * This does not raise TIF_NOTIFY_RESUME as that happens in
> + * rseq_sched_switch_event().
> + */
> +static __always_inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid)
> +{
> +	/*
> +	 * Requires a comparison as the switch_mm_cid() code does not
> +	 * provide a conditional for it readily. So avoid excessive updates
> +	 * when nothing changes.
> +	 */
> +	if (t->rseq.ids.mm_cid != cid)
> +		t->rseq.event.ids_changed = true;
> +}
> +
> +/* Enforce a full update after RSEQ registration and when execve() failed */
> +static inline void rseq_force_update(void)
> +{
> +	if (current->rseq.event.has_rseq) {
> +		current->rseq.event.ids_changed = true;
> +		current->rseq.event.sched_switch = true;
> +		rseq_raise_notify_resume(current);
>   	}
>   }
>   
> @@ -53,7 +117,7 @@ static inline void rseq_sched_switch_eve
>   static inline void rseq_virt_userspace_exit(void)
>   {
>   	if (current->rseq.event.sched_switch)
> -		set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
> +		rseq_raise_notify_resume(current);
>   }
>   
>   static inline void rseq_reset(struct task_struct *t)
> @@ -85,6 +149,9 @@ static inline void rseq_fork(struct task
>   static inline void rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { }
>   static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
>   static inline void rseq_sched_switch_event(struct task_struct *t) { }
> +static inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu) { }
> +static inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid) { }
> +static inline void rseq_force_update(void) { }
>   static inline void rseq_virt_userspace_exit(void) { }
>   static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { }
>   static inline void rseq_execve(struct task_struct *t) { }
> --- a/include/linux/rseq_types.h
> +++ b/include/linux/rseq_types.h
> @@ -11,20 +11,27 @@ struct rseq;
>    * struct rseq_event - Storage for rseq related event management
>    * @all:		Compound to initialize and clear the data efficiently
>    * @events:		Compound to access events with a single load/store
> - * @sched_switch:	True if the task was scheduled out
> + * @sched_switch:	True if the task was scheduled and needs update on
> + *			exit to user
> + * @ids_changed:	Indicator that IDs need to be updated
>    * @user_irq:		True on interrupt entry from user mode
>    * @has_rseq:		True if the task has a rseq pointer installed
>    * @error:		Compound error code for the slow path to analyze
>    * @fatal:		User space data corrupted or invalid
> + *
> + * @sched_switch and @ids_changed must be adjacent and the combo must be
> + * 16bit aligned to allow a single store, when both are set at the same
> + * time in the scheduler.
>    */
>   struct rseq_event {
>   	union {
>   		u64				all;
>   		struct {
>   			union {
> -				u16		events;
> +				u32		events;
>   				struct {
>   					u8	sched_switch;
> +					u8	ids_changed;
>   					u8	user_irq;
>   				};
>   			};
> --- a/kernel/rseq.c
> +++ b/kernel/rseq.c
> @@ -456,7 +456,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user
>   	 * are updated before returning to user-space.
>   	 */
>   	current->rseq.event.has_rseq = true;
> -	rseq_sched_switch_event(current);
> +	rseq_force_update();
>   
>   	return 0;
>   }
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -5150,7 +5150,6 @@ prepare_task_switch(struct rq *rq, struc
>   	kcov_prepare_switch(prev);
>   	sched_info_switch(rq, prev, next);
>   	perf_event_task_sched_out(prev, next);
> -	rseq_sched_switch_event(prev);
>   	fire_sched_out_preempt_notifiers(prev, next);
>   	kmap_local_sched_out();
>   	prepare_task(next);
> @@ -5348,6 +5347,12 @@ context_switch(struct rq *rq, struct tas
>   	/* switch_mm_cid() requires the memory barriers above. */
>   	switch_mm_cid(rq, prev, next);
>   
> +	/*
> +	 * Tell rseq that the task was scheduled in. Must be after
> +	 * switch_mm_cid() to get the TIF flag set.
> +	 */
> +	rseq_sched_switch_event(next);
> +
>   	prepare_lock_switch(rq, next, rf);
>   
>   	/* Here we just switch the register state and the stack. */
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2181,6 +2181,7 @@ static inline void __set_task_cpu(struct
>   	smp_wmb();
>   	WRITE_ONCE(task_thread_info(p)->cpu, cpu);
>   	p->wake_cpu = cpu;
> +	rseq_sched_set_task_cpu(p, cpu);
>   #endif /* CONFIG_SMP */
>   }
>   
> @@ -3778,8 +3779,10 @@ static inline void switch_mm_cid(struct
>   		mm_cid_put_lazy(prev);
>   		prev->mm_cid = -1;
>   	}
> -	if (next->mm_cid_active)
> +	if (next->mm_cid_active) {
>   		next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next, next->mm);
> +		rseq_sched_set_task_mm_cid(next, next->mm_cid);
> +	}
>   }
>   
>   #else /* !CONFIG_SCHED_MM_CID: */
> 


-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ