[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <6e0bcd7c-f842-4db0-b30b-5f6857b45b66@efficios.com>
Date: Thu, 11 Sep 2025 10:03:06 -0400
From: Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
To: Thomas Gleixner <tglx@...utronix.de>, LKML <linux-kernel@...r.kernel.org>
Cc: Michael Jeanson <mjeanson@...icios.com>, Jens Axboe <axboe@...nel.dk>,
Peter Zijlstra <peterz@...radead.org>, "Paul E. McKenney"
<paulmck@...nel.org>, Boqun Feng <boqun.feng@...il.com>,
Paolo Bonzini <pbonzini@...hat.com>, Sean Christopherson
<seanjc@...gle.com>, Wei Liu <wei.liu@...nel.org>,
Dexuan Cui <decui@...rosoft.com>, x86@...nel.org,
Arnd Bergmann <arnd@...db.de>, Heiko Carstens <hca@...ux.ibm.com>,
Christian Borntraeger <borntraeger@...ux.ibm.com>,
Sven Schnelle <svens@...ux.ibm.com>, Huacai Chen <chenhuacai@...nel.org>,
Paul Walmsley <paul.walmsley@...ive.com>, Palmer Dabbelt <palmer@...belt.com>
Subject: Re: [patch V4 26/36] rseq: Optimize event setting
On 2025-09-08 17:32, Thomas Gleixner wrote:
> After removing the various condition bits earlier it turns out that one
> extra information is needed to avoid setting event::sched_switch and
> TIF_NOTIFY_RESUME unconditionally on every context switch.
>
> The update of the RSEQ user space memory is only required, when either
>
> the task was interrupted in user space and schedules
>
> or
>
> the CPU or MM CID changes in schedule() independent of the entry mode
>
> Right now only the interrupt from user information is available.
>
> Add a event flag, which is set when the CPU or MM CID or both change.
a event -> an event
>
> Evaluate this event in the scheduler to decide whether the sched_switch
> event and the TIF bit need to be set.
>
> It's an extra conditional in context_switch(), but the downside of
> unconditionally handling RSEQ after a context switch to user is way more
> significant. The utilized boolean logic minimizes this to a single
> conditional branch.
>
> Signed-off-by: Thomas Gleixner <tglx@...utronix.de>
>
> ---
> fs/exec.c | 2 -
> include/linux/rseq.h | 81 +++++++++++++++++++++++++++++++++++++++++----
> include/linux/rseq_types.h | 11 +++++-
> kernel/rseq.c | 2 -
> kernel/sched/core.c | 7 +++
> kernel/sched/sched.h | 5 ++
> 6 files changed, 95 insertions(+), 13 deletions(-)
>
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1775,7 +1775,7 @@ static int bprm_execve(struct linux_binp
> force_fatal_sig(SIGSEGV);
>
> sched_mm_cid_after_execve(current);
> - rseq_sched_switch_event(current);
> + rseq_force_update();
> current->in_execve = 0;
>
> return retval;
> --- a/include/linux/rseq.h
> +++ b/include/linux/rseq.h
> @@ -9,7 +9,8 @@ void __rseq_handle_notify_resume(struct
>
> static inline void rseq_handle_notify_resume(struct pt_regs *regs)
> {
> - if (current->rseq.event.has_rseq)
> + /* '&' is intentional to spare one conditional branch */
> + if (current->rseq.event.sched_switch & current->rseq.event.has_rseq)
I wonder.. except for the corner case of rseq unregistration,
when can we have sched_switch set but not has_rseq ?
We could remove a load from the fast path and the AND if we
clear the sched_switch flag on rseq unregistration.
Thanks,
Mathieu
> __rseq_handle_notify_resume(regs);
> }
>
> @@ -31,12 +32,75 @@ static inline void rseq_signal_deliver(s
> }
> }
>
> -/* Raised from context switch and exevce to force evaluation on exit to user */
> -static inline void rseq_sched_switch_event(struct task_struct *t)
> +static inline void rseq_raise_notify_resume(struct task_struct *t)
> {
> - if (t->rseq.event.has_rseq) {
> - t->rseq.event.sched_switch = true;
> - set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
> + set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
> +}
> +
> +/* Invoked from context switch to force evaluation on exit to user */
> +static __always_inline void rseq_sched_switch_event(struct task_struct *t)
> +{
> + struct rseq_event *ev = &t->rseq.event;
> +
> + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
> + /*
> + * Avoid a boat load of conditionals by using simple logic
> + * to determine whether NOTIFY_RESUME needs to be raised.
> + *
> + * It's required when the CPU or MM CID has changed or
> + * the entry was from user space.
> + */
> + bool raise = (ev->user_irq | ev->ids_changed) & ev->has_rseq;
> +
> + if (raise) {
> + ev->sched_switch = true;
> + rseq_raise_notify_resume(t);
> + }
> + } else {
> + if (ev->has_rseq) {
> + t->rseq.event.sched_switch = true;
> + rseq_raise_notify_resume(t);
> + }
> + }
> +}
> +
> +/*
> + * Invoked from __set_task_cpu() when a task migrates to enforce an IDs
> + * update.
> + *
> + * This does not raise TIF_NOTIFY_RESUME as that happens in
> + * rseq_sched_switch_event().
> + */
> +static __always_inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu)
> +{
> + t->rseq.event.ids_changed = true;
> +}
> +
> +/*
> + * Invoked from switch_mm_cid() in context switch when the task gets a MM
> + * CID assigned.
> + *
> + * This does not raise TIF_NOTIFY_RESUME as that happens in
> + * rseq_sched_switch_event().
> + */
> +static __always_inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid)
> +{
> + /*
> + * Requires a comparison as the switch_mm_cid() code does not
> + * provide a conditional for it readily. So avoid excessive updates
> + * when nothing changes.
> + */
> + if (t->rseq.ids.mm_cid != cid)
> + t->rseq.event.ids_changed = true;
> +}
> +
> +/* Enforce a full update after RSEQ registration and when execve() failed */
> +static inline void rseq_force_update(void)
> +{
> + if (current->rseq.event.has_rseq) {
> + current->rseq.event.ids_changed = true;
> + current->rseq.event.sched_switch = true;
> + rseq_raise_notify_resume(current);
> }
> }
>
> @@ -53,7 +117,7 @@ static inline void rseq_sched_switch_eve
> static inline void rseq_virt_userspace_exit(void)
> {
> if (current->rseq.event.sched_switch)
> - set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
> + rseq_raise_notify_resume(current);
> }
>
> static inline void rseq_reset(struct task_struct *t)
> @@ -85,6 +149,9 @@ static inline void rseq_fork(struct task
> static inline void rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { }
> static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
> static inline void rseq_sched_switch_event(struct task_struct *t) { }
> +static inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu) { }
> +static inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid) { }
> +static inline void rseq_force_update(void) { }
> static inline void rseq_virt_userspace_exit(void) { }
> static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { }
> static inline void rseq_execve(struct task_struct *t) { }
> --- a/include/linux/rseq_types.h
> +++ b/include/linux/rseq_types.h
> @@ -11,20 +11,27 @@ struct rseq;
> * struct rseq_event - Storage for rseq related event management
> * @all: Compound to initialize and clear the data efficiently
> * @events: Compound to access events with a single load/store
> - * @sched_switch: True if the task was scheduled out
> + * @sched_switch: True if the task was scheduled and needs update on
> + * exit to user
> + * @ids_changed: Indicator that IDs need to be updated
> * @user_irq: True on interrupt entry from user mode
> * @has_rseq: True if the task has a rseq pointer installed
> * @error: Compound error code for the slow path to analyze
> * @fatal: User space data corrupted or invalid
> + *
> + * @sched_switch and @ids_changed must be adjacent and the combo must be
> + * 16bit aligned to allow a single store, when both are set at the same
> + * time in the scheduler.
> */
> struct rseq_event {
> union {
> u64 all;
> struct {
> union {
> - u16 events;
> + u32 events;
> struct {
> u8 sched_switch;
> + u8 ids_changed;
> u8 user_irq;
> };
> };
> --- a/kernel/rseq.c
> +++ b/kernel/rseq.c
> @@ -456,7 +456,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user
> * are updated before returning to user-space.
> */
> current->rseq.event.has_rseq = true;
> - rseq_sched_switch_event(current);
> + rseq_force_update();
>
> return 0;
> }
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -5150,7 +5150,6 @@ prepare_task_switch(struct rq *rq, struc
> kcov_prepare_switch(prev);
> sched_info_switch(rq, prev, next);
> perf_event_task_sched_out(prev, next);
> - rseq_sched_switch_event(prev);
> fire_sched_out_preempt_notifiers(prev, next);
> kmap_local_sched_out();
> prepare_task(next);
> @@ -5348,6 +5347,12 @@ context_switch(struct rq *rq, struct tas
> /* switch_mm_cid() requires the memory barriers above. */
> switch_mm_cid(rq, prev, next);
>
> + /*
> + * Tell rseq that the task was scheduled in. Must be after
> + * switch_mm_cid() to get the TIF flag set.
> + */
> + rseq_sched_switch_event(next);
> +
> prepare_lock_switch(rq, next, rf);
>
> /* Here we just switch the register state and the stack. */
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2181,6 +2181,7 @@ static inline void __set_task_cpu(struct
> smp_wmb();
> WRITE_ONCE(task_thread_info(p)->cpu, cpu);
> p->wake_cpu = cpu;
> + rseq_sched_set_task_cpu(p, cpu);
> #endif /* CONFIG_SMP */
> }
>
> @@ -3778,8 +3779,10 @@ static inline void switch_mm_cid(struct
> mm_cid_put_lazy(prev);
> prev->mm_cid = -1;
> }
> - if (next->mm_cid_active)
> + if (next->mm_cid_active) {
> next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next, next->mm);
> + rseq_sched_set_task_mm_cid(next, next->mm_cid);
> + }
> }
>
> #else /* !CONFIG_SCHED_MM_CID: */
>
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
Powered by blists - more mailing lists