[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <e87fb53b-329b-44dc-a14e-e8c7a49d9adf@efficios.com>
Date: Tue, 26 Aug 2025 10:52:37 -0400
From: Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
To: Thomas Gleixner <tglx@...utronix.de>, LKML <linux-kernel@...r.kernel.org>
Cc: Jens Axboe <axboe@...nel.dk>, Peter Zijlstra <peterz@...radead.org>,
"Paul E. McKenney" <paulmck@...nel.org>, Boqun Feng <boqun.feng@...il.com>,
Paolo Bonzini <pbonzini@...hat.com>, Sean Christopherson
<seanjc@...gle.com>, Wei Liu <wei.liu@...nel.org>,
Dexuan Cui <decui@...rosoft.com>, x86@...nel.org,
Arnd Bergmann <arnd@...db.de>, Heiko Carstens <hca@...ux.ibm.com>,
Christian Borntraeger <borntraeger@...ux.ibm.com>,
Sven Schnelle <svens@...ux.ibm.com>, Huacai Chen <chenhuacai@...nel.org>,
Paul Walmsley <paul.walmsley@...ive.com>, Palmer Dabbelt <palmer@...belt.com>
Subject: Re: [patch V2 23/37] rseq: Provide and use rseq_set_uids()
On 2025-08-23 12:40, Thomas Gleixner wrote:
> Provide a new and straight forward implementation to set the IDs (CPU ID,
> Node ID and MM CID), which can be later inlined into the fast path.
>
> It does all operations in one user_rw_masked_begin() section and retrieves
> also the critical section member (rseq::cs_rseq) from user space to avoid
> another user..begin/end() pair. This is in preparation for optimizing the
> fast path to avoid extra work when not required.
>
> Use it to replace the whole related zoo in rseq.c
>
> Signed-off-by: Thomas Gleixner <tglx@...utronix.de>
> ---
> fs/binfmt_elf.c | 2
> include/linux/rseq_entry.h | 95 ++++++++++++++++++++
> include/linux/rseq_types.h | 2
> include/linux/sched.h | 10 --
> kernel/rseq.c | 208 ++++++---------------------------------------
> 5 files changed, 130 insertions(+), 187 deletions(-)
>
> --- a/fs/binfmt_elf.c
> +++ b/fs/binfmt_elf.c
> @@ -46,7 +46,7 @@
> #include <linux/cred.h>
> #include <linux/dax.h>
> #include <linux/uaccess.h>
> -#include <linux/rseq.h>
> +#include <uapi/linux/rseq.h>
> #include <asm/param.h>
> #include <asm/page.h>
>
> --- a/include/linux/rseq_entry.h
> +++ b/include/linux/rseq_entry.h
> @@ -38,6 +38,8 @@ DECLARE_PER_CPU(struct rseq_stats, rseq_
> #include <linux/rseq.h>
> #include <linux/uaccess.h>
>
> +#include <uapi/linux/rseq.h>
> +
> #include <linux/tracepoint-defs.h>
>
> #ifdef CONFIG_TRACEPOINTS
> @@ -77,6 +79,7 @@ DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEB
> #endif
>
> bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr);
> +bool rseq_debug_validate_uids(struct task_struct *t);
>
> static __always_inline void rseq_note_user_irq_entry(void)
> {
> @@ -198,6 +201,44 @@ bool rseq_debug_update_user_cs(struct ta
> user_access_end();
> return false;
> }
> +
> +/*
> + * On debug kernels validate that user space did not mess with it if
> + * DEBUG_RSEQ is enabled, but don't on the first exit to user space. In
> + * that case cpu_cid is ~0. See fork/execve.
> + */
> +bool rseq_debug_validate_uids(struct task_struct *t)
Typically "UIDs" are a well known term (user identity) associated with
getuid(2).
I understand that you're trying to use "uids" for "userspace IDs" here,
but I'm concerned about the TLA clash.
perhaps we should name this "rseq_debug_validate_user_fields" instead ?
> +{
> + u32 cpu_id, uval, node_id = cpu_to_node(task_cpu(t));
> + struct rseq __user *rseq = t->rseq;
> +
> + if (t->rseq_ids.cpu_cid == ~0)
> + return true;
> +
> + if (!user_read_masked_begin(rseq))
> + return false;
> +
> + unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault);
> + if (cpu_id != t->rseq_ids.cpu_id)
> + goto die;
> + unsafe_get_user(uval, &rseq->cpu_id, efault);
> + if (uval != cpu_id)
> + goto die;
> + unsafe_get_user(uval, &rseq->node_id, efault);
> + if (uval != node_id)
> + goto die;
AFAIU, when a task migrates across NUMA nodes, userspace will have a
stale value and this check will fail, thus killing the process. To fix
this you'd need to derive "node_id" from
cpu_to_node(t->rseq_ids.cpu_id).
But doing that will not work on powerpc, where the mapping between
node_id and cpu_id can change dynamically, AFAIU this can kill processes
even though userspace did not alter the node_id behind the kernel's
back.
The difference with the preexisting code is that this compares
the userspace node_id with the current node_id that comes
from cpu_to_node(task_cpu(t)), whereas the preexisting
rseq_validate_ro_fields() compares the userspace node_id with the
prior node_id copy we have in the kernel.
> + unsafe_get_user(uval, &rseq->mm_cid, efault);
> + if (uval != t->rseq_ids.mm_cid)
> + goto die;
> + user_access_end();
> + return true;
> +die:
> + t->rseq_event.fatal = true;
> +efault:
> + user_access_end();
> + return false;
> +}
> +
> #endif /* RSEQ_BUILD_SLOW_PATH */
>
> /*
> @@ -268,6 +309,60 @@ rseq_update_user_cs(struct task_struct *
> user_access_end();
> return false;
> }
> +
> +/*
> + * Updates CPU ID, Node ID and MM CID and reads the critical section
> + * address, when @csaddr != NULL. This allows to put the ID update and the
> + * read under the same uaccess region to spare a seperate begin/end.
separate
> + *
> + * As this is either invoked from a C wrapper with @csaddr = NULL or from
> + * the fast path code with a valid pointer, a clever compiler should be
> + * able to optimize the read out. Spares a duplicate implementation.
> + *
> + * Returns true, if the operation was successful, false otherwise.
> + *
> + * In the failure case task::rseq_event::fatal is set when invalid data
> + * was found on debug kernels. It's clear when the failure was an unresolved page
> + * fault.
> + *
> + * If inlined into the exit to user path with interrupts disabled, the
> + * caller has to protect against page faults with pagefault_disable().
> + *
> + * In preemptible task context this would be counterproductive as the page
> + * faults could not be fully resolved. As a consequence unresolved page
> + * faults in task context are fatal too.
> + */
> +static rseq_inline
> +bool rseq_set_uids_get_csaddr(struct task_struct *t, struct rseq_ids *ids,
> + u32 node_id, u64 *csaddr)
> +{
> + struct rseq __user *rseq = t->rseq;
> +
> + if (static_branch_unlikely(&rseq_debug_enabled)) {
> + if (!rseq_debug_validate_uids(t))
> + return false;
> + }
> +
> + if (!user_rw_masked_begin(rseq))
> + return false;
> +
> + unsafe_put_user(ids->cpu_id, &rseq->cpu_id_start, efault);
> + unsafe_put_user(ids->cpu_id, &rseq->cpu_id, efault);
> + unsafe_put_user(node_id, &rseq->node_id, efault);
> + unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault);
> + if (csaddr)
> + unsafe_get_user(*csaddr, &rseq->rseq_cs, efault);
> + user_access_end();
> +
> + /* Cache the new values */
> + t->rseq_ids.cpu_cid = ids->cpu_cid;
I may be missing something, but I think we're missing updates to
t->rseq_ids.mm_cid and we may want to keep track of t->rseq_ids.node_id
as well.
Thanks,
Mathieu
> + rseq_stat_inc(rseq_stats.ids);
> + rseq_trace_update(t, ids);
> + return true;
> +efault:
> + user_access_end();
> + return false;
> +}
>
> static __always_inline void rseq_exit_to_user_mode(void)
> {
> --- a/include/linux/rseq_types.h
> +++ b/include/linux/rseq_types.h
> @@ -3,6 +3,8 @@
> #define _LINUX_RSEQ_TYPES_H
>
> #include <linux/types.h>
> +/* Forward declaration for the sched.h */
> +struct rseq;
>
> /*
> * struct rseq_event - Storage for rseq related event management
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -42,7 +42,6 @@
> #include <linux/posix-timers_types.h>
> #include <linux/restart_block.h>
> #include <linux/rseq_types.h>
> -#include <uapi/linux/rseq.h>
> #include <linux/seqlock_types.h>
> #include <linux/kcsan.h>
> #include <linux/rv.h>
> @@ -1407,15 +1406,6 @@ struct task_struct {
> u32 rseq_sig;
> struct rseq_event rseq_event;
> struct rseq_ids rseq_ids;
> -# ifdef CONFIG_DEBUG_RSEQ
> - /*
> - * This is a place holder to save a copy of the rseq fields for
> - * validation of read-only fields. The struct rseq has a
> - * variable-length array at the end, so it cannot be used
> - * directly. Reserve a size large enough for the known fields.
> - */
> - char rseq_fields[sizeof(struct rseq)];
> -# endif
> #endif
>
> #ifdef CONFIG_SCHED_MM_CID
> --- a/kernel/rseq.c
> +++ b/kernel/rseq.c
> @@ -88,13 +88,6 @@
> # define RSEQ_EVENT_GUARD preempt
> #endif
>
> -/* The original rseq structure size (including padding) is 32 bytes. */
> -#define ORIG_RSEQ_SIZE 32
> -
> -#define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \
> - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \
> - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE)
> -
> DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
>
> static inline void rseq_control_debug(bool on)
> @@ -227,159 +220,9 @@ static int __init rseq_debugfs_init(void
> __initcall(rseq_debugfs_init);
> #endif /* CONFIG_DEBUG_FS */
>
> -#ifdef CONFIG_DEBUG_RSEQ
> -static struct rseq *rseq_kernel_fields(struct task_struct *t)
> -{
> - return (struct rseq *) t->rseq_fields;
> -}
> -
> -static int rseq_validate_ro_fields(struct task_struct *t)
> -{
> - static DEFINE_RATELIMIT_STATE(_rs,
> - DEFAULT_RATELIMIT_INTERVAL,
> - DEFAULT_RATELIMIT_BURST);
> - u32 cpu_id_start, cpu_id, node_id, mm_cid;
> - struct rseq __user *rseq = t->rseq;
> -
> - /*
> - * Validate fields which are required to be read-only by
> - * user-space.
> - */
> - if (!user_read_access_begin(rseq, t->rseq_len))
> - goto efault;
> - unsafe_get_user(cpu_id_start, &rseq->cpu_id_start, efault_end);
> - unsafe_get_user(cpu_id, &rseq->cpu_id, efault_end);
> - unsafe_get_user(node_id, &rseq->node_id, efault_end);
> - unsafe_get_user(mm_cid, &rseq->mm_cid, efault_end);
> - user_read_access_end();
> -
> - if ((cpu_id_start != rseq_kernel_fields(t)->cpu_id_start ||
> - cpu_id != rseq_kernel_fields(t)->cpu_id ||
> - node_id != rseq_kernel_fields(t)->node_id ||
> - mm_cid != rseq_kernel_fields(t)->mm_cid) && __ratelimit(&_rs)) {
> -
> - pr_warn("Detected rseq corruption for pid: %d, name: %s\n"
> - "\tcpu_id_start: %u ?= %u\n"
> - "\tcpu_id: %u ?= %u\n"
> - "\tnode_id: %u ?= %u\n"
> - "\tmm_cid: %u ?= %u\n",
> - t->pid, t->comm,
> - cpu_id_start, rseq_kernel_fields(t)->cpu_id_start,
> - cpu_id, rseq_kernel_fields(t)->cpu_id,
> - node_id, rseq_kernel_fields(t)->node_id,
> - mm_cid, rseq_kernel_fields(t)->mm_cid);
> - }
> -
> - /* For now, only print a console warning on mismatch. */
> - return 0;
> -
> -efault_end:
> - user_read_access_end();
> -efault:
> - return -EFAULT;
> -}
> -
> -/*
> - * Update an rseq field and its in-kernel copy in lock-step to keep a coherent
> - * state.
> - */
> -#define rseq_unsafe_put_user(t, value, field, error_label) \
> - do { \
> - unsafe_put_user(value, &t->rseq->field, error_label); \
> - rseq_kernel_fields(t)->field = value; \
> - } while (0)
> -
> -#else
> -static int rseq_validate_ro_fields(struct task_struct *t)
> -{
> - return 0;
> -}
> -
> -#define rseq_unsafe_put_user(t, value, field, error_label) \
> - unsafe_put_user(value, &t->rseq->field, error_label)
> -#endif
> -
> -static int rseq_update_cpu_node_id(struct task_struct *t)
> -{
> - struct rseq __user *rseq = t->rseq;
> - u32 cpu_id = raw_smp_processor_id();
> - u32 node_id = cpu_to_node(cpu_id);
> - u32 mm_cid = task_mm_cid(t);
> -
> - rseq_stat_inc(rseq_stats.ids);
> -
> - /* Validate read-only rseq fields on debug kernels */
> - if (rseq_validate_ro_fields(t))
> - goto efault;
> - WARN_ON_ONCE((int) mm_cid < 0);
> -
> - if (!user_write_access_begin(rseq, t->rseq_len))
> - goto efault;
> -
> - rseq_unsafe_put_user(t, cpu_id, cpu_id_start, efault_end);
> - rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end);
> - rseq_unsafe_put_user(t, node_id, node_id, efault_end);
> - rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end);
> -
> - /* Cache the user space values */
> - t->rseq_ids.cpu_id = cpu_id;
> - t->rseq_ids.mm_cid = mm_cid;
> -
> - /*
> - * Additional feature fields added after ORIG_RSEQ_SIZE
> - * need to be conditionally updated only if
> - * t->rseq_len != ORIG_RSEQ_SIZE.
> - */
> - user_write_access_end();
> - trace_rseq_update(t);
> - return 0;
> -
> -efault_end:
> - user_write_access_end();
> -efault:
> - return -EFAULT;
> -}
> -
> -static int rseq_reset_rseq_cpu_node_id(struct task_struct *t)
> +static bool rseq_set_uids(struct task_struct *t, struct rseq_ids *ids, u32 node_id)
> {
> - struct rseq __user *rseq = t->rseq;
> - u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0,
> - mm_cid = 0;
> -
> - /*
> - * Validate read-only rseq fields.
> - */
> - if (rseq_validate_ro_fields(t))
> - goto efault;
> -
> - if (!user_write_access_begin(rseq, t->rseq_len))
> - goto efault;
> -
> - /*
> - * Reset all fields to their initial state.
> - *
> - * All fields have an initial state of 0 except cpu_id which is set to
> - * RSEQ_CPU_ID_UNINITIALIZED, so that any user coming in after
> - * unregistration can figure out that rseq needs to be registered
> - * again.
> - */
> - rseq_unsafe_put_user(t, cpu_id_start, cpu_id_start, efault_end);
> - rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end);
> - rseq_unsafe_put_user(t, node_id, node_id, efault_end);
> - rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end);
> -
> - /*
> - * Additional feature fields added after ORIG_RSEQ_SIZE
> - * need to be conditionally reset only if
> - * t->rseq_len != ORIG_RSEQ_SIZE.
> - */
> - user_write_access_end();
> - return 0;
> -
> -efault_end:
> - user_write_access_end();
> -efault:
> - return -EFAULT;
> + return rseq_set_uids_get_csaddr(t, ids, node_id, NULL);
> }
>
> static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs)
> @@ -407,6 +250,8 @@ static bool rseq_handle_cs(struct task_s
> void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
> {
> struct task_struct *t = current;
> + struct rseq_ids ids;
> + u32 node_id;
> bool event;
> int sig;
>
> @@ -453,6 +298,8 @@ void __rseq_handle_notify_resume(struct
> scoped_guard(RSEQ_EVENT_GUARD) {
> event = t->rseq_event.sched_switch;
> t->rseq_event.sched_switch = false;
> + ids.cpu_id = task_cpu(t);
> + ids.mm_cid = task_mm_cid(t);
> }
>
> if (!IS_ENABLED(CONFIG_DEBUG_RSEQ) && !event)
> @@ -461,7 +308,8 @@ void __rseq_handle_notify_resume(struct
> if (!rseq_handle_cs(t, regs))
> goto error;
>
> - if (unlikely(rseq_update_cpu_node_id(t)))
> + node_id = cpu_to_node(ids.cpu_id);
> + if (!rseq_set_uids(t, &ids, node_id))
> goto error;
> return;
>
> @@ -502,13 +350,33 @@ void rseq_syscall(struct pt_regs *regs)
> }
> #endif
>
> +static bool rseq_reset_ids(void)
> +{
> + struct rseq_ids ids = {
> + .cpu_id = RSEQ_CPU_ID_UNINITIALIZED,
> + .mm_cid = 0,
> + };
> +
> + /*
> + * If this fails, terminate it because this leaves the kernel in
> + * stupid state as exit to user space will try to fixup the ids
> + * again.
> + */
> + if (rseq_set_uids(current, &ids, 0))
> + return true;
> +
> + force_sig(SIGSEGV);
> + return false;
> +}
> +
> +/* The original rseq structure size (including padding) is 32 bytes. */
> +#define ORIG_RSEQ_SIZE 32
> +
> /*
> * sys_rseq - setup restartable sequences for caller thread.
> */
> SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig)
> {
> - int ret;
> -
> if (flags & RSEQ_FLAG_UNREGISTER) {
> if (flags & ~RSEQ_FLAG_UNREGISTER)
> return -EINVAL;
> @@ -519,9 +387,8 @@ SYSCALL_DEFINE4(rseq, struct rseq __user
> return -EINVAL;
> if (current->rseq_sig != sig)
> return -EPERM;
> - ret = rseq_reset_rseq_cpu_node_id(current);
> - if (ret)
> - return ret;
> + if (!rseq_reset_ids())
> + return -EFAULT;
> current->rseq = NULL;
> current->rseq_sig = 0;
> current->rseq_len = 0;
> @@ -574,17 +441,6 @@ SYSCALL_DEFINE4(rseq, struct rseq __user
> if (put_user_masked_u64(0UL, &rseq->rseq_cs))
> return -EFAULT;
>
> -#ifdef CONFIG_DEBUG_RSEQ
> - /*
> - * Initialize the in-kernel rseq fields copy for validation of
> - * read-only fields.
> - */
> - if (get_user(rseq_kernel_fields(current)->cpu_id_start, &rseq->cpu_id_start) ||
> - get_user(rseq_kernel_fields(current)->cpu_id, &rseq->cpu_id) ||
> - get_user(rseq_kernel_fields(current)->node_id, &rseq->node_id) ||
> - get_user(rseq_kernel_fields(current)->mm_cid, &rseq->mm_cid))
> - return -EFAULT;
> -#endif
> /*
> * Activate the registration by setting the rseq area address, length
> * and signature in the task struct.
>
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
Powered by blists - more mailing lists