[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <885d6f67-f0e7-43c1-a173-740586cc0da5@efficios.com>
Date: Mon, 25 Aug 2025 14:34:57 -0400
From: Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
To: Thomas Gleixner <tglx@...utronix.de>, LKML <linux-kernel@...r.kernel.org>
Cc: Jens Axboe <axboe@...nel.dk>, Peter Zijlstra <peterz@...radead.org>,
"Paul E. McKenney" <paulmck@...nel.org>, Boqun Feng <boqun.feng@...il.com>,
Paolo Bonzini <pbonzini@...hat.com>, Sean Christopherson
<seanjc@...gle.com>, Wei Liu <wei.liu@...nel.org>,
Dexuan Cui <decui@...rosoft.com>, x86@...nel.org,
Arnd Bergmann <arnd@...db.de>, Heiko Carstens <hca@...ux.ibm.com>,
Christian Borntraeger <borntraeger@...ux.ibm.com>,
Sven Schnelle <svens@...ux.ibm.com>, Huacai Chen <chenhuacai@...nel.org>,
Paul Walmsley <paul.walmsley@...ive.com>, Palmer Dabbelt <palmer@...belt.com>
Subject: Re: [patch V2 17/37] rseq: Expose lightweight statistics in debugfs
On 2025-08-23 12:39, Thomas Gleixner wrote:
> Analyzing the call frequency without actually using tracing is helpful for
> analysis of this infrastructure. The overhead is minimal as it just
> increments a per CPU counter associated to each operation.
>
> The debugfs readout provides a racy sum of all counters.
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
>
> Signed-off-by: Thomas Gleixner <tglx@...utronix.de>
> ---
> include/linux/rseq.h | 16 ---------
> include/linux/rseq_entry.h | 49 +++++++++++++++++++++++++++
> init/Kconfig | 12 ++++++
> kernel/rseq.c | 79 +++++++++++++++++++++++++++++++++++++++++----
> 4 files changed, 133 insertions(+), 23 deletions(-)
>
> --- a/include/linux/rseq.h
> +++ b/include/linux/rseq.h
> @@ -29,21 +29,6 @@ static inline void rseq_sched_switch_eve
> }
> }
>
> -static __always_inline void rseq_exit_to_user_mode(void)
> -{
> - struct rseq_event *ev = ¤t->rseq_event;
> -
> - if (IS_ENABLED(CONFIG_DEBUG_RSEQ))
> - WARN_ON_ONCE(ev->sched_switch);
> -
> - /*
> - * Ensure that event (especially user_irq) is cleared when the
> - * interrupt did not result in a schedule and therefore the
> - * rseq processing did not clear it.
> - */
> - ev->events = 0;
> -}
> -
> /*
> * KVM/HYPERV invoke resume_user_mode_work() before entering guest mode,
> * which clears TIF_NOTIFY_RESUME. To avoid updating user space RSEQ in
> @@ -97,7 +82,6 @@ static inline void rseq_sched_switch_eve
> static inline void rseq_virt_userspace_exit(void) { }
> static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { }
> static inline void rseq_execve(struct task_struct *t) { }
> -static inline void rseq_exit_to_user_mode(void) { }
> #endif /* !CONFIG_RSEQ */
>
> #ifdef CONFIG_DEBUG_RSEQ
> --- a/include/linux/rseq_entry.h
> +++ b/include/linux/rseq_entry.h
> @@ -2,6 +2,37 @@
> #ifndef _LINUX_RSEQ_ENTRY_H
> #define _LINUX_RSEQ_ENTRY_H
>
> +/* Must be outside the CONFIG_RSEQ guard to resolve the stubs */
> +#ifdef CONFIG_RSEQ_STATS
> +#include <linux/percpu.h>
> +
> +struct rseq_stats {
> + unsigned long exit;
> + unsigned long signal;
> + unsigned long slowpath;
> + unsigned long ids;
> + unsigned long cs;
> + unsigned long clear;
> + unsigned long fixup;
> +};
> +
> +DECLARE_PER_CPU(struct rseq_stats, rseq_stats);
> +
> +/*
> + * Slow path has interrupts and preemption enabled, but the fast path
> + * runs with interrupts disabled so there is no point in having the
> + * preemption checks implied in __this_cpu_inc() for every operation.
> + */
> +#ifdef RSEQ_BUILD_SLOW_PATH
> +#define rseq_stat_inc(which) this_cpu_inc((which))
> +#else
> +#define rseq_stat_inc(which) raw_cpu_inc((which))
> +#endif
> +
> +#else /* CONFIG_RSEQ_STATS */
> +#define rseq_stat_inc(x) do { } while (0)
> +#endif /* !CONFIG_RSEQ_STATS */
> +
> #ifdef CONFIG_RSEQ
> #include <linux/rseq.h>
>
> @@ -41,8 +72,26 @@ static __always_inline void rseq_note_us
> current->rseq_event.user_irq = true;
> }
>
> +static __always_inline void rseq_exit_to_user_mode(void)
> +{
> + struct rseq_event *ev = ¤t->rseq_event;
> +
> + rseq_stat_inc(rseq_stats.exit);
> +
> + if (IS_ENABLED(CONFIG_DEBUG_RSEQ))
> + WARN_ON_ONCE(ev->sched_switch);
> +
> + /*
> + * Ensure that event (especially user_irq) is cleared when the
> + * interrupt did not result in a schedule and therefore the
> + * rseq processing did not clear it.
> + */
> + ev->events = 0;
> +}
> +
> #else /* CONFIG_RSEQ */
> static inline void rseq_note_user_irq_entry(void) { }
> +static inline void rseq_exit_to_user_mode(void) { }
> #endif /* !CONFIG_RSEQ */
>
> #endif /* _LINUX_RSEQ_ENTRY_H */
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1883,6 +1883,18 @@ config RSEQ
>
> If unsure, say Y.
>
> +config RSEQ_STATS
> + default n
> + bool "Enable lightweight statistics of restartable sequences" if EXPERT
> + depends on RSEQ && DEBUG_FS
> + help
> + Enable lightweight counters which expose information about the
> + frequency of RSEQ operations via debugfs. Mostly interesting for
> + kernel debugging or performance analysis. While lightweight it's
> + still adding code into the user/kernel mode transitions.
> +
> + If unsure, say N.
> +
> config DEBUG_RSEQ
> default n
> bool "Enable debugging of rseq() system call" if EXPERT
> --- a/kernel/rseq.c
> +++ b/kernel/rseq.c
> @@ -67,12 +67,16 @@
> * F1. <failure>
> */
>
> +/* Required to select the proper per_cpu ops for rseq_stats_inc() */
> +#define RSEQ_BUILD_SLOW_PATH
> +
> +#include <linux/debugfs.h>
> +#include <linux/ratelimit.h>
> +#include <linux/rseq_entry.h>
> #include <linux/sched.h>
> -#include <linux/uaccess.h>
> #include <linux/syscalls.h>
> -#include <linux/rseq.h>
> +#include <linux/uaccess.h>
> #include <linux/types.h>
> -#include <linux/ratelimit.h>
> #include <asm/ptrace.h>
>
> #define CREATE_TRACE_POINTS
> @@ -108,6 +112,56 @@ void __rseq_trace_ip_fixup(unsigned long
> }
> #endif /* CONFIG_TRACEPOINTS */
>
> +#ifdef CONFIG_RSEQ_STATS
> +DEFINE_PER_CPU(struct rseq_stats, rseq_stats);
> +
> +static int rseq_debug_show(struct seq_file *m, void *p)
> +{
> + struct rseq_stats stats = { };
> + unsigned int cpu;
> +
> + for_each_possible_cpu(cpu) {
> + stats.exit += data_race(per_cpu(rseq_stats.exit, cpu));
> + stats.signal += data_race(per_cpu(rseq_stats.signal, cpu));
> + stats.slowpath += data_race(per_cpu(rseq_stats.slowpath, cpu));
> + stats.ids += data_race(per_cpu(rseq_stats.ids, cpu));
> + stats.cs += data_race(per_cpu(rseq_stats.cs, cpu));
> + stats.clear += data_race(per_cpu(rseq_stats.clear, cpu));
> + stats.fixup += data_race(per_cpu(rseq_stats.fixup, cpu));
> + }
> +
> + seq_printf(m, "exit: %16lu\n", stats.exit);
> + seq_printf(m, "signal: %16lu\n", stats.signal);
> + seq_printf(m, "slowp: %16lu\n", stats.slowpath);
> + seq_printf(m, "ids: %16lu\n", stats.ids);
> + seq_printf(m, "cs: %16lu\n", stats.cs);
> + seq_printf(m, "clear: %16lu\n", stats.clear);
> + seq_printf(m, "fixup: %16lu\n", stats.fixup);
> + return 0;
> +}
> +
> +static int rseq_debug_open(struct inode *inode, struct file *file)
> +{
> + return single_open(file, rseq_debug_show, inode->i_private);
> +}
> +
> +static const struct file_operations dfs_ops = {
> + .open = rseq_debug_open,
> + .read = seq_read,
> + .llseek = seq_lseek,
> + .release = single_release,
> +};
> +
> +static int __init rseq_debugfs_init(void)
> +{
> + struct dentry *root_dir = debugfs_create_dir("rseq", NULL);
> +
> + debugfs_create_file("stats", 0444, root_dir, NULL, &dfs_ops);
> + return 0;
> +}
> +__initcall(rseq_debugfs_init);
> +#endif /* CONFIG_RSEQ_STATS */
> +
> #ifdef CONFIG_DEBUG_RSEQ
> static struct rseq *rseq_kernel_fields(struct task_struct *t)
> {
> @@ -187,12 +241,13 @@ static int rseq_update_cpu_node_id(struc
> u32 node_id = cpu_to_node(cpu_id);
> u32 mm_cid = task_mm_cid(t);
>
> - /*
> - * Validate read-only rseq fields.
> - */
> + rseq_stat_inc(rseq_stats.ids);
> +
> + /* Validate read-only rseq fields on debug kernels */
> if (rseq_validate_ro_fields(t))
> goto efault;
> WARN_ON_ONCE((int) mm_cid < 0);
> +
> if (!user_write_access_begin(rseq, t->rseq_len))
> goto efault;
>
> @@ -403,6 +458,8 @@ static int rseq_ip_fixup(struct pt_regs
> struct rseq_cs rseq_cs;
> int ret;
>
> + rseq_stat_inc(rseq_stats.cs);
> +
> ret = rseq_get_rseq_cs(t, &rseq_cs);
> if (ret)
> return ret;
> @@ -412,8 +469,10 @@ static int rseq_ip_fixup(struct pt_regs
> * If not nested over a rseq critical section, restart is useless.
> * Clear the rseq_cs pointer and return.
> */
> - if (!in_rseq_cs(ip, &rseq_cs))
> + if (!in_rseq_cs(ip, &rseq_cs)) {
> + rseq_stat_inc(rseq_stats.clear);
> return clear_rseq_cs(t->rseq);
> + }
> ret = rseq_check_flags(t, rseq_cs.flags);
> if (ret < 0)
> return ret;
> @@ -422,6 +481,7 @@ static int rseq_ip_fixup(struct pt_regs
> ret = clear_rseq_cs(t->rseq);
> if (ret)
> return ret;
> + rseq_stat_inc(rseq_stats.fixup);
> trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset,
> rseq_cs.abort_ip);
> instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip);
> @@ -462,6 +522,11 @@ void __rseq_handle_notify_resume(struct
> if (unlikely(t->flags & PF_EXITING))
> return;
>
> + if (ksig)
> + rseq_stat_inc(rseq_stats.signal);
> + else
> + rseq_stat_inc(rseq_stats.slowpath);
> +
> /*
> * Read and clear the event pending bit first. If the task
> * was not preempted or migrated or a signal is on the way,
>
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
Powered by blists - more mailing lists