linux-kernel - Re: [patch V2 17/37] rseq: Expose lightweight statistics in debugfs

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <885d6f67-f0e7-43c1-a173-740586cc0da5@efficios.com>
Date: Mon, 25 Aug 2025 14:34:57 -0400
From: Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
To: Thomas Gleixner <tglx@...utronix.de>, LKML <linux-kernel@...r.kernel.org>
Cc: Jens Axboe <axboe@...nel.dk>, Peter Zijlstra <peterz@...radead.org>,
 "Paul E. McKenney" <paulmck@...nel.org>, Boqun Feng <boqun.feng@...il.com>,
 Paolo Bonzini <pbonzini@...hat.com>, Sean Christopherson
 <seanjc@...gle.com>, Wei Liu <wei.liu@...nel.org>,
 Dexuan Cui <decui@...rosoft.com>, x86@...nel.org,
 Arnd Bergmann <arnd@...db.de>, Heiko Carstens <hca@...ux.ibm.com>,
 Christian Borntraeger <borntraeger@...ux.ibm.com>,
 Sven Schnelle <svens@...ux.ibm.com>, Huacai Chen <chenhuacai@...nel.org>,
 Paul Walmsley <paul.walmsley@...ive.com>, Palmer Dabbelt <palmer@...belt.com>
Subject: Re: [patch V2 17/37] rseq: Expose lightweight statistics in debugfs

On 2025-08-23 12:39, Thomas Gleixner wrote:
> Analyzing the call frequency without actually using tracing is helpful for
> analysis of this infrastructure. The overhead is minimal as it just
> increments a per CPU counter associated to each operation.
> 
> The debugfs readout provides a racy sum of all counters.

Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@...icios.com>

> 
> Signed-off-by: Thomas Gleixner <tglx@...utronix.de>
> ---
>   include/linux/rseq.h       |   16 ---------
>   include/linux/rseq_entry.h |   49 +++++++++++++++++++++++++++
>   init/Kconfig               |   12 ++++++
>   kernel/rseq.c              |   79 +++++++++++++++++++++++++++++++++++++++++----
>   4 files changed, 133 insertions(+), 23 deletions(-)
> 
> --- a/include/linux/rseq.h
> +++ b/include/linux/rseq.h
> @@ -29,21 +29,6 @@ static inline void rseq_sched_switch_eve
>   	}
>   }
>   
> -static __always_inline void rseq_exit_to_user_mode(void)
> -{
> -	struct rseq_event *ev = &current->rseq_event;
> -
> -	if (IS_ENABLED(CONFIG_DEBUG_RSEQ))
> -		WARN_ON_ONCE(ev->sched_switch);
> -
> -	/*
> -	 * Ensure that event (especially user_irq) is cleared when the
> -	 * interrupt did not result in a schedule and therefore the
> -	 * rseq processing did not clear it.
> -	 */
> -	ev->events = 0;
> -}
> -
>   /*
>    * KVM/HYPERV invoke resume_user_mode_work() before entering guest mode,
>    * which clears TIF_NOTIFY_RESUME. To avoid updating user space RSEQ in
> @@ -97,7 +82,6 @@ static inline void rseq_sched_switch_eve
>   static inline void rseq_virt_userspace_exit(void) { }
>   static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { }
>   static inline void rseq_execve(struct task_struct *t) { }
> -static inline void rseq_exit_to_user_mode(void) { }
>   #endif  /* !CONFIG_RSEQ */
>   
>   #ifdef CONFIG_DEBUG_RSEQ
> --- a/include/linux/rseq_entry.h
> +++ b/include/linux/rseq_entry.h
> @@ -2,6 +2,37 @@
>   #ifndef _LINUX_RSEQ_ENTRY_H
>   #define _LINUX_RSEQ_ENTRY_H
>   
> +/* Must be outside the CONFIG_RSEQ guard to resolve the stubs */
> +#ifdef CONFIG_RSEQ_STATS
> +#include <linux/percpu.h>
> +
> +struct rseq_stats {
> +	unsigned long	exit;
> +	unsigned long	signal;
> +	unsigned long	slowpath;
> +	unsigned long	ids;
> +	unsigned long	cs;
> +	unsigned long	clear;
> +	unsigned long	fixup;
> +};
> +
> +DECLARE_PER_CPU(struct rseq_stats, rseq_stats);
> +
> +/*
> + * Slow path has interrupts and preemption enabled, but the fast path
> + * runs with interrupts disabled so there is no point in having the
> + * preemption checks implied in __this_cpu_inc() for every operation.
> + */
> +#ifdef RSEQ_BUILD_SLOW_PATH
> +#define rseq_stat_inc(which)	this_cpu_inc((which))
> +#else
> +#define rseq_stat_inc(which)	raw_cpu_inc((which))
> +#endif
> +
> +#else /* CONFIG_RSEQ_STATS */
> +#define rseq_stat_inc(x)	do { } while (0)
> +#endif /* !CONFIG_RSEQ_STATS */
> +
>   #ifdef CONFIG_RSEQ
>   #include <linux/rseq.h>
>   
> @@ -41,8 +72,26 @@ static __always_inline void rseq_note_us
>   		current->rseq_event.user_irq = true;
>   }
>   
> +static __always_inline void rseq_exit_to_user_mode(void)
> +{
> +	struct rseq_event *ev = &current->rseq_event;
> +
> +	rseq_stat_inc(rseq_stats.exit);
> +
> +	if (IS_ENABLED(CONFIG_DEBUG_RSEQ))
> +		WARN_ON_ONCE(ev->sched_switch);
> +
> +	/*
> +	 * Ensure that event (especially user_irq) is cleared when the
> +	 * interrupt did not result in a schedule and therefore the
> +	 * rseq processing did not clear it.
> +	 */
> +	ev->events = 0;
> +}
> +
>   #else /* CONFIG_RSEQ */
>   static inline void rseq_note_user_irq_entry(void) { }
> +static inline void rseq_exit_to_user_mode(void) { }
>   #endif /* !CONFIG_RSEQ */
>   
>   #endif /* _LINUX_RSEQ_ENTRY_H */
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1883,6 +1883,18 @@ config RSEQ
>   
>   	  If unsure, say Y.
>   
> +config RSEQ_STATS
> +	default n
> +	bool "Enable lightweight statistics of restartable sequences" if EXPERT
> +	depends on RSEQ && DEBUG_FS
> +	help
> +	  Enable lightweight counters which expose information about the
> +	  frequency of RSEQ operations via debugfs. Mostly interesting for
> +	  kernel debugging or performance analysis. While lightweight it's
> +	  still adding code into the user/kernel mode transitions.
> +
> +	  If unsure, say N.
> +
>   config DEBUG_RSEQ
>   	default n
>   	bool "Enable debugging of rseq() system call" if EXPERT
> --- a/kernel/rseq.c
> +++ b/kernel/rseq.c
> @@ -67,12 +67,16 @@
>    *   F1. <failure>
>    */
>   
> +/* Required to select the proper per_cpu ops for rseq_stats_inc() */
> +#define RSEQ_BUILD_SLOW_PATH
> +
> +#include <linux/debugfs.h>
> +#include <linux/ratelimit.h>
> +#include <linux/rseq_entry.h>
>   #include <linux/sched.h>
> -#include <linux/uaccess.h>
>   #include <linux/syscalls.h>
> -#include <linux/rseq.h>
> +#include <linux/uaccess.h>
>   #include <linux/types.h>
> -#include <linux/ratelimit.h>
>   #include <asm/ptrace.h>
>   
>   #define CREATE_TRACE_POINTS
> @@ -108,6 +112,56 @@ void __rseq_trace_ip_fixup(unsigned long
>   }
>   #endif /* CONFIG_TRACEPOINTS */
>   
> +#ifdef CONFIG_RSEQ_STATS
> +DEFINE_PER_CPU(struct rseq_stats, rseq_stats);
> +
> +static int rseq_debug_show(struct seq_file *m, void *p)
> +{
> +	struct rseq_stats stats = { };
> +	unsigned int cpu;
> +
> +	for_each_possible_cpu(cpu) {
> +		stats.exit	+= data_race(per_cpu(rseq_stats.exit, cpu));
> +		stats.signal	+= data_race(per_cpu(rseq_stats.signal, cpu));
> +		stats.slowpath	+= data_race(per_cpu(rseq_stats.slowpath, cpu));
> +		stats.ids	+= data_race(per_cpu(rseq_stats.ids, cpu));
> +		stats.cs	+= data_race(per_cpu(rseq_stats.cs, cpu));
> +		stats.clear	+= data_race(per_cpu(rseq_stats.clear, cpu));
> +		stats.fixup	+= data_race(per_cpu(rseq_stats.fixup, cpu));
> +	}
> +
> +	seq_printf(m, "exit:   %16lu\n", stats.exit);
> +	seq_printf(m, "signal: %16lu\n", stats.signal);
> +	seq_printf(m, "slowp:  %16lu\n", stats.slowpath);
> +	seq_printf(m, "ids:    %16lu\n", stats.ids);
> +	seq_printf(m, "cs:     %16lu\n", stats.cs);
> +	seq_printf(m, "clear:  %16lu\n", stats.clear);
> +	seq_printf(m, "fixup:  %16lu\n", stats.fixup);
> +	return 0;
> +}
> +
> +static int rseq_debug_open(struct inode *inode, struct file *file)
> +{
> +	return single_open(file, rseq_debug_show, inode->i_private);
> +}
> +
> +static const struct file_operations dfs_ops = {
> +	.open		= rseq_debug_open,
> +	.read		= seq_read,
> +	.llseek		= seq_lseek,
> +	.release	= single_release,
> +};
> +
> +static int __init rseq_debugfs_init(void)
> +{
> +	struct dentry *root_dir = debugfs_create_dir("rseq", NULL);
> +
> +	debugfs_create_file("stats", 0444, root_dir, NULL, &dfs_ops);
> +	return 0;
> +}
> +__initcall(rseq_debugfs_init);
> +#endif /* CONFIG_RSEQ_STATS */
> +
>   #ifdef CONFIG_DEBUG_RSEQ
>   static struct rseq *rseq_kernel_fields(struct task_struct *t)
>   {
> @@ -187,12 +241,13 @@ static int rseq_update_cpu_node_id(struc
>   	u32 node_id = cpu_to_node(cpu_id);
>   	u32 mm_cid = task_mm_cid(t);
>   
> -	/*
> -	 * Validate read-only rseq fields.
> -	 */
> +	rseq_stat_inc(rseq_stats.ids);
> +
> +	/* Validate read-only rseq fields on debug kernels */
>   	if (rseq_validate_ro_fields(t))
>   		goto efault;
>   	WARN_ON_ONCE((int) mm_cid < 0);
> +
>   	if (!user_write_access_begin(rseq, t->rseq_len))
>   		goto efault;
>   
> @@ -403,6 +458,8 @@ static int rseq_ip_fixup(struct pt_regs
>   	struct rseq_cs rseq_cs;
>   	int ret;
>   
> +	rseq_stat_inc(rseq_stats.cs);
> +
>   	ret = rseq_get_rseq_cs(t, &rseq_cs);
>   	if (ret)
>   		return ret;
> @@ -412,8 +469,10 @@ static int rseq_ip_fixup(struct pt_regs
>   	 * If not nested over a rseq critical section, restart is useless.
>   	 * Clear the rseq_cs pointer and return.
>   	 */
> -	if (!in_rseq_cs(ip, &rseq_cs))
> +	if (!in_rseq_cs(ip, &rseq_cs)) {
> +		rseq_stat_inc(rseq_stats.clear);
>   		return clear_rseq_cs(t->rseq);
> +	}
>   	ret = rseq_check_flags(t, rseq_cs.flags);
>   	if (ret < 0)
>   		return ret;
> @@ -422,6 +481,7 @@ static int rseq_ip_fixup(struct pt_regs
>   	ret = clear_rseq_cs(t->rseq);
>   	if (ret)
>   		return ret;
> +	rseq_stat_inc(rseq_stats.fixup);
>   	trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset,
>   			    rseq_cs.abort_ip);
>   	instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip);
> @@ -462,6 +522,11 @@ void __rseq_handle_notify_resume(struct
>   	if (unlikely(t->flags & PF_EXITING))
>   		return;
>   
> +	if (ksig)
> +		rseq_stat_inc(rseq_stats.signal);
> +	else
> +		rseq_stat_inc(rseq_stats.slowpath);
> +
>   	/*
>   	 * Read and clear the event pending bit first. If the task
>   	 * was not preempted or migrated or a signal is on the way,
> 


-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com