[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250823161654.357235187@linutronix.de>
Date: Sat, 23 Aug 2025 18:39:48 +0200 (CEST)
From: Thomas Gleixner <tglx@...utronix.de>
To: LKML <linux-kernel@...r.kernel.org>
Cc: Jens Axboe <axboe@...nel.dk>,
Mathieu Desnoyers <mathieu.desnoyers@...icios.com>,
Peter Zijlstra <peterz@...radead.org>,
"Paul E. McKenney" <paulmck@...nel.org>,
Boqun Feng <boqun.feng@...il.com>,
Paolo Bonzini <pbonzini@...hat.com>,
Sean Christopherson <seanjc@...gle.com>,
Wei Liu <wei.liu@...nel.org>,
Dexuan Cui <decui@...rosoft.com>,
x86@...nel.org,
Arnd Bergmann <arnd@...db.de>,
Heiko Carstens <hca@...ux.ibm.com>,
Christian Borntraeger <borntraeger@...ux.ibm.com>,
Sven Schnelle <svens@...ux.ibm.com>,
Huacai Chen <chenhuacai@...nel.org>,
Paul Walmsley <paul.walmsley@...ive.com>,
Palmer Dabbelt <palmer@...belt.com>
Subject: [patch V2 17/37] rseq: Expose lightweight statistics in debugfs
Analyzing the call frequency without actually using tracing is helpful for
analysis of this infrastructure. The overhead is minimal as it just
increments a per CPU counter associated to each operation.
The debugfs readout provides a racy sum of all counters.
Signed-off-by: Thomas Gleixner <tglx@...utronix.de>
---
include/linux/rseq.h | 16 ---------
include/linux/rseq_entry.h | 49 +++++++++++++++++++++++++++
init/Kconfig | 12 ++++++
kernel/rseq.c | 79 +++++++++++++++++++++++++++++++++++++++++----
4 files changed, 133 insertions(+), 23 deletions(-)
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -29,21 +29,6 @@ static inline void rseq_sched_switch_eve
}
}
-static __always_inline void rseq_exit_to_user_mode(void)
-{
- struct rseq_event *ev = ¤t->rseq_event;
-
- if (IS_ENABLED(CONFIG_DEBUG_RSEQ))
- WARN_ON_ONCE(ev->sched_switch);
-
- /*
- * Ensure that event (especially user_irq) is cleared when the
- * interrupt did not result in a schedule and therefore the
- * rseq processing did not clear it.
- */
- ev->events = 0;
-}
-
/*
* KVM/HYPERV invoke resume_user_mode_work() before entering guest mode,
* which clears TIF_NOTIFY_RESUME. To avoid updating user space RSEQ in
@@ -97,7 +82,6 @@ static inline void rseq_sched_switch_eve
static inline void rseq_virt_userspace_exit(void) { }
static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { }
static inline void rseq_execve(struct task_struct *t) { }
-static inline void rseq_exit_to_user_mode(void) { }
#endif /* !CONFIG_RSEQ */
#ifdef CONFIG_DEBUG_RSEQ
--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -2,6 +2,37 @@
#ifndef _LINUX_RSEQ_ENTRY_H
#define _LINUX_RSEQ_ENTRY_H
+/* Must be outside the CONFIG_RSEQ guard to resolve the stubs */
+#ifdef CONFIG_RSEQ_STATS
+#include <linux/percpu.h>
+
+struct rseq_stats {
+ unsigned long exit;
+ unsigned long signal;
+ unsigned long slowpath;
+ unsigned long ids;
+ unsigned long cs;
+ unsigned long clear;
+ unsigned long fixup;
+};
+
+DECLARE_PER_CPU(struct rseq_stats, rseq_stats);
+
+/*
+ * Slow path has interrupts and preemption enabled, but the fast path
+ * runs with interrupts disabled so there is no point in having the
+ * preemption checks implied in __this_cpu_inc() for every operation.
+ */
+#ifdef RSEQ_BUILD_SLOW_PATH
+#define rseq_stat_inc(which) this_cpu_inc((which))
+#else
+#define rseq_stat_inc(which) raw_cpu_inc((which))
+#endif
+
+#else /* CONFIG_RSEQ_STATS */
+#define rseq_stat_inc(x) do { } while (0)
+#endif /* !CONFIG_RSEQ_STATS */
+
#ifdef CONFIG_RSEQ
#include <linux/rseq.h>
@@ -41,8 +72,26 @@ static __always_inline void rseq_note_us
current->rseq_event.user_irq = true;
}
+static __always_inline void rseq_exit_to_user_mode(void)
+{
+ struct rseq_event *ev = ¤t->rseq_event;
+
+ rseq_stat_inc(rseq_stats.exit);
+
+ if (IS_ENABLED(CONFIG_DEBUG_RSEQ))
+ WARN_ON_ONCE(ev->sched_switch);
+
+ /*
+ * Ensure that event (especially user_irq) is cleared when the
+ * interrupt did not result in a schedule and therefore the
+ * rseq processing did not clear it.
+ */
+ ev->events = 0;
+}
+
#else /* CONFIG_RSEQ */
static inline void rseq_note_user_irq_entry(void) { }
+static inline void rseq_exit_to_user_mode(void) { }
#endif /* !CONFIG_RSEQ */
#endif /* _LINUX_RSEQ_ENTRY_H */
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1883,6 +1883,18 @@ config RSEQ
If unsure, say Y.
+config RSEQ_STATS
+ default n
+ bool "Enable lightweight statistics of restartable sequences" if EXPERT
+ depends on RSEQ && DEBUG_FS
+ help
+ Enable lightweight counters which expose information about the
+ frequency of RSEQ operations via debugfs. Mostly interesting for
+ kernel debugging or performance analysis. While lightweight it's
+ still adding code into the user/kernel mode transitions.
+
+ If unsure, say N.
+
config DEBUG_RSEQ
default n
bool "Enable debugging of rseq() system call" if EXPERT
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -67,12 +67,16 @@
* F1. <failure>
*/
+/* Required to select the proper per_cpu ops for rseq_stats_inc() */
+#define RSEQ_BUILD_SLOW_PATH
+
+#include <linux/debugfs.h>
+#include <linux/ratelimit.h>
+#include <linux/rseq_entry.h>
#include <linux/sched.h>
-#include <linux/uaccess.h>
#include <linux/syscalls.h>
-#include <linux/rseq.h>
+#include <linux/uaccess.h>
#include <linux/types.h>
-#include <linux/ratelimit.h>
#include <asm/ptrace.h>
#define CREATE_TRACE_POINTS
@@ -108,6 +112,56 @@ void __rseq_trace_ip_fixup(unsigned long
}
#endif /* CONFIG_TRACEPOINTS */
+#ifdef CONFIG_RSEQ_STATS
+DEFINE_PER_CPU(struct rseq_stats, rseq_stats);
+
+static int rseq_debug_show(struct seq_file *m, void *p)
+{
+ struct rseq_stats stats = { };
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu) {
+ stats.exit += data_race(per_cpu(rseq_stats.exit, cpu));
+ stats.signal += data_race(per_cpu(rseq_stats.signal, cpu));
+ stats.slowpath += data_race(per_cpu(rseq_stats.slowpath, cpu));
+ stats.ids += data_race(per_cpu(rseq_stats.ids, cpu));
+ stats.cs += data_race(per_cpu(rseq_stats.cs, cpu));
+ stats.clear += data_race(per_cpu(rseq_stats.clear, cpu));
+ stats.fixup += data_race(per_cpu(rseq_stats.fixup, cpu));
+ }
+
+ seq_printf(m, "exit: %16lu\n", stats.exit);
+ seq_printf(m, "signal: %16lu\n", stats.signal);
+ seq_printf(m, "slowp: %16lu\n", stats.slowpath);
+ seq_printf(m, "ids: %16lu\n", stats.ids);
+ seq_printf(m, "cs: %16lu\n", stats.cs);
+ seq_printf(m, "clear: %16lu\n", stats.clear);
+ seq_printf(m, "fixup: %16lu\n", stats.fixup);
+ return 0;
+}
+
+static int rseq_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, rseq_debug_show, inode->i_private);
+}
+
+static const struct file_operations dfs_ops = {
+ .open = rseq_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init rseq_debugfs_init(void)
+{
+ struct dentry *root_dir = debugfs_create_dir("rseq", NULL);
+
+ debugfs_create_file("stats", 0444, root_dir, NULL, &dfs_ops);
+ return 0;
+}
+__initcall(rseq_debugfs_init);
+#endif /* CONFIG_RSEQ_STATS */
+
#ifdef CONFIG_DEBUG_RSEQ
static struct rseq *rseq_kernel_fields(struct task_struct *t)
{
@@ -187,12 +241,13 @@ static int rseq_update_cpu_node_id(struc
u32 node_id = cpu_to_node(cpu_id);
u32 mm_cid = task_mm_cid(t);
- /*
- * Validate read-only rseq fields.
- */
+ rseq_stat_inc(rseq_stats.ids);
+
+ /* Validate read-only rseq fields on debug kernels */
if (rseq_validate_ro_fields(t))
goto efault;
WARN_ON_ONCE((int) mm_cid < 0);
+
if (!user_write_access_begin(rseq, t->rseq_len))
goto efault;
@@ -403,6 +458,8 @@ static int rseq_ip_fixup(struct pt_regs
struct rseq_cs rseq_cs;
int ret;
+ rseq_stat_inc(rseq_stats.cs);
+
ret = rseq_get_rseq_cs(t, &rseq_cs);
if (ret)
return ret;
@@ -412,8 +469,10 @@ static int rseq_ip_fixup(struct pt_regs
* If not nested over a rseq critical section, restart is useless.
* Clear the rseq_cs pointer and return.
*/
- if (!in_rseq_cs(ip, &rseq_cs))
+ if (!in_rseq_cs(ip, &rseq_cs)) {
+ rseq_stat_inc(rseq_stats.clear);
return clear_rseq_cs(t->rseq);
+ }
ret = rseq_check_flags(t, rseq_cs.flags);
if (ret < 0)
return ret;
@@ -422,6 +481,7 @@ static int rseq_ip_fixup(struct pt_regs
ret = clear_rseq_cs(t->rseq);
if (ret)
return ret;
+ rseq_stat_inc(rseq_stats.fixup);
trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset,
rseq_cs.abort_ip);
instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip);
@@ -462,6 +522,11 @@ void __rseq_handle_notify_resume(struct
if (unlikely(t->flags & PF_EXITING))
return;
+ if (ksig)
+ rseq_stat_inc(rseq_stats.signal);
+ else
+ rseq_stat_inc(rseq_stats.slowpath);
+
/*
* Read and clear the event pending bit first. If the task
* was not preempted or migrated or a signal is on the way,
Powered by blists - more mailing lists