lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251015173548.710051410@kernel.org>
Date: Wed, 15 Oct 2025 13:32:18 -0400
From: Steven Rostedt <rostedt@...nel.org>
To: linux-kernel@...r.kernel.org,
 linux-trace-kernel@...r.kernel.org,
 linux-perf-users@...r.kernel.org
Cc: Masami Hiramatsu <mhiramat@...nel.org>,
 Mark Rutland <mark.rutland@....com>,
 Mathieu Desnoyers <mathieu.desnoyers@...icios.com>,
 Andrew Morton <akpm@...ux-foundation.org>,
 Peter Zijlstra <peterz@...radead.org>,
 Namhyung Kim <namhyung@...nel.org>,
 Takaya Saeki <takayas@...gle.com>,
 Tom Zanussi <zanussi@...nel.org>,
 Thomas Gleixner <tglx@...utronix.de>,
 Ian Rogers <irogers@...gle.com>,
 Douglas Raillard <douglas.raillard@....com>,
 Arnaldo Carvalho de Melo <acme@...nel.org>,
 Jiri Olsa <jolsa@...nel.org>,
 Adrian Hunter <adrian.hunter@...el.com>,
 Ingo Molnar <mingo@...hat.com>
Subject: [PATCH v3 04/13] perf: tracing: Have perf system calls read user space

From: Steven Rostedt <rostedt@...dmis.org>

Allow some of the system call events to read user space buffers. Instead
of just showing the pointer into user space, allow perf events to also
record the content of those pointers. For example:

  # perf record -e syscalls:sys_enter_openat ls /usr/bin
  [..]
  # perf script
      ls    1024 [005]    52.902721: syscalls:sys_enter_openat: dfd: 0xffffff9c, filename: 0x7fc1dbae321c "/etc/ld.so.cache", flags: 0x00080000, mode: 0x00000000
      ls    1024 [005]    52.902899: syscalls:sys_enter_openat: dfd: 0xffffff9c, filename: 0x7fc1dbaae140 "/lib/x86_64-linux-gnu/libselinux.so.1", flags: 0x00080000, mode: 0x00000000
      ls    1024 [005]    52.903471: syscalls:sys_enter_openat: dfd: 0xffffff9c, filename: 0x7fc1dbaae690 "/lib/x86_64-linux-gnu/libcap.so.2", flags: 0x00080000, mode: 0x00000000
      ls    1024 [005]    52.903946: syscalls:sys_enter_openat: dfd: 0xffffff9c, filename: 0x7fc1dbaaebe0 "/lib/x86_64-linux-gnu/libc.so.6", flags: 0x00080000, mode: 0x00000000
      ls    1024 [005]    52.904629: syscalls:sys_enter_openat: dfd: 0xffffff9c, filename: 0x7fc1dbaaf110 "/lib/x86_64-linux-gnu/libpcre2-8.so.0", flags: 0x00080000, mode: 0x00000000
      ls    1024 [005]    52.906985: syscalls:sys_enter_openat: dfd: 0xffffffffffffff9c, filename: 0x7fc1dba92904 "/proc/filesystems", flags: 0x00080000, mode: 0x00000000
      ls    1024 [005]    52.907323: syscalls:sys_enter_openat: dfd: 0xffffff9c, filename: 0x7fc1dba19490 "/usr/lib/locale/locale-archive", flags: 0x00080000, mode: 0x00000000
      ls    1024 [005]    52.907746: syscalls:sys_enter_openat: dfd: 0xffffff9c, filename: 0x556fb888dcd0 "/usr/bin", flags: 0x00090800, mode: 0x00000000

Signed-off-by: Steven Rostedt (Google) <rostedt@...dmis.org>
---
 kernel/trace/trace_syscalls.c | 135 ++++++++++++++++++++++------------
 1 file changed, 89 insertions(+), 46 deletions(-)

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 8f3432014da4..c1dfc3208a12 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -468,6 +468,58 @@ static char *sys_fault_user(struct syscall_metadata *sys_data,
 	return buf;
 }
 
+static int
+syscall_get_data(struct syscall_metadata *sys_data, unsigned long *args,
+		 char **buffer, int *size, int *user_size)
+{
+	struct syscall_user_buffer *sbuf;
+
+	/* If the syscall_buffer is NULL, tracing is being shutdown */
+	sbuf = READ_ONCE(syscall_buffer);
+	if (!sbuf)
+		return -1;
+
+	*buffer = sys_fault_user(sys_data, sbuf, args, user_size);
+	/*
+	 * user_size is the amount of data to append.
+	 * Need to add 4 for the meta field that points to
+	 * the user memory at the end of the event and also
+	 * stores its size.
+	 */
+	*size = 4 + *user_size;
+	return 0;
+}
+
+static void syscall_put_data(struct syscall_metadata *sys_data,
+			     struct syscall_trace_enter *entry,
+			     char *buffer, int size)
+{
+	void *ptr;
+	int val;
+
+	/*
+	 * Set the pointer to point to the meta data of the event
+	 * that has information about the stored user space memory.
+	 */
+	ptr = (void *)entry->args + sizeof(unsigned long) * sys_data->nb_args;
+
+	/*
+	 * The meta data will store the offset of the user data from
+	 * the beginning of the event.
+	 */
+	val  = (ptr - (void *)entry) + 4;
+
+	/* Store the offset and the size into the meta data */
+	*(int *)ptr = val | (size << 16);
+
+	/* Nothing to do if the user space was empty or faulted */
+	if (size) {
+		/* Now store the user space data into the event */
+		ptr += 4;
+		memcpy(ptr, buffer, size);
+	}
+}
+
 static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 {
 	struct trace_array *tr = data;
@@ -511,21 +563,9 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 	syscall_get_arguments(current, regs, args);
 
 	if (mayfault) {
-		struct syscall_user_buffer *sbuf;
-
-		/* If the syscall_buffer is NULL, tracing is being shutdown */
-		sbuf = READ_ONCE(syscall_buffer);
-		if (!sbuf)
+		if (syscall_get_data(sys_data, args, &user_ptr,
+				     &size, &user_size) < 0)
 			return;
-
-		user_ptr = sys_fault_user(sys_data, sbuf, args, &user_size);
-		/*
-		 * user_size is the amount of data to append.
-		 * Need to add 4 for the meta field that points to
-		 * the user memory at the end of the event and also
-		 * stores its size.
-		 */
-		size = 4 + user_size;
 	}
 
 	size += sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
@@ -539,32 +579,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 
 	memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args);
 
-	if (mayfault) {
-		void *ptr;
-		int val;
-
-		/*
-		 * Set the pointer to point to the meta data of the event
-		 * that has information about the stored user space memory.
-		 */
-		ptr = (void *)entry->args + sizeof(unsigned long) * sys_data->nb_args;
-
-		/*
-		 * The meta data will store the offset of the user data from
-		 * the beginning of the event.
-		 */
-		val  = (ptr - (void *)entry) + 4;
-
-		/* Store the offset and the size into the meta data */
-		*(int *)ptr = val | (user_size << 16);
-
-		/* Nothing to do if the user space was empty or faulted */
-		if (user_size) {
-			/* Now store the user space data into the event */
-			ptr += 4;
-			memcpy(ptr, user_ptr, user_size);
-		}
-	}
+	if (mayfault)
+		syscall_put_data(sys_data, entry, user_ptr, user_size);
 
 	trace_event_buffer_commit(&fbuffer);
 }
@@ -956,9 +972,12 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	struct hlist_head *head;
 	unsigned long args[6];
 	bool valid_prog_array;
+	bool mayfault;
+	char *user_ptr;
 	int syscall_nr;
+	int user_size;
 	int rctx;
-	int size;
+	int size = 0;
 
 	/*
 	 * Syscall probe called with preemption enabled, but the ring
@@ -977,13 +996,24 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	if (!sys_data)
 		return;
 
+	syscall_get_arguments(current, regs, args);
+
+	/* Check if this syscall event faults in user space memory */
+	mayfault = sys_data->user_mask != 0;
+
+	if (mayfault) {
+		if (syscall_get_data(sys_data, args, &user_ptr,
+				     &size, &user_size) < 0)
+			return;
+	}
+
 	head = this_cpu_ptr(sys_data->enter_event->perf_events);
 	valid_prog_array = bpf_prog_array_valid(sys_data->enter_event);
 	if (!valid_prog_array && hlist_empty(head))
 		return;
 
 	/* get the size after alignment with the u32 buffer size field */
-	size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
+	size += sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
 	size = ALIGN(size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
 
@@ -992,9 +1022,11 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 		return;
 
 	rec->nr = syscall_nr;
-	syscall_get_arguments(current, regs, args);
 	memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args);
 
+	if (mayfault)
+		syscall_put_data(sys_data, rec, user_ptr, user_size);
+
 	if ((valid_prog_array &&
 	     !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) ||
 	    hlist_empty(head)) {
@@ -1009,35 +1041,46 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 
 static int perf_sysenter_enable(struct trace_event_call *call)
 {
+	struct syscall_metadata *sys_data = call->data;
 	int ret;
 	int num;
 
-	num = ((struct syscall_metadata *)call->data)->syscall_nr;
+	num = sys_data->syscall_nr;
 
 	guard(mutex)(&syscall_trace_lock);
+	if (sys_data->user_mask) {
+		ret = syscall_fault_buffer_enable();
+		if (ret < 0)
+			return ret;
+	}
 	if (!sys_perf_refcount_enter) {
 		ret = register_trace_sys_enter(perf_syscall_enter, NULL);
 		if (ret) {
 			pr_info("event trace: Could not activate syscall entry trace point");
+			if (sys_data->user_mask)
+				syscall_fault_buffer_disable();
 			return ret;
 		}
 	}
 	set_bit(num, enabled_perf_enter_syscalls);
 	sys_perf_refcount_enter++;
-	return ret;
+	return 0;
 }
 
 static void perf_sysenter_disable(struct trace_event_call *call)
 {
+	struct syscall_metadata *sys_data = call->data;
 	int num;
 
-	num = ((struct syscall_metadata *)call->data)->syscall_nr;
+	num = sys_data->syscall_nr;
 
 	guard(mutex)(&syscall_trace_lock);
 	sys_perf_refcount_enter--;
 	clear_bit(num, enabled_perf_enter_syscalls);
 	if (!sys_perf_refcount_enter)
 		unregister_trace_sys_enter(perf_syscall_enter, NULL);
+	if (sys_data->user_mask)
+		syscall_fault_buffer_disable();
 }
 
 static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs,
-- 
2.51.0



Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ