linux-kernel - [PATCH v1 2/2] perf trace: Use pid to index perf

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-Id: <20240827092013.1596-3-howardchu95@gmail.com>
Date: Tue, 27 Aug 2024 17:20:13 +0800
From: Howard Chu <howardchu95@...il.com>
To: acme@...nel.org
Cc: namhyung@...nel.org,
	irogers@...gle.com,
	jolsa@...nel.org,
	adrian.hunter@...el.com,
	kan.liang@...ux.intel.com,
	linux-perf-users@...r.kernel.org,
	linux-kernel@...r.kernel.org,
	Howard Chu <howardchu95@...il.com>
Subject: [PATCH v1 2/2] perf trace: Use pid to index perf_event in BPF

Currently, perf trace -p <PID> is broken for some syscalls. This patch
fixes the it.

Before:
perf $ perf trace -e open -p 79768
         ? (         ):  ... [continued]: open())                                             = -1 ENOENT (No such file or directory)
         ? (         ):  ... [continued]: open())                                             = -1 ENOENT (No such file or directory)
         ? (         ):  ... [continued]: open())                                             = -1 ENOENT (No such file or directory)

After:
perf $ ./perf trace -e open -p 79768
     0.000 ( 0.019 ms): open(filename: "DINGZHEN", flags: WRONLY)                             = -1 ENOENT (No such file or directory)
  1000.187 ( 0.031 ms): open(filename: "DINGZHEN", flags: WRONLY)                             = -1 ENOENT (No such file or directory)
  2000.377 ( 0.019 ms): open(filename: "DINGZHEN", flags: WRONLY)                             = -1 ENOENT (No such file or directory)

This is because when using -p <PID> in perf trace, we mmap the pids
instead of cpus. But in BPF, we tend to use a per-cpu mapped perf_event
to output the augmented data (such as using BPF_F_CURRENT_CPU). That
means the index for perf_event map is cpu.

When we are using -p <PID>, there is "cpu = -1, pid = <PID>".

perf_event_map
[-1]  =  target_perf_event_of_this_pid

This -1 index will never work in BPF. So my original solution is to map
every cpu on this single pid, which is:

perf_event_map
[0]   =  target_perf_event_of_this_pid
[1]   =  target_perf_event_of_this_pid
[2]   =  target_perf_event_of_this_pid
[3]   =  target_perf_event_of_this_pid

But that will cause <number-of-pid> * <number-of-cpu> times
sys_perf_event_open.

So Namhyung's solution is to introduce a new map. I call it
pid2perf_event.

pid2perf_event_map
[pid] = perf_event_index

and then:

perf_event_map
[perf_event_index] = target_perf_event_of_this_pid

we use pid to get the correct index in perf_event map, and
retrieve the correct perf_event using this index.

Suggested-by: Namhyung Kim <namhyung@...nel.org>
Signed-off-by: Howard Chu <howardchu95@...il.com>
---
 tools/perf/builtin-trace.c                    | 55 +++++++++++++++----
 .../bpf_skel/augmented_raw_syscalls.bpf.c     | 33 +++++++++--
 tools/perf/util/evlist.c                      |  2 +-
 3 files changed, 72 insertions(+), 18 deletions(-)

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index d38e0b919e8e..f9ff65c3d4d2 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -3920,6 +3920,7 @@ static int trace__set_allowed_pids(struct trace *trace)
 	struct strlist *pids_slist = strlist__new(trace->opts.target.pid, NULL);
 
 	trace->skel->bss->task_specific = false;
+	trace->skel->bss->is_workload = false;
 
 	if (pids_slist) {
 		strlist__for_each_entry(pos, pids_slist) {
@@ -3944,6 +3945,7 @@ static int trace__set_allowed_pids(struct trace *trace)
 			return err;
 
 		trace->skel->bss->task_specific = true;
+		trace->skel->bss->is_workload = true;
 	}
 
 	strlist__delete(pids_slist);
@@ -4321,18 +4323,49 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
 		goto out_error_open;
 #ifdef HAVE_BPF_SKEL
 	if (trace->syscalls.events.bpf_output) {
-		struct perf_cpu cpu;
+		if (trace->opts.target.pid) {
+			/*
+			 * perf_event map is supposed to be a cpu to perf_event mapping, which is
+			 * different from which when we specified -p, with cpu = -1, pid = <PID>.
+			 * In this case, we treat perf_event map as an array and ignore the cpu
+			 * mapping side of it, and use pid to retrieve the correct index to its
+			 * corresponding perf_event.
+			 */
+			int j = 0;
+			struct perf_thread_map *threads;
+			struct evsel *evsel_aug_sys = evlist__find_evsel_by_str(trace->evlist, "__augmented_syscalls__");
 
-		/*
-		 * Set up the __augmented_syscalls__ BPF map to hold for each
-		 * CPU the bpf-output event's file descriptor.
-		 */
-		perf_cpu_map__for_each_cpu(cpu, i, trace->syscalls.events.bpf_output->core.cpus) {
-			bpf_map__update_elem(trace->skel->maps.__augmented_syscalls__,
-					&cpu.cpu, sizeof(int),
-					xyarray__entry(trace->syscalls.events.bpf_output->core.fd,
-						       cpu.cpu, 0),
-					sizeof(__u32), BPF_ANY);
+			if (evsel_aug_sys == NULL)
+				goto out_error;
+
+			threads = evsel_aug_sys->core.threads;
+
+			for (int thread = 0; thread < perf_thread_map__nr(threads); thread++, j++) {
+				pid_t pid = perf_thread_map__pid(threads, thread);
+
+				bpf_map__update_elem(trace->skel->maps.pid2perf_event, &pid, sizeof(pid_t),
+						     &j, sizeof(int), BPF_ANY);
+
+				bpf_map__update_elem(trace->skel->maps.__augmented_syscalls__,
+						&j, sizeof(int),
+						xyarray__entry(trace->syscalls.events.bpf_output->core.fd,
+							       0, j),
+						sizeof(__u32), BPF_ANY);
+			}
+		} else {
+			struct perf_cpu cpu;
+
+			/*
+			 * Set up the __augmented_syscalls__ BPF map to hold for each
+			 * CPU the bpf-output event's file descriptor.
+			 */
+			perf_cpu_map__for_each_cpu(cpu, i, trace->syscalls.events.bpf_output->core.cpus) {
+				bpf_map__update_elem(trace->skel->maps.__augmented_syscalls__,
+						&cpu.cpu, sizeof(int),
+						xyarray__entry(trace->syscalls.events.bpf_output->core.fd,
+							       cpu.cpu, 0),
+						sizeof(__u32), BPF_ANY);
+			}
 		}
 	}
 #endif
diff --git a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
index 1ab0a56c8f35..ef8aa0bd2275 100644
--- a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
+++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
@@ -25,6 +25,7 @@
 #define MAX_CPUS  4096
 
 volatile bool task_specific;
+volatile bool is_workload;
 
 /* bpf-output associated map */
 struct __augmented_syscalls__ {
@@ -90,6 +91,13 @@ struct pids_allowed {
 	__uint(max_entries, 512);
 } pids_allowed SEC(".maps");
 
+struct pid2perf_event {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__type(key, pid_t);
+	__type(value, int);
+	__uint(max_entries, MAX_CPUS);
+} pid2perf_event SEC(".maps");
+
 /*
  * Desired design of maximum size and alignment (see RFC2553)
  */
@@ -154,6 +162,11 @@ struct beauty_payload_enter_map {
 	__uint(max_entries, 1);
 } beauty_payload_enter_map SEC(".maps");
 
+static pid_t getpid(void)
+{
+	return bpf_get_current_pid_tgid();
+}
+
 static inline struct augmented_args_payload *augmented_args_payload(void)
 {
 	int key = 0;
@@ -168,7 +181,20 @@ static inline int augmented__output(void *ctx, struct augmented_args_payload *ar
 
 static inline int augmented__beauty_output(void *ctx, void *data, int len)
 {
-	return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, data, len);
+	/*
+	 * when it's cpu = -1 pid = PID, we look up the perf_event for this PID. Workload is
+	 * per-cpu mapped so we don't do so.
+	 */
+	if (task_specific && !is_workload) {
+		pid_t pid = getpid();
+		u32 *perf_event = bpf_map_lookup_elem(&pid2perf_event, &pid);
+		if (perf_event)
+			return bpf_perf_event_output(ctx, &__augmented_syscalls__, *perf_event, data, len);
+	} else {
+		return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, data, len);
+	}
+
+	return -1;
 }
 
 static inline
@@ -397,11 +423,6 @@ int sys_enter_nanosleep(struct syscall_enter_args *args)
 	return 1; /* Failure: don't filter */
 }
 
-static pid_t getpid(void)
-{
-	return bpf_get_current_pid_tgid();
-}
-
 static inline bool should_filter()
 {
 	pid_t pid = getpid();
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index f14b7e6ff1dc..ef58a7764318 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -1067,7 +1067,7 @@ int evlist__create_maps(struct evlist *evlist, struct target *target)
 	if (!threads)
 		return -1;
 
-	if (target__uses_dummy_map(target) && !evlist__has_bpf_output(evlist))
+	if (target__uses_dummy_map(target))
 		cpus = perf_cpu_map__new_any_cpu();
 	else
 		cpus = perf_cpu_map__new(target->cpu_list);
-- 
2.46.0