linux-kernel - [RFC PATCH 10/10] perf synthetic-events: use workqueue parallel

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <b430c981f027cb03c1c20f6f7ee4800aded7a810.1626177381.git.rickyman7@gmail.com>
Date:   Tue, 13 Jul 2021 14:11:21 +0200
From:   Riccardo Mancini <rickyman7@...il.com>
To:     Arnaldo Carvalho de Melo <acme@...nel.org>
Cc:     Ian Rogers <irogers@...gle.com>,
        Namhyung Kim <namhyung@...nel.org>,
        Peter Zijlstra <peterz@...radead.org>,
        Ingo Molnar <mingo@...hat.com>,
        Mark Rutland <mark.rutland@....com>,
        Jiri Olsa <jolsa@...hat.com>, linux-kernel@...r.kernel.org,
        linux-perf-users@...r.kernel.org,
        Riccardo Mancini <rickyman7@...il.com>
Subject: [RFC PATCH 10/10] perf synthetic-events: use workqueue parallel_for

To generate synthetic events, perf has the option to use multiple
threads. These threads are created manually using pthread_created.

This patch replaces the manual pthread_create with a workqueue,
using the parallel_for utility.

Experimental results show that workqueue has a higher overhead, but
this is repayed by the improved work balancing among threads.

Results of perf bench before and after are reported below:
Command: sudo ./perf bench internals synthesize -t
Average synthesis time in usec is reported.

Laptop (dual core i7 w/ hyperthreading), avg num events ~14200:
 N    pthread (before)        workqueue (after)
 1  70714.400 +-  908.789   73306.000 +- 1597.868
 2  77426.700 +- 2986.579   46782.300 +-  326.221
 3  53176.300 +- 3405.635   41614.100 +-  239.827
 4  50760.900 +-  702.623   41071.300 +-  230.200

VM (16 vCPUs over 16 core Intel Xeon E5-2630L v3), avg num events ~2760:
 N    pthread (before)        workqueue (after)
 1  30309.500 +-  578.283   34252.000 +-  839.474
 2  23815.200 +- 1339.102   28487.200 +- 1423.481
 3  20644.300 +-  311.573   19220.200 +- 1436.024
 4  19091.500 +-  446.109   15048.600 +-  319.138
 5  17574.000 +-  988.612   14938.500 +-  411.078
 6  18908.900 +-  520.676   13997.600 +-  358.668
 7  19275.700 +-  631.989   11371.400 +-  365.038
 8  15671.200 +-  306.727   11964.800 +-  338.021
 9  14660.900 +-  333.218   11762.800 +-  652.763
10  12490.200 +-  579.211   11832.300 +-  200.601
11  18052.900 +-  941.578   13166.900 +-  704.318
12  14253.600 +-  354.332   12012.000 +-  309.724
13  12219.000 +-  516.438   12023.800 +-  273.626
14  15896.600 +-  442.419   11764.600 +-  353.961
15  15087.200 +-  337.612   11942.600 +-  304.102
16  15368.700 +-  336.785   13625.200 +-  715.125

Signed-off-by: Riccardo Mancini <rickyman7@...il.com>
---
 tools/perf/util/synthetic-events.c | 131 ++++++++++++-----------------
 1 file changed, 56 insertions(+), 75 deletions(-)

diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c
index 35aa0c0f7cd955b2..a55c7fa41b4f86d3 100644
--- a/tools/perf/util/synthetic-events.c
+++ b/tools/perf/util/synthetic-events.c
@@ -41,6 +41,7 @@
 #include <sys/stat.h>
 #include <fcntl.h>
 #include <unistd.h>
+#include "util/workqueue/workqueue.h"
 
 #define DEFAULT_PROC_MAP_PARSE_TIMEOUT 500
 
@@ -882,16 +883,13 @@ static int __perf_event__synthesize_threads(struct perf_tool *tool,
 					    perf_event__handler_t process,
 					    struct machine *machine,
 					    bool mmap_data,
-					    struct dirent **dirent,
-					    int start,
-					    int num)
+					    char *d_name)
 {
 	union perf_event *comm_event, *mmap_event, *fork_event;
 	union perf_event *namespaces_event;
 	int err = -1;
 	char *end;
 	pid_t pid;
-	int i;
 
 	comm_event = malloc(sizeof(comm_event->comm) + machine->id_hdr_size);
 	if (comm_event == NULL)
@@ -911,24 +909,22 @@ static int __perf_event__synthesize_threads(struct perf_tool *tool,
 	if (namespaces_event == NULL)
 		goto out_free_fork;
 
-	for (i = start; i < start + num; i++) {
-		if (!isdigit(dirent[i]->d_name[0]))
-			continue;
+	if (!isdigit(d_name[0]))
+		goto out_free_namespaces;
 
-		pid = (pid_t)strtol(dirent[i]->d_name, &end, 10);
-		/* only interested in proper numerical dirents */
-		if (*end)
-			continue;
-		/*
-		 * We may race with exiting thread, so don't stop just because
-		 * one thread couldn't be synthesized.
-		 */
-		__event__synthesize_thread(comm_event, mmap_event, fork_event,
-					   namespaces_event, pid, 1, process,
-					   tool, machine, mmap_data);
-	}
+	pid = (pid_t)strtol(d_name, &end, 10);
+	/* only interested in proper numerical dirents */
+	if (*end)
+		goto out_free_namespaces;
+	/*
+	 * We may race with exiting thread, so don't stop just because
+	 * one thread couldn't be synthesized.
+	 */
+	__event__synthesize_thread(comm_event, mmap_event, fork_event,
+					namespaces_event, pid, 1, process,
+					tool, machine, mmap_data);
 	err = 0;
-
+out_free_namespaces:
 	free(namespaces_event);
 out_free_fork:
 	free(fork_event);
@@ -946,19 +942,15 @@ struct synthesize_threads_arg {
 	struct machine *machine;
 	bool mmap_data;
 	struct dirent **dirent;
-	int num;
-	int start;
 };
 
-static void *synthesize_threads_worker(void *arg)
+static void synthesize_threads_worker(int i, void *arg)
 {
 	struct synthesize_threads_arg *args = arg;
 
 	__perf_event__synthesize_threads(args->tool, args->process,
 					 args->machine, args->mmap_data,
-					 args->dirent,
-					 args->start, args->num);
-	return NULL;
+					 args->dirent[i]->d_name);
 }
 
 int perf_event__synthesize_threads(struct perf_tool *tool,
@@ -967,15 +959,14 @@ int perf_event__synthesize_threads(struct perf_tool *tool,
 				   bool mmap_data,
 				   unsigned int nr_threads_synthesize)
 {
-	struct synthesize_threads_arg *args = NULL;
-	pthread_t *synthesize_threads = NULL;
+	struct synthesize_threads_arg args;
 	char proc_path[PATH_MAX];
 	struct dirent **dirent;
-	int num_per_thread;
-	int m, n, i, j;
+	int n, i;
 	int thread_nr;
-	int base = 0;
-	int err = -1;
+	int err = -1, ret;
+	struct threadpool_struct *pool;
+	struct workqueue_struct *wq;
 
 
 	if (machine__is_default_guest(machine))
@@ -992,54 +983,44 @@ int perf_event__synthesize_threads(struct perf_tool *tool,
 		thread_nr = nr_threads_synthesize;
 
 	if (thread_nr <= 1) {
-		err = __perf_event__synthesize_threads(tool, process,
-						       machine, mmap_data,
-						       dirent, base, n);
+		for (i = 0; i < n; i++)
+			err = __perf_event__synthesize_threads(tool, process,
+								machine, mmap_data,
+								dirent[i]->d_name);
 		goto free_dirent;
 	}
-	if (thread_nr > n)
-		thread_nr = n;
 
-	synthesize_threads = calloc(sizeof(pthread_t), thread_nr);
-	if (synthesize_threads == NULL)
+	pool = create_threadpool(thread_nr);
+	if (!pool)
 		goto free_dirent;
 
-	args = calloc(sizeof(*args), thread_nr);
-	if (args == NULL)
-		goto free_threads;
-
-	num_per_thread = n / thread_nr;
-	m = n % thread_nr;
-	for (i = 0; i < thread_nr; i++) {
-		args[i].tool = tool;
-		args[i].process = process;
-		args[i].machine = machine;
-		args[i].mmap_data = mmap_data;
-		args[i].dirent = dirent;
-	}
-	for (i = 0; i < m; i++) {
-		args[i].num = num_per_thread + 1;
-		args[i].start = i * args[i].num;
-	}
-	if (i != 0)
-		base = args[i-1].start + args[i-1].num;
-	for (j = i; j < thread_nr; j++) {
-		args[j].num = num_per_thread;
-		args[j].start = base + (j - i) * args[i].num;
-	}
-
-	for (i = 0; i < thread_nr; i++) {
-		if (pthread_create(&synthesize_threads[i], NULL,
-				   synthesize_threads_worker, &args[i]))
-			goto out_join;
-	}
-	err = 0;
-out_join:
-	for (i = 0; i < thread_nr; i++)
-		pthread_join(synthesize_threads[i], NULL);
-	free(args);
-free_threads:
-	free(synthesize_threads);
+	err = start_threadpool(pool);
+	if (err)
+		goto free_pool;
+
+	wq = create_workqueue(pool);
+	if (!wq)
+		goto stop_pool;
+
+	args.tool = tool;
+	args.process = process;
+	args.machine = machine;
+	args.mmap_data = mmap_data;
+	args.dirent = dirent;
+
+	ret = parallel_for(wq, 0, n, 1, synthesize_threads_worker, &args);
+	if (ret)
+		err = ret;
+
+	ret = destroy_workqueue(wq);
+	if (ret)
+		err = ret;
+stop_pool:
+	ret = stop_threadpool(pool);
+	if (ret)
+		err = ret;
+free_pool:
+	destroy_threadpool(pool);
 free_dirent:
 	for (i = 0; i < n; i++)
 		zfree(&dirent[i]);
-- 
2.31.1