linux-kernel - [PATCH] perf tooling: Simplify 'perf bench syscall'

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20160201074849.GA9129@gmail.com>
Date:	Mon, 1 Feb 2016 08:48:49 +0100
From:	Ingo Molnar <mingo@...nel.org>
To:	riel@...hat.com
Cc:	linux-kernel@...r.kernel.org, fweisbec@...il.com,
	tglx@...utronix.de, luto@...capital.net, peterz@...radead.org,
	clark@...hat.com, Arnaldo Carvalho de Melo <acme@...radead.org>,
	Peter Zijlstra <a.p.zijlstra@...llo.nl>
Subject: [PATCH] perf tooling: Simplify 'perf bench syscall'


* Ingo Molnar <mingo@...nel.org> wrote:

> [...]
> 
> I kept the process, threading and memory allocation bits of numa.c, just in case 
> we need them to measure more complex syscalls. Maybe we could keep the threading 
> bits and remove the memory allocation parameters, to simplify the benchmark?

So the patch below removes NUMA details: convergence measurement and memory access 
pattern details. This reduces the linecount by about 30%. Should be combined with 
the previous patch I suspect.

Thanks,

	Ingo

==================>
>From a992aecebe12a195ffa74e09fcbe6b48db4430e3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@...nel.org>
Date: Mon, 1 Feb 2016 08:46:39 +0100
Subject: [PATCH] perf tooling: Simplify 'perf bench syscall'

Remove NUMA legacies.

Signed-off-by: Ingo Molnar <mingo@...nel.org>
---
 tools/perf/bench/syscall.c | 316 +--------------------------------------------
 1 file changed, 5 insertions(+), 311 deletions(-)

diff --git a/tools/perf/bench/syscall.c b/tools/perf/bench/syscall.c
index 5a4ef02176d1..fabac462bde5 100644
--- a/tools/perf/bench/syscall.c
+++ b/tools/perf/bench/syscall.c
@@ -81,11 +81,6 @@ struct params {
 	double			mb_thread;
 
 	/* Access patterns to the working set: */
-	bool			data_reads;
-	bool			data_writes;
-	bool			data_backwards;
-	bool			data_zero_memset;
-	bool			data_rand_walk;
 	u32			nr_loops;
 	u32			nr_secs;
 	u32			sleep_usecs;
@@ -108,10 +103,6 @@ struct params {
 	int			nr_tasks;
 	bool			show_quiet;
 
-	bool			show_convergence;
-	bool			measure_convergence;
-
-	int			perturb_secs;
 	int			nr_cpus;
 	int			nr_nodes;
 
@@ -139,8 +130,6 @@ struct global_info {
 
 	struct thread_data	*threads;
 
-	/* Convergence latency measurement: */
-	bool			all_converged;
 	bool			stop_work;
 
 	int			print_once;
@@ -168,23 +157,13 @@ static const struct option options[] = {
 	OPT_UINTEGER('s', "nr_secs"	, &p0.nr_secs,		"max number of seconds to run (default: 5 secs)"),
 	OPT_UINTEGER('u', "usleep"	, &p0.sleep_usecs,	"usecs to sleep per loop iteration"),
 
-	OPT_BOOLEAN('R', "data_reads"	, &p0.data_reads,	"access the data via writes (can be mixed with -W)"),
-	OPT_BOOLEAN('W', "data_writes"	, &p0.data_writes,	"access the data via writes (can be mixed with -R)"),
-	OPT_BOOLEAN('B', "data_backwards", &p0.data_backwards,	"access the data backwards as well"),
-	OPT_BOOLEAN('Z', "data_zero_memset", &p0.data_zero_memset,"access the data via glibc bzero only"),
-	OPT_BOOLEAN('r', "data_rand_walk", &p0.data_rand_walk,	"access the data with random (32bit LFSR) walk"),
-
-
 	OPT_BOOLEAN('z', "init_zero"	, &p0.init_zero,	"bzero the initial allocations"),
 	OPT_BOOLEAN('I', "init_random"	, &p0.init_random,	"randomize the contents of the initial allocations"),
 	OPT_BOOLEAN('0', "init_cpu0"	, &p0.init_cpu0,	"do the initial allocations on CPU#0"),
-	OPT_INTEGER('x', "perturb_secs", &p0.perturb_secs,	"perturb thread 0/0 every X secs, to test convergence stability"),
 
 	OPT_INCR   ('d', "show_details"	, &p0.show_details,	"Show details"),
 	OPT_INCR   ('a', "all"		, &p0.run_all,		"Run all tests in the suite"),
 	OPT_INTEGER('H', "thp"		, &p0.thp,		"MADV_NOHUGEPAGE < 0 < MADV_HUGEPAGE"),
-	OPT_BOOLEAN('c', "show_convergence", &p0.show_convergence, "show convergence details"),
-	OPT_BOOLEAN('m', "measure_convergence",	&p0.measure_convergence, "measure convergence latency"),
 	OPT_BOOLEAN('q', "quiet"	, &p0.show_quiet,	"quiet mode"),
 	OPT_BOOLEAN('S', "serialize-startup", &p0.serialize_startup,"serialize thread startup"),
 
@@ -208,32 +187,6 @@ static const char * const syscall_usage[] = {
 	NULL
 };
 
-static cpu_set_t bind_to_cpu(int target_cpu)
-{
-	cpu_set_t orig_mask, mask;
-	int ret;
-
-	ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask);
-	BUG_ON(ret);
-
-	CPU_ZERO(&mask);
-
-	if (target_cpu == -1) {
-		int cpu;
-
-		for (cpu = 0; cpu < g->p.nr_cpus; cpu++)
-			CPU_SET(cpu, &mask);
-	} else {
-		BUG_ON(target_cpu < 0 || target_cpu >= g->p.nr_cpus);
-		CPU_SET(target_cpu, &mask);
-	}
-
-	ret = sched_setaffinity(0, sizeof(mask), &mask);
-	BUG_ON(ret);
-
-	return orig_mask;
-}
-
 static cpu_set_t bind_to_node(int target_node)
 {
 	int cpus_per_node = g->p.nr_cpus/g->p.nr_nodes;
@@ -699,222 +652,11 @@ static void update_curr_cpu(int task_nr, unsigned long bytes_worked)
 	prctl(0, bytes_worked);
 }
 
-#define MAX_NR_NODES	64
-
-/*
- * Count the number of nodes a process's threads
- * are spread out on.
- *
- * A count of 1 means that the process is compressed
- * to a single node. A count of g->p.nr_nodes means it's
- * spread out on the whole system.
- */
-static int count_process_nodes(int process_nr)
-{
-	char node_present[MAX_NR_NODES] = { 0, };
-	int nodes;
-	int n, t;
-
-	for (t = 0; t < g->p.nr_threads; t++) {
-		struct thread_data *td;
-		int task_nr;
-		int node;
-
-		task_nr = process_nr*g->p.nr_threads + t;
-		td = g->threads + task_nr;
-
-		node = numa_node_of_cpu(td->curr_cpu);
-		if (node < 0) /* curr_cpu was likely still -1 */
-			return 0;
-
-		node_present[node] = 1;
-	}
-
-	nodes = 0;
-
-	for (n = 0; n < MAX_NR_NODES; n++)
-		nodes += node_present[n];
-
-	return nodes;
-}
-
-/*
- * Count the number of distinct process-threads a node contains.
- *
- * A count of 1 means that the node contains only a single
- * process. If all nodes on the system contain at most one
- * process then we are well-converged.
- */
-static int count_node_processes(int node)
-{
-	int processes = 0;
-	int t, p;
-
-	for (p = 0; p < g->p.nr_proc; p++) {
-		for (t = 0; t < g->p.nr_threads; t++) {
-			struct thread_data *td;
-			int task_nr;
-			int n;
-
-			task_nr = p*g->p.nr_threads + t;
-			td = g->threads + task_nr;
-
-			n = numa_node_of_cpu(td->curr_cpu);
-			if (n == node) {
-				processes++;
-				break;
-			}
-		}
-	}
-
-	return processes;
-}
-
-static void calc_convergence_compression(int *strong)
-{
-	unsigned int nodes_min, nodes_max;
-	int p;
-
-	nodes_min = -1;
-	nodes_max =  0;
-
-	for (p = 0; p < g->p.nr_proc; p++) {
-		unsigned int nodes = count_process_nodes(p);
-
-		if (!nodes) {
-			*strong = 0;
-			return;
-		}
-
-		nodes_min = min(nodes, nodes_min);
-		nodes_max = max(nodes, nodes_max);
-	}
-
-	/* Strong convergence: all threads compress on a single node: */
-	if (nodes_min == 1 && nodes_max == 1) {
-		*strong = 1;
-	} else {
-		*strong = 0;
-		tprintf(" {%d-%d}", nodes_min, nodes_max);
-	}
-}
-
-static void calc_convergence(double runtime_ns_max, double *convergence)
-{
-	unsigned int loops_done_min, loops_done_max;
-	int process_groups;
-	int nodes[MAX_NR_NODES];
-	int distance;
-	int nr_min;
-	int nr_max;
-	int strong;
-	int sum;
-	int nr;
-	int node;
-	int cpu;
-	int t;
-
-	if (!g->p.show_convergence && !g->p.measure_convergence)
-		return;
-
-	for (node = 0; node < g->p.nr_nodes; node++)
-		nodes[node] = 0;
-
-	loops_done_min = -1;
-	loops_done_max = 0;
-
-	for (t = 0; t < g->p.nr_tasks; t++) {
-		struct thread_data *td = g->threads + t;
-		unsigned int loops_done;
-
-		cpu = td->curr_cpu;
-
-		/* Not all threads have written it yet: */
-		if (cpu < 0)
-			continue;
-
-		node = numa_node_of_cpu(cpu);
-
-		nodes[node]++;
-
-		loops_done = td->loops_done;
-		loops_done_min = min(loops_done, loops_done_min);
-		loops_done_max = max(loops_done, loops_done_max);
-	}
-
-	nr_max = 0;
-	nr_min = g->p.nr_tasks;
-	sum = 0;
-
-	for (node = 0; node < g->p.nr_nodes; node++) {
-		nr = nodes[node];
-		nr_min = min(nr, nr_min);
-		nr_max = max(nr, nr_max);
-		sum += nr;
-	}
-	BUG_ON(nr_min > nr_max);
-
-	BUG_ON(sum > g->p.nr_tasks);
-
-	if (0 && (sum < g->p.nr_tasks))
-		return;
-
-	/*
-	 * Count the number of distinct process groups present
-	 * on nodes - when we are converged this will decrease
-	 * to g->p.nr_proc:
-	 */
-	process_groups = 0;
-
-	for (node = 0; node < g->p.nr_nodes; node++) {
-		int processes = count_node_processes(node);
-
-		nr = nodes[node];
-		tprintf(" %2d/%-2d", nr, processes);
-
-		process_groups += processes;
-	}
-
-	distance = nr_max - nr_min;
-
-	tprintf(" [%2d/%-2d]", distance, process_groups);
-
-	tprintf(" l:%3d-%-3d (%3d)",
-		loops_done_min, loops_done_max, loops_done_max-loops_done_min);
-
-	if (loops_done_min && loops_done_max) {
-		double skew = 1.0 - (double)loops_done_min/loops_done_max;
-
-		tprintf(" [%4.1f%%]", skew * 100.0);
-	}
-
-	calc_convergence_compression(&strong);
-
-	if (strong && process_groups == g->p.nr_proc) {
-		if (!*convergence) {
-			*convergence = runtime_ns_max;
-			tprintf(" (%6.1fs converged)\n", *convergence/1e9);
-			if (g->p.measure_convergence) {
-				g->all_converged = true;
-				g->stop_work = true;
-			}
-		}
-	} else {
-		if (*convergence) {
-			tprintf(" (%6.1fs de-converged)", runtime_ns_max/1e9);
-			*convergence = 0;
-		}
-		tprintf("\n");
-	}
-}
-
-static void show_summary(double runtime_ns_max, int l, double *convergence)
+static void show_summary(double runtime_ns_max, int l)
 {
 	tprintf("\r #  %5.1f%%  [%.1f mins]",
 		(double)(l+1)/g->p.nr_loops*100.0, runtime_ns_max/1e9 / 60.0);
 
-	calc_convergence(runtime_ns_max, convergence);
-
 	if (g->p.show_details >= 0)
 		fflush(stdout);
 }
@@ -925,11 +667,9 @@ static void *worker_thread(void *__tdata)
 	struct timeval start0, start, stop, diff;
 	int process_nr = td->process_nr;
 	int thread_nr = td->thread_nr;
-	unsigned long last_perturbance;
 	int task_nr = td->task_nr;
 	int details = g->p.show_details;
-	int first_task, last_task;
-	double convergence = 0;
+	int last_task;
 	u64 val = td->val;
 	double runtime_ns_max;
 	u8 *global_data;
@@ -955,10 +695,6 @@ static void *worker_thread(void *__tdata)
 	if (process_nr == g->p.nr_proc-1 && thread_nr == g->p.nr_threads-1)
 		last_task = 1;
 
-	first_task = 0;
-	if (process_nr == 0 && thread_nr == 0)
-		first_task = 1;
-
 	if (details >= 2) {
 		printf("#  thread %2d / %2d global mem: %p, process mem: %p, thread mem: %p\n",
 			process_nr, thread_nr, global_data, process_data, thread_data);
@@ -983,7 +719,6 @@ static void *worker_thread(void *__tdata)
 	gettimeofday(&start0, NULL);
 
 	start = stop = start0;
-	last_perturbance = start.tv_sec;
 
 	for (l = 0; l < g->p.nr_loops; l++) {
 		start = stop;
@@ -1015,7 +750,7 @@ static void *worker_thread(void *__tdata)
 		update_curr_cpu(task_nr, work_done);
 		bytes_done += work_done;
 
-		if (details < 0 && !g->p.perturb_secs && !g->p.measure_convergence && !g->p.nr_secs)
+		if (details < 0 && !g->p.nr_secs)
 			continue;
 
 		td->loops_done = l;
@@ -1035,37 +770,6 @@ static void *worker_thread(void *__tdata)
 		if (start.tv_sec == stop.tv_sec)
 			continue;
 
-		/*
-		 * Perturb the first task's equilibrium every g->p.perturb_secs seconds,
-		 * by migrating to CPU#0:
-		 */
-		if (first_task && g->p.perturb_secs && (int)(stop.tv_sec - last_perturbance) >= g->p.perturb_secs) {
-			cpu_set_t orig_mask;
-			int target_cpu;
-			int this_cpu;
-
-			last_perturbance = stop.tv_sec;
-
-			/*
-			 * Depending on where we are running, move into
-			 * the other half of the system, to create some
-			 * real disturbance:
-			 */
-			this_cpu = g->threads[task_nr].curr_cpu;
-			if (this_cpu < g->p.nr_cpus/2)
-				target_cpu = g->p.nr_cpus-1;
-			else
-				target_cpu = 0;
-
-			orig_mask = bind_to_cpu(target_cpu);
-
-			/* Here we are running on the target CPU already */
-			if (details >= 1)
-				printf(" (injecting perturbalance, moved to CPU#%d)\n", target_cpu);
-
-			bind_to_cpumask(orig_mask);
-		}
-
 		if (details >= 3) {
 			timersub(&stop, &start, &diff);
 			runtime_ns_max = diff.tv_sec * 1000000000;
@@ -1084,7 +788,7 @@ static void *worker_thread(void *__tdata)
 		runtime_ns_max = diff.tv_sec * 1000000000ULL;
 		runtime_ns_max += diff.tv_usec * 1000ULL;
 
-		show_summary(runtime_ns_max, l, &convergence);
+		show_summary(runtime_ns_max, l);
 	}
 
 	gettimeofday(&stop, NULL);
@@ -1226,8 +930,7 @@ static int init(void)
 
 	g->p.nr_nodes = numa_max_node() + 1;
 
-	/* char array in count_process_nodes(): */
-	BUG_ON(g->p.nr_nodes > MAX_NR_NODES || g->p.nr_nodes < 0);
+	BUG_ON(g->p.nr_nodes < 0);
 
 	if (g->p.show_quiet && !g->p.show_details)
 		g->p.show_details = -1;
@@ -1427,11 +1130,6 @@ static int __bench_syscall(const char *name)
 	bytes = g->bytes_done;
 	runtime_avg = (double)runtime_ns_sum / g->p.nr_tasks / 1e9;
 
-	if (g->p.measure_convergence) {
-		print_res(name, runtime_sec_max,
-			"secs,", "NUMA-convergence-latency", "secs latency to NUMA-converge");
-	}
-
 	print_res(name, runtime_sec_max,
 		"secs,", "runtime-max/thread",	"secs slowest (max) thread-runtime");
 
@@ -1517,10 +1215,6 @@ static void init_params(struct params *p, const char *name, int argc, const char
 	/* Initialize nonzero defaults: */
 
 	p->serialize_startup		= 1;
-	p->data_reads			= true;
-	p->data_writes			= true;
-	p->data_backwards		= true;
-	p->data_rand_walk		= true;
 	p->nr_loops			= 10000000;
 	p->init_random			= true;
 	p->mb_global_str		= "1";