[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20160201074849.GA9129@gmail.com>
Date: Mon, 1 Feb 2016 08:48:49 +0100
From: Ingo Molnar <mingo@...nel.org>
To: riel@...hat.com
Cc: linux-kernel@...r.kernel.org, fweisbec@...il.com,
tglx@...utronix.de, luto@...capital.net, peterz@...radead.org,
clark@...hat.com, Arnaldo Carvalho de Melo <acme@...radead.org>,
Peter Zijlstra <a.p.zijlstra@...llo.nl>
Subject: [PATCH] perf tooling: Simplify 'perf bench syscall'
* Ingo Molnar <mingo@...nel.org> wrote:
> [...]
>
> I kept the process, threading and memory allocation bits of numa.c, just in case
> we need them to measure more complex syscalls. Maybe we could keep the threading
> bits and remove the memory allocation parameters, to simplify the benchmark?
So the patch below removes NUMA details: convergence measurement and memory access
pattern details. This reduces the linecount by about 30%. Should be combined with
the previous patch I suspect.
Thanks,
Ingo
==================>
>From a992aecebe12a195ffa74e09fcbe6b48db4430e3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@...nel.org>
Date: Mon, 1 Feb 2016 08:46:39 +0100
Subject: [PATCH] perf tooling: Simplify 'perf bench syscall'
Remove NUMA legacies.
Signed-off-by: Ingo Molnar <mingo@...nel.org>
---
tools/perf/bench/syscall.c | 316 +--------------------------------------------
1 file changed, 5 insertions(+), 311 deletions(-)
diff --git a/tools/perf/bench/syscall.c b/tools/perf/bench/syscall.c
index 5a4ef02176d1..fabac462bde5 100644
--- a/tools/perf/bench/syscall.c
+++ b/tools/perf/bench/syscall.c
@@ -81,11 +81,6 @@ struct params {
double mb_thread;
/* Access patterns to the working set: */
- bool data_reads;
- bool data_writes;
- bool data_backwards;
- bool data_zero_memset;
- bool data_rand_walk;
u32 nr_loops;
u32 nr_secs;
u32 sleep_usecs;
@@ -108,10 +103,6 @@ struct params {
int nr_tasks;
bool show_quiet;
- bool show_convergence;
- bool measure_convergence;
-
- int perturb_secs;
int nr_cpus;
int nr_nodes;
@@ -139,8 +130,6 @@ struct global_info {
struct thread_data *threads;
- /* Convergence latency measurement: */
- bool all_converged;
bool stop_work;
int print_once;
@@ -168,23 +157,13 @@ static const struct option options[] = {
OPT_UINTEGER('s', "nr_secs" , &p0.nr_secs, "max number of seconds to run (default: 5 secs)"),
OPT_UINTEGER('u', "usleep" , &p0.sleep_usecs, "usecs to sleep per loop iteration"),
- OPT_BOOLEAN('R', "data_reads" , &p0.data_reads, "access the data via writes (can be mixed with -W)"),
- OPT_BOOLEAN('W', "data_writes" , &p0.data_writes, "access the data via writes (can be mixed with -R)"),
- OPT_BOOLEAN('B', "data_backwards", &p0.data_backwards, "access the data backwards as well"),
- OPT_BOOLEAN('Z', "data_zero_memset", &p0.data_zero_memset,"access the data via glibc bzero only"),
- OPT_BOOLEAN('r', "data_rand_walk", &p0.data_rand_walk, "access the data with random (32bit LFSR) walk"),
-
-
OPT_BOOLEAN('z', "init_zero" , &p0.init_zero, "bzero the initial allocations"),
OPT_BOOLEAN('I', "init_random" , &p0.init_random, "randomize the contents of the initial allocations"),
OPT_BOOLEAN('0', "init_cpu0" , &p0.init_cpu0, "do the initial allocations on CPU#0"),
- OPT_INTEGER('x', "perturb_secs", &p0.perturb_secs, "perturb thread 0/0 every X secs, to test convergence stability"),
OPT_INCR ('d', "show_details" , &p0.show_details, "Show details"),
OPT_INCR ('a', "all" , &p0.run_all, "Run all tests in the suite"),
OPT_INTEGER('H', "thp" , &p0.thp, "MADV_NOHUGEPAGE < 0 < MADV_HUGEPAGE"),
- OPT_BOOLEAN('c', "show_convergence", &p0.show_convergence, "show convergence details"),
- OPT_BOOLEAN('m', "measure_convergence", &p0.measure_convergence, "measure convergence latency"),
OPT_BOOLEAN('q', "quiet" , &p0.show_quiet, "quiet mode"),
OPT_BOOLEAN('S', "serialize-startup", &p0.serialize_startup,"serialize thread startup"),
@@ -208,32 +187,6 @@ static const char * const syscall_usage[] = {
NULL
};
-static cpu_set_t bind_to_cpu(int target_cpu)
-{
- cpu_set_t orig_mask, mask;
- int ret;
-
- ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask);
- BUG_ON(ret);
-
- CPU_ZERO(&mask);
-
- if (target_cpu == -1) {
- int cpu;
-
- for (cpu = 0; cpu < g->p.nr_cpus; cpu++)
- CPU_SET(cpu, &mask);
- } else {
- BUG_ON(target_cpu < 0 || target_cpu >= g->p.nr_cpus);
- CPU_SET(target_cpu, &mask);
- }
-
- ret = sched_setaffinity(0, sizeof(mask), &mask);
- BUG_ON(ret);
-
- return orig_mask;
-}
-
static cpu_set_t bind_to_node(int target_node)
{
int cpus_per_node = g->p.nr_cpus/g->p.nr_nodes;
@@ -699,222 +652,11 @@ static void update_curr_cpu(int task_nr, unsigned long bytes_worked)
prctl(0, bytes_worked);
}
-#define MAX_NR_NODES 64
-
-/*
- * Count the number of nodes a process's threads
- * are spread out on.
- *
- * A count of 1 means that the process is compressed
- * to a single node. A count of g->p.nr_nodes means it's
- * spread out on the whole system.
- */
-static int count_process_nodes(int process_nr)
-{
- char node_present[MAX_NR_NODES] = { 0, };
- int nodes;
- int n, t;
-
- for (t = 0; t < g->p.nr_threads; t++) {
- struct thread_data *td;
- int task_nr;
- int node;
-
- task_nr = process_nr*g->p.nr_threads + t;
- td = g->threads + task_nr;
-
- node = numa_node_of_cpu(td->curr_cpu);
- if (node < 0) /* curr_cpu was likely still -1 */
- return 0;
-
- node_present[node] = 1;
- }
-
- nodes = 0;
-
- for (n = 0; n < MAX_NR_NODES; n++)
- nodes += node_present[n];
-
- return nodes;
-}
-
-/*
- * Count the number of distinct process-threads a node contains.
- *
- * A count of 1 means that the node contains only a single
- * process. If all nodes on the system contain at most one
- * process then we are well-converged.
- */
-static int count_node_processes(int node)
-{
- int processes = 0;
- int t, p;
-
- for (p = 0; p < g->p.nr_proc; p++) {
- for (t = 0; t < g->p.nr_threads; t++) {
- struct thread_data *td;
- int task_nr;
- int n;
-
- task_nr = p*g->p.nr_threads + t;
- td = g->threads + task_nr;
-
- n = numa_node_of_cpu(td->curr_cpu);
- if (n == node) {
- processes++;
- break;
- }
- }
- }
-
- return processes;
-}
-
-static void calc_convergence_compression(int *strong)
-{
- unsigned int nodes_min, nodes_max;
- int p;
-
- nodes_min = -1;
- nodes_max = 0;
-
- for (p = 0; p < g->p.nr_proc; p++) {
- unsigned int nodes = count_process_nodes(p);
-
- if (!nodes) {
- *strong = 0;
- return;
- }
-
- nodes_min = min(nodes, nodes_min);
- nodes_max = max(nodes, nodes_max);
- }
-
- /* Strong convergence: all threads compress on a single node: */
- if (nodes_min == 1 && nodes_max == 1) {
- *strong = 1;
- } else {
- *strong = 0;
- tprintf(" {%d-%d}", nodes_min, nodes_max);
- }
-}
-
-static void calc_convergence(double runtime_ns_max, double *convergence)
-{
- unsigned int loops_done_min, loops_done_max;
- int process_groups;
- int nodes[MAX_NR_NODES];
- int distance;
- int nr_min;
- int nr_max;
- int strong;
- int sum;
- int nr;
- int node;
- int cpu;
- int t;
-
- if (!g->p.show_convergence && !g->p.measure_convergence)
- return;
-
- for (node = 0; node < g->p.nr_nodes; node++)
- nodes[node] = 0;
-
- loops_done_min = -1;
- loops_done_max = 0;
-
- for (t = 0; t < g->p.nr_tasks; t++) {
- struct thread_data *td = g->threads + t;
- unsigned int loops_done;
-
- cpu = td->curr_cpu;
-
- /* Not all threads have written it yet: */
- if (cpu < 0)
- continue;
-
- node = numa_node_of_cpu(cpu);
-
- nodes[node]++;
-
- loops_done = td->loops_done;
- loops_done_min = min(loops_done, loops_done_min);
- loops_done_max = max(loops_done, loops_done_max);
- }
-
- nr_max = 0;
- nr_min = g->p.nr_tasks;
- sum = 0;
-
- for (node = 0; node < g->p.nr_nodes; node++) {
- nr = nodes[node];
- nr_min = min(nr, nr_min);
- nr_max = max(nr, nr_max);
- sum += nr;
- }
- BUG_ON(nr_min > nr_max);
-
- BUG_ON(sum > g->p.nr_tasks);
-
- if (0 && (sum < g->p.nr_tasks))
- return;
-
- /*
- * Count the number of distinct process groups present
- * on nodes - when we are converged this will decrease
- * to g->p.nr_proc:
- */
- process_groups = 0;
-
- for (node = 0; node < g->p.nr_nodes; node++) {
- int processes = count_node_processes(node);
-
- nr = nodes[node];
- tprintf(" %2d/%-2d", nr, processes);
-
- process_groups += processes;
- }
-
- distance = nr_max - nr_min;
-
- tprintf(" [%2d/%-2d]", distance, process_groups);
-
- tprintf(" l:%3d-%-3d (%3d)",
- loops_done_min, loops_done_max, loops_done_max-loops_done_min);
-
- if (loops_done_min && loops_done_max) {
- double skew = 1.0 - (double)loops_done_min/loops_done_max;
-
- tprintf(" [%4.1f%%]", skew * 100.0);
- }
-
- calc_convergence_compression(&strong);
-
- if (strong && process_groups == g->p.nr_proc) {
- if (!*convergence) {
- *convergence = runtime_ns_max;
- tprintf(" (%6.1fs converged)\n", *convergence/1e9);
- if (g->p.measure_convergence) {
- g->all_converged = true;
- g->stop_work = true;
- }
- }
- } else {
- if (*convergence) {
- tprintf(" (%6.1fs de-converged)", runtime_ns_max/1e9);
- *convergence = 0;
- }
- tprintf("\n");
- }
-}
-
-static void show_summary(double runtime_ns_max, int l, double *convergence)
+static void show_summary(double runtime_ns_max, int l)
{
tprintf("\r # %5.1f%% [%.1f mins]",
(double)(l+1)/g->p.nr_loops*100.0, runtime_ns_max/1e9 / 60.0);
- calc_convergence(runtime_ns_max, convergence);
-
if (g->p.show_details >= 0)
fflush(stdout);
}
@@ -925,11 +667,9 @@ static void *worker_thread(void *__tdata)
struct timeval start0, start, stop, diff;
int process_nr = td->process_nr;
int thread_nr = td->thread_nr;
- unsigned long last_perturbance;
int task_nr = td->task_nr;
int details = g->p.show_details;
- int first_task, last_task;
- double convergence = 0;
+ int last_task;
u64 val = td->val;
double runtime_ns_max;
u8 *global_data;
@@ -955,10 +695,6 @@ static void *worker_thread(void *__tdata)
if (process_nr == g->p.nr_proc-1 && thread_nr == g->p.nr_threads-1)
last_task = 1;
- first_task = 0;
- if (process_nr == 0 && thread_nr == 0)
- first_task = 1;
-
if (details >= 2) {
printf("# thread %2d / %2d global mem: %p, process mem: %p, thread mem: %p\n",
process_nr, thread_nr, global_data, process_data, thread_data);
@@ -983,7 +719,6 @@ static void *worker_thread(void *__tdata)
gettimeofday(&start0, NULL);
start = stop = start0;
- last_perturbance = start.tv_sec;
for (l = 0; l < g->p.nr_loops; l++) {
start = stop;
@@ -1015,7 +750,7 @@ static void *worker_thread(void *__tdata)
update_curr_cpu(task_nr, work_done);
bytes_done += work_done;
- if (details < 0 && !g->p.perturb_secs && !g->p.measure_convergence && !g->p.nr_secs)
+ if (details < 0 && !g->p.nr_secs)
continue;
td->loops_done = l;
@@ -1035,37 +770,6 @@ static void *worker_thread(void *__tdata)
if (start.tv_sec == stop.tv_sec)
continue;
- /*
- * Perturb the first task's equilibrium every g->p.perturb_secs seconds,
- * by migrating to CPU#0:
- */
- if (first_task && g->p.perturb_secs && (int)(stop.tv_sec - last_perturbance) >= g->p.perturb_secs) {
- cpu_set_t orig_mask;
- int target_cpu;
- int this_cpu;
-
- last_perturbance = stop.tv_sec;
-
- /*
- * Depending on where we are running, move into
- * the other half of the system, to create some
- * real disturbance:
- */
- this_cpu = g->threads[task_nr].curr_cpu;
- if (this_cpu < g->p.nr_cpus/2)
- target_cpu = g->p.nr_cpus-1;
- else
- target_cpu = 0;
-
- orig_mask = bind_to_cpu(target_cpu);
-
- /* Here we are running on the target CPU already */
- if (details >= 1)
- printf(" (injecting perturbalance, moved to CPU#%d)\n", target_cpu);
-
- bind_to_cpumask(orig_mask);
- }
-
if (details >= 3) {
timersub(&stop, &start, &diff);
runtime_ns_max = diff.tv_sec * 1000000000;
@@ -1084,7 +788,7 @@ static void *worker_thread(void *__tdata)
runtime_ns_max = diff.tv_sec * 1000000000ULL;
runtime_ns_max += diff.tv_usec * 1000ULL;
- show_summary(runtime_ns_max, l, &convergence);
+ show_summary(runtime_ns_max, l);
}
gettimeofday(&stop, NULL);
@@ -1226,8 +930,7 @@ static int init(void)
g->p.nr_nodes = numa_max_node() + 1;
- /* char array in count_process_nodes(): */
- BUG_ON(g->p.nr_nodes > MAX_NR_NODES || g->p.nr_nodes < 0);
+ BUG_ON(g->p.nr_nodes < 0);
if (g->p.show_quiet && !g->p.show_details)
g->p.show_details = -1;
@@ -1427,11 +1130,6 @@ static int __bench_syscall(const char *name)
bytes = g->bytes_done;
runtime_avg = (double)runtime_ns_sum / g->p.nr_tasks / 1e9;
- if (g->p.measure_convergence) {
- print_res(name, runtime_sec_max,
- "secs,", "NUMA-convergence-latency", "secs latency to NUMA-converge");
- }
-
print_res(name, runtime_sec_max,
"secs,", "runtime-max/thread", "secs slowest (max) thread-runtime");
@@ -1517,10 +1215,6 @@ static void init_params(struct params *p, const char *name, int argc, const char
/* Initialize nonzero defaults: */
p->serialize_startup = 1;
- p->data_reads = true;
- p->data_writes = true;
- p->data_backwards = true;
- p->data_rand_walk = true;
p->nr_loops = 10000000;
p->init_random = true;
p->mb_global_str = "1";
Powered by blists - more mailing lists