lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Tue, 28 Jul 2020 08:51:52 -0300
From:   Arnaldo Carvalho de Melo <acme@...nel.org>
To:     Ian Rogers <irogers@...gle.com>
Cc:     Peter Zijlstra <peterz@...radead.org>,
        Ingo Molnar <mingo@...hat.com>,
        Mark Rutland <mark.rutland@....com>,
        Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
        Jiri Olsa <jolsa@...hat.com>,
        Namhyung Kim <namhyung@...nel.org>,
        Thomas Gleixner <tglx@...utronix.de>,
        Andi Kleen <ak@...ux.intel.com>, linux-kernel@...r.kernel.org,
        Stephane Eranian <eranian@...gle.com>
Subject: Re: [PATCH] perf bench: Add benchmark of find_next_bit

Em Fri, Jul 24, 2020 at 12:19:59AM -0700, Ian Rogers escreveu:
> for_each_set_bit, or similar functions like for_each_cpu, may be hot
> within the kernel. If many bits were set then one could imagine on
> Intel a "bt" instruction with every bit may be faster than the function
> call and word length find_next_bit logic. Add a benchmark to measure
> this.

Thanks, applied.

- Arnaldo
 
> This benchmark on AMD rome and Intel skylakex shows "bt" is not a good
> option except for very small bitmaps.
> 
> Signed-off-by: Ian Rogers <irogers@...gle.com>
> ---
>  tools/perf/bench/Build            |   1 +
>  tools/perf/bench/bench.h          |   1 +
>  tools/perf/bench/find-bit-bench.c | 135 ++++++++++++++++++++++++++++++
>  tools/perf/builtin-bench.c        |   1 +
>  4 files changed, 138 insertions(+)
>  create mode 100644 tools/perf/bench/find-bit-bench.c
> 
> diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build
> index 768e408757a0..fb114bca3a8d 100644
> --- a/tools/perf/bench/Build
> +++ b/tools/perf/bench/Build
> @@ -10,6 +10,7 @@ perf-y += epoll-wait.o
>  perf-y += epoll-ctl.o
>  perf-y += synthesize.o
>  perf-y += kallsyms-parse.o
> +perf-y += find-bit-bench.o
>  
>  perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-lib.o
>  perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-asm.o
> diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h
> index 61cae4966cae..3291b0ddddfe 100644
> --- a/tools/perf/bench/bench.h
> +++ b/tools/perf/bench/bench.h
> @@ -35,6 +35,7 @@ int bench_sched_messaging(int argc, const char **argv);
>  int bench_sched_pipe(int argc, const char **argv);
>  int bench_mem_memcpy(int argc, const char **argv);
>  int bench_mem_memset(int argc, const char **argv);
> +int bench_mem_find_bit(int argc, const char **argv);
>  int bench_futex_hash(int argc, const char **argv);
>  int bench_futex_wake(int argc, const char **argv);
>  int bench_futex_wake_parallel(int argc, const char **argv);
> diff --git a/tools/perf/bench/find-bit-bench.c b/tools/perf/bench/find-bit-bench.c
> new file mode 100644
> index 000000000000..1aadbd9d7e86
> --- /dev/null
> +++ b/tools/perf/bench/find-bit-bench.c
> @@ -0,0 +1,135 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Benchmark find_next_bit and related bit operations.
> + *
> + * Copyright 2020 Google LLC.
> + */
> +#include <stdlib.h>
> +#include "bench.h"
> +#include "../util/stat.h"
> +#include <linux/bitmap.h>
> +#include <linux/bitops.h>
> +#include <linux/time64.h>
> +#include <subcmd/parse-options.h>
> +
> +static unsigned int outer_iterations = 5;
> +static unsigned int inner_iterations = 100000;
> +
> +static const struct option options[] = {
> +	OPT_UINTEGER('i', "outer-iterations", &outer_iterations,
> +		"Number of outerer iterations used"),
> +	OPT_UINTEGER('j', "inner-iterations", &inner_iterations,
> +		"Number of outerer iterations used"),
> +	OPT_END()
> +};
> +
> +static const char *const bench_usage[] = {
> +	"perf bench mem find_bit <options>",
> +	NULL
> +};
> +
> +static unsigned int accumulator;
> +static unsigned int use_of_val;
> +
> +static noinline void workload(int val)
> +{
> +	use_of_val += val;
> +	accumulator++;
> +}
> +
> +#if defined(__i386__) || defined(__x86_64__)
> +static bool asm_test_bit(long nr, const unsigned long *addr)
> +{
> +	bool oldbit;
> +
> +	asm volatile("bt %2,%1"
> +		     : "=@ccc" (oldbit)
> +		     : "m" (*(unsigned long *)addr), "Ir" (nr) : "memory");
> +
> +	return oldbit;
> +}
> +#else
> +#define asm_test_bit test_bit
> +#endif
> +
> +static int do_for_each_set_bit(unsigned int num_bits)
> +{
> +	unsigned long *to_test = bitmap_alloc(num_bits);
> +	struct timeval start, end, diff;
> +	u64 runtime_us;
> +	struct stats fb_time_stats, tb_time_stats;
> +	double time_average, time_stddev;
> +	unsigned int bit, i, j;
> +	unsigned int set_bits, skip;
> +	unsigned int old;
> +
> +	init_stats(&fb_time_stats);
> +	init_stats(&tb_time_stats);
> +
> +	for (set_bits = 1; set_bits <= num_bits; set_bits <<= 1) {
> +		bitmap_zero(to_test, num_bits);
> +		skip = num_bits / set_bits;
> +		for (i = 0; i < num_bits; i += skip)
> +			set_bit(i, to_test);
> +
> +		for (i = 0; i < outer_iterations; i++) {
> +			old = accumulator;
> +			gettimeofday(&start, NULL);
> +			for (j = 0; j < inner_iterations; j++) {
> +				for_each_set_bit(bit, to_test, num_bits)
> +					workload(bit);
> +			}
> +			gettimeofday(&end, NULL);
> +			assert(old + (inner_iterations * set_bits) == accumulator);
> +			timersub(&end, &start, &diff);
> +			runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec;
> +			update_stats(&fb_time_stats, runtime_us);
> +
> +			old = accumulator;
> +			gettimeofday(&start, NULL);
> +			for (j = 0; j < inner_iterations; j++) {
> +				for (bit = 0; bit < num_bits; bit++) {
> +					if (asm_test_bit(bit, to_test))
> +						workload(bit);
> +				}
> +			}
> +			gettimeofday(&end, NULL);
> +			assert(old + (inner_iterations * set_bits) == accumulator);
> +			timersub(&end, &start, &diff);
> +			runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec;
> +			update_stats(&tb_time_stats, runtime_us);
> +		}
> +
> +		printf("%d operations %d bits set of %d bits\n",
> +			inner_iterations, set_bits, num_bits);
> +		time_average = avg_stats(&fb_time_stats);
> +		time_stddev = stddev_stats(&fb_time_stats);
> +		printf("  Average for_each_set_bit took: %.3f usec (+- %.3f usec)\n",
> +			time_average, time_stddev);
> +		time_average = avg_stats(&tb_time_stats);
> +		time_stddev = stddev_stats(&tb_time_stats);
> +		printf("  Average test_bit loop took:    %.3f usec (+- %.3f usec)\n",
> +			time_average, time_stddev);
> +
> +		if (use_of_val == accumulator)  /* Try to avoid compiler tricks. */
> +			printf("\n");
> +	}
> +	bitmap_free(to_test);
> +	return 0;
> +}
> +
> +int bench_mem_find_bit(int argc, const char **argv)
> +{
> +	int err = 0, i;
> +
> +	argc = parse_options(argc, argv, options, bench_usage, 0);
> +	if (argc) {
> +		usage_with_options(bench_usage, options);
> +		exit(EXIT_FAILURE);
> +	}
> +
> +	for (i = 1; i <= 2048; i <<= 1)
> +		do_for_each_set_bit(i);
> +
> +	return err;
> +}
> diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c
> index cad31b1d3438..690eee1120a7 100644
> --- a/tools/perf/builtin-bench.c
> +++ b/tools/perf/builtin-bench.c
> @@ -52,6 +52,7 @@ static struct bench sched_benchmarks[] = {
>  static struct bench mem_benchmarks[] = {
>  	{ "memcpy",	"Benchmark for memcpy() functions",		bench_mem_memcpy	},
>  	{ "memset",	"Benchmark for memset() functions",		bench_mem_memset	},
> +	{ "find_bit",	"Benchmark for find_bit() functions",		bench_mem_find_bit	},
>  	{ "all",	"Run all memory access benchmarks",		NULL			},
>  	{ NULL,		NULL,						NULL			}
>  };
> -- 
> 2.28.0.rc0.142.g3c755180ce-goog
> 

-- 

- Arnaldo

Powered by blists - more mailing lists