netdev - [PATCH RFC bpf-next] selftests/bpf: Extend bench for LPM trie with noop and baseline

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <175509897596.2755384.18413775753563966331.stgit@firesoul>
Date: Wed, 13 Aug 2025 17:29:35 +0200
From: Jesper Dangaard Brouer <hawk@...nel.org>
To: matt@...dmodwrite.com, bpf@...r.kernel.org,
 Alexei Starovoitov <ast@...nel.org>
Cc: Jesper Dangaard Brouer <hawk@...nel.org>, mfleming@...udflare.com,
 Daniel Borkmann <borkmann@...earbox.net>, netdev@...r.kernel.org,
 kernel-team@...udflare.com
Subject: [PATCH RFC bpf-next] selftests/bpf: Extend bench for LPM trie with
 noop and baseline

This patch is extending[0] with some adjustments[1].
 [0] https://lore.kernel.org/all/20250722150152.1158205-1-matt@readmodwrite.com/
 [1] https://github.com/xdp-project/xdp-project/blob/main/areas/bench/patches/bench-lpm-trie-V3-adjusted.patch

The 'noop' bench measures the overhead of the harness.
Meaning the bpf_prog_test_run that calls bpf_loop with 10000
(NR_LOOPS) iterations in the lpm_producer loop.

CPU: AMD EPYC 9684X
sudo ./bench lpm-trie-noop  --nr_entries=1 --producers=1 --affinity
Setting up benchmark 'lpm-trie-noop'...
Benchmark 'lpm-trie-noop' started.
Iter   0 ( 42.501us): hits   74.567M/s ( 74.567M/prod)
Iter   1 ( -5.155us): hits   74.630M/s ( 74.630M/prod)
Iter   2 (  0.123us): hits   74.620M/s ( 74.620M/prod)
Iter   3 ( -7.127us): hits   74.611M/s ( 74.611M/prod)
Iter   4 (  7.334us): hits   74.609M/s ( 74.609M/prod)
Iter   5 (  0.163us): hits   74.620M/s ( 74.620M/prod)
Iter   6 (  0.213us): hits   74.610M/s ( 74.610M/prod)
Summary: throughput   74.617 ± 0.008 M ops/s ( 74.617M ops/prod), latency   13.402 ns/op

The baseline measures overhead of getting a random number
and modulo, which can be used as a baseline comparsion
against lpm-trie-lookup and lpm-trie-update.

sudo ./bench lpm-trie-baseline  --nr_entries=1 --producers=1 --affinity
Setting up benchmark 'lpm-trie-baseline'...
Benchmark 'lpm-trie-baseline' started.
Iter   0 ( 44.996us): hits   36.308M/s ( 36.308M/prod)
Iter   1 ( -1.535us): hits   36.330M/s ( 36.330M/prod)
Iter   2 ( -3.919us): hits   36.310M/s ( 36.310M/prod)
Iter   3 ( -1.004us): hits   36.330M/s ( 36.330M/prod)
Iter   4 ( -1.476us): hits   36.320M/s ( 36.320M/prod)
Iter   5 (  0.468us): hits   36.330M/s ( 36.330M/prod)
Iter   6 ( -0.304us): hits   36.330M/s ( 36.330M/prod)
Summary: throughput   36.325 ± 0.008 M ops/s ( 36.325M ops/prod), latency   27.529 ns/op

Thus, the overhead of bpf_get_prandom_u32() is 14.1 nanosec.

Signed-off-by: Jesper Dangaard Brouer <hawk@...nel.org>
---
 tools/testing/selftests/bpf/bench.c                |    4 ++
 .../selftests/bpf/benchs/bench_lpm_trie_map.c      |   40 +++++++++++++++++++-
 tools/testing/selftests/bpf/progs/lpm_trie_bench.c |   31 ++++++++++++++--
 3 files changed, 70 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c
index fd15f60fd5a8..8a41aec89479 100644
--- a/tools/testing/selftests/bpf/bench.c
+++ b/tools/testing/selftests/bpf/bench.c
@@ -560,6 +560,8 @@ extern const struct bench bench_htab_mem;
 extern const struct bench bench_crypto_encrypt;
 extern const struct bench bench_crypto_decrypt;
 extern const struct bench bench_sockmap;
+extern const struct bench bench_lpm_trie_noop;
+extern const struct bench bench_lpm_trie_baseline;
 extern const struct bench bench_lpm_trie_lookup;
 extern const struct bench bench_lpm_trie_update;
 extern const struct bench bench_lpm_trie_delete;
@@ -631,6 +633,8 @@ static const struct bench *benchs[] = {
 	&bench_crypto_encrypt,
 	&bench_crypto_decrypt,
 	&bench_sockmap,
+	&bench_lpm_trie_noop,
+	&bench_lpm_trie_baseline,
 	&bench_lpm_trie_lookup,
 	&bench_lpm_trie_update,
 	&bench_lpm_trie_delete,
diff --git a/tools/testing/selftests/bpf/benchs/bench_lpm_trie_map.c b/tools/testing/selftests/bpf/benchs/bench_lpm_trie_map.c
index 32a46c2402ea..4e0f12e359ba 100644
--- a/tools/testing/selftests/bpf/benchs/bench_lpm_trie_map.c
+++ b/tools/testing/selftests/bpf/benchs/bench_lpm_trie_map.c
@@ -87,7 +87,7 @@ static void __lpm_validate(void)
 	};
 }
 
-enum { OP_LOOKUP = 1, OP_UPDATE, OP_DELETE, OP_FREE };
+enum { OP_NOOP=0, OP_BASELINE, OP_LOOKUP, OP_UPDATE, OP_DELETE, OP_FREE };
 
 static void lpm_delete_validate(void)
 {
@@ -175,6 +175,18 @@ static void lpm_setup(void)
 	fill_map(fd);
 }
 
+static void lpm_noop_setup(void)
+{
+	__lpm_setup();
+	ctx.bench->bss->op = OP_NOOP;
+}
+
+static void lpm_baseline_setup(void)
+{
+	__lpm_setup();
+	ctx.bench->bss->op = OP_BASELINE;
+}
+
 static void lpm_lookup_setup(void)
 {
 	lpm_setup();
@@ -208,7 +220,7 @@ static void lpm_measure(struct bench_res *res)
 	res->duration_ns = atomic_swap(&ctx.bench->bss->duration_ns, 0);
 }
 
-/* For LOOKUP, UPDATE, and DELETE */
+/* For NOOP, BASELINE, LOOKUP, UPDATE, and DELETE */
 static void *lpm_producer(void *unused __always_unused)
 {
 	int err;
@@ -310,6 +322,30 @@ static void free_ops_report_final(struct bench_res res[], int res_cnt)
 	       latency / lat_divisor / env.producer_cnt, unit);
 }
 
+/* noop bench measures harness-overhead */
+const struct bench bench_lpm_trie_noop = {
+	.name = "lpm-trie-noop",
+	.argp = &bench_lpm_trie_map_argp,
+	.validate = __lpm_validate,
+	.setup = lpm_noop_setup,
+	.producer_thread = lpm_producer,
+	.measure = lpm_measure,
+	.report_progress = ops_report_progress,
+	.report_final = ops_report_final,
+};
+
+/* baseline overhead for lookup and update */
+const struct bench bench_lpm_trie_baseline = {
+	.name = "lpm-trie-baseline",
+	.argp = &bench_lpm_trie_map_argp,
+	.validate = __lpm_validate,
+	.setup = lpm_baseline_setup,
+	.producer_thread = lpm_producer,
+	.measure = lpm_measure,
+	.report_progress = ops_report_progress,
+	.report_final = ops_report_final,
+};
+
 const struct bench bench_lpm_trie_lookup = {
 	.name = "lpm-trie-lookup",
 	.argp = &bench_lpm_trie_map_argp,
diff --git a/tools/testing/selftests/bpf/progs/lpm_trie_bench.c b/tools/testing/selftests/bpf/progs/lpm_trie_bench.c
index 522e1cbef490..e4a5cecd6560 100644
--- a/tools/testing/selftests/bpf/progs/lpm_trie_bench.c
+++ b/tools/testing/selftests/bpf/progs/lpm_trie_bench.c
@@ -6,6 +6,7 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_core_read.h>
 #include "bpf_misc.h"
+#include "bpf_atomic.h"
 
 #define BPF_OBJ_NAME_LEN 16U
 #define MAX_ENTRIES 100000000
@@ -84,12 +85,30 @@ int BPF_PROG(trie_free_exit, struct work_struct *work)
 	return 0;
 }
 
-static void gen_random_key(struct trie_key *key)
+static __always_inline void gen_random_key(struct trie_key *key)
 {
 	key->prefixlen = prefixlen;
 	key->data = bpf_get_prandom_u32() % nr_entries;
 }
 
+static int noop(__u32 index, __u32 *unused)
+{
+	return 0;
+}
+
+static int baseline(__u32 index, __u32 *unused)
+{
+	struct trie_key key;
+	__s64 blackbox;
+
+	gen_random_key(&key);
+	/* Avoid compiler optimizing out the modulo */
+	barrier_var(blackbox);
+	blackbox = READ_ONCE(key.data);
+
+	return 0;
+}
+
 static int lookup(__u32 index, __u32 *unused)
 {
 	struct trie_key key;
@@ -148,13 +167,19 @@ int BPF_PROG(run_bench)
 	start = bpf_ktime_get_ns();
 
 	switch (op) {
+	case 0:
+		loops = bpf_loop(NR_LOOPS, noop, NULL, 0);
+		break;
 	case 1:
-		loops = bpf_loop(NR_LOOPS, lookup, NULL, 0);
+		loops = bpf_loop(NR_LOOPS, baseline, NULL, 0);
 		break;
 	case 2:
-		loops = bpf_loop(NR_LOOPS, update, NULL, 0);
+		loops = bpf_loop(NR_LOOPS, lookup, NULL, 0);
 		break;
 	case 3:
+		loops = bpf_loop(NR_LOOPS, update, NULL, 0);
+		break;
+	case 4:
 		loops = bpf_loop(NR_LOOPS, delete, &need_refill, 0);
 		break;
 	default: