lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20250703121521.1874196-3-dongml2@chinatelecom.cn>
Date: Thu,  3 Jul 2025 20:15:05 +0800
From: Menglong Dong <menglong8.dong@...il.com>
To: alexei.starovoitov@...il.com,
	rostedt@...dmis.org,
	jolsa@...nel.org
Cc: bpf@...r.kernel.org,
	Menglong Dong <dongml2@...natelecom.cn>,
	"H. Peter Anvin" <hpa@...or.com>,
	Martin KaFai Lau <martin.lau@...ux.dev>,
	Eduard Zingerman <eddyz87@...il.com>,
	Song Liu <song@...nel.org>,
	Yonghong Song <yonghong.song@...ux.dev>,
	John Fastabend <john.fastabend@...il.com>,
	KP Singh <kpsingh@...nel.org>,
	Stanislav Fomichev <sdf@...ichev.me>,
	Hao Luo <haoluo@...gle.com>,
	linux-kernel@...r.kernel.org,
	netdev@...r.kernel.org
Subject: [PATCH bpf-next v2 02/18] x86,bpf: add bpf_global_caller for global trampoline

Implement the bpf global trampoline "bpf_global_caller" for x86_64. Thanks
to Alexei's advice, we implement most of the global trampoline with C
instead of asm.

We implement the entry of the trampoline with a "__naked" function, who
will save the regs to an array on the stack and call
bpf_global_caller_run(). The entry will pass the address of the array and
the address of the rip to bpf_global_caller_run().

In bpf_global_caller_run(), we will find the metadata by the function ip.
For origin call case, we call kfunc_md_enter() to protect the metadata,
which is similar to __bpf_tramp_enter(). Then we will call all the BPF
progs, just like what BPF trampoline do.

Without origin call, the bpf_global_caller_run() will return 0, and the
entry will restore the regs and return; In origin call case, it will
return 1, and the entry will make the RSP skip the rip before return.

In the FENTRY case, the performance of global trampoline is ~10% slower
than BPF trampoline. The global trampoline is optimized by inline some
function call, such as __bpf_prog_enter_recur and __bpf_prog_exit_recur.
However, more condition, branch and memory read is used in the
bpf_global_caller.

In the FEXIT and MODIFY_RETURN cases, the performance of the global
trampoline is the same(or even better) than BPF trampoline. It make sense,
as we also make the function call to __bpf_tramp_enter and
__bpf_tramp_exit inlined in the bpf_global_caller.

In fact, we can do more optimization to the bpf_global_caller. For
example, we can define more bpf_global_caller_xx_run() function and make
the "if (prog->sleepable)" and "if (do_origin_call)" fixed. It can be done
in a next series. After such optimization, I believe the performance of
FENTRY_MULTI can be closer or the same to FENTRY. And for the
FEXIT/MODIFY_RETURN cases, the performance can be better.

Signed-off-by: Menglong Dong <dongml2@...natelecom.cn>
---
v2:
- rewrite the global trampoline with C instead of asm
---
 arch/x86/Kconfig            |   4 +
 arch/x86/net/bpf_jit_comp.c | 268 ++++++++++++++++++++++++++++++++++++
 include/linux/bpf_tramp.h   |  72 ++++++++++
 kernel/bpf/trampoline.c     |  23 +---
 4 files changed, 346 insertions(+), 21 deletions(-)
 create mode 100644 include/linux/bpf_tramp.h

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 71019b3b54ea..96962c61419a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -155,6 +155,7 @@ config X86
 	select ARCH_WANTS_THP_SWAP		if X86_64
 	select ARCH_HAS_PARANOID_L1D_FLUSH
 	select ARCH_WANT_IRQS_OFF_ACTIVATE_MM
+	select ARCH_HAS_BPF_GLOBAL_CALLER	if X86_64
 	select BUILDTIME_TABLE_SORT
 	select CLKEVT_I8253
 	select CLOCKSOURCE_WATCHDOG
@@ -432,6 +433,9 @@ config PGTABLE_LEVELS
 	default 3 if X86_PAE
 	default 2
 
+config ARCH_HAS_BPF_GLOBAL_CALLER
+	bool
+
 menu "Processor type and features"
 
 config SMP
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 15672cb926fc..8d2fc436a748 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -11,6 +11,8 @@
 #include <linux/bpf.h>
 #include <linux/memory.h>
 #include <linux/sort.h>
+#include <linux/bpf_tramp.h>
+#include <linux/kfunc_md.h>
 #include <asm/extable.h>
 #include <asm/ftrace.h>
 #include <asm/set_memory.h>
@@ -3413,6 +3415,272 @@ int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
 	return ret;
 }
 
+#define FUNC_ARGS_0		((2 - 1) * 8)
+#define FUNC_ARGS_1		((2 + 0) * 8)
+#define FUNC_ARGS_2		((2 + 1) * 8)
+#define FUNC_ARGS_3		((2 + 2) * 8)
+#define FUNC_ARGS_4		((2 + 3) * 8)
+#define FUNC_ARGS_5		((2 + 4) * 8)
+#define FUNC_ARGS_6		((2 + 5) * 8)
+
+#define SAVE_ARGS_0
+#define SAVE_ARGS_1						\
+	"movq %rdi, " __stringify(FUNC_ARGS_1) "(%rsp)\n"
+#define SAVE_ARGS_2 SAVE_ARGS_1					\
+	"movq %rsi, " __stringify(FUNC_ARGS_2) "(%rsp)\n"
+#define SAVE_ARGS_3 SAVE_ARGS_2					\
+	"movq %rdx, " __stringify(FUNC_ARGS_3) "(%rsp)\n"
+#define SAVE_ARGS_4 SAVE_ARGS_3					\
+	"movq %rcx, " __stringify(FUNC_ARGS_4) "(%rsp)\n"
+#define SAVE_ARGS_5 SAVE_ARGS_4					\
+	"movq %r8, " __stringify(FUNC_ARGS_5) "(%rsp)\n"
+#define SAVE_ARGS_6 SAVE_ARGS_5					\
+	"movq %r9, " __stringify(FUNC_ARGS_6) "(%rsp)\n"	\
+
+#define RESTORE_ARGS_0
+#define RESTORE_ARGS_1						\
+	"movq " __stringify(FUNC_ARGS_1) "(%rsp), %rdi\n"
+#define RESTORE_ARGS_2 RESTORE_ARGS_1				\
+	"movq " __stringify(FUNC_ARGS_2) "(%rsp), %rsi\n"
+#define RESTORE_ARGS_3 RESTORE_ARGS_2				\
+	"movq " __stringify(FUNC_ARGS_3) "(%rsp), %rdx\n"
+#define RESTORE_ARGS_4 RESTORE_ARGS_3				\
+	"movq " __stringify(FUNC_ARGS_4) "(%rsp), %rcx\n"
+#define RESTORE_ARGS_5 RESTORE_ARGS_4				\
+	"movq " __stringify(FUNC_ARGS_5) "(%rsp), %r8\n"
+#define RESTORE_ARGS_6 RESTORE_ARGS_5				\
+	"movq " __stringify(FUNC_ARGS_6) "(%rsp), %r9\n"
+
+#define RESTORE_ORIGIN_0
+#define RESTORE_ORIGIN_1						\
+	"movq " __stringify(FUNC_ARGS_1 - FUNC_ARGS_1) "(%[args]), %%rdi\n"
+#define RESTORE_ORIGIN_2 RESTORE_ORIGIN_1				\
+	"movq " __stringify(FUNC_ARGS_2 - FUNC_ARGS_1) "(%[args]), %%rsi\n"
+#define RESTORE_ORIGIN_3 RESTORE_ORIGIN_2				\
+	"movq " __stringify(FUNC_ARGS_3 - FUNC_ARGS_1) "(%[args]), %%rdx\n"
+#define RESTORE_ORIGIN_4 RESTORE_ORIGIN_3				\
+	"movq " __stringify(FUNC_ARGS_4 - FUNC_ARGS_1) "(%[args]), %%rcx\n"
+#define RESTORE_ORIGIN_5 RESTORE_ORIGIN_4				\
+	"movq " __stringify(FUNC_ARGS_5 - FUNC_ARGS_1) "(%[args]), %%r8\n"
+#define RESTORE_ORIGIN_6 RESTORE_ORIGIN_5				\
+	"movq " __stringify(FUNC_ARGS_6 - FUNC_ARGS_1) "(%[args]), %%r9\n"
+
+static __always_inline void
+do_origin_call(unsigned long *args, unsigned long *ip, int nr_args)
+{
+	/* Following code will be optimized by the compiler, as nr_args
+	 * is a const, and there will be no condition here.
+	 */
+	if (nr_args == 0) {
+		asm volatile(
+			RESTORE_ORIGIN_0 CALL_NOSPEC "\n"
+			"movq %%rax, %0\n"
+			: "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
+			: [args]"r"(args), [thunk_target]"r"(*ip)
+			:
+		);
+	} else if (nr_args == 1) {
+		asm volatile(
+			RESTORE_ORIGIN_1 CALL_NOSPEC "\n"
+			"movq %%rax, %0\n"
+			: "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
+			: [args]"r"(args), [thunk_target]"r"(*ip)
+			: "rdi"
+		);
+	} else if (nr_args == 2) {
+		asm volatile(
+			RESTORE_ORIGIN_2 CALL_NOSPEC "\n"
+			"movq %%rax, %0\n"
+			: "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
+			: [args]"r"(args), [thunk_target]"r"(*ip)
+			: "rdi", "rsi"
+		);
+	} else if (nr_args == 3) {
+		asm volatile(
+			RESTORE_ORIGIN_3 CALL_NOSPEC "\n"
+			"movq %%rax, %0\n"
+			: "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
+			: [args]"r"(args), [thunk_target]"r"(*ip)
+			: "rdi", "rsi", "rdx"
+		);
+	} else if (nr_args == 4) {
+		asm volatile(
+			RESTORE_ORIGIN_4 CALL_NOSPEC "\n"
+			"movq %%rax, %0\n"
+			: "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
+			: [args]"r"(args), [thunk_target]"r"(*ip)
+			: "rdi", "rsi", "rdx", "rcx"
+		);
+	} else if (nr_args == 5) {
+		asm volatile(
+			RESTORE_ORIGIN_5 CALL_NOSPEC "\n"
+			"movq %%rax, %0\n"
+			: "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
+			: [args]"r"(args), [thunk_target]"r"(*ip)
+			: "rdi", "rsi", "rdx", "rcx", "r8"
+		);
+	} else if (nr_args == 6) {
+		asm volatile(
+			RESTORE_ORIGIN_6 CALL_NOSPEC "\n"
+			"movq %%rax, %0\n"
+			: "=m"(args[nr_args]), ASM_CALL_CONSTRAINT
+			: [args]"r"(args), [thunk_target]"r"(*ip)
+			: "rdi", "rsi", "rdx", "rcx", "r8", "r9"
+		);
+	}
+}
+
+static __always_inline notrace void
+run_tramp_prog(struct kfunc_md_tramp_prog *tramp_prog,
+	       struct bpf_tramp_run_ctx *run_ctx, unsigned long *args)
+{
+	struct bpf_prog *prog;
+	u64 start_time;
+
+	while (tramp_prog) {
+		prog = tramp_prog->prog;
+		run_ctx->bpf_cookie = tramp_prog->cookie;
+		start_time = bpf_gtramp_enter(prog, run_ctx);
+
+		if (likely(start_time)) {
+			asm volatile(
+				CALL_NOSPEC "\n"
+				: : [thunk_target]"r"(prog->bpf_func), [args]"D"(args)
+			);
+		}
+
+		bpf_gtramp_exit(prog, start_time, run_ctx);
+		tramp_prog = tramp_prog->next;
+	}
+}
+
+static __always_inline notrace int
+bpf_global_caller_run(unsigned long *args, unsigned long *ip, int nr_args)
+{
+	unsigned long origin_ip = (*ip) & 0xfffffffffffffff0; // Align to 16 bytes
+	struct kfunc_md_tramp_prog *tramp_prog;
+	struct bpf_tramp_run_ctx run_ctx;
+	struct kfunc_md *md;
+	bool do_orgin;
+
+	rcu_read_lock();
+	md = kfunc_md_get_rcu(origin_ip);
+	do_orgin = md->bpf_origin_call;
+	if (do_orgin)
+		kfunc_md_enter(md);
+	rcu_read_unlock();
+
+	/* save the origin function ip for bpf_get_func_ip() */
+	*(args - 2) = origin_ip;
+	*(args - 1) = nr_args;
+
+	run_tramp_prog(md->bpf_progs[BPF_TRAMP_FENTRY], &run_ctx, args);
+
+	/* no fexit and modify_return, return directly */
+	if (!do_orgin)
+		return 0;
+
+	/* modify return case */
+	tramp_prog = md->bpf_progs[BPF_TRAMP_MODIFY_RETURN];
+	/* initialize return value */
+	args[nr_args] = 0;
+	while (tramp_prog) {
+		struct bpf_prog *prog;
+		u64 start_time, ret;
+
+		prog = tramp_prog->prog;
+		run_ctx.bpf_cookie = tramp_prog->cookie;
+		start_time = bpf_gtramp_enter(prog, &run_ctx);
+
+		if (likely(start_time)) {
+			asm volatile(
+				CALL_NOSPEC "\n"
+				: "=a"(ret), ASM_CALL_CONSTRAINT
+				: [thunk_target]"r"(prog->bpf_func),
+				  [args]"D"(args)
+			);
+			args[nr_args] = ret;
+		} else {
+			ret = 0;
+		}
+
+		bpf_gtramp_exit(prog, start_time, &run_ctx);
+		if (ret)
+			goto do_fexit;
+		tramp_prog = tramp_prog->next;
+	}
+
+	/* restore the function arguments and call the origin function */
+	do_origin_call(args, ip, nr_args);
+do_fexit:
+	run_tramp_prog(md->bpf_progs[BPF_TRAMP_FEXIT], &run_ctx, args);
+	kfunc_md_exit(md);
+	return 1;
+}
+
+/* Layout of the stack frame:
+ *   rip		----> 8 bytes
+ *   return value	----> 8 bytes
+ *   args		----> 8 * 6 bytes
+ *   arg count		----> 8 bytes
+ *   origin ip		----> 8 bytes
+ */
+#define stack_size __stringify(8 + 8 + 6 * 8 + 8)
+
+#define CALLER_DEFINE(name, nr_args)					\
+static __always_used __no_stack_protector notrace int			\
+name##_run(unsigned long *args, unsigned long *ip)			\
+{									\
+	return bpf_global_caller_run(args, ip, nr_args);		\
+}									\
+static __naked void name(void)						\
+{									\
+	asm volatile(							\
+		"subq $" stack_size ", %rsp\n"				\
+		SAVE_ARGS_##nr_args					\
+	);								\
+									\
+	asm volatile(							\
+		"leaq " __stringify(FUNC_ARGS_1) "(%rsp), %rdi\n"	\
+		"leaq " stack_size "(%rsp), %rsi\n"			\
+		"call " #name "_run\n"					\
+		"test %rax, %rax\n"					\
+		"jne 1f\n"						\
+	);								\
+									\
+	asm volatile(							\
+		RESTORE_ARGS_##nr_args					\
+		"addq $" stack_size ", %rsp\n"				\
+		ASM_RET							\
+	);								\
+									\
+	asm volatile(							\
+		"1:\n"							\
+		"movq " __stringify(FUNC_ARGS_##nr_args + 8)		\
+		"(%rsp), %rax\n"					\
+		"addq $(" stack_size " + 8), %rsp\n"			\
+		ASM_RET);						\
+}									\
+STACK_FRAME_NON_STANDARD(name)
+
+CALLER_DEFINE(bpf_global_caller_0, 0);
+CALLER_DEFINE(bpf_global_caller_1, 1);
+CALLER_DEFINE(bpf_global_caller_2, 2);
+CALLER_DEFINE(bpf_global_caller_3, 3);
+CALLER_DEFINE(bpf_global_caller_4, 4);
+CALLER_DEFINE(bpf_global_caller_5, 5);
+CALLER_DEFINE(bpf_global_caller_6, 6);
+
+void *bpf_gloabl_caller_array[MAX_BPF_FUNC_ARGS + 1] = {
+	bpf_global_caller_0,
+	bpf_global_caller_1,
+	bpf_global_caller_2,
+	bpf_global_caller_3,
+	bpf_global_caller_4,
+	bpf_global_caller_5,
+	bpf_global_caller_6,
+};
+
 static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs, u8 *image, u8 *buf)
 {
 	u8 *jg_reloc, *prog = *pprog;
diff --git a/include/linux/bpf_tramp.h b/include/linux/bpf_tramp.h
new file mode 100644
index 000000000000..32447fcfc017
--- /dev/null
+++ b/include/linux/bpf_tramp.h
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __LINUX_BPF_TRAMP_H__
+#define __LINUX_BPF_TRAMP_H__
+#ifdef CONFIG_BPF_JIT
+#include <linux/filter.h>
+
+#ifdef CONFIG_ARCH_HAS_BPF_GLOBAL_CALLER
+extern void *bpf_gloabl_caller_array[MAX_BPF_FUNC_ARGS + 1];
+#endif
+
+void notrace __update_prog_stats(struct bpf_prog *prog, u64 start);
+
+#define NO_START_TIME 1
+static __always_inline u64 notrace bpf_prog_start_time(void)
+{
+	u64 start = NO_START_TIME;
+
+	if (static_branch_unlikely(&bpf_stats_enabled_key)) {
+		start = sched_clock();
+		if (unlikely(!start))
+			start = NO_START_TIME;
+	}
+	return start;
+}
+
+static __always_inline void notrace update_prog_stats(struct bpf_prog *prog,
+						      u64 start)
+{
+	if (static_branch_unlikely(&bpf_stats_enabled_key))
+		__update_prog_stats(prog, start);
+}
+
+static __always_inline u64 notrace
+bpf_gtramp_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx)
+	__acquires(RCU)
+{
+	if (unlikely(prog->sleepable)) {
+		rcu_read_lock_trace();
+		might_fault();
+	} else {
+		rcu_read_lock();
+	}
+	migrate_disable();
+
+	run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
+	if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+		bpf_prog_inc_misses_counter(prog);
+		if (prog->aux->recursion_detected)
+			prog->aux->recursion_detected(prog);
+		return 0;
+	}
+	return bpf_prog_start_time();
+}
+
+static __always_inline void notrace
+bpf_gtramp_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx)
+	__releases(RCU)
+{
+	bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
+	update_prog_stats(prog, start);
+	this_cpu_dec(*(prog->active));
+	migrate_enable();
+	if (unlikely(prog->sleepable))
+		rcu_read_unlock_trace();
+	else
+		rcu_read_unlock();
+}
+
+#endif
+#endif
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index b1e358c16eeb..fa90c225c93b 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -13,6 +13,7 @@
 #include <linux/bpf_verifier.h>
 #include <linux/bpf_lsm.h>
 #include <linux/delay.h>
+#include <linux/bpf_tramp.h>
 
 /* dummy _ops. The verifier will operate on target program's ops. */
 const struct bpf_verifier_ops bpf_extension_verifier_ops = {
@@ -868,19 +869,6 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
 	mutex_unlock(&trampoline_mutex);
 }
 
-#define NO_START_TIME 1
-static __always_inline u64 notrace bpf_prog_start_time(void)
-{
-	u64 start = NO_START_TIME;
-
-	if (static_branch_unlikely(&bpf_stats_enabled_key)) {
-		start = sched_clock();
-		if (unlikely(!start))
-			start = NO_START_TIME;
-	}
-	return start;
-}
-
 /* The logic is similar to bpf_prog_run(), but with an explicit
  * rcu_read_lock() and migrate_disable() which are required
  * for the trampoline. The macro is split into
@@ -911,7 +899,7 @@ static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tram
 	return bpf_prog_start_time();
 }
 
-static void notrace __update_prog_stats(struct bpf_prog *prog, u64 start)
+void notrace __update_prog_stats(struct bpf_prog *prog, u64 start)
 {
 	struct bpf_prog_stats *stats;
 	unsigned long flags;
@@ -932,13 +920,6 @@ static void notrace __update_prog_stats(struct bpf_prog *prog, u64 start)
 	u64_stats_update_end_irqrestore(&stats->syncp, flags);
 }
 
-static __always_inline void notrace update_prog_stats(struct bpf_prog *prog,
-						      u64 start)
-{
-	if (static_branch_unlikely(&bpf_stats_enabled_key))
-		__update_prog_stats(prog, start);
-}
-
 static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start,
 					  struct bpf_tramp_run_ctx *run_ctx)
 	__releases(RCU)
-- 
2.39.5


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ