[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <5967F93A-235B-447E-9B70-E7768998B718@fb.com>
Date: Thu, 7 Nov 2019 22:37:19 +0000
From: Song Liu <songliubraving@...com>
To: Alexei Starovoitov <ast@...nel.org>
CC: "davem@...emloft.net" <davem@...emloft.net>,
"daniel@...earbox.net" <daniel@...earbox.net>,
"x86@...nel.org" <x86@...nel.org>,
"netdev@...r.kernel.org" <netdev@...r.kernel.org>,
"bpf@...r.kernel.org" <bpf@...r.kernel.org>,
Kernel Team <Kernel-team@...com>
Subject: Re: [PATCH v2 bpf-next 03/17] bpf: Introduce BPF trampoline
> On Nov 6, 2019, at 9:46 PM, Alexei Starovoitov <ast@...nel.org> wrote:
>
[...]
>
> Signed-off-by: Alexei Starovoitov <ast@...nel.org>
> Acked-by: Andrii Nakryiko <andriin@...com>
> ---
> arch/x86/net/bpf_jit_comp.c | 227 ++++++++++++++++++++++++++++++--
> include/linux/bpf.h | 98 ++++++++++++++
> include/uapi/linux/bpf.h | 2 +
> kernel/bpf/Makefile | 1 +
> kernel/bpf/btf.c | 77 ++++++++++-
> kernel/bpf/core.c | 1 +
> kernel/bpf/syscall.c | 53 +++++++-
> kernel/bpf/trampoline.c | 252 ++++++++++++++++++++++++++++++++++++
> kernel/bpf/verifier.c | 39 ++++++
> 9 files changed, 732 insertions(+), 18 deletions(-)
> create mode 100644 kernel/bpf/trampoline.c
>
> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
> index 8631d3bd637f..44169e8bffc0 100644
> --- a/arch/x86/net/bpf_jit_comp.c
> +++ b/arch/x86/net/bpf_jit_comp.c
> @@ -98,6 +98,7 @@ static int bpf_size_to_x86_bytes(int bpf_size)
>
> /* Pick a register outside of BPF range for JIT internal work */
> #define AUX_REG (MAX_BPF_JIT_REG + 1)
> +#define X86_REG_R9 (MAX_BPF_JIT_REG + 2)
>
> /*
> * The following table maps BPF registers to x86-64 registers.
> @@ -123,6 +124,7 @@ static const int reg2hex[] = {
> [BPF_REG_FP] = 5, /* RBP readonly */
> [BPF_REG_AX] = 2, /* R10 temp register */
> [AUX_REG] = 3, /* R11 temp register */
> + [X86_REG_R9] = 1, /* R9 register, 6th function argument */
We should update the comment above this:
* Also x86-64 register R9 is unused. ...
> };
>
> static const int reg2pt_regs[] = {
> @@ -150,6 +152,7 @@ static bool is_ereg(u32 reg)
> BIT(BPF_REG_7) |
> BIT(BPF_REG_8) |
> BIT(BPF_REG_9) |
> + BIT(X86_REG_R9) |
> BIT(BPF_REG_AX));
> }
>
> @@ -492,8 +495,8 @@ static int emit_call(u8 **pprog, void *func, void *ip)
> int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
> void *old_addr, void *new_addr)
> {
> - u8 old_insn[NOP_ATOMIC5] = {};
> - u8 new_insn[NOP_ATOMIC5] = {};
> + u8 old_insn[X86_CALL_SIZE] = {};
> + u8 new_insn[X86_CALL_SIZE] = {};
> u8 *prog;
> int ret;
>
> @@ -507,9 +510,9 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
> if (ret)
> return ret;
> }
> - if (old_addr) {
> + if (new_addr) {
> prog = new_insn;
> - ret = emit_call(&prog, old_addr, (void *)ip);
> + ret = emit_call(&prog, new_addr, (void *)ip);
> if (ret)
> return ret;
> }
> @@ -517,19 +520,19 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
> mutex_lock(&text_mutex);
> switch (t) {
> case BPF_MOD_NOP_TO_CALL:
> - if (memcmp(ip, ideal_nops, 5))
> + if (memcmp(ip, ideal_nops[NOP_ATOMIC5], X86_CALL_SIZE))
> goto out;
> - text_poke(ip, new_insn, 5);
> + text_poke(ip, new_insn, X86_CALL_SIZE);
> break;
> case BPF_MOD_CALL_TO_CALL:
> - if (memcmp(ip, old_insn, 5))
> + if (memcmp(ip, old_insn, X86_CALL_SIZE))
> goto out;
> - text_poke(ip, new_insn, 5);
> + text_poke(ip, new_insn, X86_CALL_SIZE);
> break;
> case BPF_MOD_CALL_TO_NOP:
> - if (memcmp(ip, old_insn, 5))
> + if (memcmp(ip, old_insn, X86_CALL_SIZE))
> goto out;
> - text_poke(ip, ideal_nops, 5);
> + text_poke(ip, ideal_nops[NOP_ATOMIC5], X86_CALL_SIZE);
> break;
> }
> ret = 0;
> @@ -1234,6 +1237,210 @@ xadd: if (is_imm8(insn->off))
> return proglen;
> }
>
> +static void save_regs(struct btf_func_model *m, u8 **prog, int nr_args,
> + int stack_size)
> +{
> + int i;
> + /* Store function arguments to stack.
> + * For a function that accepts two pointers the sequence will be:
> + * mov QWORD PTR [rbp-0x10],rdi
> + * mov QWORD PTR [rbp-0x8],rsi
> + */
> + for (i = 0; i < min(nr_args, 6); i++)
> + emit_stx(prog, bytes_to_bpf_size(m->arg_size[i]),
> + BPF_REG_FP,
> + i == 5 ? X86_REG_R9 : BPF_REG_1 + i,
> + -(stack_size - i * 8));
> +}
> +
> +static void restore_regs(struct btf_func_model *m, u8 **prog, int nr_args,
> + int stack_size)
> +{
> + int i;
> +
> + /* Restore function arguments from stack.
> + * For a function that accepts two pointers the sequence will be:
> + * EMIT4(0x48, 0x8B, 0x7D, 0xF0); mov rdi,QWORD PTR [rbp-0x10]
> + * EMIT4(0x48, 0x8B, 0x75, 0xF8); mov rsi,QWORD PTR [rbp-0x8]
> + */
> + for (i = 0; i < min(nr_args, 6); i++)
> + emit_ldx(prog, bytes_to_bpf_size(m->arg_size[i]),
> + i == 5 ? X86_REG_R9 : BPF_REG_1 + i,
> + BPF_REG_FP,
> + -(stack_size - i * 8));
> +}
> +
> +static int invoke_bpf(struct btf_func_model *m, u8 **pprog,
> + struct bpf_prog **progs, int prog_cnt, int stack_size)
> +{
> + u8 *prog = *pprog;
> + int cnt = 0, i;
> +
> + for (i = 0; i < prog_cnt; i++) {
> + if (emit_call(&prog, __bpf_prog_enter, prog))
> + return -EINVAL;
> + /* remember prog start time returned by __bpf_prog_enter */
> + emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
> +
> + /* arg1: lea rdi, [rbp - stack_size] */
> + EMIT4(0x48, 0x8D, 0x7D, -stack_size);
> + /* arg2: progs[i]->insnsi for interpreter */
> + if (!progs[i]->jited)
> + emit_mov_imm64(&prog, BPF_REG_2,
> + (long) progs[i]->insnsi >> 32,
> + (u32) (long) progs[i]->insnsi);
> + /* call JITed bpf program or interpreter */
> + if (emit_call(&prog, progs[i]->bpf_func, prog))
> + return -EINVAL;
> +
> + /* arg1: mov rdi, progs[i] */
> + emit_mov_imm64(&prog, BPF_REG_1, (long) progs[i] >> 32,
> + (u32) (long) progs[i]);
> + /* arg2: mov rsi, rbx <- start time in nsec */
> + emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6);
> + if (emit_call(&prog, __bpf_prog_exit, prog))
> + return -EINVAL;
> + }
> + *pprog = prog;
> + return 0;
> +}
> +
> +/* Example:
> + * __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev);
> + * its 'struct btf_func_model' will be nr_args=2
> + * The assembly code when eth_type_trans is executing after trampoline:
> + *
> + * push rbp
> + * mov rbp, rsp
> + * sub rsp, 16 // space for skb and dev
> + * push rbx // temp regs to pass start time
> + * mov qword ptr [rbp - 16], rdi // save skb pointer to stack
> + * mov qword ptr [rbp - 8], rsi // save dev pointer to stack
> + * call __bpf_prog_enter // rcu_read_lock and preempt_disable
> + * mov rbx, rax // remember start time in bpf stats are enabled
> + * lea rdi, [rbp - 16] // R1==ctx of bpf prog
> + * call addr_of_jited_FENTRY_prog
> + * movabsq rdi, 64bit_addr_of_struct_bpf_prog // unused if bpf stats are off
> + * mov rsi, rbx // prog start time
> + * call __bpf_prog_exit // rcu_read_unlock, preempt_enable and stats math
> + * mov rdi, qword ptr [rbp - 16] // restore skb pointer from stack
> + * mov rsi, qword ptr [rbp - 8] // restore dev pointer from stack
> + * pop rbx
> + * leave
> + * ret
> + *
> + * eth_type_trans has 5 byte nop at the beginning. These 5 bytes will be
> + * replaced with 'call generated_bpf_trampoline'. When it returns
> + * eth_type_trans will continue executing with original skb and dev pointers.
> + *
> + * The assembly code when eth_type_trans is called from trampoline:
> + *
> + * push rbp
> + * mov rbp, rsp
> + * sub rsp, 24 // space for skb, dev, return value
> + * push rbx // temp regs to pass start time
> + * mov qword ptr [rbp - 24], rdi // save skb pointer to stack
> + * mov qword ptr [rbp - 16], rsi // save dev pointer to stack
> + * call __bpf_prog_enter // rcu_read_lock and preempt_disable
> + * mov rbx, rax // remember start time if bpf stats are enabled
> + * lea rdi, [rbp - 24] // R1==ctx of bpf prog
> + * call addr_of_jited_FENTRY_prog // bpf prog can access skb and dev
> + * movabsq rdi, 64bit_addr_of_struct_bpf_prog // unused if bpf stats are off
> + * mov rsi, rbx // prog start time
> + * call __bpf_prog_exit // rcu_read_unlock, preempt_enable and stats math
> + * mov rdi, qword ptr [rbp - 24] // restore skb pointer from stack
> + * mov rsi, qword ptr [rbp - 16] // restore dev pointer from stack
> + * call eth_type_trans+5 // execute body of eth_type_trans
> + * mov qword ptr [rbp - 8], rax // save return value
> + * call __bpf_prog_enter // rcu_read_lock and preempt_disable
> + * mov rbx, rax // remember start time in bpf stats are enabled
> + * lea rdi, [rbp - 24] // R1==ctx of bpf prog
> + * call addr_of_jited_FEXIT_prog // bpf prog can access skb, dev, return value
> + * movabsq rdi, 64bit_addr_of_struct_bpf_prog // unused if bpf stats are off
> + * mov rsi, rbx // prog start time
> + * call __bpf_prog_exit // rcu_read_unlock, preempt_enable and stats math
> + * mov rax, qword ptr [rbp - 8] // restore eth_type_trans's return value
> + * pop rbx
> + * leave
> + * add rsp, 8 // skip eth_type_trans's frame
> + * ret // return to its caller
> + */
> +int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags,
> + struct bpf_prog **fentry_progs, int fentry_cnt,
> + struct bpf_prog **fexit_progs, int fexit_cnt,
> + void *orig_call)
> +{
> + int cnt = 0, nr_args = m->nr_args;
> + int stack_size = nr_args * 8;
> + u8 *prog;
> +
> + /* x86-64 supports up to 6 arguments. 7+ can be added in the future */
> + if (nr_args > 6)
> + return -ENOTSUPP;
> +
> + if ((flags & BPF_TRAMP_F_RESTORE_REGS) &&
> + (flags & BPF_TRAMP_F_SKIP_FRAME))
> + return -EINVAL;
> +
> + if (flags & BPF_TRAMP_F_CALL_ORIG)
> + stack_size += 8; /* room for return value of orig_call */
> +
> + if (flags & BPF_TRAMP_F_SKIP_FRAME)
> + /* skip patched call instruction and point orig_call to actual
> + * body of the kernel function.
> + */
> + orig_call += X86_CALL_SIZE;
> +
> + prog = image;
> +
> + EMIT1(0x55); /* push rbp */
> + EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
> + EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */
> + EMIT1(0x53); /* push rbx */
> +
> + save_regs(m, &prog, nr_args, stack_size);
> +
> + if (fentry_cnt)
> + if (invoke_bpf(m, &prog, fentry_progs, fentry_cnt, stack_size))
> + return -EINVAL;
> +
> + if (flags & BPF_TRAMP_F_CALL_ORIG) {
> + if (fentry_cnt)
> + restore_regs(m, &prog, nr_args, stack_size);
> +
> + /* call original function */
> + if (emit_call(&prog, orig_call, prog))
> + return -EINVAL;
> + /* remember return value in a stack for bpf prog to access */
> + emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
> + }
> +
> + if (fexit_cnt)
> + if (invoke_bpf(m, &prog, fexit_progs, fexit_cnt, stack_size))
> + return -EINVAL;
> +
> + if (flags & BPF_TRAMP_F_RESTORE_REGS)
> + restore_regs(m, &prog, nr_args, stack_size);
> +
> + if (flags & BPF_TRAMP_F_CALL_ORIG)
> + /* restore original return value back into RAX */
> + emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8);
> +
> + EMIT1(0x5B); /* pop rbx */
> + EMIT1(0xC9); /* leave */
> + if (flags & BPF_TRAMP_F_SKIP_FRAME)
> + /* skip our return address and return to parent */
> + EMIT4(0x48, 0x83, 0xC4, 8); /* add rsp, 8 */
> + EMIT1(0xC3); /* ret */
> + /* One half of the page has active running trampoline.
> + * Another half is an area for next trampoline.
> + * Make sure the trampoline generation logic doesn't overflow.
> + */
> + if (WARN_ON_ONCE(prog - (u8 *)image > PAGE_SIZE / 2 - BPF_INSN_SAFETY))
> + return -EFAULT;
Given max number of args, can we catch this error at compile time?
> + return 0;
> +}
> +
> struct x64_jit_data {
> struct bpf_binary_header *header;
> int *addrs;
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 8b90db25348a..9206b7e86dde 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -14,6 +14,7 @@
> #include <linux/numa.h>
> #include <linux/wait.h>
> #include <linux/u64_stats_sync.h>
> +#include <linux/refcount.h>
>
> struct bpf_verifier_env;
> struct bpf_verifier_log;
> @@ -384,6 +385,94 @@ struct bpf_prog_stats {
> struct u64_stats_sync syncp;
> } __aligned(2 * sizeof(u64));
>
> +struct btf_func_model {
> + u8 ret_size;
> + u8 nr_args;
> + u8 arg_size[MAX_BPF_FUNC_ARGS];
> +};
> +
> +/* Restore arguments before returning from trampoline to let original function
> + * continue executing. This flag is used for fentry progs when there are no
> + * fexit progs.
> + */
> +#define BPF_TRAMP_F_RESTORE_REGS BIT(0)
> +/* Call original function after fentry progs, but before fexit progs.
> + * Makes sense for fentry/fexit, normal calls and indirect calls.
> + */
> +#define BPF_TRAMP_F_CALL_ORIG BIT(1)
> +/* Skip current frame and return to parent. Makes sense for fentry/fexit
> + * programs only. Should not be used with normal calls and indirect calls.
> + */
> +#define BPF_TRAMP_F_SKIP_FRAME BIT(2)
> +
> +/* Different use cases for BPF trampoline:
> + * 1. replace nop at the function entry (kprobe equivalent)
> + * flags = BPF_TRAMP_F_RESTORE_REGS
> + * fentry = a set of programs to run before returning from trampoline
> + *
> + * 2. replace nop at the function entry (kprobe + kretprobe equivalent)
> + * flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME
> + * orig_call = fentry_ip + MCOUNT_INSN_SIZE
> + * fentry = a set of program to run before calling original function
> + * fexit = a set of program to run after original function
> + *
> + * 3. replace direct call instruction anywhere in the function body
> + * or assign a function pointer for indirect call (like tcp_congestion_ops->cong_avoid)
> + * With flags = 0
> + * fentry = a set of programs to run before returning from trampoline
> + * With flags = BPF_TRAMP_F_CALL_ORIG
> + * orig_call = original callback addr or direct function addr
> + * fentry = a set of program to run before calling original function
> + * fexit = a set of program to run after original function
> + */
> +int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags,
> + struct bpf_prog **fentry_progs, int fentry_cnt,
> + struct bpf_prog **fexit_progs, int fexit_cnt,
> + void *orig_call);
> +/* these two functions are called from generated trampoline */
> +u64 notrace __bpf_prog_enter(void);
> +void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start);
> +
> +enum bpf_tramp_prog_type {
> + BPF_TRAMP_FENTRY,
> + BPF_TRAMP_FEXIT,
> + BPF_TRAMP_MAX
> +};
> +
> +struct bpf_trampoline {
> + struct hlist_node hlist;
> + refcount_t refcnt;
> + u64 key;
> + struct {
> + struct btf_func_model model;
> + void *addr;
> + } func;
> + struct hlist_head progs_hlist[BPF_TRAMP_MAX];
> + int progs_cnt[BPF_TRAMP_MAX];
> + void *image;
> + u64 selector;
> +};
> +#ifdef CONFIG_BPF_JIT
> +struct bpf_trampoline *bpf_trampoline_lookup(u64 key);
> +int bpf_trampoline_link_prog(struct bpf_prog *prog);
> +int bpf_trampoline_unlink_prog(struct bpf_prog *prog);
> +void bpf_trampoline_put(struct bpf_trampoline *tr);
> +#else
> +static inline struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
> +{
> + return NULL;
> +}
> +static inline int bpf_trampoline_link_prog(struct bpf_prog *prog)
> +{
> + return -ENOTSUPP;
> +}
> +static inline int bpf_trampoline_unlink_prog(struct bpf_prog *prog)
> +{
> + return -ENOTSUPP;
> +}
> +static inline void bpf_trampoline_put(struct bpf_trampoline *tr) {}
> +#endif
> +
> struct bpf_prog_aux {
> atomic_t refcnt;
> u32 used_map_cnt;
> @@ -398,6 +487,9 @@ struct bpf_prog_aux {
> bool verifier_zext; /* Zero extensions has been inserted by verifier. */
> bool offload_requested;
> bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */
> + enum bpf_tramp_prog_type trampoline_prog_type;
> + struct bpf_trampoline *trampoline;
> + struct hlist_node tramp_hlist;
> /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
> const struct btf_type *attach_func_proto;
> /* function name for valid attach_btf_id */
> @@ -784,6 +876,12 @@ int btf_struct_access(struct bpf_verifier_log *log,
> u32 *next_btf_id);
> u32 btf_resolve_helper_id(struct bpf_verifier_log *log, void *, int);
>
> +int btf_distill_func_proto(struct bpf_verifier_log *log,
> + struct btf *btf,
> + const struct btf_type *func_proto,
> + const char *func_name,
> + struct btf_func_model *m);
> +
> #else /* !CONFIG_BPF_SYSCALL */
> static inline struct bpf_prog *bpf_prog_get(u32 ufd)
> {
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index df6809a76404..69c200e6e696 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -201,6 +201,8 @@ enum bpf_attach_type {
> BPF_CGROUP_GETSOCKOPT,
> BPF_CGROUP_SETSOCKOPT,
> BPF_TRACE_RAW_TP,
> + BPF_TRACE_FENTRY,
> + BPF_TRACE_FEXIT,
> __MAX_BPF_ATTACH_TYPE
> };
>
> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
> index e1d9adb212f9..3f671bf617e8 100644
> --- a/kernel/bpf/Makefile
> +++ b/kernel/bpf/Makefile
> @@ -6,6 +6,7 @@ obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
> obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
> obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
> obj-$(CONFIG_BPF_SYSCALL) += disasm.o
> +obj-$(CONFIG_BPF_JIT) += trampoline.o
> obj-$(CONFIG_BPF_SYSCALL) += btf.o
> ifeq ($(CONFIG_NET),y)
> obj-$(CONFIG_BPF_SYSCALL) += devmap.o
> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> index 128d89601d73..78d9ceaabc00 100644
> --- a/kernel/bpf/btf.c
> +++ b/kernel/bpf/btf.c
> @@ -3441,13 +3441,18 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
> args++;
> nr_args--;
> }
> - if (arg >= nr_args) {
> +
> + if (prog->expected_attach_type == BPF_TRACE_FEXIT &&
> + arg == nr_args) {
> + /* function return type */
> + t = btf_type_by_id(btf_vmlinux, t->type);
> + } else if (arg >= nr_args) {
> bpf_log(log, "func '%s' doesn't have %d-th argument\n",
> - tname, arg);
> + tname, arg + 1);
> return false;
> + } else {
> + t = btf_type_by_id(btf_vmlinux, args[arg].type);
> }
> -
> - t = btf_type_by_id(btf_vmlinux, args[arg].type);
> /* skip modifiers */
> while (btf_type_is_modifier(t))
> t = btf_type_by_id(btf_vmlinux, t->type);
> @@ -3647,6 +3652,70 @@ u32 btf_resolve_helper_id(struct bpf_verifier_log *log, void *fn, int arg)
> return btf_id;
> }
>
> +static int __get_type_size(struct btf *btf, u32 btf_id,
> + const struct btf_type **bad_type)
> +{
> + const struct btf_type *t;
> +
> + if (!btf_id)
> + /* void */
> + return 0;
> + t = btf_type_by_id(btf, btf_id);
> + while (t && btf_type_is_modifier(t))
> + t = btf_type_by_id(btf, t->type);
> + if (!t)
> + return -EINVAL;
> + if (btf_type_is_ptr(t))
> + /* kernel size of pointer. Not BPF's size of pointer*/
> + return sizeof(void *);
> + if (btf_type_is_int(t) || btf_type_is_enum(t))
> + return t->size;
> + *bad_type = t;
> + return -EINVAL;
> +}
> +
> +int btf_distill_func_proto(struct bpf_verifier_log *log,
> + struct btf *btf,
> + const struct btf_type *func,
> + const char *tname,
> + struct btf_func_model *m)
> +{
> + const struct btf_param *args;
> + const struct btf_type *t;
> + u32 i, nargs;
> + int ret;
> +
> + args = (const struct btf_param *)(func + 1);
> + nargs = btf_type_vlen(func);
> + if (nargs >= MAX_BPF_FUNC_ARGS) {
> + bpf_log(log,
> + "The function %s has %d arguments. Too many.\n",
> + tname, nargs);
> + return -EINVAL;
> + }
> + ret = __get_type_size(btf, func->type, &t);
> + if (ret < 0) {
> + bpf_log(log,
> + "The function %s return type %s is unsupported.\n",
> + tname, btf_kind_str[BTF_INFO_KIND(t->info)]);
> + return -EINVAL;
> + }
> + m->ret_size = ret;
> +
> + for (i = 0; i < nargs; i++) {
> + ret = __get_type_size(btf, args[i].type, &t);
> + if (ret < 0) {
> + bpf_log(log,
> + "The function %s arg%d type %s is unsupported.\n",
> + tname, i, btf_kind_str[BTF_INFO_KIND(t->info)]);
> + return -EINVAL;
> + }
> + m->arg_size[i] = ret;
> + }
> + m->nr_args = nargs;
> + return 0;
> +}
> +
> void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
> struct seq_file *m)
> {
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 856564e97943..7352f5514e57 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -2011,6 +2011,7 @@ static void bpf_prog_free_deferred(struct work_struct *work)
> if (aux->prog->has_callchain_buf)
> put_callchain_buffers();
> #endif
> + bpf_trampoline_put(aux->trampoline);
> for (i = 0; i < aux->func_cnt; i++)
> bpf_jit_free(aux->func[i]);
> if (aux->func_cnt) {
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 6d9ce95e5a8d..e2e37bea86bc 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -1799,6 +1799,49 @@ static int bpf_obj_get(const union bpf_attr *attr)
> attr->file_flags);
> }
>
> +static int bpf_tracing_prog_release(struct inode *inode, struct file *filp)
> +{
> + struct bpf_prog *prog = filp->private_data;
> +
> + WARN_ON_ONCE(bpf_trampoline_unlink_prog(prog));
> + bpf_prog_put(prog);
> + return 0;
> +}
> +
> +static const struct file_operations bpf_tracing_prog_fops = {
> + .release = bpf_tracing_prog_release,
> + .read = bpf_dummy_read,
> + .write = bpf_dummy_write,
> +};
> +
> +static int bpf_tracing_prog_attach(struct bpf_prog *prog)
> +{
> + int tr_fd, err;
> +
> + if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
> + prog->expected_attach_type != BPF_TRACE_FEXIT) {
> + err = -EINVAL;
> + goto out_put_prog;
> + }
> +
> + err = bpf_trampoline_link_prog(prog);
> + if (err)
> + goto out_put_prog;
> +
> + tr_fd = anon_inode_getfd("bpf-tracing-prog", &bpf_tracing_prog_fops,
> + prog, O_CLOEXEC);
> + if (tr_fd < 0) {
> + WARN_ON_ONCE(bpf_trampoline_unlink_prog(prog));
> + err = tr_fd;
> + goto out_put_prog;
> + }
> + return tr_fd;
> +
> +out_put_prog:
> + bpf_prog_put(prog);
> + return err;
> +}
> +
> struct bpf_raw_tracepoint {
> struct bpf_raw_event_map *btp;
> struct bpf_prog *prog;
> @@ -1850,14 +1893,16 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
>
> if (prog->type == BPF_PROG_TYPE_TRACING) {
> if (attr->raw_tracepoint.name) {
> - /* raw_tp name should not be specified in raw_tp
> - * programs that were verified via in-kernel BTF info
> + /* The attach point for this category of programs
> + * should be specified via btf_id during program load.
> */
> err = -EINVAL;
> goto out_put_prog;
> }
> - /* raw_tp name is taken from type name instead */
> - tp_name = prog->aux->attach_func_name;
> + if (prog->expected_attach_type == BPF_TRACE_RAW_TP)
> + tp_name = prog->aux->attach_func_name;
> + else
> + return bpf_tracing_prog_attach(prog);
> } else {
> if (strncpy_from_user(buf,
> u64_to_user_ptr(attr->raw_tracepoint.name),
> diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
> new file mode 100644
> index 000000000000..214c964cfd72
> --- /dev/null
> +++ b/kernel/bpf/trampoline.c
> @@ -0,0 +1,252 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (c) 2019 Facebook */
> +#include <linux/hash.h>
> +#include <linux/bpf.h>
> +#include <linux/filter.h>
> +
> +/* btf_vmlinux has ~22k attachable functions. 1k htab is enough. */
> +#define TRAMPOLINE_HASH_BITS 10
> +#define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS)
> +
> +static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
> +
> +static DEFINE_MUTEX(trampoline_mutex);
> +
> +struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
> +{
> + struct bpf_trampoline *tr;
> + struct hlist_head *head;
> + void *image;
> + int i;
> +
> + mutex_lock(&trampoline_mutex);
> + head = &trampoline_table[hash_64(key, TRAMPOLINE_HASH_BITS)];
> + hlist_for_each_entry(tr, head, hlist) {
> + if (tr->key == key) {
> + refcount_inc(&tr->refcnt);
> + goto out;
> + }
> + }
> + tr = kzalloc(sizeof(*tr), GFP_KERNEL);
> + if (!tr)
> + goto out;
> +
> + /* is_root was checked earlier. No need for bpf_jit_charge_modmem() */
> + image = bpf_jit_alloc_exec(PAGE_SIZE);
> + if (!image) {
> + kfree(tr);
> + tr = NULL;
> + goto out;
> + }
> +
> + tr->key = key;
> + INIT_HLIST_NODE(&tr->hlist);
> + hlist_add_head(&tr->hlist, head);
> + refcount_set(&tr->refcnt, 1);
> + for (i = 0; i < BPF_TRAMP_MAX; i++)
> + INIT_HLIST_HEAD(&tr->progs_hlist[i]);
> +
> + set_vm_flush_reset_perms(image);
> + /* Keep image as writeable. The alternative is to keep flipping ro/rw
> + * everytime new program is attached or detached.
> + */
> + set_memory_x((long)image, 1);
> + tr->image = image;
> +out:
> + mutex_unlock(&trampoline_mutex);
> + return tr;
> +}
> +
> +/* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50
> + * bytes on x86. Pick a number to fit into PAGE_SIZE / 2
> + */
> +#define BPF_MAX_TRAMP_PROGS 40
> +
> +static int bpf_trampoline_update(struct bpf_prog *prog)
Seems argument "prog" is not used at all?
> +{
> + struct bpf_trampoline *tr = prog->aux->trampoline;
> + void *old_image = tr->image + ((tr->selector + 1) & 1) * PAGE_SIZE/2;
> + void *new_image = tr->image + (tr->selector & 1) * PAGE_SIZE/2;
> + struct bpf_prog *progs_to_run[BPF_MAX_TRAMP_PROGS];
> + int fentry_cnt = tr->progs_cnt[BPF_TRAMP_FENTRY];
> + int fexit_cnt = tr->progs_cnt[BPF_TRAMP_FEXIT];
> + struct bpf_prog **progs, **fentry, **fexit;
> + u32 flags = BPF_TRAMP_F_RESTORE_REGS;
> + struct bpf_prog_aux *aux;
> + int err;
> +
> + if (fentry_cnt + fexit_cnt == 0) {
> + err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL_TO_NOP,
> + old_image, NULL);
> + tr->selector = 0;
> + goto out;
> + }
> +
> + /* populate fentry progs */
> + fentry = progs = progs_to_run;
> + hlist_for_each_entry(aux, &tr->progs_hlist[BPF_TRAMP_FENTRY], tramp_hlist)
> + *progs++ = aux->prog;
> +
> + /* populate fexit progs */
> + fexit = progs;
> + hlist_for_each_entry(aux, &tr->progs_hlist[BPF_TRAMP_FEXIT], tramp_hlist)
> + *progs++ = aux->prog;
> +
> + if (fexit_cnt)
> + flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
> +
> + err = arch_prepare_bpf_trampoline(new_image, &tr->func.model, flags,
> + fentry, fentry_cnt,
> + fexit, fexit_cnt,
> + tr->func.addr);
> + if (err)
> + goto out;
> +
> + if (tr->selector)
> + /* progs already running at this address */
> + err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL_TO_CALL,
> + old_image, new_image);
> + else
> + /* first time registering */
> + err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_NOP_TO_CALL,
> + NULL, new_image);
> +
> + if (err)
> + goto out;
> + tr->selector++;
Shall we do selector-- for unlink?
> +out:
> + return err;
> +}
> +
> +static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(enum bpf_attach_type t)
> +{
> + switch (t) {
> + case BPF_TRACE_FENTRY:
> + return BPF_TRAMP_FENTRY;
> + default:
> + return BPF_TRAMP_FEXIT;
> + }
> +}
> +
> +int bpf_trampoline_link_prog(struct bpf_prog *prog)
> +{
> + enum bpf_tramp_prog_type kind;
> + struct bpf_trampoline *tr;
> + int err = 0;
> +
> + tr = prog->aux->trampoline;
> + kind = bpf_attach_type_to_tramp(prog->expected_attach_type);
> + mutex_lock(&trampoline_mutex);
> + if (tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT]
> + >= BPF_MAX_TRAMP_PROGS) {
> + err = -E2BIG;
> + goto out;
> + }
> + if (!hlist_unhashed(&prog->aux->tramp_hlist)) {
> + /* prog already linked */
> + err = -EBUSY;
> + goto out;
> + }
> + hlist_add_head(&prog->aux->tramp_hlist, &tr->progs_hlist[kind]);
> + tr->progs_cnt[kind]++;
> + err = bpf_trampoline_update(prog);
> + if (err) {
> + hlist_del(&prog->aux->tramp_hlist);
> + tr->progs_cnt[kind]--;
> + }
> +out:
> + mutex_unlock(&trampoline_mutex);
> + return err;
> +}
> +
> +/* bpf_trampoline_unlink_prog() should never fail. */
> +int bpf_trampoline_unlink_prog(struct bpf_prog *prog)
> +{
> + enum bpf_tramp_prog_type kind;
> + struct bpf_trampoline *tr;
> + int err;
> +
> + tr = prog->aux->trampoline;
> + kind = bpf_attach_type_to_tramp(prog->expected_attach_type);
> + mutex_lock(&trampoline_mutex);
> + hlist_del(&prog->aux->tramp_hlist);
> + tr->progs_cnt[kind]--;
> + err = bpf_trampoline_update(prog);
> + mutex_unlock(&trampoline_mutex);
> + return err;
> +}
> +
> +void bpf_trampoline_put(struct bpf_trampoline *tr)
> +{
> + if (!tr)
> + return;
> + mutex_lock(&trampoline_mutex);
> + if (!refcount_dec_and_test(&tr->refcnt))
> + goto out;
> + if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FENTRY])))
> + goto out;
> + if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT])))
> + goto out;
> + bpf_jit_free_exec(tr->image);
> + hlist_del(&tr->hlist);
> + kfree(tr);
> +out:
> + mutex_unlock(&trampoline_mutex);
> +}
> +
> +/* The logic is similar to BPF_PROG_RUN, but with explicit rcu and preempt that
> + * are needed for trampoline. The macro is split into
> + * call _bpf_prog_enter
> + * call prog->bpf_func
> + * call __bpf_prog_exit
> + */
> +u64 notrace __bpf_prog_enter(void)
> +{
> + u64 start = 0;
> +
> + rcu_read_lock();
> + preempt_disable();
> + if (static_branch_unlikely(&bpf_stats_enabled_key))
> + start = sched_clock();
> + return start;
> +}
> +
> +void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start)
> +{
> + struct bpf_prog_stats *stats;
> +
> + if (static_branch_unlikely(&bpf_stats_enabled_key) &&
> + /* static_key could be enabled in __bpf_prog_enter
> + * and disabled in __bpf_prog_exit.
> + * And vice versa.
> + * Hence check that 'start' is not zero.
> + */
> + start) {
> + stats = this_cpu_ptr(prog->aux->stats);
> + u64_stats_update_begin(&stats->syncp);
> + stats->cnt++;
> + stats->nsecs += sched_clock() - start;
> + u64_stats_update_end(&stats->syncp);
> + }
> + preempt_enable();
> + rcu_read_unlock();
> +}
> +
> +int __weak
> +arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags,
> + struct bpf_prog **fentry_progs, int fentry_cnt,
> + struct bpf_prog **fexit_progs, int fexit_cnt,
> + void *orig_call)
> +{
> + return -ENOTSUPP;
> +}
> +
> +static int __init init_trampolines(void)
> +{
> + int i;
> +
> + for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++)
> + INIT_HLIST_HEAD(&trampoline_table[i]);
> + return 0;
> +}
> +late_initcall(init_trampolines);
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 2f2374967b36..36542edd4936 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -9382,8 +9382,11 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
> struct bpf_prog *prog = env->prog;
> u32 btf_id = prog->aux->attach_btf_id;
> const char prefix[] = "btf_trace_";
> + struct bpf_trampoline *tr;
> const struct btf_type *t;
> const char *tname;
> + long addr;
> + int ret;
>
> if (prog->type != BPF_PROG_TYPE_TRACING)
> return 0;
> @@ -9432,6 +9435,42 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
> prog->aux->attach_func_proto = t;
> prog->aux->attach_btf_trace = true;
> return 0;
> + case BPF_TRACE_FENTRY:
> + case BPF_TRACE_FEXIT:
> + if (!btf_type_is_func(t)) {
> + verbose(env, "attach_btf_id %u is not a function\n",
> + btf_id);
> + return -EINVAL;
> + }
> + t = btf_type_by_id(btf_vmlinux, t->type);
> + if (!btf_type_is_func_proto(t))
> + return -EINVAL;
> + tr = bpf_trampoline_lookup(btf_id);
> + if (!tr)
> + return -ENOMEM;
> + prog->aux->attach_func_name = tname;
> + prog->aux->attach_func_proto = t;
> + if (tr->func.addr) {
> + prog->aux->trampoline = tr;
> + return 0;
> + }
> + ret = btf_distill_func_proto(&env->log, btf_vmlinux, t,
> + tname, &tr->func.model);
> + if (ret < 0) {
> + bpf_trampoline_put(tr);
> + return ret;
> + }
> + addr = kallsyms_lookup_name(tname);
> + if (!addr) {
> + verbose(env,
> + "The address of function %s cannot be found\n",
> + tname);
> + bpf_trampoline_put(tr);
> + return -ENOENT;
> + }
> + tr->func.addr = (void *)addr;
> + prog->aux->trampoline = tr;
> + return 0;
> default:
> return -EINVAL;
> }
> --
> 2.23.0
>
Powered by blists - more mailing lists