[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAEf4BzaQb=_aOL26syfsUWA9ewi6xOC1frzP27cOWz=_5Cz1iA@mail.gmail.com>
Date: Mon, 26 Apr 2021 15:22:36 -0700
From: Andrii Nakryiko <andrii.nakryiko@...il.com>
To: Alexei Starovoitov <alexei.starovoitov@...il.com>
Cc: "David S. Miller" <davem@...emloft.net>,
Daniel Borkmann <daniel@...earbox.net>,
Andrii Nakryiko <andrii@...nel.org>,
Networking <netdev@...r.kernel.org>, bpf <bpf@...r.kernel.org>,
Kernel Team <kernel-team@...com>
Subject: Re: [PATCH v2 bpf-next 14/16] libbpf: Generate loader program out of
BPF ELF file.
On Thu, Apr 22, 2021 at 5:27 PM Alexei Starovoitov
<alexei.starovoitov@...il.com> wrote:
>
> From: Alexei Starovoitov <ast@...nel.org>
>
> The BPF program loading process performed by libbpf is quite complex
> and consists of the following steps:
> "open" phase:
> - parse elf file and remember relocations, sections
> - collect externs and ksyms including their btf_ids in prog's BTF
> - patch BTF datasec (since llvm couldn't do it)
> - init maps (old style map_def, BTF based, global data map, kconfig map)
> - collect relocations against progs and maps
> "load" phase:
> - probe kernel features
> - load vmlinux BTF
> - resolve externs (kconfig and ksym)
> - load program BTF
> - init struct_ops
> - create maps
> - apply CO-RE relocations
> - patch ld_imm64 insns with src_reg=PSEUDO_MAP, PSEUDO_MAP_VALUE, PSEUDO_BTF_ID
> - reposition subprograms and adjust call insns
> - sanitize and load progs
>
> During this process libbpf does sys_bpf() calls to load BTF, create maps,
> populate maps and finally load programs.
> Instead of actually doing the syscalls generate a trace of what libbpf
> would have done and represent it as the "loader program".
> The "loader program" consists of single map with:
> - union bpf_attr(s)
> - BTF bytes
> - map value bytes
> - insns bytes
> and single bpf program that passes bpf_attr(s) and data into bpf_sys_bpf() helper.
> Executing such "loader program" via bpf_prog_test_run() command will
> replay the sequence of syscalls that libbpf would have done which will result
> the same maps created and programs loaded as specified in the elf file.
> The "loader program" removes libelf and majority of libbpf dependency from
> program loading process.
>
> kconfig, typeless ksym, struct_ops and CO-RE are not supported yet.
>
> The order of relocate_data and relocate_calls had to change, so that
> bpf_gen__prog_load() can see all relocations for a given program with
> correct insn_idx-es.
>
> Signed-off-by: Alexei Starovoitov <ast@...nel.org>
> ---
> tools/lib/bpf/Build | 2 +-
> tools/lib/bpf/bpf_gen_internal.h | 40 ++
> tools/lib/bpf/gen_loader.c | 615 +++++++++++++++++++++++++++++++
> tools/lib/bpf/libbpf.c | 204 ++++++++--
> tools/lib/bpf/libbpf.h | 12 +
> tools/lib/bpf/libbpf.map | 1 +
> tools/lib/bpf/libbpf_internal.h | 2 +
> tools/lib/bpf/skel_internal.h | 105 ++++++
> 8 files changed, 948 insertions(+), 33 deletions(-)
> create mode 100644 tools/lib/bpf/bpf_gen_internal.h
> create mode 100644 tools/lib/bpf/gen_loader.c
> create mode 100644 tools/lib/bpf/skel_internal.h
>
> diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build
> index 9b057cc7650a..430f6874fa41 100644
> --- a/tools/lib/bpf/Build
> +++ b/tools/lib/bpf/Build
> @@ -1,3 +1,3 @@
> libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o \
> netlink.o bpf_prog_linfo.o libbpf_probes.o xsk.o hashmap.o \
> - btf_dump.o ringbuf.o strset.o linker.o
> + btf_dump.o ringbuf.o strset.o linker.o gen_loader.o
> diff --git a/tools/lib/bpf/bpf_gen_internal.h b/tools/lib/bpf/bpf_gen_internal.h
> new file mode 100644
> index 000000000000..dc3e2cbf9ce3
> --- /dev/null
> +++ b/tools/lib/bpf/bpf_gen_internal.h
> @@ -0,0 +1,40 @@
> +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
> +/* Copyright (c) 2021 Facebook */
> +#ifndef __BPF_GEN_INTERNAL_H
> +#define __BPF_GEN_INTERNAL_H
> +
> +struct relo_desc {
there is very similarly named reloc_desc struct in libbpf.c, can you
rename it to something like gen_btf_relo_desc?
> + const char *name;
> + int kind;
> + int insn_idx;
> +};
> +
[...]
> +
> +static int bpf_gen__realloc_insn_buf(struct bpf_gen *gen, __u32 size)
> +{
> + size_t off = gen->insn_cur - gen->insn_start;
> +
> + if (gen->error)
> + return gen->error;
> + if (size > INT32_MAX || off + size > INT32_MAX) {
> + gen->error = -ERANGE;
> + return -ERANGE;
> + }
> + gen->insn_start = realloc(gen->insn_start, off + size);
leaking memory here: gen->insn_start will be NULL on failure
> + if (!gen->insn_start) {
> + gen->error = -ENOMEM;
> + return -ENOMEM;
> + }
> + gen->insn_cur = gen->insn_start + off;
> + return 0;
> +}
> +
> +static int bpf_gen__realloc_data_buf(struct bpf_gen *gen, __u32 size)
> +{
> + size_t off = gen->data_cur - gen->data_start;
> +
> + if (gen->error)
> + return gen->error;
> + if (size > INT32_MAX || off + size > INT32_MAX) {
> + gen->error = -ERANGE;
> + return -ERANGE;
> + }
> + gen->data_start = realloc(gen->data_start, off + size);
same as above
> + if (!gen->data_start) {
> + gen->error = -ENOMEM;
> + return -ENOMEM;
> + }
> + gen->data_cur = gen->data_start + off;
> + return 0;
> +}
> +
[...]
> +
> +static void bpf_gen__emit_sys_bpf(struct bpf_gen *gen, int cmd, int attr, int attr_size)
> +{
> + bpf_gen__emit(gen, BPF_MOV64_IMM(BPF_REG_1, cmd));
> + bpf_gen__emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_2, BPF_PSEUDO_MAP_IDX_VALUE, 0, 0, 0, attr));
is attr an offset into a blob? if yes, attr_off? or attr_base_off,
anything with _off
> + bpf_gen__emit(gen, BPF_MOV64_IMM(BPF_REG_3, attr_size));
> + bpf_gen__emit(gen, BPF_EMIT_CALL(BPF_FUNC_sys_bpf));
> + /* remember the result in R7 */
> + bpf_gen__emit(gen, BPF_MOV64_REG(BPF_REG_7, BPF_REG_0));
> +}
> +
> +static void bpf_gen__emit_check_err(struct bpf_gen *gen)
> +{
> + bpf_gen__emit(gen, BPF_JMP_IMM(BPF_JSGE, BPF_REG_7, 0, 2));
> + bpf_gen__emit(gen, BPF_MOV64_REG(BPF_REG_0, BPF_REG_7));
> + bpf_gen__emit(gen, BPF_EXIT_INSN());
> +}
> +
> +static void __bpf_gen__debug(struct bpf_gen *gen, int reg1, int reg2, const char *fmt, va_list args)
Can you please leave a comment on what reg1 and reg2 is, it's not very
clear and the code clearly assumes that it can't be reg[1-4]. It's
probably those special R7 and R9 (or -1, of course), but having a
short comment makes sense to not jump around trying to figure out
possible inputs.
Oh, reading further, it can also be R0.
> +{
> + char buf[1024];
> + int addr, len, ret;
> +
> + if (!gen->log_level)
> + return;
> + ret = vsnprintf(buf, sizeof(buf), fmt, args);
> + if (ret < 1024 - 7 && reg1 >= 0 && reg2 < 0)
> + /* The special case to accommodate common bpf_gen__debug_ret():
> + * to avoid specifying BPF_REG_7 and adding " r=%%d" to prints explicitly.
> + */
> + strcat(buf, " r=%d");
> + len = strlen(buf) + 1;
> + addr = bpf_gen__add_data(gen, buf, len);
nit: offset, not address, right?
> +
> + bpf_gen__emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, 0, 0, 0, addr));
> + bpf_gen__emit(gen, BPF_MOV64_IMM(BPF_REG_2, len));
> + if (reg1 >= 0)
> + bpf_gen__emit(gen, BPF_MOV64_REG(BPF_REG_3, reg1));
> + if (reg2 >= 0)
> + bpf_gen__emit(gen, BPF_MOV64_REG(BPF_REG_4, reg2));
> + bpf_gen__emit(gen, BPF_EMIT_CALL(BPF_FUNC_trace_printk));
> +}
> +
[...]
> +int bpf_gen__finish(struct bpf_gen *gen)
> +{
> + int i;
> +
> + bpf_gen__emit_sys_close_stack(gen, stack_off(btf_fd));
> + for (i = 0; i < gen->nr_progs; i++)
> + bpf_gen__move_stack2ctx(gen,
> + sizeof(struct bpf_loader_ctx) +
> + sizeof(struct bpf_map_desc) * gen->nr_maps +
> + sizeof(struct bpf_prog_desc) * i +
> + offsetof(struct bpf_prog_desc, prog_fd), 4,
> + stack_off(prog_fd[i]));
> + for (i = 0; i < gen->nr_maps; i++)
> + bpf_gen__move_stack2ctx(gen,
> + sizeof(struct bpf_loader_ctx) +
> + sizeof(struct bpf_map_desc) * i +
> + offsetof(struct bpf_map_desc, map_fd), 4,
> + stack_off(map_fd[i]));
> + bpf_gen__emit(gen, BPF_MOV64_IMM(BPF_REG_0, 0));
> + bpf_gen__emit(gen, BPF_EXIT_INSN());
> + pr_debug("bpf_gen__finish %d\n", gen->error);
maybe prefix all those pr_debug()s with "gen: " to distinguish them
from the rest of libbpf logging?
> + if (!gen->error) {
> + struct gen_loader_opts *opts = gen->opts;
> +
> + opts->insns = gen->insn_start;
> + opts->insns_sz = gen->insn_cur - gen->insn_start;
> + opts->data = gen->data_start;
> + opts->data_sz = gen->data_cur - gen->data_start;
> + }
> + return gen->error;
> +}
> +
> +void bpf_gen__free(struct bpf_gen *gen)
> +{
> + if (!gen)
> + return;
> + free(gen->data_start);
> + free(gen->insn_start);
> + gen->data_start = NULL;
> + gen->insn_start = NULL;
what's the point of NULL'ing them out if you don't clear gen->data_cur
and gen->insn_cur?
also should it free(gen) itself?
> +}
> +
> +void bpf_gen__load_btf(struct bpf_gen *gen, const void *btf_raw_data, __u32 btf_raw_size)
> +{
> + union bpf_attr attr = {};
here and below: memset(0)?
> + int attr_size = offsetofend(union bpf_attr, btf_log_level);
> + int btf_data, btf_load_attr;
> +
> + pr_debug("btf_load: size %d\n", btf_raw_size);
> + btf_data = bpf_gen__add_data(gen, btf_raw_data, btf_raw_size);
> +
[...]
> + map_create_attr = bpf_gen__add_data(gen, &attr, attr_size);
> + if (attr.btf_value_type_id)
> + /* populate union bpf_attr with btf_fd saved in the stack earlier */
> + bpf_gen__move_stack2blob(gen, map_create_attr + offsetof(union bpf_attr, btf_fd), 4,
> + stack_off(btf_fd));
> + switch (attr.map_type) {
> + case BPF_MAP_TYPE_ARRAY_OF_MAPS:
> + case BPF_MAP_TYPE_HASH_OF_MAPS:
> + bpf_gen__move_stack2blob(gen, map_create_attr + offsetof(union bpf_attr, inner_map_fd),
> + 4, stack_off(inner_map_fd));
> + close_inner_map_fd = true;
> + break;
> + default:;
default:
break;
> + }
> + /* emit MAP_CREATE command */
> + bpf_gen__emit_sys_bpf(gen, BPF_MAP_CREATE, map_create_attr, attr_size);
> + bpf_gen__debug_ret(gen, "map_create %s idx %d type %d value_size %d",
> + attr.map_name, map_idx, map_attr->map_type, attr.value_size);
> + bpf_gen__emit_check_err(gen);
what will happen on error with inner_map_fd and all the other fds
created by now?
> + /* remember map_fd in the stack, if successful */
> + if (map_idx < 0) {
> + /* This bpf_gen__map_create() function is called with map_idx >= 0 for all maps
> + * that libbpf loading logic tracks.
> + * It's called with -1 to create an inner map.
> + */
> + bpf_gen__emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_7, stack_off(inner_map_fd)));
> + } else {
> + if (map_idx != gen->nr_maps) {
why would that happen? defensive programming? and even then `if () {}
else if () {} else {}` structure is more appropriate
> + gen->error = -EDOM; /* internal bug */
> + return;
> + }
> + bpf_gen__emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_7, stack_off(map_fd[map_idx])));
> + gen->nr_maps++;
> + }
> + if (close_inner_map_fd)
> + bpf_gen__emit_sys_close_stack(gen, stack_off(inner_map_fd));
> +}
> +
[...]
> +static void bpf_gen__cleanup_relos(struct bpf_gen *gen, int insns)
> +{
> + int i, insn;
> +
> + for (i = 0; i < gen->relo_cnt; i++) {
> + if (gen->relos[i].kind != BTF_KIND_VAR)
> + continue;
> + /* close fd recorded in insn[insn_idx + 1].imm */
> + insn = insns + sizeof(struct bpf_insn) * (gen->relos[i].insn_idx + 1)
> + + offsetof(struct bpf_insn, imm);
> + bpf_gen__emit_sys_close_blob(gen, insn);
wouldn't this close the same FD used across multiple "relos" multiple times?
> + }
> + if (gen->relo_cnt) {
> + free(gen->relos);
> + gen->relo_cnt = 0;
> + gen->relos = NULL;
> + }
> +}
> +
[...]
> + struct bpf_gen *gen_loader;
> +
> /*
> * Information when doing elf related work. Only valid if fd
> * is valid.
> @@ -2651,7 +2654,15 @@ static int bpf_object__sanitize_and_load_btf(struct bpf_object *obj)
> bpf_object__sanitize_btf(obj, kern_btf);
> }
>
> - err = btf__load(kern_btf);
> + if (obj->gen_loader) {
> + __u32 raw_size = 0;
> + const void *raw_data = btf__get_raw_data(kern_btf, &raw_size);
this can return NULL on ENOMEM
> +
> + bpf_gen__load_btf(obj->gen_loader, raw_data, raw_size);
> + btf__set_fd(kern_btf, 0);
why setting fd to 0 (stdin)? does gen depend on this somewhere? The
problem is that it will eventually be closed on btf__free(), which
will close stdin, causing a big surprise. What will happen if you
leave it at -1?
> + } else {
> + err = btf__load(kern_btf);
> + }
> if (sanitize) {
> if (!err) {
> /* move fd to libbpf's BTF */
> @@ -4262,6 +4273,12 @@ static bool kernel_supports(const struct bpf_object *obj, enum kern_feature_id f
> struct kern_feature_desc *feat = &feature_probes[feat_id];
> int ret;
>
> + if (obj->gen_loader)
> + /* To generate loader program assume the latest kernel
> + * to avoid doing extra prog_load, map_create syscalls.
> + */
> + return true;
> +
> if (READ_ONCE(feat->res) == FEAT_UNKNOWN) {
> ret = feat->probe();
> if (ret > 0) {
> @@ -4344,6 +4361,13 @@ bpf_object__populate_internal_map(struct bpf_object *obj, struct bpf_map *map)
> char *cp, errmsg[STRERR_BUFSIZE];
> int err, zero = 0;
>
> + if (obj->gen_loader) {
> + bpf_gen__map_update_elem(obj->gen_loader, map - obj->maps,
it would be great for bpf_gen__map_update_elem to reflect that it's
not a generic map_update_elem() call, rather special internal map
update (just use bpf_gen__populate_internal_map?) Whether to freeze or
not could be just a flag to the same call, they always go together.
> + map->mmaped, map->def.value_size);
> + if (map_type == LIBBPF_MAP_RODATA || map_type == LIBBPF_MAP_KCONFIG)
> + bpf_gen__map_freeze(obj->gen_loader, map - obj->maps);
> + return 0;
> + }
> err = bpf_map_update_elem(map->fd, &zero, map->mmaped, 0);
> if (err) {
> err = -errno;
> @@ -4369,7 +4393,7 @@ bpf_object__populate_internal_map(struct bpf_object *obj, struct bpf_map *map)
>
> static void bpf_map__destroy(struct bpf_map *map);
>
> -static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map)
> +static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, bool is_inner)
> {
> struct bpf_create_map_attr create_attr;
> struct bpf_map_def *def = &map->def;
> @@ -4415,9 +4439,9 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map)
>
> if (bpf_map_type__is_map_in_map(def->type)) {
> if (map->inner_map) {
> - int err;
> + int err = 0;
no need to initialize to zero, you are assigning it right below
>
> - err = bpf_object__create_map(obj, map->inner_map);
> + err = bpf_object__create_map(obj, map->inner_map, true);
> if (err) {
> pr_warn("map '%s': failed to create inner map: %d\n",
> map->name, err);
[...]
> @@ -4469,7 +4498,12 @@ static int init_map_slots(struct bpf_map *map)
>
> targ_map = map->init_slots[i];
> fd = bpf_map__fd(targ_map);
> - err = bpf_map_update_elem(map->fd, &i, &fd, 0);
> + if (obj->gen_loader) {
> + printf("// TODO map_update_elem: idx %ld key %d value==map_idx %ld\n",
> + map - obj->maps, i, targ_map - obj->maps);
return error for now?
> + } else {
> + err = bpf_map_update_elem(map->fd, &i, &fd, 0);
> + }
> if (err) {
> err = -errno;
> pr_warn("map '%s': failed to initialize slot [%d] to map '%s' fd=%d: %d\n",
[...]
> @@ -6082,6 +6119,11 @@ static int bpf_core_apply_relo(struct bpf_program *prog,
> if (str_is_empty(spec_str))
> return -EINVAL;
>
> + if (prog->obj->gen_loader) {
> + printf("// TODO core_relo: prog %ld insn[%d] %s %s kind %d\n",
> + prog - prog->obj->programs, relo->insn_off / 8,
> + local_name, spec_str, relo->kind);
same, return error? Drop printf, maybe leave pr_debug()?
> + }
> err = bpf_core_parse_spec(local_btf, local_id, spec_str, relo->kind, &local_spec);
> if (err) {
> pr_warn("prog '%s': relo #%d: parsing [%d] %s %s + %s failed: %d\n",
> @@ -6821,6 +6863,19 @@ bpf_object__relocate_calls(struct bpf_object *obj, struct bpf_program *prog)
>
> return 0;
> }
empty line here
> +static void
> +bpf_object__free_relocs(struct bpf_object *obj)
> +{
> + struct bpf_program *prog;
> + int i;
> +
> + /* free up relocation descriptors */
> + for (i = 0; i < obj->nr_programs; i++) {
> + prog = &obj->programs[i];
> + zfree(&prog->reloc_desc);
> + prog->nr_reloc = 0;
> + }
> +}
>
[...]
> +static int bpf_program__record_externs(struct bpf_program *prog)
> +{
> + struct bpf_object *obj = prog->obj;
> + int i;
> +
> + for (i = 0; i < prog->nr_reloc; i++) {
> + struct reloc_desc *relo = &prog->reloc_desc[i];
> + struct extern_desc *ext = &obj->externs[relo->sym_off];
> +
> + switch (relo->type) {
> + case RELO_EXTERN_VAR:
> + if (ext->type != EXT_KSYM)
> + continue;
> + if (!ext->ksym.type_id) /* typeless ksym */
> + continue;
this shouldn't be silently ignored, if it's not supported, it should
return error
> + bpf_gen__record_extern(obj->gen_loader, ext->name, BTF_KIND_VAR,
> + relo->insn_idx);
> + break;
> + case RELO_EXTERN_FUNC:
> + bpf_gen__record_extern(obj->gen_loader, ext->name, BTF_KIND_FUNC,
> + relo->insn_idx);
> + break;
> + default:
> + continue;
> + }
> + }
> + return 0;
> +}
> +
[...]
> @@ -7868,6 +7970,9 @@ int bpf_object__load_xattr(struct bpf_object_load_attr *attr)
> err = err ? : bpf_object__relocate(obj, attr->target_btf_path);
> err = err ? : bpf_object__load_progs(obj, attr->log_level);
>
> + if (obj->gen_loader && !err)
> + err = bpf_gen__finish(obj->gen_loader);
> +
> /* clean up module BTFs */
> for (i = 0; i < obj->btf_module_cnt; i++) {
> close(obj->btf_modules[i].fd);
> @@ -8493,6 +8598,7 @@ void bpf_object__close(struct bpf_object *obj)
> if (obj->clear_priv)
> obj->clear_priv(obj, obj->priv);
bpf_object__close() will close all those FD=0 in maps/progs, that's not good
>
> + bpf_gen__free(obj->gen_loader);
> bpf_object__elf_finish(obj);
> bpf_object__unload(obj);
> btf__free(obj->btf);
[...]
> @@ -9387,7 +9521,13 @@ static int libbpf_find_attach_btf_id(struct bpf_program *prog, int *btf_obj_fd,
> }
>
> /* kernel/module BTF ID */
> - err = find_kernel_btf_id(prog->obj, attach_name, attach_type, btf_obj_fd, btf_type_id);
> + if (prog->obj->gen_loader) {
> + bpf_gen__record_attach_target(prog->obj->gen_loader, attach_name, attach_type);
> + *btf_obj_fd = 0;
this will leak kernel module BTF FDs
> + *btf_type_id = 1;
> + } else {
> + err = find_kernel_btf_id(prog->obj, attach_name, attach_type, btf_obj_fd, btf_type_id);
> + }
> if (err) {
> pr_warn("failed to find kernel BTF type ID of '%s': %d\n", attach_name, err);
> return err;
[...]
> +out:
> + close(map_fd);
> + close(prog_fd);
this does close(-1), check >= 0
> + return err;
> +}
> +
> +#endif
> --
> 2.30.2
>
Powered by blists - more mailing lists