[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAEf4Bzb+GsNdi4gOZY3VZ7V8-hfaD-k5DFDeZHEBpB_WS2DDXQ@mail.gmail.com>
Date: Thu, 14 Mar 2019 11:11:36 -0700
From: Andrii Nakryiko <andrii.nakryiko@...il.com>
To: Daniel Borkmann <daniel@...earbox.net>
Cc: Alexei Starovoitov <ast@...nel.org>, bpf@...r.kernel.org,
Networking <netdev@...r.kernel.org>,
Joe Stringer <joe@...d.net.nz>,
john fastabend <john.fastabend@...il.com>,
Yonghong Song <yhs@...com>,
Jakub Kicinski <jakub.kicinski@...ronome.com>, tgraf@...g.ch,
lmb@...udflare.com
Subject: Re: [PATCH rfc v3 bpf-next 1/9] bpf: implement lookup-free direct
value access for maps
On Mon, Mar 11, 2019 at 2:51 PM Daniel Borkmann <daniel@...earbox.net> wrote:
>
> This generic extension to BPF maps allows for directly loading an
> address residing inside a BPF map value as a single BPF ldimm64
> instruction.
>
> The idea is similar to what BPF_PSEUDO_MAP_FD does today, which
> is a special src_reg flag for ldimm64 instruction that indicates
> that inside the first part of the double insns's imm field is a
> file descriptor which the verifier then replaces as a full 64bit
> address of the map into both imm parts. For the newly added
> BPF_PSEUDO_MAP_VALUE src_reg flag, the idea is the following:
> the first part of the double insns's imm field is again a file
> descriptor corresponding to the map, and the second part of the
> imm field is an offset into the value. Both insns's off fields
> build the optional key resp. index to the map if it contains
> more than just one element. The verifier will then replace both
> imm parts with an address that points into the BPF map value
> for maps that support this operation. BPF_PSEUDO_MAP_VALUE is
> a distinct flag as otherwise with BPF_PSEUDO_MAP_FD we could
> not differ offset 0 between load of map pointer versus load of
> map's value at offset 0, and changing BPF_PSEUDO_MAP_FD's
> encoding into off by one to differ between regular map pointer
> and map value pointer would add unnecessary complexity and
> increases barrier for debuggability thus less suitable.
>
> This extension allows for efficiently retrieving an address to
> a map value memory area without having to issue a helper call
> which needs to prepare registers according to calling convention,
> etc, without needing the extra NULL test, and without having to
> add the offset in an additional instruction to the value base
> pointer. The verifier then treats the destination register as
> PTR_TO_MAP_VALUE with constant reg->off from the user passed
> offset from the second imm field, and guarantees that this is
> within bounds of the map value. Any subsequent operations are
> normally treated as typical map value handling without anything
> else needed for verification.
>
> The two map operations for direct value access have been added to
> array map for now. In future other types could be supported as
> well depending on the use case. The main use case for this commit
> is to allow for BPF loader support for global variables that
> reside in .data/.rodata/.bss sections such that we can directly
> load the address of them with minimal additional infrastructure
> required. Loader support has been added in subsequent commits for
> libbpf library.
>
> Signed-off-by: Daniel Borkmann <daniel@...earbox.net>
Thanks! It looks really good, I'm just worried about imm -> idx + off
conversion. Please double-check.
> ---
> include/linux/bpf.h | 6 ++
> include/linux/bpf_verifier.h | 4 ++
> include/uapi/linux/bpf.h | 13 +++-
> kernel/bpf/arraymap.c | 29 ++++++++
> kernel/bpf/core.c | 3 +-
> kernel/bpf/disasm.c | 5 +-
> kernel/bpf/syscall.c | 31 +++++++--
> kernel/bpf/verifier.c | 109 ++++++++++++++++++++++--------
> tools/bpf/bpftool/xlated_dumper.c | 6 ++
> 9 files changed, 168 insertions(+), 38 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index a2132e09dc1c..85b6b5dc883f 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -57,6 +57,12 @@ struct bpf_map_ops {
> const struct btf *btf,
> const struct btf_type *key_type,
> const struct btf_type *value_type);
> +
> + /* Direct value access helpers. */
> + int (*map_direct_value_addr)(const struct bpf_map *map,
> + u64 *imm, u32 idx, u32 off);
> + int (*map_direct_value_meta)(const struct bpf_map *map,
> + u64 imm, u32 *idx, u32 *off);
> };
>
> struct bpf_map {
> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
> index 69f7a3449eda..6e28f1c24710 100644
> --- a/include/linux/bpf_verifier.h
> +++ b/include/linux/bpf_verifier.h
> @@ -183,6 +183,10 @@ struct bpf_insn_aux_data {
> unsigned long map_state; /* pointer/poison value for maps */
> s32 call_imm; /* saved imm field of call insn */
> u32 alu_limit; /* limit for add/sub register with pointer */
> + struct {
> + u32 map_index; /* index into used_maps[] */
> + u32 map_off; /* offset from value base address */
> + };
> };
> int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
> int sanitize_stack_off; /* stack slot to be cleared */
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 3c38ac9a92a7..d0b80fce0fc9 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -255,8 +255,19 @@ enum bpf_attach_type {
> */
> #define BPF_F_ANY_ALIGNMENT (1U << 1)
>
> -/* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */
> +/* When BPF ldimm64's insn[0].src_reg != 0 then this can have
> + * two extensions:
> + *
> + * insn[0].src_reg: BPF_PSEUDO_MAP_FD BPF_PSEUDO_MAP_VALUE
> + * insn[0].imm: map fd map fd
> + * insn[1].imm: 0 offset into value
> + * insn[0].off: 0 32 bit index to the
> + * insn[1].off: 0 map value
It would be good to also document here which off part is lower 16
bits, and which one is higher 16 bits.
> + * ldimm64 rewrite: address of map address of map[index]+offset
> + * verifier type: CONST_PTR_TO_MAP PTR_TO_MAP_VALUE
> + */
> #define BPF_PSEUDO_MAP_FD 1
> +#define BPF_PSEUDO_MAP_VALUE 2
>
> /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative
> * offset to another bpf function
> diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
> index c72e0d8e1e65..862d20422ad1 100644
> --- a/kernel/bpf/arraymap.c
> +++ b/kernel/bpf/arraymap.c
> @@ -160,6 +160,33 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
> return array->value + array->elem_size * (index & array->index_mask);
> }
>
> +static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm,
> + u32 idx, u32 off)
> +{
> + struct bpf_array *array = container_of(map, struct bpf_array, map);
> +
> + if (idx >= map->max_entries || off >= map->value_size)
> + return -EINVAL;
> + *imm = (unsigned long)(array->value +
Is there anything wrong with using u64 here (and few other places below)?
> + array->elem_size * (idx & array->index_mask));
> + return 0;
> +}
> +
> +static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm,
> + u32 *idx, u32 *off)
> +{
> + struct bpf_array *array = container_of(map, struct bpf_array, map);
> + u64 rem, base = (unsigned long)array->value, slot = map->value_size;
Should slot use map->elem_size instead of map->value_size?
> + u64 range = slot * map->max_entries;
> +
> + if (imm < base || imm >= base + range)
> + return -ENOENT;
> + base = imm - base;
> + *idx = div64_u64_rem(base, slot, &rem);
> + *off = rem;
> + return 0;
> +}
> +
> /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
> static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
> {
> @@ -419,6 +446,8 @@ const struct bpf_map_ops array_map_ops = {
> .map_update_elem = array_map_update_elem,
> .map_delete_elem = array_map_delete_elem,
> .map_gen_lookup = array_map_gen_lookup,
> + .map_direct_value_addr = array_map_direct_value_addr,
> + .map_direct_value_meta = array_map_direct_value_meta,
> .map_seq_show_elem = array_map_seq_show_elem,
> .map_check_btf = array_map_check_btf,
> };
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 3f08c257858e..af3dcd8b852b 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -292,7 +292,8 @@ int bpf_prog_calc_tag(struct bpf_prog *fp)
> dst[i] = fp->insnsi[i];
> if (!was_ld_map &&
> dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) &&
> - dst[i].src_reg == BPF_PSEUDO_MAP_FD) {
> + (dst[i].src_reg == BPF_PSEUDO_MAP_FD ||
> + dst[i].src_reg == BPF_PSEUDO_MAP_VALUE)) {
> was_ld_map = true;
> dst[i].imm = 0;
> } else if (was_ld_map &&
> diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
> index de73f55e42fd..d9ce383c0f9c 100644
> --- a/kernel/bpf/disasm.c
> +++ b/kernel/bpf/disasm.c
> @@ -205,10 +205,11 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
> * part of the ldimm64 insn is accessible.
> */
> u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
> - bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
> + bool is_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD ||
> + insn->src_reg == BPF_PSEUDO_MAP_VALUE;
> char tmp[64];
>
> - if (map_ptr && !allow_ptr_leaks)
> + if (is_ptr && !allow_ptr_leaks)
> imm = 0;
>
> verbose(cbs->private_data, "(%02x) r%d = %s\n",
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index bc34cf9fe9ee..b0c7a6485c49 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -2061,13 +2061,27 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
> }
>
> static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog,
> - unsigned long addr)
> + unsigned long addr, u32 *idx,
> + u32 *off, u32 *type)
> {
> + const struct bpf_map *map;
> int i;
>
> - for (i = 0; i < prog->aux->used_map_cnt; i++)
> - if (prog->aux->used_maps[i] == (void *)addr)
> - return prog->aux->used_maps[i];
> + *off = *idx = 0;
> + for (i = 0; i < prog->aux->used_map_cnt; i++) {
> + map = prog->aux->used_maps[i];
> + if (map == (void *)addr) {
> + *type = BPF_PSEUDO_MAP_FD;
> + return map;
> + }
> + if (!map->ops->map_direct_value_meta)
> + continue;
> + if (!map->ops->map_direct_value_meta(map, addr, idx, off)) {
> + *type = BPF_PSEUDO_MAP_VALUE;
> + return map;
> + }
> + }
> +
> return NULL;
> }
>
> @@ -2075,6 +2089,7 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog)
> {
> const struct bpf_map *map;
> struct bpf_insn *insns;
> + u32 idx, off, type;
> u64 imm;
> int i;
>
> @@ -2102,11 +2117,13 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog)
> continue;
>
> imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm;
> - map = bpf_map_from_imm(prog, imm);
> + map = bpf_map_from_imm(prog, imm, &idx, &off, &type);
> if (map) {
> - insns[i].src_reg = BPF_PSEUDO_MAP_FD;
> + insns[i].src_reg = type;
> insns[i].imm = map->id;
> - insns[i + 1].imm = 0;
> + insns[i].off = (u16)idx;
> + insns[i + 1].imm = off;
> + insns[i + 1].off = (u16)(idx >> 16);
> continue;
> }
> }
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index ce166a002d16..57678cef9a2c 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -4944,25 +4944,20 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
> return 0;
> }
>
> -/* return the map pointer stored inside BPF_LD_IMM64 instruction */
> -static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
> -{
> - u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32;
> -
> - return (struct bpf_map *) (unsigned long) imm64;
> -}
> -
> /* verify BPF_LD_IMM64 instruction */
> static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
> {
> + struct bpf_insn_aux_data *aux = cur_aux(env);
> struct bpf_reg_state *regs = cur_regs(env);
> + struct bpf_map *map;
> int err;
>
> if (BPF_SIZE(insn->code) != BPF_DW) {
> verbose(env, "invalid BPF_LD_IMM insn\n");
> return -EINVAL;
> }
> - if (insn->off != 0) {
> +
> + if (insn->src_reg != BPF_PSEUDO_MAP_VALUE && insn->off != 0) {
> verbose(env, "BPF_LD_IMM64 uses reserved fields\n");
> return -EINVAL;
> }
> @@ -4979,11 +4974,22 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
> return 0;
> }
>
> - /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */
> - BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD);
> + map = env->used_maps[aux->map_index];
> + mark_reg_known_zero(env, regs, insn->dst_reg);
> + regs[insn->dst_reg].map_ptr = map;
> +
> + if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) {
> + regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
> + regs[insn->dst_reg].off = aux->map_off;
> + if (map_value_has_spin_lock(map))
> + regs[insn->dst_reg].id = ++env->id_gen;
> + } else if (insn->src_reg == BPF_PSEUDO_MAP_FD) {
> + regs[insn->dst_reg].type = CONST_PTR_TO_MAP;
> + } else {
> + verbose(env, "bpf verifier is misconfigured\n");
> + return -EINVAL;
> + }
>
> - regs[insn->dst_reg].type = CONST_PTR_TO_MAP;
> - regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn);
> return 0;
> }
>
> @@ -6664,23 +6670,34 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
> }
>
> if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
> + struct bpf_insn_aux_data *aux;
> struct bpf_map *map;
> struct fd f;
> -
> - if (i == insn_cnt - 1 || insn[1].code != 0 ||
> - insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
> - insn[1].off != 0) {
> + u64 addr;
> +
> + if (i == insn_cnt - 1 ||
> + insn[1].code != 0 ||
> + insn[1].dst_reg != 0 ||
> + insn[1].src_reg != 0 ||
> + (insn[1].off != 0 &&
> + insn[0].src_reg != BPF_PSEUDO_MAP_VALUE)) {
> verbose(env, "invalid bpf_ld_imm64 insn\n");
> return -EINVAL;
> }
>
> - if (insn->src_reg == 0)
> + if (insn[0].src_reg == 0)
> /* valid generic load 64-bit imm */
> goto next_insn;
>
> - if (insn[0].src_reg != BPF_PSEUDO_MAP_FD ||
> - insn[1].imm != 0) {
> - verbose(env, "unrecognized bpf_ld_imm64 insn\n");
> + /* In final convert_pseudo_ld_imm64() step, this is
> + * converted into regular 64-bit imm load insn.
> + */
> + if ((insn[0].src_reg != BPF_PSEUDO_MAP_FD &&
> + insn[0].src_reg != BPF_PSEUDO_MAP_VALUE) ||
> + (insn[0].src_reg == BPF_PSEUDO_MAP_FD &&
> + insn[1].imm != 0)) {
> + verbose(env,
> + "unrecognized bpf_ld_imm64 insn\n");
> return -EINVAL;
> }
>
> @@ -6698,16 +6715,49 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
> return err;
> }
>
> - /* store map pointer inside BPF_LD_IMM64 instruction */
> - insn[0].imm = (u32) (unsigned long) map;
> - insn[1].imm = ((u64) (unsigned long) map) >> 32;
> + aux = &env->insn_aux_data[i];
> + if (insn->src_reg == BPF_PSEUDO_MAP_FD) {
> + addr = (unsigned long)map;
> + } else {
> + u32 idx = ((u32)(u16)insn[0].off) |
> + ((u32)(u16)insn[1].off) << 16;
> + u32 off = insn[1].imm;
> +
> + if (off >= BPF_MAX_VAR_OFF) {
> + verbose(env, "direct value offset of %u is not allowed\n", off);
> + fdput(f);
> + return -EINVAL;
> + }
> +
> + if (!map->ops->map_direct_value_addr) {
> + verbose(env, "no direct value access support for this map type\n");
> + fdput(f);
> + return -EINVAL;
> + }
> +
> + err = map->ops->map_direct_value_addr(map, &addr, idx, off);
> + if (err) {
> + verbose(env, "invalid access to map value pointer, value_size=%u index=%u off=%u\n",
> + map->value_size, idx, off);
> + fdput(f);
> + return err;
> + }
> +
> + aux->map_off = off;
> + addr += off;
> + }
> +
> + insn[0].imm = (u32)addr;
> + insn[1].imm = addr >> 32;
>
> /* check whether we recorded this map already */
> - for (j = 0; j < env->used_map_cnt; j++)
> + for (j = 0; j < env->used_map_cnt; j++) {
> if (env->used_maps[j] == map) {
> + aux->map_index = j;
> fdput(f);
> goto next_insn;
> }
> + }
>
> if (env->used_map_cnt >= MAX_USED_MAPS) {
> fdput(f);
> @@ -6724,6 +6774,8 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
> fdput(f);
> return PTR_ERR(map);
> }
> +
> + aux->map_index = env->used_map_cnt;
> env->used_maps[env->used_map_cnt++] = map;
>
> if (bpf_map_is_cgroup_storage(map) &&
> @@ -6778,9 +6830,12 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
> int insn_cnt = env->prog->len;
> int i;
>
> - for (i = 0; i < insn_cnt; i++, insn++)
> - if (insn->code == (BPF_LD | BPF_IMM | BPF_DW))
> + for (i = 0; i < insn_cnt; i++, insn++) {
> + if (insn->code == (BPF_LD | BPF_IMM | BPF_DW)) {
> insn->src_reg = 0;
> + insn->off = (insn + 1)->off = 0;
> + }
> + }
> }
>
> /* single env->prog->insni[off] instruction was replaced with the range
> diff --git a/tools/bpf/bpftool/xlated_dumper.c b/tools/bpf/bpftool/xlated_dumper.c
> index 7073dbe1ff27..5391a9a70112 100644
> --- a/tools/bpf/bpftool/xlated_dumper.c
> +++ b/tools/bpf/bpftool/xlated_dumper.c
> @@ -195,6 +195,12 @@ static const char *print_imm(void *private_data,
> if (insn->src_reg == BPF_PSEUDO_MAP_FD)
> snprintf(dd->scratch_buff, sizeof(dd->scratch_buff),
> "map[id:%u]", insn->imm);
> + else if (insn->src_reg == BPF_PSEUDO_MAP_VALUE)
> + snprintf(dd->scratch_buff, sizeof(dd->scratch_buff),
> + "map[id:%u][%u]+%u", insn->imm,
> + ((__u32)(__u16)insn[0].off) |
> + ((__u32)(__u16)insn[1].off) << 16,
> + (insn + 1)->imm);
> else
> snprintf(dd->scratch_buff, sizeof(dd->scratch_buff),
> "0x%llx", (unsigned long long)full_imm);
> --
> 2.17.1
>
Powered by blists - more mailing lists