netdev - Re: [RFC PATCH bpf-next 2/4] bpf: introduce BPF dispatcher

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAEf4BzZw+jX13JuiVueKhpufQ9qHEBc0xYtqKdhhUV00afx0Gw@mail.gmail.com>
Date:   Mon, 18 Nov 2019 11:36:12 -0800
From:   Andrii Nakryiko <andrii.nakryiko@...il.com>
To:     Björn Töpel <bjorn.topel@...il.com>
Cc:     Networking <netdev@...r.kernel.org>,
        Alexei Starovoitov <ast@...nel.org>,
        Daniel Borkmann <daniel@...earbox.net>,
        Björn Töpel <bjorn.topel@...el.com>,
        bpf <bpf@...r.kernel.org>,
        Magnus Karlsson <magnus.karlsson@...il.com>,
        Magnus Karlsson <magnus.karlsson@...el.com>,
        Jonathan Lemon <jonathan.lemon@...il.com>
Subject: Re: [RFC PATCH bpf-next 2/4] bpf: introduce BPF dispatcher

On Wed, Nov 13, 2019 at 12:48 PM Björn Töpel <bjorn.topel@...il.com> wrote:
>
> From: Björn Töpel <bjorn.topel@...el.com>
>
> The BPF dispatcher builds on top of the BPF trampoline ideas;
> Introduce bpf_arch_text_poke() and (re-)use the BPF JIT generate
> code. The dispatcher builds a dispatch table for XDP programs, for
> retpoline avoidance. The table is a simple binary search model, so
> lookup is O(log n). Here, the dispatch table is limited to four
> entries (for laziness reason -- only 1B relative jumps :-P). If the
> dispatch table is full, it will fallback to the retpoline path.
>
> An example: A module/driver allocates a dispatcher. The dispatcher is
> shared for all netdevs. Each netdev allocate a slot in the dispatcher
> and a BPF program. The netdev then uses the dispatcher to call the
> correct program with a direct call (actually a tail-call).
>
> Signed-off-by: Björn Töpel <bjorn.topel@...el.com>
> ---
>  arch/x86/net/bpf_jit_comp.c |  96 ++++++++++++++++++
>  kernel/bpf/Makefile         |   1 +
>  kernel/bpf/dispatcher.c     | 197 ++++++++++++++++++++++++++++++++++++
>  3 files changed, 294 insertions(+)
>  create mode 100644 kernel/bpf/dispatcher.c
>
> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
> index 28782a1c386e..d75aebf508b8 100644
> --- a/arch/x86/net/bpf_jit_comp.c
> +++ b/arch/x86/net/bpf_jit_comp.c
> @@ -10,10 +10,12 @@
>  #include <linux/if_vlan.h>
>  #include <linux/bpf.h>
>  #include <linux/memory.h>
> +#include <linux/sort.h>
>  #include <asm/extable.h>
>  #include <asm/set_memory.h>
>  #include <asm/nospec-branch.h>
>  #include <asm/text-patching.h>
> +#include <asm/asm-prototypes.h>
>

[...]

> +
> +int arch_prepare_bpf_dispatcher(void *image, struct bpf_prog **progs,
> +                               int num_progs)
> +{
> +       u64 ips[BPF_DISPATCHER_MAX] = {};
> +       u8 *fallback, *prog = image;
> +       int i, err, cnt = 0;
> +
> +       if (!num_progs || num_progs > BPF_DISPATCHER_MAX)
> +               return -EINVAL;
> +
> +       for (i = 0; i < num_progs; i++)
> +               ips[i] = (u64)progs[i]->bpf_func;
> +
> +       EMIT2(0xEB, 5); /* jmp rip+5 (skip retpoline) */
> +       fallback = prog;
> +       err = emit_jmp(&prog,   /* jmp retpoline */
> +                      __x86_indirect_thunk_rdx, prog);
> +       if (err)
> +               return err;
> +
> +       sort(&ips[0], num_progs, sizeof(ips[i]), cmp_ips, NULL);

nit: sizeof(ips[i]) looks weird...

> +       return emit_bpf_dispatcher(&prog, 0, num_progs - 1, &ips[0], fallback);
> +}
> +
>  struct x64_jit_data {
>         struct bpf_binary_header *header;
>         int *addrs;

[...]

> +
> +static int bpf_dispatcher_add_prog(struct bpf_dispatcher *d,
> +                                  struct bpf_prog *prog)
> +{
> +       struct bpf_prog **entry = NULL;
> +       int i, err = 0;
> +
> +       if (d->num_progs == BPF_DISPATCHER_MAX)
> +               return err;

err == 0, not what you want, probably

> +
> +       for (i = 0; i < BPF_DISPATCHER_MAX; i++) {
> +               if (!entry && !d->progs[i])
> +                       entry = &d->progs[i];
> +               if (d->progs[i] == prog)
> +                       return err;
> +       }
> +
> +       prog = bpf_prog_inc(prog);
> +       if (IS_ERR(prog))
> +               return err;
> +
> +       *entry = prog;
> +       d->num_progs++;
> +       return err;
> +}
> +
> +static void bpf_dispatcher_remove_prog(struct bpf_dispatcher *d,
> +                                      struct bpf_prog *prog)
> +{
> +       int i;
> +
> +       for (i = 0; i < BPF_DISPATCHER_MAX; i++) {
> +               if (d->progs[i] == prog) {
> +                       bpf_prog_put(prog);
> +                       d->progs[i] = NULL;
> +                       d->num_progs--;

instead of allowing holes, why not swap removed prog with the last on
in d->progs?

> +                       break;
> +               }
> +       }
> +}
> +
> +int __weak arch_prepare_bpf_dispatcher(void *image, struct bpf_prog **progs,
> +                                      int num_ids)
> +{
> +       return -ENOTSUPP;
> +}
> +
> +/* NB! bpf_dispatcher_update() might free the dispatcher! */
> +static int bpf_dispatcher_update(struct bpf_dispatcher *d)
> +{
> +       void *old_image = d->image + ((d->selector + 1) & 1) * PAGE_SIZE / 2;
> +       void *new_image = d->image + (d->selector & 1) * PAGE_SIZE / 2;
> +       int err;
> +
> +       if (d->num_progs == 0) {
> +               err = bpf_arch_text_poke(d->func, BPF_MOD_JMP_TO_NOP,
> +                                        old_image, NULL);
> +               bpf_dispatcher_free(d);
> +               goto out;
> +       }
> +
> +       err = arch_prepare_bpf_dispatcher(new_image, &d->progs[0],
> +                                         d->num_progs);
> +       if (err)
> +               goto out;
> +
> +       if (d->selector)
> +               /* progs already running at this address */
> +               err = bpf_arch_text_poke(d->func, BPF_MOD_JMP_TO_JMP,
> +                                        old_image, new_image);
> +       else
> +               /* first time registering */
> +               err = bpf_arch_text_poke(d->func, BPF_MOD_NOP_TO_JMP,
> +                                        NULL, new_image);
> +
> +       if (err)
> +               goto out;
> +       d->selector++;
> +
> +out:
> +       return err;
> +}
> +
> +void bpf_dispatcher_change_prog(void *func, struct bpf_prog *from,
> +                               struct bpf_prog *to)
> +{
> +       struct bpf_dispatcher *d;
> +
> +       if (!from && !to)
> +               return;
> +
> +       mutex_lock(&dispatcher_mutex);
> +       d = bpf_dispatcher_lookup(func);
> +       if (!d)
> +               goto out;
> +
> +       if (from)
> +               bpf_dispatcher_remove_prog(d, from);
> +
> +       if (to)
> +               bpf_dispatcher_add_prog(d, to);

this can fail

> +
> +       WARN_ON(bpf_dispatcher_update(d));

shouldn't dispatcher be removed from the list before freed? It seems
like handling dispatches freeing is better done here explicitly (and
you won't need to leave a NB remark)

> +
> +out:
> +       mutex_unlock(&dispatcher_mutex);
> +}
> +
> +static int __init init_dispatchers(void)
> +{
> +       int i;
> +
> +       for (i = 0; i < DISPATCHER_TABLE_SIZE; i++)
> +               INIT_HLIST_HEAD(&dispatcher_table[i]);
> +       return 0;
> +}
> +late_initcall(init_dispatchers);
> +
> +#endif
> --
> 2.20.1
>