[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAGXu5j+gMepR-euMhBjYqMVHsdJM-Do5yc0rLw5dMK-C9BBieA@mail.gmail.com>
Date: Wed, 23 Jul 2014 12:00:25 -0700
From: Kees Cook <keescook@...omium.org>
To: Alexei Starovoitov <ast@...mgrid.com>
Cc: "David S. Miller" <davem@...emloft.net>,
Ingo Molnar <mingo@...nel.org>,
Linus Torvalds <torvalds@...ux-foundation.org>,
Andy Lutomirski <luto@...capital.net>,
Steven Rostedt <rostedt@...dmis.org>,
Daniel Borkmann <dborkman@...hat.com>,
Chema Gonzalez <chema@...gle.com>,
Eric Dumazet <edumazet@...gle.com>,
Peter Zijlstra <a.p.zijlstra@...llo.nl>,
Arnaldo Carvalho de Melo <acme@...radead.org>,
Jiri Olsa <jolsa@...hat.com>,
Thomas Gleixner <tglx@...utronix.de>,
"H. Peter Anvin" <hpa@...or.com>,
Andrew Morton <akpm@...ux-foundation.org>,
Linux API <linux-api@...r.kernel.org>,
Network Development <netdev@...r.kernel.org>,
LKML <linux-kernel@...r.kernel.org>
Subject: Re: [PATCH RFC v2 net-next 09/16] bpf: expand BPF syscall with
program load/unload
On Thu, Jul 17, 2014 at 9:19 PM, Alexei Starovoitov <ast@...mgrid.com> wrote:
> eBPF programs are safe run-to-completion functions with load/unload
> methods from userspace similar to kernel modules.
>
> User space API:
>
> - load eBPF program
> fd = bpf_prog_load(bpf_prog_type, struct nlattr *prog, int len)
>
> where 'prog' is a sequence of sections (TEXT, LICENSE, MAP_ASSOC)
> TEXT - array of eBPF instructions
> LICENSE - must be GPL compatible to call helper functions marked gpl_only
> MAP_FIXUP - array of {insn idx, map fd} used by kernel to adjust
> imm constants in 'mov' instructions used to access maps
Nit: naming mismatch between MAP_ASSOC vs MAP_FIXUP.
>
> - unload eBPF program
> close(fd)
>
> User space example of syscall(__NR_bpf, BPF_PROG_LOAD, prog_type, ...)
> follows in later patches
>
> Signed-off-by: Alexei Starovoitov <ast@...mgrid.com>
> ---
> include/linux/bpf.h | 33 +++++
> include/linux/filter.h | 9 +-
> include/uapi/linux/bpf.h | 29 +++++
> kernel/bpf/core.c | 5 +-
> kernel/bpf/syscall.c | 309 ++++++++++++++++++++++++++++++++++++++++++++++
> net/core/filter.c | 9 +-
> 6 files changed, 388 insertions(+), 6 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 91e2caf8edf9..4967619595cc 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -46,4 +46,37 @@ struct bpf_map_type_list {
> void bpf_register_map_type(struct bpf_map_type_list *tl);
> struct bpf_map *bpf_map_get(u32 map_id);
>
> +/* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs
> + * to in-kernel helper functions and for adjusting imm32 field in BPF_CALL
> + * instructions after verifying
> + */
> +struct bpf_func_proto {
> + u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
> + bool gpl_only;
> +};
> +
> +struct bpf_verifier_ops {
> + /* return eBPF function prototype for verification */
> + const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id);
> +};
> +
> +struct bpf_prog_type_list {
> + struct list_head list_node;
> + struct bpf_verifier_ops *ops;
> + enum bpf_prog_type type;
> +};
> +
> +void bpf_register_prog_type(struct bpf_prog_type_list *tl);
> +
> +struct bpf_prog_info {
> + bool is_gpl_compatible;
> + enum bpf_prog_type prog_type;
> + struct bpf_verifier_ops *ops;
> + u32 *used_maps;
> + u32 used_map_cnt;
> +};
> +
> +void free_bpf_prog_info(struct bpf_prog_info *info);
> +struct sk_filter *bpf_prog_get(u32 ufd);
> +
> #endif /* _LINUX_BPF_H */
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index b43ad6a2b3cf..822b310e75e1 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -30,12 +30,17 @@ struct sock_fprog_kern {
> struct sk_buff;
> struct sock;
> struct seccomp_data;
> +struct bpf_prog_info;
>
> struct sk_filter {
> atomic_t refcnt;
> u32 jited:1, /* Is our filter JIT'ed? */
> - len:31; /* Number of filter blocks */
> - struct sock_fprog_kern *orig_prog; /* Original BPF program */
> + ebpf:1, /* Is it eBPF program ? */
> + len:30; /* Number of filter blocks */
> + union {
> + struct sock_fprog_kern *orig_prog; /* Original BPF program */
> + struct bpf_prog_info *info;
> + };
> struct rcu_head rcu;
> unsigned int (*bpf_func)(const struct sk_buff *skb,
> const struct bpf_insn *filter);
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 3ea11ba053a8..06ba71b49f64 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -333,6 +333,13 @@ enum bpf_cmd {
> * returns zero and stores next key or negative error
> */
> BPF_MAP_GET_NEXT_KEY,
> +
> + /* verify and load eBPF program
> + * prog_id = bpf_prog_load(bpf_prog_type, struct nlattr *prog, int len)
> + * prog is a sequence of sections
> + * returns fd or negative error
> + */
> + BPF_PROG_LOAD,
> };
>
> enum bpf_map_attributes {
> @@ -350,4 +357,26 @@ enum bpf_map_type {
> BPF_MAP_TYPE_HASH,
> };
>
> +enum bpf_prog_attributes {
> + BPF_PROG_UNSPEC,
> + BPF_PROG_TEXT, /* array of eBPF instructions */
> + BPF_PROG_LICENSE, /* license string */
> + BPF_PROG_MAP_FIXUP, /* array of {insn idx, map fd} to fixup insns */
> + __BPF_PROG_ATTR_MAX,
> +};
> +#define BPF_PROG_ATTR_MAX (__BPF_PROG_ATTR_MAX - 1)
> +#define BPF_PROG_MAX_ATTR_SIZE 65535
> +
> +enum bpf_prog_type {
> + BPF_PROG_TYPE_UNSPEC,
> +};
> +
> +/* integer value in 'imm' field of BPF_CALL instruction selects which helper
> + * function eBPF program intends to call
> + */
> +enum bpf_func_id {
> + BPF_FUNC_unspec,
> + __BPF_FUNC_MAX_ID,
> +};
> +
> #endif /* _UAPI__LINUX_BPF_H__ */
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 265a02cc822d..e65ecdc36358 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -23,6 +23,7 @@
> #include <linux/filter.h>
> #include <linux/skbuff.h>
> #include <asm/unaligned.h>
> +#include <linux/bpf.h>
>
> /* Registers */
> #define BPF_R0 regs[BPF_REG_0]
> @@ -528,9 +529,11 @@ void sk_filter_select_runtime(struct sk_filter *fp)
> }
> EXPORT_SYMBOL_GPL(sk_filter_select_runtime);
>
> -/* free internal BPF program */
> +/* free internal BPF program, called after RCU grace period */
> void sk_filter_free(struct sk_filter *fp)
> {
> + if (fp->ebpf)
> + free_bpf_prog_info(fp->info);
> bpf_jit_free(fp);
> }
> EXPORT_SYMBOL_GPL(sk_filter_free);
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index ca2be66845b3..9e45ca6b6937 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -14,6 +14,8 @@
> #include <net/netlink.h>
> #include <linux/anon_inodes.h>
> #include <linux/file.h>
> +#include <linux/license.h>
> +#include <linux/filter.h>
>
> /* mutex to protect insertion/deletion of map_id in IDR */
> static DEFINE_MUTEX(bpf_map_lock);
> @@ -406,6 +408,310 @@ err_unlock:
> return err;
> }
>
> +static LIST_HEAD(bpf_prog_types);
> +
> +static int find_prog_type(enum bpf_prog_type type, struct sk_filter *prog)
> +{
> + struct bpf_prog_type_list *tl;
> +
> + list_for_each_entry(tl, &bpf_prog_types, list_node) {
> + if (tl->type == type) {
> + prog->info->ops = tl->ops;
> + prog->info->prog_type = type;
> + return 0;
> + }
> + }
> + return -EINVAL;
> +}
> +
> +void bpf_register_prog_type(struct bpf_prog_type_list *tl)
> +{
> + list_add(&tl->list_node, &bpf_prog_types);
> +}
> +
> +/* fixup insn->imm field of bpf_call instructions:
> + * if (insn->imm == BPF_FUNC_map_lookup_elem)
> + * insn->imm = bpf_map_lookup_elem - __bpf_call_base;
> + * else if (insn->imm == BPF_FUNC_map_update_elem)
> + * insn->imm = bpf_map_update_elem - __bpf_call_base;
> + * else ...
> + *
> + * this function is called after eBPF program passed verification
> + */
> +static void fixup_bpf_calls(struct sk_filter *prog)
> +{
> + const struct bpf_func_proto *fn;
> + int i;
> +
> + for (i = 0; i < prog->len; i++) {
> + struct bpf_insn *insn = &prog->insnsi[i];
> +
> + if (insn->code == (BPF_JMP | BPF_CALL)) {
> + /* we reach here when program has bpf_call instructions
> + * and it passed bpf_check(), means that
> + * ops->get_func_proto must have been supplied, check it
> + */
> + BUG_ON(!prog->info->ops->get_func_proto);
> +
> + fn = prog->info->ops->get_func_proto(insn->imm);
> + /* all functions that have prototype and verifier allowed
> + * programs to call them, must be real in-kernel functions
> + */
> + BUG_ON(!fn->func);
> + insn->imm = fn->func - __bpf_call_base;
> + }
> + }
> +}
> +
> +/* fixup instructions that are using map_ids:
> + *
> + * BPF_MOV64_IMM(BPF_REG_1, MAP_ID), // r1 = MAP_ID
> + * BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
> + *
> + * in the 1st insn kernel replaces MAP_ID with global map_id,
> + * since programs are executing out of different contexts and must use
> + * globally visible ids to access maps
> + *
> + * map_fixup is an array of pairs {insn idx, map ufd}
> + *
> + * kernel resolves ufd -> global map_id and adjusts eBPF instructions
> + */
> +static int fixup_bpf_map_id(struct sk_filter *prog, struct nlattr *map_fixup)
> +{
> + struct {
> + u32 insn_idx;
> + u32 ufd;
> + } *fixup = nla_data(map_fixup);
> + int fixup_len = nla_len(map_fixup) / sizeof(*fixup);
> + struct bpf_insn *insn;
> + struct fd f;
> + u32 idx;
> + int i, map_id;
> +
> + if (fixup_len <= 0)
> + return -EINVAL;
> +
> + for (i = 0; i < fixup_len; i++) {
> + idx = fixup[i].insn_idx;
> + if (idx >= prog->len)
> + return -EINVAL;
> +
> + insn = &prog->insnsi[idx];
> + if (insn->code != (BPF_ALU64 | BPF_MOV | BPF_K) &&
> + insn->code != (BPF_ALU | BPF_MOV | BPF_K))
> + return -EINVAL;
> +
> + f = fdget(fixup[i].ufd);
> +
> + map_id = get_map_id(f);
> +
> + if (map_id < 0)
> + return map_id;
> +
> + insn->imm = map_id;
> + fdput(f);
It looks like there a potentially race risk of a map_id changing out
from under a running program? Between the call to fixup_bpf_map_id()
and the bpf_map_get() calls during bpf_prog_load() below...
> + }
> + return 0;
> +}
> +
> +/* free eBPF program auxilary data, called after rcu grace period,
> + * so it's safe to drop refcnt on maps used by this program
> + *
> + * called from sk_filter_release()->sk_filter_release_rcu()->sk_filter_free()
> + */
> +void free_bpf_prog_info(struct bpf_prog_info *info)
> +{
> + bool found;
> + int i;
> +
> + for (i = 0; i < info->used_map_cnt; i++) {
> + found = bpf_map_put(info->used_maps[i]);
> + /* all maps that this program was using should obviously still
> + * be there
> + */
> + BUG_ON(!found);
> + }
> + kfree(info);
> +}
> +
> +static int bpf_prog_release(struct inode *inode, struct file *filp)
> +{
> + struct sk_filter *prog = filp->private_data;
> +
> + sk_unattached_filter_destroy(prog);
> + return 0;
> +}
> +
> +static const struct file_operations bpf_prog_fops = {
> + .release = bpf_prog_release,
> +};
> +
> +static const struct nla_policy prog_policy[BPF_PROG_ATTR_MAX + 1] = {
> + [BPF_PROG_TEXT] = { .type = NLA_BINARY },
> + [BPF_PROG_LICENSE] = { .type = NLA_NUL_STRING },
> + [BPF_PROG_MAP_FIXUP] = { .type = NLA_BINARY },
> +};
> +
> +static int bpf_prog_load(enum bpf_prog_type type, struct nlattr __user *uattr,
> + int len)
> +{
> + struct nlattr *tb[BPF_PROG_ATTR_MAX + 1];
> + struct sk_filter *prog;
> + struct bpf_map *map;
> + struct nlattr *attr;
> + size_t insn_len;
> + int err, i;
> + bool is_gpl;
> +
> + if (len <= 0 || len > BPF_PROG_MAX_ATTR_SIZE)
> + return -EINVAL;
> +
> + attr = kmalloc(len, GFP_USER);
> + if (!attr)
> + return -ENOMEM;
> +
> + /* copy eBPF program from user space */
> + err = -EFAULT;
> + if (copy_from_user(attr, uattr, len) != 0)
> + goto free_attr;
> +
> + /* perform basic validation */
> + err = nla_parse(tb, BPF_PROG_ATTR_MAX, attr, len, prog_policy);
> + if (err < 0)
> + goto free_attr;
> +
> + err = -EINVAL;
> + /* look for mandatory license string */
> + if (!tb[BPF_PROG_LICENSE])
> + goto free_attr;
> +
> + /* eBPF programs must be GPL compatible to use GPL-ed functions */
> + is_gpl = license_is_gpl_compatible(nla_data(tb[BPF_PROG_LICENSE]));
> +
> + /* look for mandatory array of eBPF instructions */
> + if (!tb[BPF_PROG_TEXT])
> + goto free_attr;
> +
> + insn_len = nla_len(tb[BPF_PROG_TEXT]);
> + if (insn_len % sizeof(struct bpf_insn) != 0 || insn_len <= 0)
> + goto free_attr;
> +
> + /* plain sk_filter allocation */
> + err = -ENOMEM;
> + prog = kmalloc(sk_filter_size(insn_len), GFP_USER);
> + if (!prog)
> + goto free_attr;
> +
> + prog->len = insn_len / sizeof(struct bpf_insn);
> + memcpy(prog->insns, nla_data(tb[BPF_PROG_TEXT]), insn_len);
> + prog->orig_prog = NULL;
> + prog->jited = 0;
> + prog->ebpf = 0;
> + atomic_set(&prog->refcnt, 1);
> +
> + if (tb[BPF_PROG_MAP_FIXUP]) {
> + /* if program is using maps, fixup map_ids */
> + err = fixup_bpf_map_id(prog, tb[BPF_PROG_MAP_FIXUP]);
> + if (err < 0)
> + goto free_prog;
> + }
> +
> + /* allocate eBPF related auxilary data */
> + prog->info = kzalloc(sizeof(struct bpf_prog_info), GFP_USER);
> + if (!prog->info)
> + goto free_prog;
> + prog->ebpf = 1;
> + prog->info->is_gpl_compatible = is_gpl;
> +
> + /* find program type: socket_filter vs tracing_filter */
> + err = find_prog_type(type, prog);
> + if (err < 0)
> + goto free_prog;
> +
> + /* lock maps to prevent any changes to maps, since eBPF program may
> + * use them. In such case bpf_check() will populate prog->used_maps
> + */
> + mutex_lock(&bpf_map_lock);
> +
> + /* run eBPF verifier */
> + /* err = bpf_check(prog); */
> +
> + if (err == 0 && prog->info->used_maps) {
> + /* program passed verifier and it's using some maps,
> + * hold them
> + */
> + for (i = 0; i < prog->info->used_map_cnt; i++) {
> + map = bpf_map_get(prog->info->used_maps[i]);
> + BUG_ON(!map);
> + atomic_inc(&map->refcnt);
> + }
> + }
> + mutex_unlock(&bpf_map_lock);
As mentioned above, I think fixup_bpf_map_id needs to be done under
the map_lock mutex, unless I'm misunderstanding something in the
object lifetime.
> +
> + if (err < 0)
> + goto free_prog;
> +
> + /* fixup BPF_CALL->imm field */
> + fixup_bpf_calls(prog);
> +
> + /* eBPF program is ready to be JITed */
> + sk_filter_select_runtime(prog);
> +
> + err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
> +
> + if (err < 0)
> + /* failed to allocate fd */
> + goto free_prog;
> +
> + /* user supplied eBPF prog attributes are no longer needed */
> + kfree(attr);
> +
> + return err;
> +free_prog:
> + sk_filter_free(prog);
> +free_attr:
> + kfree(attr);
> + return err;
> +}
> +
> +static struct sk_filter *get_prog(struct fd f)
> +{
> + struct sk_filter *prog;
> +
> + if (!f.file)
> + return ERR_PTR(-EBADF);
> +
> + if (f.file->f_op != &bpf_prog_fops) {
> + fdput(f);
> + return ERR_PTR(-EINVAL);
> + }
> +
> + prog = f.file->private_data;
> +
> + return prog;
> +}
> +
> +/* called from sk_attach_filter_ebpf() or from tracing filter attach
> + * pairs with
> + * sk_detach_filter()->sk_filter_uncharge()->sk_filter_release()
> + * or with
> + * sk_unattached_filter_destroy()->sk_filter_release()
> + */
> +struct sk_filter *bpf_prog_get(u32 ufd)
> +{
> + struct fd f = fdget(ufd);
> + struct sk_filter *prog;
> +
> + prog = get_prog(f);
> +
> + if (IS_ERR(prog))
> + return prog;
> +
> + atomic_inc(&prog->refcnt);
> + fdput(f);
> + return prog;
> +}
> +
> SYSCALL_DEFINE5(bpf, int, cmd, unsigned long, arg2, unsigned long, arg3,
> unsigned long, arg4, unsigned long, arg5)
> {
> @@ -428,6 +734,9 @@ SYSCALL_DEFINE5(bpf, int, cmd, unsigned long, arg2, unsigned long, arg3,
> case BPF_MAP_GET_NEXT_KEY:
> return map_get_next_key((int) arg2, (void __user *) arg3,
> (void __user *) arg4);
> + case BPF_PROG_LOAD:
> + return bpf_prog_load((enum bpf_prog_type) arg2,
> + (struct nlattr __user *) arg3, (int) arg4);
> default:
> return -EINVAL;
> }
> diff --git a/net/core/filter.c b/net/core/filter.c
> index f3b2d5e9fe5f..255dba1bb678 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -835,7 +835,7 @@ static void sk_release_orig_filter(struct sk_filter *fp)
> {
> struct sock_fprog_kern *fprog = fp->orig_prog;
>
> - if (fprog) {
> + if (!fp->ebpf && fprog) {
> kfree(fprog->filter);
> kfree(fprog);
> }
> @@ -867,14 +867,16 @@ static void sk_filter_release(struct sk_filter *fp)
>
> void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
> {
> - atomic_sub(sk_filter_size(fp->len), &sk->sk_omem_alloc);
> + if (!fp->ebpf)
> + atomic_sub(sk_filter_size(fp->len), &sk->sk_omem_alloc);
> sk_filter_release(fp);
> }
>
> void sk_filter_charge(struct sock *sk, struct sk_filter *fp)
> {
> atomic_inc(&fp->refcnt);
> - atomic_add(sk_filter_size(fp->len), &sk->sk_omem_alloc);
> + if (!fp->ebpf)
> + atomic_add(sk_filter_size(fp->len), &sk->sk_omem_alloc);
> }
>
> static struct sk_filter *__sk_migrate_realloc(struct sk_filter *fp,
> @@ -978,6 +980,7 @@ static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp,
>
> fp->bpf_func = NULL;
> fp->jited = 0;
> + fp->ebpf = 0;
>
> err = sk_chk_filter(fp->insns, fp->len);
> if (err) {
> --
> 1.7.9.5
>
-Kees
--
Kees Cook
Chrome OS Security
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists