netdev - Re: [PATCH v5 net-next 2/3] [RFC] seccomp: convert seccomp to use extended BPF

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Tue, 4 Mar 2014 19:11:49 -0800
From:	Alexei Starovoitov <ast@...mgrid.com>
To:	Daniel Borkmann <dborkman@...hat.com>,
	Kees Cook <keescook@...omium.org>
Cc:	"David S. Miller" <davem@...emloft.net>,
	Ingo Molnar <mingo@...nel.org>, Will Drewry <wad@...omium.org>,
	Steven Rostedt <rostedt@...dmis.org>,
	Peter Zijlstra <a.p.zijlstra@...llo.nl>,
	"H. Peter Anvin" <hpa@...or.com>,
	Hagen Paul Pfeifer <hagen@...u.net>,
	Jesse Gross <jesse@...ira.com>,
	Thomas Gleixner <tglx@...utronix.de>,
	Masami Hiramatsu <masami.hiramatsu.pt@...achi.com>,
	Tom Zanussi <tom.zanussi@...ux.intel.com>,
	Jovi Zhangwei <jovi.zhangwei@...il.com>,
	Eric Dumazet <edumazet@...gle.com>,
	Linus Torvalds <torvalds@...ux-foundation.org>,
	Andrew Morton <akpm@...ux-foundation.org>,
	Frederic Weisbecker <fweisbec@...il.com>,
	Arnaldo Carvalho de Melo <acme@...radead.org>,
	Pekka Enberg <penberg@....fi>,
	Arjan van de Ven <arjan@...radead.org>,
	Christoph Hellwig <hch@...radead.org>,
	linux-kernel@...r.kernel.org, netdev@...r.kernel.org
Subject: Re: [PATCH v5 net-next 2/3] [RFC] seccomp: convert seccomp to use
 extended BPF

On Tue, Mar 4, 2014 at 2:17 PM, Alexei Starovoitov <ast@...mgrid.com> wrote:
> use sk_convert_filter() to convert seccomp BPF into extended BPF
>
> 05-sim-long_jumps.c of libseccomp was used as micro-benchmark:
>   seccomp_rule_add_exact(ctx,...
>   seccomp_rule_add_exact(ctx,...
>   rc = seccomp_load(ctx);
>   for (i = 0; i < 10000000; i++)
>      syscall(199, 100);
>
> --x86_64--
> old BPF: 8.6 seconds
>     73.38%    bench  [kernel.kallsyms]  [k] sk_run_filter
>     10.70%    bench  libc-2.15.so       [.] syscall
>      5.09%    bench  [kernel.kallsyms]  [k] seccomp_bpf_load
>      1.97%    bench  [kernel.kallsyms]  [k] system_call
> ext BPF: 5.7 seconds
>     66.20%    bench  [kernel.kallsyms]  [k] sk_run_filter_ext
>     16.75%    bench  libc-2.15.so       [.] syscall
>      3.31%    bench  [kernel.kallsyms]  [k] system_call
>      2.88%    bench  [kernel.kallsyms]  [k] __secure_computing
>
> --i386---
> old BPF: 5.4 sec
> ext BPF: 3.8 sec
>
> BPF filters generated by seccomp are very branchy, so ext BPF
> performance is better than old BPF.
>
> Performance gains will be even higher when extended BPF JIT
> is committed.
>
> Signed-off-by: Alexei Starovoitov <ast@...mgrid.com>
>
> ---
> This patch is an RFC to use extended BPF in seccomp.
> Change it to do it conditionally with bpf_ext_enable knob ?

Kees, Will,

I've played with libseccomp to test this patch and just found
your other testsuite for bpf+seccomp:
https://github.com/redpig/seccomp
It passes as well on x86_64 and i386.

Not sure how real all 'seccomp' and libseccomp' bpf filters.
Some of them are very short, some very long.
If average number of filtered syscalls is > 10, then upcoming
'ebpf table' will give another performance boost, since single table
lookup will be faster than long sequence of 'if' conditions.

Please review.

Thanks
Alexei

> ---
>  include/linux/seccomp.h |    1 -
>  kernel/seccomp.c        |  112 +++++++++++++++++++++--------------------------
>  net/core/filter.c       |    5 ---
>  3 files changed, 51 insertions(+), 67 deletions(-)
>
> diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
> index 6f19cfd1840e..4054b0994071 100644
> --- a/include/linux/seccomp.h
> +++ b/include/linux/seccomp.h
> @@ -76,7 +76,6 @@ static inline int seccomp_mode(struct seccomp *s)
>  #ifdef CONFIG_SECCOMP_FILTER
>  extern void put_seccomp_filter(struct task_struct *tsk);
>  extern void get_seccomp_filter(struct task_struct *tsk);
> -extern u32 seccomp_bpf_load(int off);
>  #else  /* CONFIG_SECCOMP_FILTER */
>  static inline void put_seccomp_filter(struct task_struct *tsk)
>  {
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index b7a10048a32c..346c597b44cc 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -55,60 +55,27 @@ struct seccomp_filter {
>         atomic_t usage;
>         struct seccomp_filter *prev;
>         unsigned short len;  /* Instruction count */
> -       struct sock_filter insns[];
> +       struct sock_filter_ext insns[];
>  };
>
>  /* Limit any path through the tree to 256KB worth of instructions. */
>  #define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
>
> -/**
> - * get_u32 - returns a u32 offset into data
> - * @data: a unsigned 64 bit value
> - * @index: 0 or 1 to return the first or second 32-bits
> - *
> - * This inline exists to hide the length of unsigned long.  If a 32-bit
> - * unsigned long is passed in, it will be extended and the top 32-bits will be
> - * 0. If it is a 64-bit unsigned long, then whatever data is resident will be
> - * properly returned.
> - *
> +/*
>   * Endianness is explicitly ignored and left for BPF program authors to manage
>   * as per the specific architecture.
>   */
> -static inline u32 get_u32(u64 data, int index)
> -{
> -       return ((u32 *)&data)[index];
> -}
> -
> -/* Helper for bpf_load below. */
> -#define BPF_DATA(_name) offsetof(struct seccomp_data, _name)
> -/**
> - * bpf_load: checks and returns a pointer to the requested offset
> - * @off: offset into struct seccomp_data to load from
> - *
> - * Returns the requested 32-bits of data.
> - * seccomp_check_filter() should assure that @off is 32-bit aligned
> - * and not out of bounds.  Failure to do so is a BUG.
> - */
> -u32 seccomp_bpf_load(int off)
> +static void populate_seccomp_data(struct seccomp_data *sd)
>  {
>         struct pt_regs *regs = task_pt_regs(current);
> -       if (off == BPF_DATA(nr))
> -               return syscall_get_nr(current, regs);
> -       if (off == BPF_DATA(arch))
> -               return syscall_get_arch(current, regs);
> -       if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) {
> -               unsigned long value;
> -               int arg = (off - BPF_DATA(args[0])) / sizeof(u64);
> -               int index = !!(off % sizeof(u64));
> -               syscall_get_arguments(current, regs, arg, 1, &value);
> -               return get_u32(value, index);
> -       }
> -       if (off == BPF_DATA(instruction_pointer))
> -               return get_u32(KSTK_EIP(current), 0);
> -       if (off == BPF_DATA(instruction_pointer) + sizeof(u32))
> -               return get_u32(KSTK_EIP(current), 1);
> -       /* seccomp_check_filter should make this impossible. */
> -       BUG();
> +       int i;
> +
> +       sd->nr = syscall_get_nr(current, regs);
> +       sd->arch = syscall_get_arch(current, regs);
> +       for (i = 0; i < 6; i++)
> +               syscall_get_arguments(current, regs, i, 1,
> +                                     (unsigned long *)&sd->args[i]);
> +       sd->instruction_pointer = KSTK_EIP(current);
>  }
>
>  /**
> @@ -133,17 +100,17 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
>
>                 switch (code) {
>                 case BPF_S_LD_W_ABS:
> -                       ftest->code = BPF_S_ANC_SECCOMP_LD_W;
> +                       ftest->code = BPF_LDX | BPF_W | BPF_ABS;
>                         /* 32-bit aligned and not out of bounds. */
>                         if (k >= sizeof(struct seccomp_data) || k & 3)
>                                 return -EINVAL;
>                         continue;
>                 case BPF_S_LD_W_LEN:
> -                       ftest->code = BPF_S_LD_IMM;
> +                       ftest->code = BPF_LD | BPF_IMM;
>                         ftest->k = sizeof(struct seccomp_data);
>                         continue;
>                 case BPF_S_LDX_W_LEN:
> -                       ftest->code = BPF_S_LDX_IMM;
> +                       ftest->code = BPF_LDX | BPF_IMM;
>                         ftest->k = sizeof(struct seccomp_data);
>                         continue;
>                 /* Explicitly include allowed calls. */
> @@ -185,6 +152,7 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
>                 case BPF_S_JMP_JGT_X:
>                 case BPF_S_JMP_JSET_K:
>                 case BPF_S_JMP_JSET_X:
> +                       sk_decode_filter(ftest, ftest);
>                         continue;
>                 default:
>                         return -EINVAL;
> @@ -202,18 +170,21 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
>  static u32 seccomp_run_filters(int syscall)
>  {
>         struct seccomp_filter *f;
> +       struct seccomp_data sd;
>         u32 ret = SECCOMP_RET_ALLOW;
>
>         /* Ensure unexpected behavior doesn't result in failing open. */
>         if (WARN_ON(current->seccomp.filter == NULL))
>                 return SECCOMP_RET_KILL;
>
> +       populate_seccomp_data(&sd);
> +
>         /*
>          * All filters in the list are evaluated and the lowest BPF return
>          * value always takes priority (ignoring the DATA).
>          */
>         for (f = current->seccomp.filter; f; f = f->prev) {
> -               u32 cur_ret = sk_run_filter(NULL, f->insns);
> +               u32 cur_ret = sk_run_filter_ext(&sd, f->insns);
>                 if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
>                         ret = cur_ret;
>         }
> @@ -231,6 +202,8 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
>         struct seccomp_filter *filter;
>         unsigned long fp_size = fprog->len * sizeof(struct sock_filter);
>         unsigned long total_insns = fprog->len;
> +       struct sock_filter *fp;
> +       int new_len;
>         long ret;
>
>         if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
> @@ -252,28 +225,42 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
>                                      CAP_SYS_ADMIN) != 0)
>                 return -EACCES;
>
> -       /* Allocate a new seccomp_filter */
> -       filter = kzalloc(sizeof(struct seccomp_filter) + fp_size,
> -                        GFP_KERNEL|__GFP_NOWARN);
> -       if (!filter)
> +       fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN);
> +       if (!fp)
>                 return -ENOMEM;
> -       atomic_set(&filter->usage, 1);
> -       filter->len = fprog->len;
>
>         /* Copy the instructions from fprog. */
>         ret = -EFAULT;
> -       if (copy_from_user(filter->insns, fprog->filter, fp_size))
> -               goto fail;
> +       if (copy_from_user(fp, fprog->filter, fp_size))
> +               goto free_prog;
>
>         /* Check and rewrite the fprog via the skb checker */
> -       ret = sk_chk_filter(filter->insns, filter->len);
> +       ret = sk_chk_filter(fp, fprog->len);
>         if (ret)
> -               goto fail;
> +               goto free_prog;
>
>         /* Check and rewrite the fprog for seccomp use */
> -       ret = seccomp_check_filter(filter->insns, filter->len);
> +       ret = seccomp_check_filter(fp, fprog->len);
>         if (ret)
> -               goto fail;
> +               goto free_prog;
> +
> +       /* convert 'sock_filter' insns to 'sock_filter_ext' insns */
> +       ret = sk_convert_filter(fp, fprog->len, NULL, &new_len);
> +       if (ret)
> +               goto free_prog;
> +
> +       /* Allocate a new seccomp_filter */
> +       filter = kzalloc(sizeof(struct seccomp_filter) +
> +                        sizeof(struct sock_filter_ext) * new_len,
> +                        GFP_KERNEL|__GFP_NOWARN);
> +       if (!filter)
> +               goto free_prog;
> +
> +       ret = sk_convert_filter(fp, fprog->len, filter->insns, &new_len);
> +       if (ret)
> +               goto free_filter;
> +       atomic_set(&filter->usage, 1);
> +       filter->len = new_len;
>
>         /*
>          * If there is an existing filter, make it the prev and don't drop its
> @@ -282,8 +269,11 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
>         filter->prev = current->seccomp.filter;
>         current->seccomp.filter = filter;
>         return 0;
> -fail:
> +
> +free_filter:
>         kfree(filter);
> +free_prog:
> +       kfree(fp);
>         return ret;
>  }
>
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 1494421486b7..f144a6a7d939 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -388,11 +388,6 @@ load_b:
>                                 A = 0;
>                         continue;
>                 }
> -#ifdef CONFIG_SECCOMP_FILTER
> -               case BPF_S_ANC_SECCOMP_LD_W:
> -                       A = seccomp_bpf_load(fentry->k);
> -                       continue;
> -#endif
>                 default:
>                         WARN_RATELIMIT(1, "Unknown code:%u jt:%u tf:%u k:%u\n",
>                                        fentry->code, fentry->jt,
> --
> 1.7.9.5
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html