[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1379386119-4157-2-git-send-email-ast@plumgrid.com>
Date: Mon, 16 Sep 2013 19:48:38 -0700
From: Alexei Starovoitov <ast@...mgrid.com>
To: "David S. Miller" <davem@...emloft.net>, netdev@...r.kernel.org,
Eric Dumazet <edumazet@...gle.com>,
Alexey Kuznetsov <kuznet@....inr.ac.ru>,
James Morris <jmorris@...ei.org>,
Hideaki YOSHIFUJI <yoshfuji@...ux-ipv6.org>,
Patrick McHardy <kaber@...sh.net>,
Thomas Gleixner <tglx@...utronix.de>,
Ingo Molnar <mingo@...hat.com>,
"H. Peter Anvin" <hpa@...or.com>,
Daniel Borkmann <dborkman@...hat.com>,
"Paul E. McKenney" <paulmck@...ux.vnet.ibm.com>,
Xi Wang <xi.wang@...il.com>,
David Howells <dhowells@...hat.com>,
Cong Wang <xiyou.wangcong@...il.com>,
Jesse Gross <jesse@...ira.com>,
Pravin B Shelar <pshelar@...ira.com>,
Ben Pfaff <blp@...ira.com>, Thomas Graf <tgraf@...g.ch>,
dev@...nvswitch.org
Subject: [RFC PATCH v2 net-next 1/2] extended BPF
extended BPF program = BPF insns + BPF tables
flexible instruction set:
- from two 32-bit registers (A and X) to ten 64-bit regs
- add conditional jump back, signed compare, bswap
- in addition to old load[1,2,4,8] bytes, add store[1,2,4,8] bytes
- fixed set of function calls via simple ABI:
R0 - return register
R1-R5 - argument passing
R6-R9 - callee saved
R10 - frame pointer
- bpf_table_lookup/bpf_table_update functions to access BPF tables
- generic 'struct bpf_context' = input/output argument to BPF program
BPF table is defined by
- type, id, number of elements, key size, element size
To use generic BPF engine other kernel components will define:
- the body of 'bpf_context' and access permission
- available function calls: their prototypes for BPF checker,
body for BPF interpreter and JIT
BPF programs can be written in restricted C
GCC backend for BPF is available
BPF checker does full program validation before it is JITed or
run in interpreter
Signed-off-by: Alexei Starovoitov <ast@...mgrid.com>
---
arch/x86/net/Makefile | 2 +-
arch/x86/net/bpf2_jit_comp.c | 610 ++++++++++++++++++++++++
arch/x86/net/bpf_jit_comp.c | 41 +-
arch/x86/net/bpf_jit_comp.h | 36 ++
include/linux/filter.h | 79 ++++
include/uapi/linux/filter.h | 125 ++++-
net/core/Makefile | 2 +-
net/core/bpf_check.c | 1043 ++++++++++++++++++++++++++++++++++++++++++
net/core/bpf_run.c | 412 +++++++++++++++++
9 files changed, 2315 insertions(+), 35 deletions(-)
create mode 100644 arch/x86/net/bpf2_jit_comp.c
create mode 100644 arch/x86/net/bpf_jit_comp.h
create mode 100644 net/core/bpf_check.c
create mode 100644 net/core/bpf_run.c
diff --git a/arch/x86/net/Makefile b/arch/x86/net/Makefile
index 90568c3..54f57c9 100644
--- a/arch/x86/net/Makefile
+++ b/arch/x86/net/Makefile
@@ -1,4 +1,4 @@
#
# Arch-specific network modules
#
-obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o
+obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o bpf2_jit_comp.o
diff --git a/arch/x86/net/bpf2_jit_comp.c b/arch/x86/net/bpf2_jit_comp.c
new file mode 100644
index 0000000..2558ed7
--- /dev/null
+++ b/arch/x86/net/bpf2_jit_comp.c
@@ -0,0 +1,610 @@
+/*
+ * Copyright (c) 2011-2013 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/filter.h>
+#include <linux/moduleloader.h>
+#include "bpf_jit_comp.h"
+
+static inline u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
+{
+ if (len == 1)
+ *ptr = bytes;
+ else if (len == 2)
+ *(u16 *)ptr = bytes;
+ else
+ *(u32 *)ptr = bytes;
+ return ptr + len;
+}
+
+#define EMIT(bytes, len) (prog = emit_code(prog, (bytes), (len)))
+
+#define EMIT1(b1) EMIT(b1, 1)
+#define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2)
+#define EMIT3(b1, b2, b3) EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3)
+#define EMIT4(b1, b2, b3, b4) EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4)
+/* imm32 is sign extended by cpu */
+#define EMIT1_off32(b1, off) \
+ do {EMIT1(b1); EMIT(off, 4); } while (0)
+#define EMIT2_off32(b1, b2, off) \
+ do {EMIT2(b1, b2); EMIT(off, 4); } while (0)
+#define EMIT3_off32(b1, b2, b3, off) \
+ do {EMIT3(b1, b2, b3); EMIT(off, 4); } while (0)
+#define EMIT4_off32(b1, b2, b3, b4, off) \
+ do {EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)
+
+/* mov A, X */
+#define EMIT_mov(A, X) \
+ EMIT3(add_2mod(0x48, A, X), 0x89, add_2reg(0xC0, A, X))
+
+#define X86_JAE 0x73
+#define X86_JE 0x74
+#define X86_JNE 0x75
+#define X86_JA 0x77
+#define X86_JGE 0x7D
+#define X86_JG 0x7F
+
+static inline bool is_imm8(__s32 value)
+{
+ return value <= 127 && value >= -128;
+}
+
+static inline bool is_simm32(__s64 value)
+{
+ return value == (__s64)(__s32)value;
+}
+
+static int bpf_size_to_x86_bytes(int bpf_size)
+{
+ if (bpf_size == BPF_W)
+ return 4;
+ else if (bpf_size == BPF_H)
+ return 2;
+ else if (bpf_size == BPF_B)
+ return 1;
+ else if (bpf_size == BPF_DW)
+ return 4; /* imm32 */
+ else
+ return 0;
+}
+
+#define AUX_REG 32
+
+/* avoid x86-64 R12 which if used as base address in memory access
+ * always needs an extra byte for index */
+static const int reg2hex[] = {
+ [R0] = 0, /* rax */
+ [R1] = 7, /* rdi */
+ [R2] = 6, /* rsi */
+ [R3] = 2, /* rdx */
+ [R4] = 1, /* rcx */
+ [R5] = 0, /* r8 */
+ [R6] = 3, /* rbx callee saved */
+ [R7] = 5, /* r13 callee saved */
+ [R8] = 6, /* r14 callee saved */
+ [R9] = 7, /* r15 callee saved */
+ [__fp__] = 5, /* rbp readonly */
+ [AUX_REG] = 1, /* r9 temp register */
+};
+
+/* is_ereg() == true if r8 <= reg <= r15,
+ * rax,rcx,...,rbp don't need extra byte of encoding */
+static inline bool is_ereg(u32 reg)
+{
+ if (reg == R5 || (reg >= R7 && reg <= R9) || reg == AUX_REG)
+ return true;
+ else
+ return false;
+}
+
+static inline u8 add_1mod(u8 byte, u32 reg)
+{
+ if (is_ereg(reg))
+ byte |= 1;
+ return byte;
+}
+static inline u8 add_2mod(u8 byte, u32 r1, u32 r2)
+{
+ if (is_ereg(r1))
+ byte |= 1;
+ if (is_ereg(r2))
+ byte |= 4;
+ return byte;
+}
+
+static inline u8 add_1reg(u8 byte, u32 a_reg)
+{
+ return byte + reg2hex[a_reg];
+}
+static inline u8 add_2reg(u8 byte, u32 a_reg, u32 x_reg)
+{
+ return byte + reg2hex[a_reg] + (reg2hex[x_reg] << 3);
+}
+
+static u8 *select_bpf_func(struct bpf_program *prog, int id)
+{
+ if (id < 0 || id >= FUNC_bpf_max_id)
+ return NULL;
+ return prog->cb->jit_select_func(id);
+}
+
+static int do_jit(struct bpf_program *bpf_prog, int *addrs, u8 *image,
+ int oldproglen)
+{
+ struct bpf_insn *insn = bpf_prog->insns;
+ int insn_cnt = bpf_prog->insn_cnt;
+ u8 temp[64];
+ int i;
+ int proglen = 0;
+ u8 *prog = temp;
+ int stacksize = 512;
+
+ EMIT1(0x55); /* push rbp */
+ EMIT3(0x48, 0x89, 0xE5); /* mov rbp,rsp */
+
+ /* sub rsp, stacksize */
+ EMIT3_off32(0x48, 0x81, 0xEC, stacksize);
+ /* mov qword ptr [rbp-X],rbx */
+ EMIT3_off32(0x48, 0x89, 0x9D, -stacksize);
+ /* mov qword ptr [rbp-X],r13 */
+ EMIT3_off32(0x4C, 0x89, 0xAD, -stacksize + 8);
+ /* mov qword ptr [rbp-X],r14 */
+ EMIT3_off32(0x4C, 0x89, 0xB5, -stacksize + 16);
+ /* mov qword ptr [rbp-X],r15 */
+ EMIT3_off32(0x4C, 0x89, 0xBD, -stacksize + 24);
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ const __s32 K = insn->imm;
+ __u32 a_reg = insn->a_reg;
+ __u32 x_reg = insn->x_reg;
+ u8 b1 = 0, b2 = 0, b3 = 0;
+ u8 jmp_cond;
+ __s64 jmp_offset;
+ int ilen;
+ u8 *func;
+
+ switch (insn->code) {
+ /* ALU */
+ case BPF_ALU | BPF_ADD | BPF_X:
+ case BPF_ALU | BPF_SUB | BPF_X:
+ case BPF_ALU | BPF_AND | BPF_X:
+ case BPF_ALU | BPF_OR | BPF_X:
+ case BPF_ALU | BPF_XOR | BPF_X:
+ b1 = 0x48;
+ b3 = 0xC0;
+ switch (BPF_OP(insn->code)) {
+ case BPF_ADD: b2 = 0x01; break;
+ case BPF_SUB: b2 = 0x29; break;
+ case BPF_AND: b2 = 0x21; break;
+ case BPF_OR: b2 = 0x09; break;
+ case BPF_XOR: b2 = 0x31; break;
+ }
+ EMIT3(add_2mod(b1, a_reg, x_reg), b2,
+ add_2reg(b3, a_reg, x_reg));
+ break;
+
+ /* mov A, X */
+ case BPF_ALU | BPF_MOV | BPF_X:
+ EMIT_mov(a_reg, x_reg);
+ break;
+
+ /* neg A */
+ case BPF_ALU | BPF_NEG | BPF_X:
+ EMIT3(add_1mod(0x48, a_reg), 0xF7,
+ add_1reg(0xD8, a_reg));
+ break;
+
+ case BPF_ALU | BPF_ADD | BPF_K:
+ case BPF_ALU | BPF_SUB | BPF_K:
+ case BPF_ALU | BPF_AND | BPF_K:
+ case BPF_ALU | BPF_OR | BPF_K:
+ b1 = add_1mod(0x48, a_reg);
+
+ switch (BPF_OP(insn->code)) {
+ case BPF_ADD: b3 = 0xC0; break;
+ case BPF_SUB: b3 = 0xE8; break;
+ case BPF_AND: b3 = 0xE0; break;
+ case BPF_OR: b3 = 0xC8; break;
+ }
+
+ if (is_imm8(K))
+ EMIT4(b1, 0x83, add_1reg(b3, a_reg), K);
+ else
+ EMIT3_off32(b1, 0x81, add_1reg(b3, a_reg), K);
+ break;
+
+ case BPF_ALU | BPF_MOV | BPF_K:
+ /* 'mov rax, imm32' sign extends imm32.
+ * possible optimization: if imm32 is positive,
+ * use 'mov eax, imm32' (which zero-extends imm32)
+ * to save 2 bytes */
+ b1 = add_1mod(0x48, a_reg);
+ b2 = 0xC7;
+ b3 = 0xC0;
+ EMIT3_off32(b1, b2, add_1reg(b3, a_reg), K);
+ break;
+
+ /* A %= X
+ * A /= X */
+ case BPF_ALU | BPF_MOD | BPF_X:
+ case BPF_ALU | BPF_DIV | BPF_X:
+ EMIT1(0x50); /* push rax */
+ EMIT1(0x52); /* push rdx */
+
+ /* mov r9, X */
+ EMIT_mov(AUX_REG, x_reg);
+
+ /* mov rax, A */
+ EMIT_mov(R0, a_reg);
+
+ /* xor rdx, rdx */
+ EMIT3(0x48, 0x31, 0xd2);
+
+ /* if X==0, skip divide, make A=0 */
+
+ /* cmp r9, 0 */
+ EMIT4(0x49, 0x83, 0xF9, 0x00);
+
+ /* je .+3 */
+ EMIT2(X86_JE, 3);
+
+ /* div r9 */
+ EMIT3(0x49, 0xF7, 0xF1);
+
+ if (BPF_OP(insn->code) == BPF_MOD) {
+ /* mov r9, rdx */
+ EMIT3(0x49, 0x89, 0xD1);
+ } else {
+ /* mov r9, rax */
+ EMIT3(0x49, 0x89, 0xC1);
+ }
+
+ EMIT1(0x5A); /* pop rdx */
+ EMIT1(0x58); /* pop rax */
+
+ /* mov A, r9 */
+ EMIT_mov(a_reg, AUX_REG);
+ break;
+
+ /* shifts */
+ case BPF_ALU | BPF_LSH | BPF_K:
+ case BPF_ALU | BPF_RSH | BPF_K:
+ case BPF_ALU | BPF_ARSH | BPF_K:
+ b1 = add_1mod(0x48, a_reg);
+ switch (BPF_OP(insn->code)) {
+ case BPF_LSH: b3 = 0xE0; break;
+ case BPF_RSH: b3 = 0xE8; break;
+ case BPF_ARSH: b3 = 0xF8; break;
+ }
+ EMIT4(b1, 0xC1, add_1reg(b3, a_reg), K);
+ break;
+
+ case BPF_ALU | BPF_BSWAP32 | BPF_X:
+ /* emit 'bswap eax' to swap lower 4-bytes */
+ if (is_ereg(a_reg))
+ EMIT2(0x41, 0x0F);
+ else
+ EMIT1(0x0F);
+ EMIT1(add_1reg(0xC8, a_reg));
+ break;
+
+ case BPF_ALU | BPF_BSWAP64 | BPF_X:
+ /* emit 'bswap rax' to swap 8-bytes */
+ EMIT3(add_1mod(0x48, a_reg), 0x0F, add_1reg(0xC8, a_reg));
+ break;
+
+ /* ST: *(u8*)(a_reg + off) = imm */
+ case BPF_ST | BPF_REL | BPF_B:
+ if (is_ereg(a_reg))
+ EMIT2(0x41, 0xC6);
+ else
+ EMIT1(0xC6);
+ goto st;
+ case BPF_ST | BPF_REL | BPF_H:
+ if (is_ereg(a_reg))
+ EMIT3(0x66, 0x41, 0xC7);
+ else
+ EMIT2(0x66, 0xC7);
+ goto st;
+ case BPF_ST | BPF_REL | BPF_W:
+ if (is_ereg(a_reg))
+ EMIT2(0x41, 0xC7);
+ else
+ EMIT1(0xC7);
+ goto st;
+ case BPF_ST | BPF_REL | BPF_DW:
+ EMIT2(add_1mod(0x48, a_reg), 0xC7);
+
+st: if (is_imm8(insn->off))
+ EMIT2(add_1reg(0x40, a_reg), insn->off);
+ else
+ EMIT1_off32(add_1reg(0x80, a_reg), insn->off);
+
+ EMIT(K, bpf_size_to_x86_bytes(BPF_SIZE(insn->code)));
+ break;
+
+ /* STX: *(u8*)(a_reg + off) = x_reg */
+ case BPF_STX | BPF_REL | BPF_B:
+ /* emit 'mov byte ptr [rax + off], al' */
+ if (is_ereg(a_reg) || is_ereg(x_reg) ||
+ /* have to add extra byte for x86 SIL, DIL regs */
+ x_reg == R1 || x_reg == R2)
+ EMIT2(add_2mod(0x40, a_reg, x_reg), 0x88);
+ else
+ EMIT1(0x88);
+ goto stx;
+ case BPF_STX | BPF_REL | BPF_H:
+ if (is_ereg(a_reg) || is_ereg(x_reg))
+ EMIT3(0x66, add_2mod(0x40, a_reg, x_reg), 0x89);
+ else
+ EMIT2(0x66, 0x89);
+ goto stx;
+ case BPF_STX | BPF_REL | BPF_W:
+ if (is_ereg(a_reg) || is_ereg(x_reg))
+ EMIT2(add_2mod(0x40, a_reg, x_reg), 0x89);
+ else
+ EMIT1(0x89);
+ goto stx;
+ case BPF_STX | BPF_REL | BPF_DW:
+ EMIT2(add_2mod(0x48, a_reg, x_reg), 0x89);
+stx: if (is_imm8(insn->off))
+ EMIT2(add_2reg(0x40, a_reg, x_reg), insn->off);
+ else
+ EMIT1_off32(add_2reg(0x80, a_reg, x_reg), insn->off);
+ break;
+
+ /* LDX: a_reg = *(u8*)(x_reg + off) */
+ case BPF_LDX | BPF_REL | BPF_B:
+ /* emit 'movzx rax, byte ptr [rax + off]' */
+ EMIT3(add_2mod(0x48, x_reg, a_reg), 0x0F, 0xB6);
+ goto ldx;
+ case BPF_LDX | BPF_REL | BPF_H:
+ /* emit 'movzx rax, word ptr [rax + off]' */
+ EMIT3(add_2mod(0x48, x_reg, a_reg), 0x0F, 0xB7);
+ goto ldx;
+ case BPF_LDX | BPF_REL | BPF_W:
+ /* emit 'mov eax, dword ptr [rax+0x14]' */
+ if (is_ereg(a_reg) || is_ereg(x_reg))
+ EMIT2(add_2mod(0x40, x_reg, a_reg), 0x8B);
+ else
+ EMIT1(0x8B);
+ goto ldx;
+ case BPF_LDX | BPF_REL | BPF_DW:
+ /* emit 'mov rax, qword ptr [rax+0x14]' */
+ EMIT2(add_2mod(0x48, x_reg, a_reg), 0x8B);
+ldx: /* if insn->off == 0 we can save one extra byte, but
+ * special case of x86 R13 which always needs an offset
+ * is not worth the pain */
+ if (is_imm8(insn->off))
+ EMIT2(add_2reg(0x40, x_reg, a_reg), insn->off);
+ else
+ EMIT1_off32(add_2reg(0x80, x_reg, a_reg), insn->off);
+ break;
+
+ /* STX XADD: lock *(u8*)(a_reg + off) += x_reg */
+ case BPF_STX | BPF_XADD | BPF_B:
+ /* emit 'lock add byte ptr [rax + off], al' */
+ if (is_ereg(a_reg) || is_ereg(x_reg) ||
+ /* have to add extra byte for x86 SIL, DIL regs */
+ x_reg == R1 || x_reg == R2)
+ EMIT3(0xF0, add_2mod(0x40, a_reg, x_reg), 0x00);
+ else
+ EMIT2(0xF0, 0x00);
+ goto xadd;
+ case BPF_STX | BPF_XADD | BPF_H:
+ if (is_ereg(a_reg) || is_ereg(x_reg))
+ EMIT4(0x66, 0xF0, add_2mod(0x40, a_reg, x_reg), 0x01);
+ else
+ EMIT3(0x66, 0xF0, 0x01);
+ goto xadd;
+ case BPF_STX | BPF_XADD | BPF_W:
+ if (is_ereg(a_reg) || is_ereg(x_reg))
+ EMIT3(0xF0, add_2mod(0x40, a_reg, x_reg), 0x01);
+ else
+ EMIT2(0xF0, 0x01);
+ goto xadd;
+ case BPF_STX | BPF_XADD | BPF_DW:
+ EMIT3(0xF0, add_2mod(0x48, a_reg, x_reg), 0x01);
+xadd: if (is_imm8(insn->off))
+ EMIT2(add_2reg(0x40, a_reg, x_reg), insn->off);
+ else
+ EMIT1_off32(add_2reg(0x80, a_reg, x_reg), insn->off);
+ break;
+
+ /* call */
+ case BPF_JMP | BPF_CALL:
+ func = select_bpf_func(bpf_prog, K);
+ jmp_offset = func - (image + addrs[i]);
+ if (!func || !is_simm32(jmp_offset)) {
+ pr_err("unsupported bpf func %d addr %p image %p\n",
+ K, func, image);
+ return -EINVAL;
+ }
+ EMIT1_off32(0xE8, jmp_offset);
+ break;
+
+ /* cond jump */
+ case BPF_JMP | BPF_JEQ | BPF_X:
+ case BPF_JMP | BPF_JNE | BPF_X:
+ case BPF_JMP | BPF_JGT | BPF_X:
+ case BPF_JMP | BPF_JGE | BPF_X:
+ case BPF_JMP | BPF_JSGT | BPF_X:
+ case BPF_JMP | BPF_JSGE | BPF_X:
+ /* emit 'cmp a_reg, x_reg' insn */
+ b1 = 0x48;
+ b2 = 0x39;
+ b3 = 0xC0;
+ EMIT3(add_2mod(b1, a_reg, x_reg), b2,
+ add_2reg(b3, a_reg, x_reg));
+ goto emit_jump;
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JNE | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JSGT | BPF_K:
+ case BPF_JMP | BPF_JSGE | BPF_K:
+ /* emit 'cmp a_reg, imm8/32' */
+ EMIT1(add_1mod(0x48, a_reg));
+
+ if (is_imm8(K))
+ EMIT3(0x83, add_1reg(0xF8, a_reg), K);
+ else
+ EMIT2_off32(0x81, add_1reg(0xF8, a_reg), K);
+
+emit_jump: /* convert BPF opcode to x86 */
+ switch (BPF_OP(insn->code)) {
+ case BPF_JEQ:
+ jmp_cond = X86_JE;
+ break;
+ case BPF_JNE:
+ jmp_cond = X86_JNE;
+ break;
+ case BPF_JGT:
+ /* GT is unsigned '>', JA in x86 */
+ jmp_cond = X86_JA;
+ break;
+ case BPF_JGE:
+ /* GE is unsigned '>=', JAE in x86 */
+ jmp_cond = X86_JAE;
+ break;
+ case BPF_JSGT:
+ /* signed '>', GT in x86 */
+ jmp_cond = X86_JG;
+ break;
+ case BPF_JSGE:
+ /* signed '>=', GE in x86 */
+ jmp_cond = X86_JGE;
+ break;
+ default: /* to silence gcc warning */
+ return -EFAULT;
+ }
+ jmp_offset = addrs[i + insn->off] - addrs[i];
+ if (is_imm8(jmp_offset)) {
+ EMIT2(jmp_cond, jmp_offset);
+ } else if (is_simm32(jmp_offset)) {
+ EMIT2_off32(0x0F, jmp_cond + 0x10, jmp_offset);
+ } else {
+ pr_err("cond_jmp gen bug %llx\n", jmp_offset);
+ return -EFAULT;
+ }
+
+ break;
+
+ case BPF_JMP | BPF_JA | BPF_X:
+ jmp_offset = addrs[i + insn->off] - addrs[i];
+ if (is_imm8(jmp_offset)) {
+ EMIT2(0xEB, jmp_offset);
+ } else if (is_simm32(jmp_offset)) {
+ EMIT1_off32(0xE9, jmp_offset);
+ } else {
+ pr_err("jmp gen bug %llx\n", jmp_offset);
+ return -EFAULT;
+ }
+
+ break;
+
+ case BPF_RET | BPF_K:
+ /* mov rbx, qword ptr [rbp-X] */
+ EMIT3_off32(0x48, 0x8B, 0x9D, -stacksize);
+ /* mov r13, qword ptr [rbp-X] */
+ EMIT3_off32(0x4C, 0x8B, 0xAD, -stacksize + 8);
+ /* mov r14, qword ptr [rbp-X] */
+ EMIT3_off32(0x4C, 0x8B, 0xB5, -stacksize + 16);
+ /* mov r15, qword ptr [rbp-X] */
+ EMIT3_off32(0x4C, 0x8B, 0xBD, -stacksize + 24);
+
+ EMIT1(0xC9); /* leave */
+ EMIT1(0xC3); /* ret */
+ break;
+
+ default:
+ /*pr_debug_bpf_insn(insn, NULL);*/
+ pr_err("bpf_jit: unknown opcode %02x\n", insn->code);
+ return -EINVAL;
+ }
+
+ ilen = prog - temp;
+ if (image) {
+ if (proglen + ilen > oldproglen)
+ return -2;
+ memcpy(image + proglen, temp, ilen);
+ }
+ proglen += ilen;
+ addrs[i] = proglen;
+ prog = temp;
+ }
+ return proglen;
+}
+
+void bpf2_jit_compile(struct bpf_program *prog)
+{
+ struct bpf_binary_header *header = NULL;
+ int proglen, oldproglen = 0;
+ int *addrs;
+ u8 *image = NULL;
+ int pass;
+ int i;
+
+ if (!prog || !prog->cb || !prog->cb->jit_select_func)
+ return;
+
+ addrs = kmalloc(prog->insn_cnt * sizeof(*addrs), GFP_KERNEL);
+ if (!addrs)
+ return;
+
+ for (proglen = 0, i = 0; i < prog->insn_cnt; i++) {
+ proglen += 64;
+ addrs[i] = proglen;
+ }
+ for (pass = 0; pass < 10; pass++) {
+ proglen = do_jit(prog, addrs, image, oldproglen);
+ if (proglen <= 0) {
+ image = NULL;
+ goto out;
+ }
+ if (image) {
+ if (proglen != oldproglen)
+ pr_err("bpf_jit: proglen=%d != oldproglen=%d\n",
+ proglen, oldproglen);
+ break;
+ }
+ if (proglen == oldproglen) {
+ header = bpf_alloc_binary(proglen, &image);
+ if (!header)
+ goto out;
+ }
+ oldproglen = proglen;
+ }
+
+ if (image) {
+ bpf_flush_icache(header, image + proglen);
+ set_memory_ro((unsigned long)header, header->pages);
+ }
+out:
+ kfree(addrs);
+ prog->jit_image = (void (*)(struct bpf_context *ctx))image;
+ return;
+}
+
+
+void bpf2_jit_free(struct bpf_program *prog)
+{
+ if (prog->jit_image)
+ bpf_free_binary(prog->jit_image);
+}
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 79c216a..37ebea8 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -13,6 +13,7 @@
#include <linux/filter.h>
#include <linux/if_vlan.h>
#include <linux/random.h>
+#include "bpf_jit_comp.h"
/*
* Conventions :
@@ -112,16 +113,6 @@ do { \
#define SEEN_XREG 2 /* ebx is used */
#define SEEN_MEM 4 /* use mem[] for temporary storage */
-static inline void bpf_flush_icache(void *start, void *end)
-{
- mm_segment_t old_fs = get_fs();
-
- set_fs(KERNEL_DS);
- smp_wmb();
- flush_icache_range((unsigned long)start, (unsigned long)end);
- set_fs(old_fs);
-}
-
#define CHOOSE_LOAD_FUNC(K, func) \
((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset)
@@ -145,16 +136,8 @@ static int pkt_type_offset(void)
return -1;
}
-struct bpf_binary_header {
- unsigned int pages;
- /* Note : for security reasons, bpf code will follow a randomly
- * sized amount of int3 instructions
- */
- u8 image[];
-};
-
-static struct bpf_binary_header *bpf_alloc_binary(unsigned int proglen,
- u8 **image_ptr)
+struct bpf_binary_header *bpf_alloc_binary(unsigned int proglen,
+ u8 **image_ptr)
{
unsigned int sz, hole;
struct bpf_binary_header *header;
@@ -772,13 +755,17 @@ out:
return;
}
-void bpf_jit_free(struct sk_filter *fp)
+void bpf_free_binary(void *bpf_func)
{
- if (fp->bpf_func != sk_run_filter) {
- unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK;
- struct bpf_binary_header *header = (void *)addr;
+ unsigned long addr = (unsigned long)bpf_func & PAGE_MASK;
+ struct bpf_binary_header *header = (void *)addr;
- set_memory_rw(addr, header->pages);
- module_free(NULL, header);
- }
+ set_memory_rw(addr, header->pages);
+ module_free(NULL, header);
+}
+
+void bpf_jit_free(struct sk_filter *fp)
+{
+ if (fp->bpf_func != sk_run_filter)
+ bpf_free_binary(fp->bpf_func);
}
diff --git a/arch/x86/net/bpf_jit_comp.h b/arch/x86/net/bpf_jit_comp.h
new file mode 100644
index 0000000..7b70de6
--- /dev/null
+++ b/arch/x86/net/bpf_jit_comp.h
@@ -0,0 +1,36 @@
+/* bpf_jit_comp.h : BPF filter alloc/free routines
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#ifndef __BPF_JIT_COMP_H
+#define __BPF_JIT_COMP_H
+
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+
+struct bpf_binary_header {
+ unsigned int pages;
+ /* Note : for security reasons, bpf code will follow a randomly
+ * sized amount of int3 instructions
+ */
+ u8 image[];
+};
+
+static inline void bpf_flush_icache(void *start, void *end)
+{
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(KERNEL_DS);
+ smp_wmb();
+ flush_icache_range((unsigned long)start, (unsigned long)end);
+ set_fs(old_fs);
+}
+
+extern struct bpf_binary_header *bpf_alloc_binary(unsigned int proglen,
+ u8 **image_ptr);
+extern void bpf_free_binary(void *image_ptr);
+
+#endif
diff --git a/include/linux/filter.h b/include/linux/filter.h
index a6ac848..63b3277 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -48,6 +48,77 @@ extern int sk_chk_filter(struct sock_filter *filter, unsigned int flen);
extern int sk_get_filter(struct sock *sk, struct sock_filter __user *filter, unsigned len);
extern void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to);
+/* type of value stored in a BPF register or
+ * passed into function as an argument or
+ * returned from the function */
+enum bpf_reg_type {
+ INVALID_PTR, /* reg doesn't contain a valid pointer */
+ PTR_TO_CTX, /* reg points to bpf_context */
+ PTR_TO_TABLE, /* reg points to table element */
+ PTR_TO_TABLE_CONDITIONAL, /* points to table element or NULL */
+ PTR_TO_STACK, /* reg == frame_pointer */
+ PTR_TO_STACK_IMM, /* reg == frame_pointer + imm */
+ RET_INTEGER, /* function returns integer */
+ RET_VOID, /* function returns void */
+ CONST_ARG /* function expects integer constant argument */
+};
+
+/* BPF function prototype */
+struct bpf_func_proto {
+ enum bpf_reg_type ret_type;
+ enum bpf_reg_type arg1_type;
+ enum bpf_reg_type arg2_type;
+ enum bpf_reg_type arg3_type;
+ enum bpf_reg_type arg4_type;
+};
+
+/* struct bpf_context access type */
+enum bpf_access_type {
+ BPF_READ = 1,
+ BPF_WRITE = 2
+};
+
+struct bpf_context_access {
+ int size;
+ enum bpf_access_type type;
+};
+
+struct bpf_callbacks {
+ /* execute BPF func_id with given registers */
+ void (*execute_func)(int id, u64 *regs);
+
+ /* return address of func_id suitable to be called from JITed program */
+ void *(*jit_select_func)(int id);
+
+ /* return BPF function prototype for verification */
+ const struct bpf_func_proto* (*get_func_proto)(int id);
+
+ /* return expected bpf_context access size and permissions
+ * for given byte offset within bpf_context */
+ const struct bpf_context_access *(*get_context_access)(int off);
+};
+
+struct bpf_program {
+ u16 insn_cnt;
+ u16 table_cnt;
+ struct bpf_insn *insns;
+ struct bpf_table *tables;
+ struct bpf_callbacks *cb;
+ void (*jit_image)(struct bpf_context *ctx);
+};
+/* load BPF program from user space, setup callback extensions
+ * and run through verifier */
+extern int bpf_load(struct bpf_image *image, struct bpf_callbacks *cb,
+ struct bpf_program **prog);
+/* free BPF program */
+extern void bpf_free(struct bpf_program *prog);
+/* execture BPF program */
+extern void bpf_run(struct bpf_program *prog, struct bpf_context *ctx);
+/* verify correctness of BPF program */
+extern int bpf_check(struct bpf_program *prog);
+/* pr_debug one insn */
+extern void pr_debug_bpf_insn(struct bpf_insn *insn, u64 *regs);
+
#ifdef CONFIG_BPF_JIT
#include <stdarg.h>
#include <linux/linkage.h>
@@ -55,6 +126,8 @@ extern void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to);
extern void bpf_jit_compile(struct sk_filter *fp);
extern void bpf_jit_free(struct sk_filter *fp);
+extern void bpf2_jit_compile(struct bpf_program *prog);
+extern void bpf2_jit_free(struct bpf_program *prog);
static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen,
u32 pass, void *image)
@@ -73,6 +146,12 @@ static inline void bpf_jit_compile(struct sk_filter *fp)
static inline void bpf_jit_free(struct sk_filter *fp)
{
}
+static inline void bpf2_jit_compile(struct bpf_program *prog)
+{
+}
+static inline void bpf2_jit_free(struct bpf_program *prog)
+{
+}
#define SK_RUN_FILTER(FILTER, SKB) sk_run_filter(SKB, FILTER->insns)
#endif
diff --git a/include/uapi/linux/filter.h b/include/uapi/linux/filter.h
index 8eb9cca..5783769 100644
--- a/include/uapi/linux/filter.h
+++ b/include/uapi/linux/filter.h
@@ -1,3 +1,4 @@
+/* extended BPF is Copyright (c) 2011-2013, PLUMgrid, http://plumgrid.com */
/*
* Linux Socket Filter Data Structures
*/
@@ -19,7 +20,7 @@
* Try and keep these values and structures similar to BSD, especially
* the BPF code definitions which need to match so you can share filters
*/
-
+
struct sock_filter { /* Filter block */
__u16 code; /* Actual filter code */
__u8 jt; /* Jump true */
@@ -46,11 +47,88 @@ struct sock_fprog { /* Required for SO_ATTACH_FILTER. */
#define BPF_RET 0x06
#define BPF_MISC 0x07
+struct bpf_insn {
+ __u8 code; /* opcode */
+ __u8 a_reg:4; /* dest register*/
+ __u8 x_reg:4; /* source register */
+ __s16 off; /* signed offset */
+ __s32 imm; /* signed immediate constant */
+};
+
+struct bpf_table {
+ __u32 id;
+ __u32 type;
+ __u32 key_size;
+ __u32 elem_size;
+ __u32 max_entries;
+ __u32 param1; /* meaning is table-dependent */
+};
+
+enum bfp_table_type {
+ BPF_TABLE_HASH = 1,
+};
+
+struct bpf_image {
+ /* version > 4096 to be binary compatible with original bpf */
+ __u16 version;
+ __u16 rsvd;
+ __u16 insn_cnt;
+ __u16 table_cnt;
+ struct bpf_insn __user *insns;
+ struct bpf_table __user *tables;
+};
+
+/* pointer to bpf_context is the first and only argument to BPF program
+ * its definition is use-case specific */
+struct bpf_context;
+
+/* bpf_add|sub|...: a += x
+ * bpf_mov: a = x
+ * bpf_bswap: bswap a */
+#define BPF_INSN_ALU(op, a, x) \
+ (struct bpf_insn){BPF_ALU|BPF_OP(op)|BPF_X, a, x, 0, 0}
+
+/* bpf_add|sub|...: a += imm
+ * bpf_mov: a = imm */
+#define BPF_INSN_ALU_IMM(op, a, imm) \
+ (struct bpf_insn){BPF_ALU|BPF_OP(op)|BPF_K, a, 0, 0, imm}
+
+/* a = *(uint *) (x + off) */
+#define BPF_INSN_LD(size, a, x, off) \
+ (struct bpf_insn){BPF_LDX|BPF_SIZE(size)|BPF_REL, a, x, off, 0}
+
+/* *(uint *) (a + off) = x */
+#define BPF_INSN_ST(size, a, off, x) \
+ (struct bpf_insn){BPF_STX|BPF_SIZE(size)|BPF_REL, a, x, off, 0}
+
+/* *(uint *) (a + off) = imm */
+#define BPF_INSN_ST_IMM(size, a, off, imm) \
+ (struct bpf_insn){BPF_ST|BPF_SIZE(size)|BPF_REL, a, 0, off, imm}
+
+/* lock *(uint *) (a + off) += x */
+#define BPF_INSN_XADD(size, a, off, x) \
+ (struct bpf_insn){BPF_STX|BPF_SIZE(size)|BPF_XADD, a, x, off, 0}
+
+/* if (a 'op' x) pc += off else fall through */
+#define BPF_INSN_JUMP(op, a, x, off) \
+ (struct bpf_insn){BPF_JMP|BPF_OP(op)|BPF_X, a, x, off, 0}
+
+/* if (a 'op' imm) pc += off else fall through */
+#define BPF_INSN_JUMP_IMM(op, a, imm, off) \
+ (struct bpf_insn){BPF_JMP|BPF_OP(op)|BPF_K, a, 0, off, imm}
+
+#define BPF_INSN_RET() \
+ (struct bpf_insn){BPF_RET|BPF_K, 0, 0, 0, 0}
+
+#define BPF_INSN_CALL(fn_code) \
+ (struct bpf_insn){BPF_JMP|BPF_CALL, 0, 0, 0, fn_code}
+
/* ld/ldx fields */
#define BPF_SIZE(code) ((code) & 0x18)
#define BPF_W 0x00
#define BPF_H 0x08
#define BPF_B 0x10
+#define BPF_DW 0x18
#define BPF_MODE(code) ((code) & 0xe0)
#define BPF_IMM 0x00
#define BPF_ABS 0x20
@@ -58,6 +136,8 @@ struct sock_fprog { /* Required for SO_ATTACH_FILTER. */
#define BPF_MEM 0x60
#define BPF_LEN 0x80
#define BPF_MSH 0xa0
+#define BPF_REL 0xc0
+#define BPF_XADD 0xe0 /* exclusive add */
/* alu/jmp fields */
#define BPF_OP(code) ((code) & 0xf0)
@@ -68,20 +148,54 @@ struct sock_fprog { /* Required for SO_ATTACH_FILTER. */
#define BPF_OR 0x40
#define BPF_AND 0x50
#define BPF_LSH 0x60
-#define BPF_RSH 0x70
+#define BPF_RSH 0x70 /* logical shift right */
#define BPF_NEG 0x80
#define BPF_MOD 0x90
#define BPF_XOR 0xa0
+#define BPF_MOV 0xb0 /* mov reg to reg */
+#define BPF_ARSH 0xc0 /* sign extending arithmetic shift right */
+#define BPF_BSWAP32 0xd0 /* swap lower 4 bytes of 64-bit register */
+#define BPF_BSWAP64 0xe0 /* swap all 8 bytes of 64-bit register */
#define BPF_JA 0x00
-#define BPF_JEQ 0x10
-#define BPF_JGT 0x20
-#define BPF_JGE 0x30
+#define BPF_JEQ 0x10 /* jump == */
+#define BPF_JGT 0x20 /* GT is unsigned '>', JA in x86 */
+#define BPF_JGE 0x30 /* GE is unsigned '>=', JAE in x86 */
#define BPF_JSET 0x40
+#define BPF_JNE 0x50 /* jump != */
+#define BPF_JSGT 0x60 /* SGT is signed '>', GT in x86 */
+#define BPF_JSGE 0x70 /* SGE is signed '>=', GE in x86 */
+#define BPF_CALL 0x80 /* function call */
#define BPF_SRC(code) ((code) & 0x08)
#define BPF_K 0x00
#define BPF_X 0x08
+/* 64-bit registers */
+#define R0 0
+#define R1 1
+#define R2 2
+#define R3 3
+#define R4 4
+#define R5 5
+#define R6 6
+#define R7 7
+#define R8 8
+#define R9 9
+#define __fp__ 10
+
+/* all types of BPF programs support at least two functions:
+ * bpf_table_lookup() and bpf_table_update()
+ * contents of bpf_context are use-case specific
+ * BPF engine can be extended with additional functions */
+enum {
+ FUNC_bpf_table_lookup = 1,
+ FUNC_bpf_table_update = 2,
+ FUNC_bpf_max_id = 1024
+};
+void *bpf_table_lookup(struct bpf_context *ctx, int table_id, const void *key);
+int bpf_table_update(struct bpf_context *ctx, int table_id, const void *key,
+ const void *leaf);
+
/* ret - BPF_K and BPF_X also apply */
#define BPF_RVAL(code) ((code) & 0x18)
#define BPF_A 0x10
@@ -134,5 +248,4 @@ struct sock_fprog { /* Required for SO_ATTACH_FILTER. */
#define SKF_NET_OFF (-0x100000)
#define SKF_LL_OFF (-0x200000)
-
#endif /* _UAPI__LINUX_FILTER_H__ */
diff --git a/net/core/Makefile b/net/core/Makefile
index b33b996..f04e016 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -9,7 +9,7 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
- sock_diag.o dev_ioctl.o
+ sock_diag.o dev_ioctl.o bpf_run.o bpf_check.o
obj-$(CONFIG_XFRM) += flow.o
obj-y += net-sysfs.o
diff --git a/net/core/bpf_check.c b/net/core/bpf_check.c
new file mode 100644
index 0000000..bf2521e
--- /dev/null
+++ b/net/core/bpf_check.c
@@ -0,0 +1,1043 @@
+/* Copyright (c) 2011-2013 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/filter.h>
+
+/* bpf_check() is a static code analyzer that walks the BPF program
+ * instruction by instruction and updates register/stack state.
+ * All paths of conditional branches are analyzed until 'ret' insn.
+ *
+ * At the first pass depth-first-search verifies that the BPF program is a DAG.
+ * It rejects the following programs:
+ * - larger than 32K insns or 128 tables
+ * - if loop is present (detected via back-edge)
+ * - unreachable insns exist (shouldn't be a forest. program = one function)
+ * - more than one ret insn
+ * - ret insn is not a last insn
+ * - out of bounds or malformed jumps
+ * The second pass is all possible path descent from the 1st insn.
+ * Conditional branch target insns keep a link list of verifier states.
+ * If the state already visited, this path can be pruned.
+ * If it wasn't a DAG, such state prunning would be incorrect, since it would
+ * skip cycles. Since it's analyzing all pathes through the program,
+ * the length of the analysis is limited to 64k insn, which may be hit even
+ * if insn_cnt < 32k, but there are too many branches that change stack/regs.
+ * Number of 'branches to be analyzed' is limited to 8k
+ *
+ * All registers are 64-bit (even on 32-bit arch)
+ * R0 - return register
+ * R1-R5 argument passing registers
+ * R6-R9 callee saved registers
+ * R10 - frame pointer read-only
+ *
+ * At the start of BPF program the register R1 contains a pointer to bpf_context
+ * and has type PTR_TO_CTX.
+ *
+ * bpf_table_lookup() function returns ether pointer to table value or NULL
+ * which is type PTR_TO_TABLE_CONDITIONAL. Once it passes through !=0 insn
+ * the register holding that pointer in the true branch changes state to
+ * PTR_TO_TABLE and the same register changes state to INVALID_PTR in the false
+ * branch. See check_cond_jmp_op()
+ *
+ * R10 has type PTR_TO_STACK. The sequence 'mov Rx, R10; add Rx, imm' changes
+ * Rx state to PTR_TO_STACK_IMM and immediate constant is saved for further
+ * stack bounds checking
+ *
+ * registers used to pass pointers to function calls are verified against
+ * function prototypes
+ * Ex: before the call to bpf_table_lookup(), R1 must have type PTR_TO_CTX
+ * R2 must contain integer constant and R3 PTR_TO_STACK_IMM
+ * Integer constant in R2 is a table_id. It's checked that 0 <= R2 < table_cnt
+ * and corresponding table_info->key_size fetched to check that
+ * [R3, R3 + table_info->key_size) are within stack limits and all that stack
+ * memory was initiliazed earlier by BPF program.
+ * After bpf_table_lookup() call insn, R0 is set to PTR_TO_TABLE_CONDITIONAL
+ * R1-R5 are cleared and no longer readable (but still writeable).
+ *
+ * load/store alignment is checked
+ * Ex: stx [Rx + 3], (u32)Ry is rejected
+ *
+ * load/store to stack bounds checked and register spill is tracked
+ * Ex: stx [R10 + 0], (u8)Rx is rejected
+ *
+ * load/store to table bounds checked and table_id provides table size
+ * Ex: stx [Rx + 8], (u16)Ry is ok, if Rx is PTR_TO_TABLE and
+ * 8 + sizeof(u16) <= table_info->elem_size
+ *
+ * load/store to bpf_context checked against known fields
+ *
+ * Future improvements:
+ * stack size is hardcoded to 512 bytes maximum per program, relax it
+ */
+#define _(OP) ({ int ret = OP; if (ret < 0) return ret; })
+
+/* JITed code allocates 512 bytes and used bottom 4 slots
+ * to save R6-R9
+ */
+#define MAX_BPF_STACK (512 - 4 * 8)
+
+struct reg_state {
+ enum bpf_reg_type ptr;
+ bool read_ok;
+ int imm;
+};
+
+#define MAX_REG 11
+
+enum bpf_stack_slot_type {
+ STACK_INVALID, /* nothing was stored in this stack slot */
+ STACK_SPILL, /* 1st byte of register spilled into stack */
+ STACK_SPILL_PART, /* other 7 bytes of register spill */
+ STACK_MISC /* BPF program wrote some data into this slot */
+};
+
+struct bpf_stack_slot {
+ enum bpf_stack_slot_type type;
+ enum bpf_reg_type ptr;
+ int imm;
+};
+
+/* state of the program:
+ * type of all registers and stack info
+ */
+struct verifier_state {
+ struct reg_state regs[MAX_REG];
+ struct bpf_stack_slot stack[MAX_BPF_STACK];
+};
+
+/* linked list of verifier states
+ * used to prune search
+ */
+struct verifier_state_list {
+ struct verifier_state state;
+ struct verifier_state_list *next;
+};
+
+/* verifier_state + insn_idx are pushed to stack
+ * when branch is encountered
+ */
+struct verifier_stack_elem {
+ struct verifier_state st;
+ int insn_idx; /* at insn 'insn_idx' the program state is 'st' */
+ struct verifier_stack_elem *next;
+};
+
+/* single container for all structs
+ * one verifier_env per bpf_check() call
+ */
+struct verifier_env {
+ struct bpf_table *tables;
+ int table_cnt;
+ struct verifier_stack_elem *head;
+ int stack_size;
+ struct verifier_state cur_state;
+ struct verifier_state_list **branch_landing;
+ const struct bpf_func_proto* (*get_func_proto)(int id);
+ const struct bpf_context_access *(*get_context_access)(int off);
+};
+
+static int pop_stack(struct verifier_env *env)
+{
+ int insn_idx;
+ struct verifier_stack_elem *elem;
+ if (env->head == NULL)
+ return -1;
+ memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state));
+ insn_idx = env->head->insn_idx;
+ elem = env->head->next;
+ kfree(env->head);
+ env->head = elem;
+ env->stack_size--;
+ return insn_idx;
+}
+
+static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx)
+{
+ struct verifier_stack_elem *elem;
+ elem = kmalloc(sizeof(struct verifier_stack_elem), GFP_KERNEL);
+ memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state));
+ elem->insn_idx = insn_idx;
+ elem->next = env->head;
+ env->head = elem;
+ env->stack_size++;
+ if (env->stack_size > 8192) {
+ pr_err("BPF program is too complex\n");
+ /* pop all elements and return */
+ while (pop_stack(env) >= 0);
+ return NULL;
+ }
+ return &elem->st;
+}
+
+#define CALLER_SAVED_REGS 6
+static const int caller_saved[CALLER_SAVED_REGS] = { R0, R1, R2, R3, R4, R5 };
+
+static void init_reg_state(struct reg_state *regs)
+{
+ struct reg_state *reg;
+ int i;
+ for (i = 0; i < MAX_REG; i++) {
+ regs[i].ptr = INVALID_PTR;
+ regs[i].read_ok = false;
+ regs[i].imm = 0xbadbad;
+ }
+ reg = regs + __fp__;
+ reg->ptr = PTR_TO_STACK;
+ reg->read_ok = true;
+
+ reg = regs + R1; /* 1st arg to a function */
+ reg->ptr = PTR_TO_CTX;
+ reg->read_ok = true;
+}
+
+static void mark_reg_no_ptr(struct reg_state *regs, int regno)
+{
+ regs[regno].ptr = INVALID_PTR;
+ regs[regno].imm = 0xbadbad;
+ regs[regno].read_ok = true;
+}
+
+static int check_reg_arg(struct reg_state *regs, int regno, bool is_src)
+{
+ if (is_src) {
+ if (!regs[regno].read_ok) {
+ pr_err("R%d !read_ok\n", regno);
+ return -EACCES;
+ }
+ } else {
+ if (regno == __fp__)
+ /* frame pointer is read only */
+ return -EACCES;
+ mark_reg_no_ptr(regs, regno);
+ }
+ return 0;
+}
+
+static int bpf_size_to_bytes(int bpf_size)
+{
+ if (bpf_size == BPF_W)
+ return 4;
+ else if (bpf_size == BPF_H)
+ return 2;
+ else if (bpf_size == BPF_B)
+ return 1;
+ else if (bpf_size == BPF_DW)
+ return 8;
+ else
+ return -EACCES;
+}
+
+static int check_stack_write(struct verifier_state *state, int off, int size,
+ int value_regno)
+{
+ int i;
+ struct bpf_stack_slot *slot;
+ if (value_regno >= 0 &&
+ (state->regs[value_regno].ptr == PTR_TO_TABLE ||
+ state->regs[value_regno].ptr == PTR_TO_CTX)) {
+
+ /* register containing pointer is being spilled into stack */
+ if (size != 8) {
+ pr_err("invalid size of register spill\n");
+ return -EACCES;
+ }
+
+ slot = &state->stack[MAX_BPF_STACK + off];
+ slot->type = STACK_SPILL;
+ /* save register state */
+ slot->ptr = state->regs[value_regno].ptr;
+ slot->imm = state->regs[value_regno].imm;
+ for (i = 1; i < 8; i++) {
+ slot = &state->stack[MAX_BPF_STACK + off + i];
+ slot->type = STACK_SPILL_PART;
+ }
+ } else {
+
+ /* regular write of data into stack */
+ for (i = 0; i < size; i++) {
+ slot = &state->stack[MAX_BPF_STACK + off + i];
+ slot->type = STACK_MISC;
+ }
+ }
+ return 0;
+}
+
+static int check_stack_read(struct verifier_state *state, int off, int size,
+ int value_regno)
+{
+ int i;
+ struct bpf_stack_slot *slot;
+
+ slot = &state->stack[MAX_BPF_STACK + off];
+
+ if (slot->type == STACK_SPILL) {
+ if (size != 8) {
+ pr_err("invalid size of register spill\n");
+ return -EACCES;
+ }
+ for (i = 1; i < 8; i++) {
+ if (state->stack[MAX_BPF_STACK + off + i].type !=
+ STACK_SPILL_PART) {
+ pr_err("corrupted spill memory\n");
+ return -EACCES;
+ }
+ }
+
+ /* restore register state from stack */
+ state->regs[value_regno].ptr = slot->ptr;
+ state->regs[value_regno].imm = slot->imm;
+ state->regs[value_regno].read_ok = true;
+ return 0;
+ } else {
+ for (i = 0; i < size; i++) {
+ if (state->stack[MAX_BPF_STACK + off + i].type !=
+ STACK_MISC) {
+ pr_err("invalid read from stack off %d+%d size %d\n",
+ off, i, size);
+ return -EACCES;
+ }
+ }
+ /* have read misc data from the stack */
+ mark_reg_no_ptr(state->regs, value_regno);
+ return 0;
+ }
+}
+
+static int get_table_info(struct verifier_env *env, int table_id,
+ struct bpf_table **table)
+{
+ /* if BPF program contains bpf_table_lookup(ctx, 1024, key)
+ * the incorrect table_id will be caught here
+ */
+ if (table_id < 0 || table_id >= env->table_cnt) {
+ pr_err("invalid access to table_id=%d max_tables=%d\n",
+ table_id, env->table_cnt);
+ return -EACCES;
+ }
+ *table = &env->tables[table_id];
+ return 0;
+}
+
+/* check read/write into table element returned by bpf_table_lookup() */
+static int check_table_access(struct verifier_env *env, int regno, int off,
+ int size)
+{
+ struct bpf_table *table;
+ int table_id = env->cur_state.regs[regno].imm;
+
+ _(get_table_info(env, table_id, &table));
+
+ if (off < 0 || off + size > table->elem_size) {
+ pr_err("invalid access to table_id=%d leaf_size=%d off=%d size=%d\n",
+ table_id, table->elem_size, off, size);
+ return -EACCES;
+ }
+ return 0;
+}
+
+/* check access to 'struct bpf_context' fields */
+static int check_ctx_access(struct verifier_env *env, int off, int size,
+ enum bpf_access_type t)
+{
+ const struct bpf_context_access *access;
+
+ if (off < 0 || off >= 32768/* struct bpf_context shouldn't be huge */)
+ goto error;
+
+ access = env->get_context_access(off);
+ if (!access)
+ goto error;
+
+ if (access->size == size && (access->type & t))
+ return 0;
+error:
+ pr_err("invalid bpf_context access off=%d size=%d\n", off, size);
+ return -EACCES;
+}
+
+static int check_mem_access(struct verifier_env *env, int regno, int off,
+ int bpf_size, enum bpf_access_type t,
+ int value_regno)
+{
+ struct verifier_state *state = &env->cur_state;
+ int size;
+ _(size = bpf_size_to_bytes(bpf_size));
+
+ if (off % size != 0) {
+ pr_err("misaligned access off %d size %d\n", off, size);
+ return -EACCES;
+ }
+
+ if (state->regs[regno].ptr == PTR_TO_TABLE) {
+ _(check_table_access(env, regno, off, size));
+ if (t == BPF_READ)
+ mark_reg_no_ptr(state->regs, value_regno);
+ } else if (state->regs[regno].ptr == PTR_TO_CTX) {
+ _(check_ctx_access(env, off, size, t));
+ if (t == BPF_READ)
+ mark_reg_no_ptr(state->regs, value_regno);
+ } else if (state->regs[regno].ptr == PTR_TO_STACK) {
+ if (off >= 0 || off < -MAX_BPF_STACK) {
+ pr_err("invalid stack off=%d size=%d\n", off, size);
+ return -EACCES;
+ }
+ if (t == BPF_WRITE)
+ _(check_stack_write(state, off, size, value_regno));
+ else
+ _(check_stack_read(state, off, size, value_regno));
+ } else {
+ pr_err("invalid mem access %d\n", state->regs[regno].ptr);
+ return -EACCES;
+ }
+ return 0;
+}
+
+static const struct bpf_func_proto funcs[] = {
+ [FUNC_bpf_table_lookup] = {PTR_TO_TABLE_CONDITIONAL, PTR_TO_CTX,
+ CONST_ARG, PTR_TO_STACK_IMM},
+ [FUNC_bpf_table_update] = {RET_INTEGER, PTR_TO_CTX, CONST_ARG,
+ PTR_TO_STACK_IMM, PTR_TO_STACK_IMM},
+};
+
+static int check_func_arg(struct reg_state *regs, int regno,
+ enum bpf_reg_type expected_type, int *reg_values)
+{
+ struct reg_state *reg = regs + regno;
+ if (expected_type == INVALID_PTR)
+ return 0;
+
+ if (!reg->read_ok) {
+ pr_err("R%d !read_ok\n", regno);
+ return -EACCES;
+ }
+
+ if (reg->ptr != expected_type) {
+ pr_err("R%d ptr=%d expected=%d\n", regno, reg->ptr,
+ expected_type);
+ return -EACCES;
+ } else if (expected_type == CONST_ARG) {
+ reg_values[regno] = reg->imm;
+ }
+
+ return 0;
+}
+
+/* when register 'regno' is passed into function that will read 'access_size'
+ * bytes from that pointer, make sure that it's within stack boundary
+ * and all elements of stack are initialized
+ */
+static int check_stack_boundary(struct verifier_state *state,
+ struct reg_state *regs, int regno,
+ int access_size)
+{
+ int off, i;
+
+ if (regs[regno].ptr != PTR_TO_STACK_IMM)
+ return -EACCES;
+
+ off = regs[regno].imm;
+ if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
+ access_size <= 0) {
+ pr_err("invalid stack ptr R%d off=%d access_size=%d\n",
+ regno, off, access_size);
+ return -EACCES;
+ }
+
+ for (i = 0; i < access_size; i++) {
+ if (state->stack[MAX_BPF_STACK + off + i].type != STACK_MISC) {
+ pr_err("invalid indirect read from stack off %d+%d size %d\n",
+ off, i, access_size);
+ return -EACCES;
+ }
+ }
+ return 0;
+}
+
+static int check_call(struct verifier_env *env, int func_id)
+{
+ int reg_values[MAX_REG] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
+ struct verifier_state *state = &env->cur_state;
+ const struct bpf_func_proto *fn = NULL;
+ struct reg_state *regs = state->regs;
+ struct reg_state *reg;
+ int i;
+
+ /* find function prototype */
+ if (func_id < 0 || func_id >= FUNC_bpf_max_id) {
+ pr_err("invalid func %d\n", func_id);
+ return -EINVAL;
+ }
+
+ if (func_id == FUNC_bpf_table_lookup ||
+ func_id == FUNC_bpf_table_update) {
+ fn = &funcs[func_id];
+ } else {
+ if (env->get_func_proto)
+ fn = env->get_func_proto(func_id);
+ if (!fn || (fn->ret_type != RET_INTEGER &&
+ fn->ret_type != RET_VOID)) {
+ pr_err("unknown func %d\n", func_id);
+ return -EINVAL;
+ }
+ }
+
+ /* check args */
+ _(check_func_arg(regs, R1, fn->arg1_type, reg_values));
+ _(check_func_arg(regs, R2, fn->arg2_type, reg_values));
+ _(check_func_arg(regs, R3, fn->arg3_type, reg_values));
+ _(check_func_arg(regs, R4, fn->arg4_type, reg_values));
+
+ if (func_id == FUNC_bpf_table_lookup) {
+ struct bpf_table *table;
+ int table_id = reg_values[R2];
+
+ _(get_table_info(env, table_id, &table));
+
+ /* bpf_table_lookup(ctx, table_id, key) call: check that
+ * [key, key + table_info->key_size) are within stack limits
+ * and initialized
+ */
+ _(check_stack_boundary(state, regs, R3, table->key_size));
+
+ } else if (func_id == FUNC_bpf_table_update) {
+ struct bpf_table *table;
+ int table_id = reg_values[R2];
+
+ _(get_table_info(env, table_id, &table));
+
+ /* bpf_table_update(ctx, table_id, key, value) check
+ * that key and value are valid
+ */
+ _(check_stack_boundary(state, regs, R3, table->key_size));
+ _(check_stack_boundary(state, regs, R4, table->elem_size));
+
+ } else if (fn->arg1_type == PTR_TO_STACK_IMM) {
+ /* bpf_xxx(buf, len) call will access 'len' bytes
+ * from stack pointer 'buf'. Check it
+ */
+ _(check_stack_boundary(state, regs, R1, reg_values[R2]));
+
+ } else if (fn->arg2_type == PTR_TO_STACK_IMM) {
+ /* bpf_yyy(arg1, buf, len) call will access 'len' bytes
+ * from stack pointer 'buf'. Check it
+ */
+ _(check_stack_boundary(state, regs, R2, reg_values[R3]));
+
+ } else if (fn->arg3_type == PTR_TO_STACK_IMM) {
+ /* bpf_zzz(arg1, arg2, buf, len) call will access 'len' bytes
+ * from stack pointer 'buf'. Check it
+ */
+ _(check_stack_boundary(state, regs, R3, reg_values[R4]));
+ }
+
+ /* reset caller saved regs */
+ for (i = 0; i < CALLER_SAVED_REGS; i++) {
+ reg = regs + caller_saved[i];
+ reg->read_ok = false;
+ reg->ptr = INVALID_PTR;
+ reg->imm = 0xbadbad;
+ }
+
+ /* update return register */
+ reg = regs + R0;
+ if (fn->ret_type == RET_INTEGER) {
+ reg->read_ok = true;
+ reg->ptr = INVALID_PTR;
+ } else if (fn->ret_type != RET_VOID) {
+ reg->read_ok = true;
+ reg->ptr = fn->ret_type;
+ if (func_id == FUNC_bpf_table_lookup)
+ /* when ret_type == PTR_TO_TABLE_CONDITIONAL
+ * remember table_id, so that check_table_access()
+ * can check 'elem_size' boundary of memory access
+ * to table element returned from bpf_table_lookup()
+ */
+ reg->imm = reg_values[R2];
+ }
+ return 0;
+}
+
+static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
+{
+ u16 opcode = BPF_OP(insn->code);
+
+ if (opcode == BPF_BSWAP32 || opcode == BPF_BSWAP64 ||
+ opcode == BPF_NEG) {
+ if (BPF_SRC(insn->code) != BPF_X)
+ return -EINVAL;
+ /* check src operand */
+ _(check_reg_arg(regs, insn->a_reg, 1));
+
+ /* check dest operand */
+ _(check_reg_arg(regs, insn->a_reg, 0));
+
+ } else if (opcode == BPF_MOV) {
+
+ if (BPF_SRC(insn->code) == BPF_X)
+ /* check src operand */
+ _(check_reg_arg(regs, insn->x_reg, 1));
+
+ /* check dest operand */
+ _(check_reg_arg(regs, insn->a_reg, 0));
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ /* case: R1 = R2
+ * copy register state to dest reg
+ */
+ regs[insn->a_reg].ptr = regs[insn->x_reg].ptr;
+ regs[insn->a_reg].imm = regs[insn->x_reg].imm;
+ } else {
+ /* case: R = imm
+ * remember the value we stored into this reg
+ */
+ regs[insn->a_reg].ptr = CONST_ARG;
+ regs[insn->a_reg].imm = insn->imm;
+ }
+
+ } else { /* all other ALU ops: and, sub, xor, add, ... */
+
+ int stack_relative = 0;
+
+ if (BPF_SRC(insn->code) == BPF_X)
+ /* check src1 operand */
+ _(check_reg_arg(regs, insn->x_reg, 1));
+
+ /* check src2 operand */
+ _(check_reg_arg(regs, insn->a_reg, 1));
+
+ if (opcode == BPF_ADD &&
+ regs[insn->a_reg].ptr == PTR_TO_STACK &&
+ BPF_SRC(insn->code) == BPF_K)
+ stack_relative = 1;
+
+ /* check dest operand */
+ _(check_reg_arg(regs, insn->a_reg, 0));
+
+ if (stack_relative) {
+ regs[insn->a_reg].ptr = PTR_TO_STACK_IMM;
+ regs[insn->a_reg].imm = insn->imm;
+ }
+ }
+
+ return 0;
+}
+
+static int check_cond_jmp_op(struct verifier_env *env, struct bpf_insn *insn,
+ int insn_idx)
+{
+ struct reg_state *regs = env->cur_state.regs;
+ struct verifier_state *other_branch;
+ u16 opcode = BPF_OP(insn->code);
+
+ if (BPF_SRC(insn->code) == BPF_X)
+ /* check src1 operand */
+ _(check_reg_arg(regs, insn->x_reg, 1));
+
+ /* check src2 operand */
+ _(check_reg_arg(regs, insn->a_reg, 1));
+
+ other_branch = push_stack(env, insn_idx + insn->off + 1);
+ if (!other_branch)
+ return -EFAULT;
+
+ /* detect if R == 0 where R is returned value from table_lookup() */
+ if (BPF_SRC(insn->code) == BPF_K &&
+ insn->imm == 0 && (opcode == BPF_JEQ ||
+ opcode == BPF_JNE) &&
+ regs[insn->a_reg].ptr == PTR_TO_TABLE_CONDITIONAL) {
+ if (opcode == BPF_JEQ) {
+ /* next fallthrough insn can access memory via
+ * this register
+ */
+ regs[insn->a_reg].ptr = PTR_TO_TABLE;
+ /* branch targer cannot access it, since reg == 0 */
+ other_branch->regs[insn->a_reg].ptr = INVALID_PTR;
+ } else {
+ other_branch->regs[insn->a_reg].ptr = PTR_TO_TABLE;
+ regs[insn->a_reg].ptr = INVALID_PTR;
+ }
+ }
+ return 0;
+}
+
+
+/* non-recursive DFS pseudo code
+ * 1 procedure DFS-iterative(G,v):
+ * 2 label v as discovered
+ * 3 let S be a stack
+ * 4 S.push(v)
+ * 5 while S is not empty
+ * 6 t <- S.pop()
+ * 7 if t is what we're looking for:
+ * 8 return t
+ * 9 for all edges e in G.adjacentEdges(t) do
+ * 10 if edge e is already labelled
+ * 11 continue with the next edge
+ * 12 w <- G.adjacentVertex(t,e)
+ * 13 if vertex w is not discovered and not explored
+ * 14 label e as tree-edge
+ * 15 label w as discovered
+ * 16 S.push(w)
+ * 17 continue at 5
+ * 18 else if vertex w is discovered
+ * 19 label e as back-edge
+ * 20 else
+ * 21 // vertex w is explored
+ * 22 label e as forward- or cross-edge
+ * 23 label t as explored
+ * 24 S.pop()
+ *
+ * convention:
+ * 1 - discovered
+ * 2 - discovered and 1st branch labelled
+ * 3 - discovered and 1st and 2nd branch labelled
+ * 4 - explored
+ */
+
+#define STATE_END ((struct verifier_state_list *)-1)
+
+#define PUSH_INT(I) \
+ do { \
+ if (cur_stack >= insn_cnt) { \
+ ret = -E2BIG; \
+ goto free_st; \
+ } \
+ stack[cur_stack++] = I; \
+ } while (0)
+
+#define PEAK_INT() \
+ ({ \
+ int _ret; \
+ if (cur_stack == 0) \
+ _ret = -1; \
+ else \
+ _ret = stack[cur_stack - 1]; \
+ _ret; \
+ })
+
+#define POP_INT() \
+ ({ \
+ int _ret; \
+ if (cur_stack == 0) \
+ _ret = -1; \
+ else \
+ _ret = stack[--cur_stack]; \
+ _ret; \
+ })
+
+#define PUSH_INSN(T, W, E) \
+ do { \
+ int w = W; \
+ if (E == 1 && st[T] >= 2) \
+ break; \
+ if (E == 2 && st[T] >= 3) \
+ break; \
+ if (w >= insn_cnt) { \
+ ret = -EACCES; \
+ goto free_st; \
+ } \
+ if (E == 2) \
+ /* mark branch target for state pruning */ \
+ env->branch_landing[w] = STATE_END; \
+ if (st[w] == 0) { \
+ /* tree-edge */ \
+ st[T] = 1 + E; \
+ st[w] = 1; /* discovered */ \
+ PUSH_INT(w); \
+ goto peak_stack; \
+ } else if (st[w] == 1 || st[w] == 2 || st[w] == 3) { \
+ pr_err("back-edge from insn %d to %d\n", t, w); \
+ ret = -EINVAL; \
+ goto free_st; \
+ } else if (st[w] == 4) { \
+ /* forward- or cross-edge */ \
+ st[T] = 1 + E; \
+ } else { \
+ pr_err("insn state internal bug\n"); \
+ ret = -EFAULT; \
+ goto free_st; \
+ } \
+ } while (0)
+
+/* non-recursive depth-first-search to detect loops in BPF program
+ * loop == back-edge in directed graph
+ */
+static int check_cfg(struct verifier_env *env, struct bpf_insn *insns,
+ int insn_cnt)
+{
+ int cur_stack = 0;
+ int *stack;
+ int ret = 0;
+ int *st;
+ int i, t;
+
+ if (insns[insn_cnt - 1].code != (BPF_RET | BPF_K)) {
+ pr_err("last insn is not a 'ret'\n");
+ return -EINVAL;
+ }
+
+ st = kzalloc(sizeof(int) * insn_cnt, GFP_KERNEL);
+ if (!st)
+ return -ENOMEM;
+
+ stack = kzalloc(sizeof(int) * insn_cnt, GFP_KERNEL);
+ if (!stack) {
+ kfree(st);
+ return -ENOMEM;
+ }
+
+ st[0] = 1; /* mark 1st insn as discovered */
+ PUSH_INT(0);
+
+peak_stack:
+ while ((t = PEAK_INT()) != -1) {
+ if (t == insn_cnt - 1)
+ goto mark_explored;
+
+ if (BPF_CLASS(insns[t].code) == BPF_RET) {
+ pr_err("extraneous 'ret'\n");
+ ret = -EINVAL;
+ goto free_st;
+ }
+
+ if (BPF_CLASS(insns[t].code) == BPF_JMP) {
+ u16 opcode = BPF_OP(insns[t].code);
+ if (opcode == BPF_CALL) {
+ PUSH_INSN(t, t + 1, 1);
+ } else if (opcode == BPF_JA) {
+ if (BPF_SRC(insns[t].code) != BPF_X) {
+ ret = -EINVAL;
+ goto free_st;
+ }
+ PUSH_INSN(t, t + insns[t].off + 1, 1);
+ } else {
+ PUSH_INSN(t, t + 1, 1);
+ PUSH_INSN(t, t + insns[t].off + 1, 2);
+ }
+ } else {
+ PUSH_INSN(t, t + 1, 1);
+ }
+
+mark_explored:
+ st[t] = 4; /* explored */
+ if (POP_INT() == -1) {
+ pr_err("pop_int internal bug\n");
+ ret = -EFAULT;
+ goto free_st;
+ }
+ }
+
+
+ for (i = 0; i < insn_cnt; i++) {
+ if (st[i] != 4) {
+ pr_err("unreachable insn %d\n", i);
+ ret = -EINVAL;
+ goto free_st;
+ }
+ }
+
+free_st:
+ kfree(st);
+ kfree(stack);
+ return ret;
+}
+
+static int is_state_visited(struct verifier_env *env, int insn_idx)
+{
+ struct verifier_state_list *sl;
+ struct verifier_state_list *new_sl;
+ sl = env->branch_landing[insn_idx];
+ if (!sl)
+ /* no branch jump to this insn, ignore it */
+ return 0;
+
+ while (sl != STATE_END) {
+ if (memcmp(&sl->state, &env->cur_state,
+ sizeof(env->cur_state)) == 0)
+ /* reached the same register/stack state,
+ * prune the search
+ */
+ return 1;
+ sl = sl->next;
+ }
+ new_sl = kmalloc(sizeof(struct verifier_state_list), GFP_KERNEL);
+
+ if (!new_sl)
+ /* ignore kmalloc error, since it's rare and doesn't affect
+ * correctness of algorithm
+ */
+ return 0;
+ /* add new state to the head of linked list */
+ memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state));
+ new_sl->next = env->branch_landing[insn_idx];
+ env->branch_landing[insn_idx] = new_sl;
+ return 0;
+}
+
+static int __bpf_check(struct verifier_env *env, struct bpf_insn *insns,
+ int insn_cnt)
+{
+ int insn_idx;
+ int insn_processed = 0;
+ struct verifier_state *state = &env->cur_state;
+ struct reg_state *regs = state->regs;
+
+ init_reg_state(regs);
+ insn_idx = 0;
+ for (;;) {
+ struct bpf_insn *insn;
+ u16 class;
+
+ if (insn_idx >= insn_cnt) {
+ pr_err("invalid insn idx %d insn_cnt %d\n",
+ insn_idx, insn_cnt);
+ return -EFAULT;
+ }
+
+ insn = &insns[insn_idx];
+ class = BPF_CLASS(insn->code);
+
+ if (++insn_processed > 65536) {
+ pr_err("BPF program is too large. Proccessed %d insn\n",
+ insn_processed);
+ return -E2BIG;
+ }
+
+ /* pr_debug_bpf_insn(insn, NULL); */
+
+ if (is_state_visited(env, insn_idx))
+ goto process_ret;
+
+ if (class == BPF_ALU) {
+ _(check_alu_op(regs, insn));
+
+ } else if (class == BPF_LDX) {
+ if (BPF_MODE(insn->code) != BPF_REL)
+ return -EINVAL;
+
+ /* check src operand */
+ _(check_reg_arg(regs, insn->x_reg, 1));
+
+ _(check_mem_access(env, insn->x_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_READ,
+ insn->a_reg));
+
+ /* dest reg state will be updated by mem_access */
+
+ } else if (class == BPF_STX) {
+ /* check src1 operand */
+ _(check_reg_arg(regs, insn->x_reg, 1));
+ /* check src2 operand */
+ _(check_reg_arg(regs, insn->a_reg, 1));
+ _(check_mem_access(env, insn->a_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_WRITE,
+ insn->x_reg));
+
+ } else if (class == BPF_ST) {
+ if (BPF_MODE(insn->code) != BPF_REL)
+ return -EINVAL;
+ /* check src operand */
+ _(check_reg_arg(regs, insn->a_reg, 1));
+ _(check_mem_access(env, insn->a_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_WRITE,
+ -1));
+
+ } else if (class == BPF_JMP) {
+ u16 opcode = BPF_OP(insn->code);
+ if (opcode == BPF_CALL) {
+ _(check_call(env, insn->imm));
+ } else if (opcode == BPF_JA) {
+ if (BPF_SRC(insn->code) != BPF_X)
+ return -EINVAL;
+ insn_idx += insn->off + 1;
+ continue;
+ } else {
+ _(check_cond_jmp_op(env, insn, insn_idx));
+ }
+
+ } else if (class == BPF_RET) {
+process_ret:
+ insn_idx = pop_stack(env);
+ if (insn_idx < 0)
+ break;
+ else
+ continue;
+ }
+
+ insn_idx++;
+ }
+
+ /* pr_debug("insn_processed %d\n", insn_processed); */
+ return 0;
+}
+
+static void free_states(struct verifier_env *env, int insn_cnt)
+{
+ int i;
+
+ for (i = 0; i < insn_cnt; i++) {
+ struct verifier_state_list *sl = env->branch_landing[i];
+ if (sl)
+ while (sl != STATE_END) {
+ struct verifier_state_list *sln = sl->next;
+ kfree(sl);
+ sl = sln;
+ }
+ }
+
+ kfree(env->branch_landing);
+}
+
+int bpf_check(struct bpf_program *prog)
+{
+ int ret;
+ struct verifier_env *env;
+
+ if (prog->insn_cnt <= 0 || prog->insn_cnt > 32768 ||
+ prog->table_cnt < 0 || prog->table_cnt > 128) {
+ pr_err("BPF program has %d insn and %d tables. Max is 32K/128\n",
+ prog->insn_cnt, prog->table_cnt);
+ return -E2BIG;
+ }
+
+ env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL);
+ if (!env)
+ return -ENOMEM;
+
+ env->tables = prog->tables;
+ env->table_cnt = prog->table_cnt;
+ env->get_func_proto = prog->cb->get_func_proto;
+ env->get_context_access = prog->cb->get_context_access;
+ env->branch_landing = kzalloc(sizeof(struct verifier_state_list *) *
+ prog->insn_cnt, GFP_KERNEL);
+
+ if (!env->branch_landing) {
+ kfree(env);
+ return -ENOMEM;
+ }
+
+ ret = check_cfg(env, prog->insns, prog->insn_cnt);
+ if (ret)
+ goto free_env;
+ ret = __bpf_check(env, prog->insns, prog->insn_cnt);
+free_env:
+ free_states(env, prog->insn_cnt);
+ kfree(env);
+ return ret;
+}
diff --git a/net/core/bpf_run.c b/net/core/bpf_run.c
new file mode 100644
index 0000000..919da4e
--- /dev/null
+++ b/net/core/bpf_run.c
@@ -0,0 +1,412 @@
+/* Copyright (c) 2011-2013 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/filter.h>
+
+static const char *const bpf_class_string[] = {
+ "ld", "ldx", "st", "stx", "alu", "jmp", "ret", "misc"
+};
+
+static const char *const bpf_alu_string[] = {
+ "+=", "-=", "*=", "/=", "|=", "&=", "<<=", ">>=", "neg",
+ "%=", "^=", "=", "s>>=", "bswap32", "bswap64", "BUG"
+};
+
+static const char *const bpf_ldst_string[] = {
+ "u32", "u16", "u8", "u64"
+};
+
+static const char *const bpf_jmp_string[] = {
+ "jmp", "==", ">", ">=", "&", "!=", "s>", "s>=", "call"
+};
+
+static const char *debug_reg(int regno, u64 *regs)
+{
+ static char reg_value[16][32];
+ if (!regs)
+ return "";
+ snprintf(reg_value[regno], sizeof(reg_value[regno]), "(0x%llx)",
+ regs[regno]);
+ return reg_value[regno];
+}
+
+#define R(regno) debug_reg(regno, regs)
+
+void pr_debug_bpf_insn(struct bpf_insn *insn, u64 *regs)
+{
+ u16 class = BPF_CLASS(insn->code);
+ if (class == BPF_ALU) {
+ if (BPF_SRC(insn->code) == BPF_X)
+ pr_debug("code_%02x r%d%s %s r%d%s\n",
+ insn->code, insn->a_reg, R(insn->a_reg),
+ bpf_alu_string[BPF_OP(insn->code) >> 4],
+ insn->x_reg, R(insn->x_reg));
+ else
+ pr_debug("code_%02x r%d%s %s %d\n",
+ insn->code, insn->a_reg, R(insn->a_reg),
+ bpf_alu_string[BPF_OP(insn->code) >> 4],
+ insn->imm);
+ } else if (class == BPF_STX) {
+ if (BPF_MODE(insn->code) == BPF_REL)
+ pr_debug("code_%02x *(%s *)(r%d%s %+d) = r%d%s\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->a_reg, R(insn->a_reg),
+ insn->off, insn->x_reg, R(insn->x_reg));
+ else if (BPF_MODE(insn->code) == BPF_XADD)
+ pr_debug("code_%02x lock *(%s *)(r%d%s %+d) += r%d%s\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->a_reg, R(insn->a_reg), insn->off,
+ insn->x_reg, R(insn->x_reg));
+ else
+ pr_debug("BUG_%02x\n", insn->code);
+ } else if (class == BPF_ST) {
+ if (BPF_MODE(insn->code) != BPF_REL) {
+ pr_debug("BUG_st_%02x\n", insn->code);
+ return;
+ }
+ pr_debug("code_%02x *(%s *)(r%d%s %+d) = %d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->a_reg, R(insn->a_reg),
+ insn->off, insn->imm);
+ } else if (class == BPF_LDX) {
+ if (BPF_MODE(insn->code) != BPF_REL) {
+ pr_debug("BUG_ldx_%02x\n", insn->code);
+ return;
+ }
+ pr_debug("code_%02x r%d = *(%s *)(r%d%s %+d)\n",
+ insn->code, insn->a_reg,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->x_reg, R(insn->x_reg), insn->off);
+ } else if (class == BPF_JMP) {
+ u16 opcode = BPF_OP(insn->code);
+ if (opcode == BPF_CALL) {
+ pr_debug("code_%02x call %d\n", insn->code, insn->imm);
+ } else if (insn->code == (BPF_JMP | BPF_JA | BPF_X)) {
+ pr_debug("code_%02x goto pc%+d\n",
+ insn->code, insn->off);
+ } else if (BPF_SRC(insn->code) == BPF_X) {
+ pr_debug("code_%02x if r%d%s %s r%d%s goto pc%+d\n",
+ insn->code, insn->a_reg, R(insn->a_reg),
+ bpf_jmp_string[BPF_OP(insn->code) >> 4],
+ insn->x_reg, R(insn->x_reg), insn->off);
+ } else {
+ pr_debug("code_%02x if r%d%s %s 0x%x goto pc%+d\n",
+ insn->code, insn->a_reg, R(insn->a_reg),
+ bpf_jmp_string[BPF_OP(insn->code) >> 4],
+ insn->imm, insn->off);
+ }
+ } else {
+ pr_debug("code_%02x %s\n", insn->code, bpf_class_string[class]);
+ }
+}
+
+void bpf_run(struct bpf_program *prog, struct bpf_context *ctx)
+{
+ struct bpf_insn *insn = prog->insns;
+ u64 stack[64];
+ u64 regs[16] = { };
+ regs[__fp__] = (u64) &stack[64];
+ regs[R1] = (u64) ctx;
+
+ for (;; insn++) {
+ const s32 K = insn->imm;
+ u64 *a_reg = ®s[insn->a_reg];
+ u64 *x_reg = ®s[insn->x_reg];
+#define A (*a_reg)
+#define X (*x_reg)
+ /*pr_debug_bpf_insn(insn, regs);*/
+ switch (insn->code) {
+ /* ALU */
+ case BPF_ALU | BPF_ADD | BPF_X:
+ A += X;
+ continue;
+ case BPF_ALU | BPF_ADD | BPF_K:
+ A += K;
+ continue;
+ case BPF_ALU | BPF_SUB | BPF_X:
+ A -= X;
+ continue;
+ case BPF_ALU | BPF_SUB | BPF_K:
+ A -= K;
+ continue;
+ case BPF_ALU | BPF_AND | BPF_X:
+ A &= X;
+ continue;
+ case BPF_ALU | BPF_AND | BPF_K:
+ A &= K;
+ continue;
+ case BPF_ALU | BPF_OR | BPF_X:
+ A |= X;
+ continue;
+ case BPF_ALU | BPF_OR | BPF_K:
+ A |= K;
+ continue;
+ case BPF_ALU | BPF_LSH | BPF_X:
+ A <<= X;
+ continue;
+ case BPF_ALU | BPF_LSH | BPF_K:
+ A <<= K;
+ continue;
+ case BPF_ALU | BPF_RSH | BPF_X:
+ A >>= X;
+ continue;
+ case BPF_ALU | BPF_RSH | BPF_K:
+ A >>= K;
+ continue;
+ case BPF_ALU | BPF_MOV | BPF_X:
+ A = X;
+ continue;
+ case BPF_ALU | BPF_MOV | BPF_K:
+ A = K;
+ continue;
+ case BPF_ALU | BPF_ARSH | BPF_X:
+ (*(s64 *) &A) >>= X;
+ continue;
+ case BPF_ALU | BPF_ARSH | BPF_K:
+ (*(s64 *) &A) >>= K;
+ continue;
+ case BPF_ALU | BPF_BSWAP32 | BPF_X:
+ A = __builtin_bswap32(A);
+ continue;
+ case BPF_ALU | BPF_BSWAP64 | BPF_X:
+ A = __builtin_bswap64(A);
+ continue;
+ case BPF_ALU | BPF_MOD | BPF_X:
+ A %= X;
+ continue;
+ case BPF_ALU | BPF_MOD | BPF_K:
+ A %= K;
+ continue;
+
+ /* CALL */
+ case BPF_JMP | BPF_CALL:
+ prog->cb->execute_func(K, regs);
+ continue;
+
+ /* JMP */
+ case BPF_JMP | BPF_JA | BPF_X:
+ insn += insn->off;
+ continue;
+ case BPF_JMP | BPF_JEQ | BPF_X:
+ if (A == X)
+ insn += insn->off;
+ continue;
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ if (A == K)
+ insn += insn->off;
+ continue;
+ case BPF_JMP | BPF_JNE | BPF_X:
+ if (A != X)
+ insn += insn->off;
+ continue;
+ case BPF_JMP | BPF_JNE | BPF_K:
+ if (A != K)
+ insn += insn->off;
+ continue;
+ case BPF_JMP | BPF_JGT | BPF_X:
+ if (A > X)
+ insn += insn->off;
+ continue;
+ case BPF_JMP | BPF_JGT | BPF_K:
+ if (A > K)
+ insn += insn->off;
+ continue;
+ case BPF_JMP | BPF_JGE | BPF_X:
+ if (A >= X)
+ insn += insn->off;
+ continue;
+ case BPF_JMP | BPF_JGE | BPF_K:
+ if (A >= K)
+ insn += insn->off;
+ continue;
+ case BPF_JMP | BPF_JSGT | BPF_X:
+ if (((s64)A) > ((s64)X))
+ insn += insn->off;
+ continue;
+ case BPF_JMP | BPF_JSGT | BPF_K:
+ if (((s64)A) > ((s64)K))
+ insn += insn->off;
+ continue;
+ case BPF_JMP | BPF_JSGE | BPF_X:
+ if (((s64)A) >= ((s64)X))
+ insn += insn->off;
+ continue;
+ case BPF_JMP | BPF_JSGE | BPF_K:
+ if (((s64)A) >= ((s64)K))
+ insn += insn->off;
+ continue;
+
+ /* STX */
+ case BPF_STX | BPF_REL | BPF_B:
+ *(u8 *)(A + insn->off) = X;
+ continue;
+ case BPF_STX | BPF_REL | BPF_H:
+ *(u16 *)(A + insn->off) = X;
+ continue;
+ case BPF_STX | BPF_REL | BPF_W:
+ *(u32 *)(A + insn->off) = X;
+ continue;
+ case BPF_STX | BPF_REL | BPF_DW:
+ *(u64 *)(A + insn->off) = X;
+ continue;
+
+ /* ST */
+ case BPF_ST | BPF_REL | BPF_B:
+ *(u8 *)(A + insn->off) = K;
+ continue;
+ case BPF_ST | BPF_REL | BPF_H:
+ *(u16 *)(A + insn->off) = K;
+ continue;
+ case BPF_ST | BPF_REL | BPF_W:
+ *(u32 *)(A + insn->off) = K;
+ continue;
+ case BPF_ST | BPF_REL | BPF_DW:
+ *(u64 *)(A + insn->off) = K;
+ continue;
+
+ /* LDX */
+ case BPF_LDX | BPF_REL | BPF_B:
+ A = *(u8 *)(X + insn->off);
+ continue;
+ case BPF_LDX | BPF_REL | BPF_H:
+ A = *(u16 *)(X + insn->off);
+ continue;
+ case BPF_LDX | BPF_REL | BPF_W:
+ A = *(u32 *)(X + insn->off);
+ continue;
+ case BPF_LDX | BPF_REL | BPF_DW:
+ A = *(u64 *)(X + insn->off);
+ continue;
+
+ /* STX XADD */
+ case BPF_STX | BPF_XADD | BPF_B:
+ __sync_fetch_and_add((u8 *)(A + insn->off), (u8)X);
+ continue;
+ case BPF_STX | BPF_XADD | BPF_H:
+ __sync_fetch_and_add((u16 *)(A + insn->off), (u16)X);
+ continue;
+ case BPF_STX | BPF_XADD | BPF_W:
+ __sync_fetch_and_add((u32 *)(A + insn->off), (u32)X);
+ continue;
+ case BPF_STX | BPF_XADD | BPF_DW:
+ __sync_fetch_and_add((u64 *)(A + insn->off), (u64)X);
+ continue;
+
+ /* RET */
+ case BPF_RET | BPF_K:
+ return;
+ default:
+ /* bpf_check() will guarantee that
+ * we never reach here
+ */
+ pr_err("unknown opcode %02x\n", insn->code);
+ return;
+ }
+ }
+}
+EXPORT_SYMBOL(bpf_run);
+
+int bpf_load(struct bpf_image *image, struct bpf_callbacks *cb,
+ struct bpf_program **p_prog)
+{
+ struct bpf_program *prog;
+ int ret;
+
+ if (!image || !cb || !cb->execute_func || !cb->get_func_proto ||
+ !cb->get_context_access)
+ return -EINVAL;
+
+ if (image->insn_cnt <= 0 || image->insn_cnt > 32768 ||
+ image->table_cnt < 0 || image->table_cnt > 128) {
+ pr_err("BPF program has %d insn and %d tables. Max is 32K/128\n",
+ image->insn_cnt, image->table_cnt);
+ return -E2BIG;
+ }
+
+ prog = kzalloc(sizeof(struct bpf_program), GFP_KERNEL);
+ if (!prog)
+ return -ENOMEM;
+
+ prog->insn_cnt = image->insn_cnt;
+ prog->table_cnt = image->table_cnt;
+ prog->cb = cb;
+
+ prog->insns = kmalloc(sizeof(struct bpf_insn) * prog->insn_cnt,
+ GFP_KERNEL);
+ if (!prog->insns) {
+ ret = -ENOMEM;
+ goto free_prog;
+ }
+
+ prog->tables = kmalloc(sizeof(struct bpf_table) * prog->table_cnt,
+ GFP_KERNEL);
+ if (!prog->tables) {
+ ret = -ENOMEM;
+ goto free_insns;
+ }
+
+ if (copy_from_user(prog->insns, image->insns,
+ sizeof(struct bpf_insn) * prog->insn_cnt)) {
+ ret = -EFAULT;
+ goto free_tables;
+ }
+
+ if (copy_from_user(prog->tables, image->tables,
+ sizeof(struct bpf_table) * prog->table_cnt)) {
+ ret = -EFAULT;
+ goto free_tables;
+ }
+
+ /* verify BPF program */
+ ret = bpf_check(prog);
+ if (ret)
+ goto free_tables;
+
+ /* JIT it */
+ bpf2_jit_compile(prog);
+
+ *p_prog = prog;
+
+ return 0;
+
+free_tables:
+ kfree(prog->tables);
+free_insns:
+ kfree(prog->insns);
+free_prog:
+ kfree(prog);
+ return ret;
+}
+EXPORT_SYMBOL(bpf_load);
+
+void bpf_free(struct bpf_program *prog)
+{
+ if (!prog)
+ return;
+ bpf2_jit_free(prog);
+ kfree(prog->tables);
+ kfree(prog->insns);
+ kfree(prog);
+}
+EXPORT_SYMBOL(bpf_free);
+
--
1.7.9.5
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists