netdev - [RFC PATCH 2/4] bpf, x64: Increase distance for bpf calls

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20190205225103.28296-3-rick.p.edgecombe@intel.com>
Date:   Tue,  5 Feb 2019 14:51:01 -0800
From:   Rick Edgecombe <rick.p.edgecombe@...el.com>
To:     daniel@...earbox.net, ast@...com
Cc:     netdev@...r.kernel.org, ard.biesheuvel@...aro.org,
        dave.hansen@...el.com, kristen@...ux.intel.com,
        Rick Edgecombe <rick.p.edgecombe@...el.com>
Subject: [RFC PATCH 2/4] bpf, x64: Increase distance for bpf calls

This allows for BPF calls to be emitted that call further than the
relative call range. When a call cannot be emitted as a relative call it
is emitted as a full indirect call.

The image has to be allocated in order to compute the distance of the
call in order to know what type of call to be emitted. The two types of
calls have different sizes, so this requires allowing the image to shrink
further after image is created. After the image is allocated two more
passes are needed, one to get the new sizes and the other to set the
final offsets.

So the algorithm in bpf_int_jit_compile is changed to always do at least
2 more passes after the program converges once. The old check inside the
loop that verified if the program length changed after the image was
created now looks for the same scenario by checking if the image ever
converged a second time after if the maximum passes were reached.

In the case of retpoline, the call needs to be made through a retpoline
sequence. This sequence is emitted at the end of the program so that it
can be re-used by multiple calls.

This change is intended to not change the emitted code that is actually
executed in the case of using the module space, however the allocation
may be larger at the end when using retpoline due the thunk emitted at
the end.

Cc: Daniel Borkmann <daniel@...earbox.net>
Cc: Alexei Starovoitov <ast@...com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@...el.com>
---
 arch/x86/net/bpf_jit_comp.c | 117 +++++++++++++++++++++++++++++-------
 1 file changed, 94 insertions(+), 23 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 5542303c43d9..c9781d471e31 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -13,6 +13,7 @@
 #include <linux/filter.h>
 #include <linux/if_vlan.h>
 #include <linux/bpf.h>
+#include <linux/moduleloader.h>
 
 #include <asm/set_memory.h>
 #include <asm/nospec-branch.h>
@@ -408,13 +409,17 @@ static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg)
 	*pprog = prog;
 }
 
+#define RETPOL_THUNK_SIZE ((IS_ENABLED(CONFIG_RETPOLINE) \
+				* RETPOLINE_RBX_BPF_JIT_CALL_SIZE) + 1)
+
 static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		  int oldproglen, struct jit_context *ctx)
 {
 	struct bpf_insn *insn = bpf_prog->insnsi;
 	int insn_cnt = bpf_prog->len;
 	bool seen_exit = false;
-	u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
+	const int retpol_thunk = oldproglen - RETPOL_THUNK_SIZE;
+	u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY + RETPOL_THUNK_SIZE];
 	int i, cnt = 0;
 	int proglen = 0;
 	u8 *prog = temp;
@@ -430,7 +435,6 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		s64 jmp_offset;
 		u8 jmp_cond;
 		int ilen;
-		u8 *func;
 
 		switch (insn->code) {
 			/* ALU */
@@ -856,16 +860,49 @@ xadd:			if (is_imm8(insn->off))
 
 			/* call */
 		case BPF_JMP | BPF_CALL:
-			func = (u8 *) __bpf_call_base + imm32;
-			jmp_offset = func - (image + addrs[i]);
-			if (!imm32 || !is_simm32(jmp_offset)) {
-				pr_err("unsupported BPF func %d addr %p image %p\n",
-				       imm32, func, image);
-				return -EINVAL;
+		{
+			bool func_addr_fixed;
+			u64 func_addr;
+			int ret = bpf_jit_get_func_addr(bpf_prog, insn,
+				!image, &func_addr, &func_addr_fixed);
+			if (ret < 0)
+				return ret;
+
+			jmp_offset = func_addr - (u64)(image + addrs[i]);
+
+			/*
+			 * Need to know the allocation location before we can
+			 * know whether we can use a relative call or not.
+			 *
+			 * Always emit indirect version until we have the image
+			 * so the length will shrink.
+			 */
+			if (image && (imm32 && is_simm32(jmp_offset))) {
+				/* Emit relative call */
+
+				EMIT1_off32(0xE8, jmp_offset);
+			} else {
+				/* Emit indirect call */
+
+				EMIT1(0x53); /* push rbx */
+				emit_mov_imm64(&prog, BPF_REG_6,
+					(u32)(((u64)func_addr) >> 32),
+					(u32)(u64)func_addr);
+				/* -1 to account for pop rbx below */
+				jmp_offset = retpol_thunk - (addrs[i] - 1);
+
+				/*
+				 * If retpoline, jump to retpoline thunk, or
+				 * else emit the default indirect call version.
+				 */
+				if (IS_ENABLED(CONFIG_RETPOLINE))
+					EMIT1_off32(0xE8, jmp_offset);
+				else
+					RETPOLINE_RBX_BPF_JIT_CALL();
+				EMIT1(0x5B); /* pop rbx */
 			}
-			EMIT1_off32(0xE8, jmp_offset);
 			break;
-
+		}
 		case BPF_JMP | BPF_TAIL_CALL:
 			emit_bpf_tail_call(&prog);
 			break;
@@ -1049,6 +1086,27 @@ xadd:			if (is_imm8(insn->off))
 		addrs[i] = proglen;
 		prog = temp;
 	}
+
+	/*
+	 * If this allocation is far from the kernel text, we may need
+	 * to do a retpoline call. Embed the chunk here so it can be
+	 * re-used by multiple calls.
+	 */
+	if (IS_ENABLED(CONFIG_RETPOLINE)) {
+		int new_len = RETPOLINE_RBX_BPF_JIT_CALL_SIZE + 1;
+
+		RETPOLINE_RBX_BPF_JIT_CALL();
+		EMIT1(0xC3); /* ret */
+
+		if (image) {
+			if (unlikely(proglen + new_len > oldproglen)) {
+				pr_err("bpf_jit: fatal error\n");
+				return -EFAULT;
+			}
+			memcpy(image + proglen, temp, new_len);
+		}
+		proglen += new_len;
+	}
 	return proglen;
 }
 
@@ -1073,6 +1131,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 	int *addrs;
 	int pass;
 	int i;
+	int converged_pass;
 
 	if (!prog->jit_requested)
 		return orig_prog;
@@ -1127,10 +1186,16 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 	/*
 	 * JITed image shrinks with every pass and the loop iterates
 	 * until the image stops shrinking. Very large BPF programs
-	 * may converge on the last pass. In such case do one more
-	 * pass to emit the final image.
+	 * may converge on the last pass. In such case one more
+	 * pass is needed to emit the final image.
+	 *
+	 * After the image memory is allocated there needs to be at least 2 more
+	 * passes as the emitted calls can shrink if they are in relative range.
+	 * The logic is, we always do at least 2 more passes after the image
+	 * converges once.
 	 */
-	for (pass = 0; pass < 20 || image; pass++) {
+	converged_pass = 1;
+	for (pass = 0; pass < 22 && pass < converged_pass + 2; pass++) {
 		proglen = do_jit(prog, addrs, image, oldproglen, &ctx);
 		if (proglen <= 0) {
 out_image:
@@ -1140,26 +1205,32 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 			prog = orig_prog;
 			goto out_addrs;
 		}
-		if (image) {
-			if (proglen != oldproglen) {
-				pr_err("bpf_jit: proglen=%d != oldproglen=%d\n",
-				       proglen, oldproglen);
-				goto out_image;
-			}
-			break;
-		}
-		if (proglen == oldproglen) {
+
+		if (proglen != oldproglen)
+			converged_pass = pass + 1;
+
+		if (proglen == oldproglen && !image) {
 			header = bpf_jit_binary_alloc(proglen, &image,
-						      1, jit_fill_hole);
+						1, jit_fill_hole);
 			if (!header) {
 				prog = orig_prog;
 				goto out_addrs;
 			}
 		}
+
 		oldproglen = proglen;
 		cond_resched();
 	}
 
+	/* See if the image ever converged */
+	if (image) {
+		if (proglen != oldproglen) {
+			pr_err("bpf_jit: proglen=%d != oldproglen=%d\n",
+				proglen, oldproglen);
+			goto out_image;
+		}
+	}
+
 	if (bpf_jit_enable > 1)
 		bpf_jit_dump(prog->len, proglen, pass + 1, image);
 
-- 
2.17.1