netdev - [PATCH bpf-next v2 6/8] bpf, arm64: optimize 32/64 immediate emission

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Mon, 14 May 2018 23:22:32 +0200
From:   Daniel Borkmann <daniel@...earbox.net>
To:     alexei.starovoitov@...il.com
Cc:     netdev@...r.kernel.org, Daniel Borkmann <daniel@...earbox.net>
Subject: [PATCH bpf-next v2 6/8] bpf, arm64: optimize 32/64 immediate emission

Improve the JIT to emit 64 and 32 bit immediates, the current
algorithm is not optimal and we often emit more instructions
than actually needed. arm64 has movz, movn, movk variants but
for the current 64 bit immediates we only use movz with a
series of movk when needed.

For example loading ffffffffffffabab emits the following 4
instructions in the JIT today:

  * movz: abab, shift:  0, result: 000000000000abab
  * movk: ffff, shift: 16, result: 00000000ffffabab
  * movk: ffff, shift: 32, result: 0000ffffffffabab
  * movk: ffff, shift: 48, result: ffffffffffffabab

Whereas after the patch the same load only needs a single
instruction:

  * movn: 5454, shift:  0, result: ffffffffffffabab

Another example where two extra instructions can be saved:

  * movz: abab, shift:  0, result: 000000000000abab
  * movk: 1f2f, shift: 16, result: 000000001f2fabab
  * movk: ffff, shift: 32, result: 0000ffff1f2fabab
  * movk: ffff, shift: 48, result: ffffffff1f2fabab

After the patch:

  * movn: e0d0, shift: 16, result: ffffffff1f2fffff
  * movk: abab, shift:  0, result: ffffffff1f2fabab

Another example with movz, before:

  * movz: 0000, shift:  0, result: 0000000000000000
  * movk: fea0, shift: 32, result: 0000fea000000000

After:

  * movz: fea0, shift: 32, result: 0000fea000000000

Moreover, reuse emit_a64_mov_i() for 32 bit immediates that
are loaded via emit_a64_mov_i64() which is a similar optimization
as done in 6fe8b9c1f41d ("bpf, x64: save several bytes by using
mov over movabsq when possible"). On arm64, the latter allows to
use a single instruction with movn due to zero extension where
otherwise two would be needed. And last but not least add a
missing optimization in emit_a64_mov_i() where movn is used but
the subsequent movk not needed. With some of the Cilium programs
in use, this shrinks the needed instructions by about three
percent. Tested on Cavium ThunderX CN8890.

Signed-off-by: Daniel Borkmann <daniel@...earbox.net>
---
 arch/arm64/net/bpf_jit_comp.c | 85 +++++++++++++++++++++++++++----------------
 1 file changed, 54 insertions(+), 31 deletions(-)

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 85113ca..c8a2620 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -79,23 +79,66 @@ static inline void emit(const u32 insn, struct jit_ctx *ctx)
 	ctx->idx++;
 }
 
+static inline void emit_a64_mov_i(const int is64, const int reg,
+				  const s32 val, struct jit_ctx *ctx)
+{
+	u16 hi = val >> 16;
+	u16 lo = val & 0xffff;
+
+	if (hi & 0x8000) {
+		if (hi == 0xffff) {
+			emit(A64_MOVN(is64, reg, (u16)~lo, 0), ctx);
+		} else {
+			emit(A64_MOVN(is64, reg, (u16)~hi, 16), ctx);
+			if (lo != 0xffff)
+				emit(A64_MOVK(is64, reg, lo, 0), ctx);
+		}
+	} else {
+		emit(A64_MOVZ(is64, reg, lo, 0), ctx);
+		if (hi)
+			emit(A64_MOVK(is64, reg, hi, 16), ctx);
+	}
+}
+
+static int i64_i16_blocks(const u64 val, bool inverse)
+{
+	return (((val >>  0) & 0xffff) != (inverse ? 0xffff : 0x0000)) +
+	       (((val >> 16) & 0xffff) != (inverse ? 0xffff : 0x0000)) +
+	       (((val >> 32) & 0xffff) != (inverse ? 0xffff : 0x0000)) +
+	       (((val >> 48) & 0xffff) != (inverse ? 0xffff : 0x0000));
+}
+
 static inline void emit_a64_mov_i64(const int reg, const u64 val,
 				    struct jit_ctx *ctx)
 {
-	u64 tmp = val;
-	int shift = 0;
-
-	emit(A64_MOVZ(1, reg, tmp & 0xffff, shift), ctx);
-	tmp >>= 16;
-	shift += 16;
-	while (tmp) {
-		if (tmp & 0xffff)
-			emit(A64_MOVK(1, reg, tmp & 0xffff, shift), ctx);
-		tmp >>= 16;
-		shift += 16;
+	u64 nrm_tmp = val, rev_tmp = ~val;
+	bool inverse;
+	int shift;
+
+	if (!(nrm_tmp >> 32))
+		return emit_a64_mov_i(0, reg, (u32)val, ctx);
+
+	inverse = i64_i16_blocks(nrm_tmp, true) < i64_i16_blocks(nrm_tmp, false);
+	shift = max(round_down((inverse ? (fls64(rev_tmp) - 1) :
+					  (fls64(nrm_tmp) - 1)), 16), 0);
+	if (inverse)
+		emit(A64_MOVN(1, reg, (rev_tmp >> shift) & 0xffff, shift), ctx);
+	else
+		emit(A64_MOVZ(1, reg, (nrm_tmp >> shift) & 0xffff, shift), ctx);
+	shift -= 16;
+	while (shift >= 0) {
+		if (((nrm_tmp >> shift) & 0xffff) != (inverse ? 0xffff : 0x0000))
+			emit(A64_MOVK(1, reg, (nrm_tmp >> shift) & 0xffff, shift), ctx);
+		shift -= 16;
 	}
 }
 
+/*
+ * This is an unoptimized 64 immediate emission used for BPF to BPF call
+ * addresses. It will always do a full 64 bit decomposition as otherwise
+ * more complexity in the last extra pass is required since we previously
+ * reserved 4 instructions for the address.
+ */
 static inline void emit_addr_mov_i64(const int reg, const u64 val,
 				     struct jit_ctx *ctx)
 {
@@ -110,26 +153,6 @@ static inline void emit_addr_mov_i64(const int reg, const u64 val,
 	}
 }
 
-static inline void emit_a64_mov_i(const int is64, const int reg,
-				  const s32 val, struct jit_ctx *ctx)
-{
-	u16 hi = val >> 16;
-	u16 lo = val & 0xffff;
-
-	if (hi & 0x8000) {
-		if (hi == 0xffff) {
-			emit(A64_MOVN(is64, reg, (u16)~lo, 0), ctx);
-		} else {
-			emit(A64_MOVN(is64, reg, (u16)~hi, 16), ctx);
-			emit(A64_MOVK(is64, reg, lo, 0), ctx);
-		}
-	} else {
-		emit(A64_MOVZ(is64, reg, lo, 0), ctx);
-		if (hi)
-			emit(A64_MOVK(is64, reg, hi, 16), ctx);
-	}
-}
-
 static inline int bpf2a64_offset(int bpf_to, int bpf_from,
 				 const struct jit_ctx *ctx)
 {
-- 
2.9.5