lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Mon, 14 May 2018 23:22:32 +0200
From:   Daniel Borkmann <daniel@...earbox.net>
To:     alexei.starovoitov@...il.com
Cc:     netdev@...r.kernel.org, Daniel Borkmann <daniel@...earbox.net>
Subject: [PATCH bpf-next v2 6/8] bpf, arm64: optimize 32/64 immediate emission

Improve the JIT to emit 64 and 32 bit immediates, the current
algorithm is not optimal and we often emit more instructions
than actually needed. arm64 has movz, movn, movk variants but
for the current 64 bit immediates we only use movz with a
series of movk when needed.

For example loading ffffffffffffabab emits the following 4
instructions in the JIT today:

  * movz: abab, shift:  0, result: 000000000000abab
  * movk: ffff, shift: 16, result: 00000000ffffabab
  * movk: ffff, shift: 32, result: 0000ffffffffabab
  * movk: ffff, shift: 48, result: ffffffffffffabab

Whereas after the patch the same load only needs a single
instruction:

  * movn: 5454, shift:  0, result: ffffffffffffabab

Another example where two extra instructions can be saved:

  * movz: abab, shift:  0, result: 000000000000abab
  * movk: 1f2f, shift: 16, result: 000000001f2fabab
  * movk: ffff, shift: 32, result: 0000ffff1f2fabab
  * movk: ffff, shift: 48, result: ffffffff1f2fabab

After the patch:

  * movn: e0d0, shift: 16, result: ffffffff1f2fffff
  * movk: abab, shift:  0, result: ffffffff1f2fabab

Another example with movz, before:

  * movz: 0000, shift:  0, result: 0000000000000000
  * movk: fea0, shift: 32, result: 0000fea000000000

After:

  * movz: fea0, shift: 32, result: 0000fea000000000

Moreover, reuse emit_a64_mov_i() for 32 bit immediates that
are loaded via emit_a64_mov_i64() which is a similar optimization
as done in 6fe8b9c1f41d ("bpf, x64: save several bytes by using
mov over movabsq when possible"). On arm64, the latter allows to
use a single instruction with movn due to zero extension where
otherwise two would be needed. And last but not least add a
missing optimization in emit_a64_mov_i() where movn is used but
the subsequent movk not needed. With some of the Cilium programs
in use, this shrinks the needed instructions by about three
percent. Tested on Cavium ThunderX CN8890.

Signed-off-by: Daniel Borkmann <daniel@...earbox.net>
---
 arch/arm64/net/bpf_jit_comp.c | 85 +++++++++++++++++++++++++++----------------
 1 file changed, 54 insertions(+), 31 deletions(-)

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 85113ca..c8a2620 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -79,23 +79,66 @@ static inline void emit(const u32 insn, struct jit_ctx *ctx)
 	ctx->idx++;
 }
 
+static inline void emit_a64_mov_i(const int is64, const int reg,
+				  const s32 val, struct jit_ctx *ctx)
+{
+	u16 hi = val >> 16;
+	u16 lo = val & 0xffff;
+
+	if (hi & 0x8000) {
+		if (hi == 0xffff) {
+			emit(A64_MOVN(is64, reg, (u16)~lo, 0), ctx);
+		} else {
+			emit(A64_MOVN(is64, reg, (u16)~hi, 16), ctx);
+			if (lo != 0xffff)
+				emit(A64_MOVK(is64, reg, lo, 0), ctx);
+		}
+	} else {
+		emit(A64_MOVZ(is64, reg, lo, 0), ctx);
+		if (hi)
+			emit(A64_MOVK(is64, reg, hi, 16), ctx);
+	}
+}
+
+static int i64_i16_blocks(const u64 val, bool inverse)
+{
+	return (((val >>  0) & 0xffff) != (inverse ? 0xffff : 0x0000)) +
+	       (((val >> 16) & 0xffff) != (inverse ? 0xffff : 0x0000)) +
+	       (((val >> 32) & 0xffff) != (inverse ? 0xffff : 0x0000)) +
+	       (((val >> 48) & 0xffff) != (inverse ? 0xffff : 0x0000));
+}
+
 static inline void emit_a64_mov_i64(const int reg, const u64 val,
 				    struct jit_ctx *ctx)
 {
-	u64 tmp = val;
-	int shift = 0;
-
-	emit(A64_MOVZ(1, reg, tmp & 0xffff, shift), ctx);
-	tmp >>= 16;
-	shift += 16;
-	while (tmp) {
-		if (tmp & 0xffff)
-			emit(A64_MOVK(1, reg, tmp & 0xffff, shift), ctx);
-		tmp >>= 16;
-		shift += 16;
+	u64 nrm_tmp = val, rev_tmp = ~val;
+	bool inverse;
+	int shift;
+
+	if (!(nrm_tmp >> 32))
+		return emit_a64_mov_i(0, reg, (u32)val, ctx);
+
+	inverse = i64_i16_blocks(nrm_tmp, true) < i64_i16_blocks(nrm_tmp, false);
+	shift = max(round_down((inverse ? (fls64(rev_tmp) - 1) :
+					  (fls64(nrm_tmp) - 1)), 16), 0);
+	if (inverse)
+		emit(A64_MOVN(1, reg, (rev_tmp >> shift) & 0xffff, shift), ctx);
+	else
+		emit(A64_MOVZ(1, reg, (nrm_tmp >> shift) & 0xffff, shift), ctx);
+	shift -= 16;
+	while (shift >= 0) {
+		if (((nrm_tmp >> shift) & 0xffff) != (inverse ? 0xffff : 0x0000))
+			emit(A64_MOVK(1, reg, (nrm_tmp >> shift) & 0xffff, shift), ctx);
+		shift -= 16;
 	}
 }
 
+/*
+ * This is an unoptimized 64 immediate emission used for BPF to BPF call
+ * addresses. It will always do a full 64 bit decomposition as otherwise
+ * more complexity in the last extra pass is required since we previously
+ * reserved 4 instructions for the address.
+ */
 static inline void emit_addr_mov_i64(const int reg, const u64 val,
 				     struct jit_ctx *ctx)
 {
@@ -110,26 +153,6 @@ static inline void emit_addr_mov_i64(const int reg, const u64 val,
 	}
 }
 
-static inline void emit_a64_mov_i(const int is64, const int reg,
-				  const s32 val, struct jit_ctx *ctx)
-{
-	u16 hi = val >> 16;
-	u16 lo = val & 0xffff;
-
-	if (hi & 0x8000) {
-		if (hi == 0xffff) {
-			emit(A64_MOVN(is64, reg, (u16)~lo, 0), ctx);
-		} else {
-			emit(A64_MOVN(is64, reg, (u16)~hi, 16), ctx);
-			emit(A64_MOVK(is64, reg, lo, 0), ctx);
-		}
-	} else {
-		emit(A64_MOVZ(is64, reg, lo, 0), ctx);
-		if (hi)
-			emit(A64_MOVK(is64, reg, hi, 16), ctx);
-	}
-}
-
 static inline int bpf2a64_offset(int bpf_to, int bpf_from,
 				 const struct jit_ctx *ctx)
 {
-- 
2.9.5

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ