netdev - [PATCH bpf-next 10/12] nfp: bpf: optimise save/restore for R6~R9 based on register usage

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1538913418-16039-11-git-send-email-quentin.monnet@netronome.com>
Date:   Sun,  7 Oct 2018 12:56:56 +0100
From:   Quentin Monnet <quentin.monnet@...ronome.com>
To:     Daniel Borkmann <daniel@...earbox.net>,
        Alexei Starovoitov <ast@...nel.org>
Cc:     netdev@...r.kernel.org, oss-drivers@...ronome.com,
        Quentin Monnet <quentin.monnet@...ronome.com>
Subject: [PATCH bpf-next 10/12] nfp: bpf: optimise save/restore for R6~R9 based on register usage

When pre-processing the instructions, it is trivial to detect what
subprograms are using R6, R7, R8 or R9 as destination registers. If a
subprogram uses none of those, then we do not need to jump to the
subroutines dedicated to saving and restoring callee-saved registers in
its prologue and epilogue.

This patch introduces detection of callee-saved registers in subprograms
and prevents the JIT from adding calls to those subroutines whenever we
can: we save some instructions in the translated program, and some time
at runtime on BPF-to-BPF calls and returns.

If no subprogram needs to save those registers, we can avoid appending
the subroutines at the end of the program.

Signed-off-by: Quentin Monnet <quentin.monnet@...ronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@...ronome.com>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c      | 85 ++++++++++++++++++-----
 drivers/net/ethernet/netronome/nfp/bpf/main.h     |  2 +
 drivers/net/ethernet/netronome/nfp/bpf/verifier.c | 14 ++--
 3 files changed, 78 insertions(+), 23 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 74423d3e714d..b393f9dea584 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -3132,7 +3132,9 @@ bpf_to_bpf_call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 			    NFP_CSR_ACT_LM_ADDR0);
 	}
 
-	/* The following steps are performed:
+	/* Two cases for jumping to the callee:
+	 *
+	 * - If callee uses and needs to save R6~R9 then:
 	 *     1. Put the start offset of the callee into imm_b(). This will
 	 *        require a fixup step, as we do not necessarily know this
 	 *        address yet.
@@ -3140,8 +3142,12 @@ bpf_to_bpf_call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	 *        register ret_reg().
 	 *     3. (After defer slots are consumed) Jump to the subroutine that
 	 *        pushes the registers to the stack.
-	 * The subroutine acts as a trampoline, and returns to the address in
-	 * imm_b(), i.e. jumps to the callee.
+	 *   The subroutine acts as a trampoline, and returns to the address in
+	 *   imm_b(), i.e. jumps to the callee.
+	 *
+	 * - If callee does not need to save R6~R9 then just load return
+	 *   address to the caller in ret_reg(), and jump to the callee
+	 *   directly.
 	 *
 	 * Using ret_reg() to pass the return address to the callee is set here
 	 * as a convention. The callee can then push this address onto its
@@ -3157,11 +3163,21 @@ bpf_to_bpf_call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	 *   execution of the callee, we will not have to push the return
 	 *   address to the stack for leaf functions.
 	 */
-	ret_tgt = nfp_prog_current_offset(nfp_prog) + 3;
-	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2,
-		     RELO_BR_GO_CALL_PUSH_REGS);
-	offset_br = nfp_prog_current_offset(nfp_prog);
-	wrp_immed_relo(nfp_prog, imm_b(nfp_prog), 0, RELO_IMMED_REL);
+	if (!meta->jmp_dst) {
+		pr_err("BUG: BPF-to-BPF call has no destination recorded\n");
+		return -ELOOP;
+	}
+	if (nfp_prog->subprog[meta->jmp_dst->subprog_idx].needs_reg_push) {
+		ret_tgt = nfp_prog_current_offset(nfp_prog) + 3;
+		emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2,
+			     RELO_BR_GO_CALL_PUSH_REGS);
+		offset_br = nfp_prog_current_offset(nfp_prog);
+		wrp_immed_relo(nfp_prog, imm_b(nfp_prog), 0, RELO_IMMED_REL);
+	} else {
+		ret_tgt = nfp_prog_current_offset(nfp_prog) + 2;
+		emit_br(nfp_prog, BR_UNC, meta->n + 1 + meta->insn.imm, 1);
+		offset_br = nfp_prog_current_offset(nfp_prog);
+	}
 	wrp_immed_relo(nfp_prog, ret_reg(nfp_prog), ret_tgt, RELO_IMMED_REL);
 
 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_tgt))
@@ -3227,15 +3243,24 @@ static int goto_out(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 static int
 nfp_subprog_epilogue(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
-	/* Pop R6~R9 to the stack via related subroutine.
-	 * Pop return address for BPF-to-BPF call from the stack and load it
-	 * into ret_reg() before we jump. This means that the subroutine does
-	 * not come back here, we make it jump back to the subprogram caller
-	 * directly!
-	 */
-	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 1,
-		     RELO_BR_GO_CALL_POP_REGS);
-	wrp_mov(nfp_prog, ret_reg(nfp_prog), reg_lm(0, 0));
+	if (nfp_prog->subprog[meta->subprog_idx].needs_reg_push) {
+		/* Pop R6~R9 to the stack via related subroutine.
+		 * We loaded the return address to the caller into ret_reg().
+		 * This means that the subroutine does not come back here, we
+		 * make it jump back to the subprogram caller directly!
+		 */
+		emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 1,
+			     RELO_BR_GO_CALL_POP_REGS);
+		/* Pop return address from the stack. */
+		wrp_mov(nfp_prog, ret_reg(nfp_prog), reg_lm(0, 0));
+	} else {
+		/* Pop return address from the stack. */
+		wrp_mov(nfp_prog, ret_reg(nfp_prog), reg_lm(0, 0));
+		/* Jump back to caller if no callee-saved registers were used
+		 * by the subprogram.
+		 */
+		emit_rtn(nfp_prog, ret_reg(nfp_prog), 0);
+	}
 
 	return 0;
 }
@@ -3410,7 +3435,8 @@ static int nfp_fixup_branches(struct nfp_prog *nfp_prog)
 			return -ELOOP;
 		}
 
-		if (is_mbpf_pseudo_call(meta)) {
+		if (is_mbpf_pseudo_call(meta) &&
+		    nfp_prog->subprog[jmp_dst->subprog_idx].needs_reg_push) {
 			err = nfp_fixup_immed_relo(nfp_prog, meta,
 						   jmp_dst, br_idx);
 			if (err)
@@ -3549,6 +3575,17 @@ static void nfp_outro_xdp(struct nfp_prog *nfp_prog)
 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_b(2), SHF_SC_L_SHF, 16);
 }
 
+static bool nfp_prog_needs_callee_reg_save(struct nfp_prog *nfp_prog)
+{
+	unsigned int idx;
+
+	for (idx = 1; idx < nfp_prog->subprog_cnt; idx++)
+		if (nfp_prog->subprog[idx].needs_reg_push)
+			return true;
+
+	return false;
+}
+
 static void nfp_push_callee_registers(struct nfp_prog *nfp_prog)
 {
 	u8 reg;
@@ -3612,7 +3649,7 @@ static void nfp_outro(struct nfp_prog *nfp_prog)
 		WARN_ON(1);
 	}
 
-	if (nfp_prog->subprog_cnt == 1)
+	if (!nfp_prog_needs_callee_reg_save(nfp_prog))
 		return;
 
 	nfp_push_callee_registers(nfp_prog);
@@ -4354,10 +4391,20 @@ void *nfp_bpf_relo_for_vnic(struct nfp_prog *nfp_prog, struct nfp_bpf_vnic *bv)
 				      nfp_prog->tgt_abort + bv->start_off);
 			break;
 		case RELO_BR_GO_CALL_PUSH_REGS:
+			if (!nfp_prog->tgt_call_push_regs) {
+				pr_err("BUG: failed to detect subprogram registers needs\n");
+				err = -EINVAL;
+				goto err_free_prog;
+			}
 			off = nfp_prog->tgt_call_push_regs + bv->start_off;
 			br_set_offset(&prog[i], off);
 			break;
 		case RELO_BR_GO_CALL_POP_REGS:
+			if (!nfp_prog->tgt_call_pop_regs) {
+				pr_err("BUG: failed to detect subprogram registers needs\n");
+				err = -EINVAL;
+				goto err_free_prog;
+			}
 			off = nfp_prog->tgt_call_pop_regs + bv->start_off;
 			br_set_offset(&prog[i], off);
 			break;
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 1cef5136c198..44b787a0bd4b 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -452,9 +452,11 @@ static inline bool is_mbpf_pseudo_call(const struct nfp_insn_meta *meta)
 /**
  * struct nfp_bpf_subprog_info - nfp BPF sub-program (a.k.a. function) info
  * @stack_depth:	maximum stack depth used by this sub-program
+ * @needs_reg_push:	whether sub-program uses callee-saved registers
  */
 struct nfp_bpf_subprog_info {
 	u16 stack_depth;
+	u8 needs_reg_push : 1;
 };
 
 /**
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index 81a463726d55..f31721bd1fac 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -644,7 +644,8 @@ nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx)
 }
 
 static int
-nfp_assign_subprog_idx(struct bpf_verifier_env *env, struct nfp_prog *nfp_prog)
+nfp_assign_subprog_idx_and_regs(struct bpf_verifier_env *env,
+				struct nfp_prog *nfp_prog)
 {
 	struct nfp_insn_meta *meta;
 	int index = 0;
@@ -653,6 +654,10 @@ nfp_assign_subprog_idx(struct bpf_verifier_env *env, struct nfp_prog *nfp_prog)
 		if (nfp_is_subprog_start(meta))
 			index++;
 		meta->subprog_idx = index;
+
+		if (meta->insn.dst_reg >= BPF_REG_6 &&
+		    meta->insn.dst_reg <= BPF_REG_9)
+			nfp_prog->subprog[index].needs_reg_push = 1;
 	}
 
 	if (index + 1 != nfp_prog->subprog_cnt) {
@@ -734,7 +739,7 @@ static int nfp_bpf_finalize(struct bpf_verifier_env *env)
 	if (!nfp_prog->subprog)
 		return -ENOMEM;
 
-	nfp_assign_subprog_idx(env, nfp_prog);
+	nfp_assign_subprog_idx_and_regs(env, nfp_prog);
 
 	info = env->subprog_info;
 	for (i = 0; i < nfp_prog->subprog_cnt; i++) {
@@ -745,8 +750,9 @@ static int nfp_bpf_finalize(struct bpf_verifier_env *env)
 
 		/* Account for size of return address. */
 		nfp_prog->subprog[i].stack_depth += REG_WIDTH;
-		/* Account for size of saved registers. */
-		nfp_prog->subprog[i].stack_depth += BPF_REG_SIZE * 4;
+		/* Account for size of saved registers, if necessary. */
+		if (nfp_prog->subprog[i].needs_reg_push)
+			nfp_prog->subprog[i].stack_depth += BPF_REG_SIZE * 4;
 	}
 
 	nn = netdev_priv(env->prog->aux->offload->netdev);
-- 
2.7.4