linux-kernel - [PATCH bpf-next 5/6] bpf: Improve tracing recursion prevention mechanism

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20230417154737.12740-6-laoar.shao@gmail.com>
Date:   Mon, 17 Apr 2023 15:47:36 +0000
From:   Yafang Shao <laoar.shao@...il.com>
To:     ast@...nel.org, daniel@...earbox.net, andrii@...nel.org,
        kafai@...com, songliubraving@...com, yhs@...com,
        john.fastabend@...il.com, kpsingh@...nel.org, sdf@...gle.com,
        haoluo@...gle.com, jolsa@...nel.org, rostedt@...dmis.org,
        mhiramat@...nel.org
Cc:     bpf@...r.kernel.org, linux-trace-kernel@...r.kernel.org,
        linux-kernel@...r.kernel.org, Yafang Shao <laoar.shao@...il.com>
Subject: [PATCH bpf-next 5/6] bpf: Improve tracing recursion prevention mechanism

Currently we use prog->active to prevent tracing recursion, but it has
some downsides,

- It can't identify different contexts
  That said, if a process context is interrupted by a irq context and
  the irq context runs the same code path, it will be considered as
  recursion. For example,
    normal:
      this_cpu_inc_return(*(prog->active)) == 1 <- OK

      irq:
        this_cpu_inc_return(*(prog->active)) == 1 <- FAIL!
        [ Considered as recusion ]

- It has to maintain a percpu area
  A percpu area will be allocated for each prog when the prog is loaded
  and be freed when the prog is destroyed.

Let's replace it with the generic tracing recursion prevention mechanism,
which can work fine with anything. In the above example, the irq context
won't be considered as recursion again,
  normal:
    test_recursion_try_acquire() <- OK

    softirq:
      test_recursion_try_acquire() <- OK

      irq:
        test_recursion_try_acquire() <- OK

Note that, currently one single recursion in process context is allowed
due to the TRACE_CTX_TRANSITION workaround, which can be fixed in the
future. That said, below behavior is expected currently,
  normal:
    test_recursion_try_acquire() <- OK
    [ recursion happens ]        <- one single recursion is allowed
    test_recursion_try_acquire() <- OK
    [ recursion happens ]
    test_recursion_try_acquire() <- RECURSION!

Signed-off-by: Yafang Shao <laoar.shao@...il.com>
---
 include/linux/bpf.h      |  2 +-
 kernel/bpf/core.c        | 10 ----------
 kernel/bpf/trampoline.c  | 44 +++++++++++++++++++++++++++++++++-----------
 kernel/trace/bpf_trace.c | 12 +++++++-----
 4 files changed, 41 insertions(+), 27 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 18b592f..c42ff90 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1467,7 +1467,6 @@ struct bpf_prog {
 	u32			jited_len;	/* Size of jited insns in bytes */
 	u8			tag[BPF_TAG_SIZE];
 	struct bpf_prog_stats __percpu *stats;
-	int __percpu		*active;
 	unsigned int		(*bpf_func)(const void *ctx,
 					    const struct bpf_insn *insn);
 	struct bpf_prog_aux	*aux;		/* Auxiliary fields */
@@ -1813,6 +1812,7 @@ struct bpf_tramp_run_ctx {
 	struct bpf_run_ctx run_ctx;
 	u64 bpf_cookie;
 	struct bpf_run_ctx *saved_run_ctx;
+	int recursion_bit;
 };
 
 static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7421487..0942ab2 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -103,12 +103,6 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
 		vfree(fp);
 		return NULL;
 	}
-	fp->active = alloc_percpu_gfp(int, bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
-	if (!fp->active) {
-		vfree(fp);
-		kfree(aux);
-		return NULL;
-	}
 
 	fp->pages = size / PAGE_SIZE;
 	fp->aux = aux;
@@ -138,7 +132,6 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
 
 	prog->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags);
 	if (!prog->stats) {
-		free_percpu(prog->active);
 		kfree(prog->aux);
 		vfree(prog);
 		return NULL;
@@ -256,7 +249,6 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
 		 */
 		fp_old->aux = NULL;
 		fp_old->stats = NULL;
-		fp_old->active = NULL;
 		__bpf_prog_free(fp_old);
 	}
 
@@ -272,7 +264,6 @@ void __bpf_prog_free(struct bpf_prog *fp)
 		kfree(fp->aux);
 	}
 	free_percpu(fp->stats);
-	free_percpu(fp->active);
 	vfree(fp);
 }
 
@@ -1385,7 +1376,6 @@ static void bpf_prog_clone_free(struct bpf_prog *fp)
 	 */
 	fp->aux = NULL;
 	fp->stats = NULL;
-	fp->active = NULL;
 	__bpf_prog_free(fp);
 }
 
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index f61d513..3df39a5 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -842,15 +842,21 @@ static __always_inline u64 notrace bpf_prog_start_time(void)
 static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx)
 	__acquires(RCU)
 {
-	rcu_read_lock();
-	migrate_disable();
-
-	run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+	int bit;
 
-	if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+	rcu_read_lock();
+	bit = test_recursion_try_acquire(_THIS_IP_, _RET_IP_);
+	run_ctx->recursion_bit = bit;
+	if (bit < 0) {
+		preempt_disable_notrace();
 		bpf_prog_inc_misses_counter(prog);
+		preempt_enable_notrace();
 		return 0;
 	}
+
+	migrate_disable();
+
+	run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
 	return bpf_prog_start_time();
 }
 
@@ -880,11 +886,16 @@ static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start,
 					  struct bpf_tramp_run_ctx *run_ctx)
 	__releases(RCU)
 {
+	if (run_ctx->recursion_bit < 0)
+		goto out;
+
 	bpf_reset_run_ctx(run_ctx->saved_run_ctx);
 
 	update_prog_stats(prog, start);
-	this_cpu_dec(*(prog->active));
 	migrate_enable();
+	test_recursion_release(run_ctx->recursion_bit);
+
+out:
 	rcu_read_unlock();
 }
 
@@ -916,15 +927,21 @@ static void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start,
 u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
 					     struct bpf_tramp_run_ctx *run_ctx)
 {
-	rcu_read_lock_trace();
-	migrate_disable();
-	might_fault();
+	int bit;
 
-	if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+	rcu_read_lock_trace();
+	bit = test_recursion_try_acquire(_THIS_IP_, _RET_IP_);
+	run_ctx->recursion_bit = bit;
+	if (bit < 0) {
+		preempt_disable_notrace();
 		bpf_prog_inc_misses_counter(prog);
+		preempt_enable_notrace();
 		return 0;
 	}
 
+	migrate_disable();
+	might_fault();
+
 	run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
 
 	return bpf_prog_start_time();
@@ -933,11 +950,16 @@ u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
 void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start,
 					     struct bpf_tramp_run_ctx *run_ctx)
 {
+	if (run_ctx->recursion_bit < 0)
+		goto out;
+
 	bpf_reset_run_ctx(run_ctx->saved_run_ctx);
 
 	update_prog_stats(prog, start);
-	this_cpu_dec(*(prog->active));
 	migrate_enable();
+	test_recursion_release(run_ctx->recursion_bit);
+
+out:
 	rcu_read_unlock_trace();
 }
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index bcf91bc..bb9a4c9 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2250,16 +2250,18 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
 static __always_inline
 void __bpf_trace_run(struct bpf_prog *prog, u64 *args)
 {
-	cant_sleep();
-	if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+	int bit;
+
+	bit = test_recursion_try_acquire(_THIS_IP_, _RET_IP_);
+	if (bit < 0) {
 		bpf_prog_inc_misses_counter(prog);
-		goto out;
+		return;
 	}
+	cant_sleep();
 	rcu_read_lock();
 	(void) bpf_prog_run(prog, args);
 	rcu_read_unlock();
-out:
-	this_cpu_dec(*(prog->active));
+	test_recursion_release(bit);
 }
 
 #define UNPACK(...)			__VA_ARGS__
-- 
1.8.3.1