[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20241025194010.515253-1-mathieu.desnoyers@efficios.com>
Date: Fri, 25 Oct 2024 15:40:10 -0400
From: Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
To: Steven Rostedt <rostedt@...dmis.org>
Cc: linux-kernel@...r.kernel.org,
	Mathieu Desnoyers <mathieu.desnoyers@...icios.com>,
	syzbot+b390c8062d8387b6272a@...kaller.appspotmail.com,
	Michael Jeanson <mjeanson@...icios.com>,
	Masami Hiramatsu <mhiramat@...nel.org>,
	Peter Zijlstra <peterz@...radead.org>,
	Alexei Starovoitov <ast@...nel.org>,
	Yonghong Song <yhs@...com>,
	"Paul E . McKenney" <paulmck@...nel.org>,
	Ingo Molnar <mingo@...hat.com>,
	Arnaldo Carvalho de Melo <acme@...nel.org>,
	Mark Rutland <mark.rutland@....com>,
	Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
	Namhyung Kim <namhyung@...nel.org>,
	Andrii Nakryiko <andrii.nakryiko@...il.com>,
	bpf@...r.kernel.org,
	Joel Fernandes <joel@...lfernandes.org>,
	Jordan Rife <jrife@...gle.com>
Subject: [RFC PATCH v2] tracing: Fix syscall tracepoint use-after-free
The grace period used internally within tracepoint.c:release_probes()
uses call_rcu() to batch waiting for quiescence of old probe arrays,
rather than using the tracepoint_synchronize_unregister() which blocks
while waiting for quiescence.
With the introduction of faultable syscall tracepoints, this causes
use-after-free issues reproduced with syzkaller.
Fix this by introducing tracepoint_call_rcu(), which uses the
appropriate call_rcu() or call_rcu_tasks_trace() before invoking the
rcu_free_old_probes callback.
Use tracepoint_call_rcu() in bpf_link_free() for raw tracepoints as
well, which has the same problem for syscall tracepoints. Ditto for
bpf_prog_put().
Reported-by: syzbot+b390c8062d8387b6272a@...kaller.appspotmail.com
Fixes: a363d27cdbc2 ("tracing: Allow system call tracepoints to handle page faults")
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
Cc: Michael Jeanson <mjeanson@...icios.com>
Cc: Steven Rostedt <rostedt@...dmis.org>
Cc: Masami Hiramatsu <mhiramat@...nel.org>
Cc: Peter Zijlstra <peterz@...radead.org>
Cc: Alexei Starovoitov <ast@...nel.org>
Cc: Yonghong Song <yhs@...com>
Cc: Paul E. McKenney <paulmck@...nel.org>
Cc: Ingo Molnar <mingo@...hat.com>
Cc: Arnaldo Carvalho de Melo <acme@...nel.org>
Cc: Mark Rutland <mark.rutland@....com>
Cc: Alexander Shishkin <alexander.shishkin@...ux.intel.com>
Cc: Namhyung Kim <namhyung@...nel.org>
Cc: Andrii Nakryiko <andrii.nakryiko@...il.com>
Cc: bpf@...r.kernel.org
Cc: Joel Fernandes <joel@...lfernandes.org>
Cc: Jordan Rife <jrife@...gle.com>
---
Changes since v0:
- Introduce tracepoint_call_rcu(),
- Fix bpf_link_free() use of call_rcu as well.
Changes since v1:
- Use tracepoint_call_rcu() for bpf_prog_put as well.
---
 include/linux/tracepoint.h |  9 +++++++++
 kernel/bpf/syscall.c       | 36 +++++++++++++++++++++++++++---------
 kernel/tracepoint.c        | 22 ++++++++++++++++++----
 3 files changed, 54 insertions(+), 13 deletions(-)
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 0dc67fad706c..45025d6b2dd6 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -104,6 +104,8 @@ void for_each_tracepoint_in_module(struct module *mod,
  * tracepoint_synchronize_unregister must be called between the last tracepoint
  * probe unregistration and the end of module exit to make sure there is no
  * caller executing a probe when it is freed.
+ * An alternative to tracepoint_synchronize_unregister() is to use
+ * tracepoint_call_rcu() for batched reclaim.
  */
 #ifdef CONFIG_TRACEPOINTS
 static inline void tracepoint_synchronize_unregister(void)
@@ -111,9 +113,16 @@ static inline void tracepoint_synchronize_unregister(void)
 	synchronize_rcu_tasks_trace();
 	synchronize_rcu();
 }
+
+void tracepoint_call_rcu(struct tracepoint *tp, struct rcu_head *head,
+			 void (*callback)(struct rcu_head *head));
+
 #else
 static inline void tracepoint_synchronize_unregister(void)
 { }
+static inline void tracepoint_call_rcu(struct tracepoint *tp, struct rcu_head *head,
+				       void (*callback)(struct rcu_head *head))
+{ }
 #endif
 
 #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 59de664e580d..f21000f33a61 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2992,28 +2992,46 @@ static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu)
 		call_rcu(rcu, bpf_link_defer_dealloc_rcu_gp);
 }
 
+static void bpf_link_defer_bpf_prog_put(struct rcu_head *rcu)
+{
+	struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
+
+	bpf_prog_put(aux->prog);
+}
+
 /* bpf_link_free is guaranteed to be called from process context */
 static void bpf_link_free(struct bpf_link *link)
 {
 	const struct bpf_link_ops *ops = link->ops;
+	struct bpf_raw_tp_link *raw_tp = NULL;
 	bool sleepable = false;
 
+	if (link->type == BPF_LINK_TYPE_RAW_TRACEPOINT)
+		raw_tp = container_of(link, struct bpf_raw_tp_link, link);
 	bpf_link_free_id(link->id);
 	if (link->prog) {
 		sleepable = link->prog->sleepable;
 		/* detach BPF program, clean up used resources */
 		ops->release(link);
-		bpf_prog_put(link->prog);
+		if (raw_tp)
+			tracepoint_call_rcu(raw_tp->btp->tp, &link->prog->aux->rcu,
+					    bpf_link_defer_bpf_prog_put);
+		else
+			bpf_prog_put(link->prog);
 	}
 	if (ops->dealloc_deferred) {
-		/* schedule BPF link deallocation; if underlying BPF program
-		 * is sleepable, we need to first wait for RCU tasks trace
-		 * sync, then go through "classic" RCU grace period
-		 */
-		if (sleepable)
-			call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp);
-		else
-			call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
+		if (raw_tp) {
+			tracepoint_call_rcu(raw_tp->btp->tp, &link->rcu, bpf_link_defer_dealloc_rcu_gp);
+		} else {
+			/* schedule BPF link deallocation; if underlying BPF program
+			 * is sleepable, we need to first wait for RCU tasks trace
+			 * sync, then go through "classic" RCU grace period
+			 */
+			if (sleepable)
+				call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp);
+			else
+				call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
+		}
 	} else if (ops->dealloc)
 		ops->dealloc(link);
 }
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 6474e2cf22c9..ef60c5484eda 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -106,13 +106,27 @@ static void rcu_free_old_probes(struct rcu_head *head)
 	kfree(container_of(head, struct tp_probes, rcu));
 }
 
-static inline void release_probes(struct tracepoint_func *old)
+static bool tracepoint_is_syscall(struct tracepoint *tp)
+{
+	return !strcmp(tp->name, "sys_enter") || !strcmp(tp->name, "sys_exit");
+}
+
+void tracepoint_call_rcu(struct tracepoint *tp, struct rcu_head *head,
+			 void (*callback)(struct rcu_head *head))
+{
+	if (tracepoint_is_syscall(tp))
+		call_rcu_tasks_trace(head, callback);
+	else
+		call_rcu(head, callback);
+}
+
+static inline void release_probes(struct tracepoint *tp, struct tracepoint_func *old)
 {
 	if (old) {
 		struct tp_probes *tp_probes = container_of(old,
 			struct tp_probes, probes[0]);
 
-		call_rcu(&tp_probes->rcu, rcu_free_old_probes);
+		tracepoint_call_rcu(tp, &tp_probes->rcu, rcu_free_old_probes);
 	}
 }
 
@@ -334,7 +348,7 @@ static int tracepoint_add_func(struct tracepoint *tp,
 		break;
 	}
 
-	release_probes(old);
+	release_probes(tp, old);
 	return 0;
 }
 
@@ -406,7 +420,7 @@ static int tracepoint_remove_func(struct tracepoint *tp,
 		WARN_ON_ONCE(1);
 		break;
 	}
-	release_probes(old);
+	release_probes(tp, old);
 	return 0;
 }
 
-- 
2.39.5
Powered by blists - more mailing lists
 
