[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250917072242.674528-2-zhaofuyu@vivo.com>
Date: Wed, 17 Sep 2025 15:22:40 +0800
From: Fuyu Zhao <zhaofuyu@...o.com>
To: ast@...nel.org,
daniel@...earbox.net,
andrii@...nel.org,
martin.lau@...ux.dev,
song@...nel.org,
yonghong.song@...ux.dev,
haoluo@...gle.com,
jolsa@...nel.org,
eddyz87@...il.com,
kpsingh@...nel.org,
sdf@...ichev.me,
rostedt@...dmis.org,
mhiramat@...nel.org,
mathieu.desnoyers@...icios.com,
shuah@...nel.org,
willemb@...gle.com,
kerneljasonxing@...il.com,
paul.chaignon@...il.com,
chen.dylane@...ux.dev,
memxor@...il.com,
martin.kelly@...wdstrike.com,
zhaofuyu@...o.com,
ameryhung@...il.com,
linux-kernel@...r.kernel.org,
bpf@...r.kernel.org,
linux-trace-kernel@...r.kernel.org,
linux-kselftest@...r.kernel.org
Cc: yikai.lin@...o.com
Subject: [RFC PATCH bpf-next v1 1/3] bpf: Introduce BPF_PROG_TYPE_RAW_TRACEPOINT_OVERRIDE
This patch introduces a new program type -- BPF_PROG_TYPE_RAW_TRACEPOINT_OVERRIDE.
Program of this type requires an additional parameter -- probe_name, to locate
the target tracepoint probe function registered by register_trace_* in the kernel.
This type reuses existing RAW_TRACEPOINT infrastructure, and differs
only when probe_name is specified. In that case, the newly attached
RAW_TRACEPOINT_OVERRIDE program and the target probe function are paired
and stored in a snapshot.
When the BPF program is detached, snapshots are consulted to determine
whether restoration of the original probe function is required.
Signed-off-by: Fuyu Zhao <zhaofuyu@...o.com>
---
include/linux/bpf_types.h | 2 +
include/linux/trace_events.h | 9 ++
include/linux/tracepoint-defs.h | 6 +
include/linux/tracepoint.h | 3 +
include/uapi/linux/bpf.h | 2 +
kernel/bpf/syscall.c | 35 ++++--
kernel/trace/bpf_trace.c | 31 ++++++
kernel/tracepoint.c | 190 +++++++++++++++++++++++++++++++-
8 files changed, 269 insertions(+), 9 deletions(-)
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index fa78f49d4a9a..e5cf8a1af6cd 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -48,6 +48,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, raw_tracepoint_writable,
struct bpf_raw_tracepoint_args, u64)
BPF_PROG_TYPE(BPF_PROG_TYPE_TRACING, tracing,
void *, void *)
+BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT_OVERRIDE, raw_tracepoint_override,
+ struct bpf_raw_tracepoint_args, u64)
#endif
#ifdef CONFIG_CGROUP_BPF
BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev,
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 04307a19cde3..fcb2d62d0c9f 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -768,6 +768,9 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info);
struct bpf_raw_tp_link;
int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link);
int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link);
+int bpf_probe_override(struct bpf_raw_event_map *btp,
+ struct bpf_raw_tp_link *link,
+ const char *probe_name);
struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name);
void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp);
@@ -805,6 +808,12 @@ static inline int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf
{
return -EOPNOTSUPP;
}
+static inline int bpf_probe_override(struct bpf_raw_event_map *btp,
+ struct bpf_raw_tp_link *link,
+ const char *probe_name)
+{
+ return -EOPNOTSUPP;
+}
static inline struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name)
{
return NULL;
diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h
index aebf0571c736..9d7b1710c0aa 100644
--- a/include/linux/tracepoint-defs.h
+++ b/include/linux/tracepoint-defs.h
@@ -29,6 +29,11 @@ struct tracepoint_func {
int prio;
};
+struct tracepoint_func_snapshot {
+ struct tracepoint_func orig;
+ struct tracepoint_func override;
+};
+
struct tracepoint_ext {
int (*regfunc)(void);
void (*unregfunc)(void);
@@ -45,6 +50,7 @@ struct tracepoint {
void *probestub;
struct tracepoint_func __rcu *funcs;
struct tracepoint_ext *ext;
+ struct tracepoint_func_snapshot *snapshot;
};
#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 826ce3f8e1f8..399001e2afca 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -50,6 +50,9 @@ tracepoint_probe_register_may_exist(struct tracepoint *tp, void *probe,
return tracepoint_probe_register_prio_may_exist(tp, probe, data,
TRACEPOINT_DEFAULT_PRIO);
}
+extern int
+tracepoint_probe_override(struct tracepoint *tp, void *probe, void *data,
+ const char *func_replaced);
extern void
for_each_kernel_tracepoint(void (*fct)(struct tracepoint *tp, void *priv),
void *priv);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 233de8677382..cd3d889fe634 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1071,6 +1071,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_SK_LOOKUP,
BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
BPF_PROG_TYPE_NETFILTER,
+ BPF_PROG_TYPE_RAW_TRACEPOINT_OVERRIDE,
__MAX_BPF_PROG_TYPE
};
@@ -1707,6 +1708,7 @@ union bpf_attr {
__u32 prog_fd;
__u32 :32;
__aligned_u64 cookie;
+ __aligned_u64 probe_name;
} raw_tracepoint;
struct { /* anonymous struct for BPF_BTF_LOAD */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3f178a0f8eb1..e360062db34e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -4092,14 +4092,16 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro
#endif /* CONFIG_PERF_EVENTS */
static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
- const char __user *user_tp_name, u64 cookie,
+ const char __user *user_tp_name,
+ const char __user *user_probe_name,
+ u64 cookie,
enum bpf_attach_type attach_type)
{
struct bpf_link_primer link_primer;
struct bpf_raw_tp_link *link;
struct bpf_raw_event_map *btp;
- const char *tp_name;
- char buf[128];
+ const char *tp_name, *probe_name;
+ char buf[128], probe[128];
int err;
switch (prog->type) {
@@ -4124,6 +4126,17 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
buf[sizeof(buf) - 1] = 0;
tp_name = buf;
break;
+ case BPF_PROG_TYPE_RAW_TRACEPOINT_OVERRIDE:
+ if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0)
+ return -EFAULT;
+ buf[sizeof(buf) - 1] = 0;
+ tp_name = buf;
+
+ if (strncpy_from_user(probe, user_probe_name, sizeof(probe) - 1) < 0)
+ return -EFAULT;
+ probe[sizeof(probe) - 1] = 0;
+ probe_name = probe;
+ break;
default:
return -EINVAL;
}
@@ -4149,7 +4162,10 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
goto out_put_btp;
}
- err = bpf_probe_register(link->btp, link);
+ if (prog->type == BPF_PROG_TYPE_RAW_TRACEPOINT_OVERRIDE)
+ err = bpf_probe_override(link->btp, link, probe_name);
+ else
+ err = bpf_probe_register(link->btp, link);
if (err) {
bpf_link_cleanup(&link_primer);
goto out_put_btp;
@@ -4162,12 +4178,12 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
return err;
}
-#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.cookie
+#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.probe_name
static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
{
struct bpf_prog *prog;
- void __user *tp_name;
+ void __user *tp_name, *probe_name;
__u64 cookie;
int fd;
@@ -4180,7 +4196,9 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
tp_name = u64_to_user_ptr(attr->raw_tracepoint.name);
cookie = attr->raw_tracepoint.cookie;
- fd = bpf_raw_tp_link_attach(prog, tp_name, cookie, prog->expected_attach_type);
+ probe_name = u64_to_user_ptr(attr->raw_tracepoint.probe_name);
+ fd = bpf_raw_tp_link_attach(prog, tp_name, probe_name,
+ cookie, prog->expected_attach_type);
if (fd < 0)
bpf_prog_put(prog);
return fd;
@@ -5565,7 +5583,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
goto out;
}
if (prog->expected_attach_type == BPF_TRACE_RAW_TP)
- ret = bpf_raw_tp_link_attach(prog, NULL, attr->link_create.tracing.cookie,
+ ret = bpf_raw_tp_link_attach(prog, NULL, NULL,
+ attr->link_create.tracing.cookie,
attr->link_create.attach_type);
else if (prog->expected_attach_type == BPF_TRACE_ITER)
ret = bpf_iter_link_attach(attr, uattr, prog);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 606007c387c5..1e965517ba05 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1998,6 +1998,14 @@ const struct bpf_verifier_ops raw_tracepoint_writable_verifier_ops = {
const struct bpf_prog_ops raw_tracepoint_writable_prog_ops = {
};
+const struct bpf_verifier_ops raw_tracepoint_override_verifier_ops = {
+ .get_func_proto = raw_tp_prog_func_proto,
+ .is_valid_access = raw_tp_writable_prog_is_valid_access,
+};
+
+const struct bpf_prog_ops raw_tracepoint_override_prog_ops = {
+};
+
static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
@@ -2307,6 +2315,29 @@ BPF_TRACE_DEFN_x(10);
BPF_TRACE_DEFN_x(11);
BPF_TRACE_DEFN_x(12);
+int bpf_probe_override(struct bpf_raw_event_map *btp,
+ struct bpf_raw_tp_link *link,
+ const char *probe_name)
+{
+ struct tracepoint *tp = btp->tp;
+ struct bpf_prog *prog = link->link.prog;
+
+ if (!probe_name)
+ return -EINVAL;
+
+ /*
+ * check that program doesn't access arguments beyond what's
+ * available in this tracepoint
+ */
+ if (prog->aux->max_ctx_offset > btp->num_args * sizeof(u64))
+ return -EINVAL;
+
+ if (prog->aux->max_tp_access > btp->writable_size)
+ return -EINVAL;
+
+ return tracepoint_probe_override(tp, (void *)btp->bpf_func, link, probe_name);
+}
+
int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link)
{
struct tracepoint *tp = btp->tp;
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 62719d2941c9..3b8317306edc 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -14,6 +14,7 @@
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/static_key.h>
+#include <linux/kallsyms.h>
enum tp_func_state {
TP_FUNC_0,
@@ -130,6 +131,121 @@ static void debug_print_probes(struct tracepoint_func *funcs)
printk(KERN_DEBUG "Probe %d : %pSb\n", i, funcs[i].func);
}
+static struct tracepoint_func *
+find_func_to_override(struct tracepoint_func *funcs,
+ unsigned long probe_addr)
+{
+ int iter;
+
+ if (!funcs)
+ return NULL;
+
+ for (iter = 0; funcs[iter].func; iter++) {
+ if ((unsigned long)funcs[iter].func == probe_addr)
+ return &(funcs[iter]);
+ }
+
+ return NULL;
+}
+
+static struct tracepoint_func_snapshot *
+find_func_snapshot(struct tracepoint_func_snapshot **ss,
+ struct tracepoint_func *func,
+ bool *is_override)
+{
+ int iter;
+ struct tracepoint_func_snapshot *shots;
+
+ shots = *ss;
+ if (!shots)
+ return NULL;
+
+ for (iter = 0; shots[iter].override.func; iter++) {
+ if (shots[iter].override.func == func->func &&
+ shots[iter].override.data == func->data) {
+ *is_override = true;
+ return &(shots[iter]);
+ }
+
+ if (shots[iter].orig.func == func->func &&
+ shots[iter].orig.data == func->data) {
+ *is_override = false;
+ return &(shots[iter]);
+ }
+ }
+
+ return NULL;
+}
+
+static void drop_func_snapshot(struct tracepoint_func_snapshot **ss,
+ struct tracepoint_func_snapshot *drop)
+{
+ struct tracepoint_func_snapshot *old, *new;
+ int nr_snapshots; /* Counter for snapshots */
+ int iter; /* Iterate over old snapshots */
+ int idx = 0; /* Index of snapshot to drop */
+
+ old = *ss;
+ if (!old)
+ return;
+
+ for (nr_snapshots = 0; old[nr_snapshots].override.func; nr_snapshots++) {
+ if (&(old[nr_snapshots]) == drop)
+ idx = nr_snapshots;
+ }
+
+ if (nr_snapshots == 0) {
+ kfree(old);
+ *ss = NULL;
+ return;
+ }
+
+ new = kmalloc_array(nr_snapshots, sizeof(struct tracepoint_func_snapshot), GFP_KERNEL);
+ if (!new) {
+ for (iter = idx; iter < nr_snapshots - 1; iter++)
+ old[iter] = old[iter + 1];
+ memset(&(old[nr_snapshots - 1]), 0, sizeof(struct tracepoint_func_snapshot));
+ } else {
+ int j = 0;
+
+ for (iter = 0; iter < nr_snapshots; iter++) {
+ if (iter != idx)
+ new[j++] = old[iter];
+ }
+ kfree(old);
+ *ss = new;
+ }
+}
+
+static int save_func_snapshot(struct tracepoint_func_snapshot **ss,
+ struct tracepoint_func *new_func,
+ struct tracepoint_func *old_func)
+{
+ struct tracepoint_func_snapshot *old, *new;
+ int nr_shots = 0; /* Counter for old snapshots */
+ int total; /* Total count of new snapshots */
+
+ old = *ss;
+ if (old)
+ while (old[nr_shots].override.func)
+ nr_shots++;
+
+ /* + 2 : one for new snapshot, one for NULL snapshot */
+ total = nr_shots + 2;
+ new = kmalloc_array(total, sizeof(struct tracepoint_func_snapshot), GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+
+ memcpy(new, old, nr_shots * sizeof(struct tracepoint_func_snapshot));
+ new[nr_shots].orig = *old_func;
+ new[nr_shots].override = *new_func;
+ new[nr_shots + 1].override.func = NULL;
+
+ *ss = new;
+ kfree(old);
+ return 0;
+}
+
static struct tracepoint_func *
func_add(struct tracepoint_func **funcs, struct tracepoint_func *tp_func,
int prio)
@@ -412,6 +528,52 @@ static int tracepoint_remove_func(struct tracepoint *tp,
return 0;
}
+static int tracepoint_override_func(struct tracepoint *tp,
+ struct tracepoint_func *func,
+ struct tracepoint_func *func_override)
+{
+ int ret = tracepoint_remove_func(tp, func);
+
+ return ret ? : tracepoint_add_func(tp, func_override,
+ func_override->prio, false);
+}
+
+static int tracepoint_restore_func(struct tracepoint *tp,
+ struct tracepoint_func *func,
+ struct tracepoint_func *func_restore)
+{
+ int ret = tracepoint_remove_func(tp, func);
+
+ return ret ? : tracepoint_add_func(tp, func_restore,
+ func_restore->prio, false);
+}
+
+int tracepoint_probe_override(struct tracepoint *tp, void *probe,
+ void *data, const char *probe_name)
+{
+ struct tracepoint_func tp_func;
+ struct tracepoint_func *target_func;
+ unsigned long probe_addr;
+ int ret;
+
+ probe_addr = kallsyms_lookup_name(probe_name);
+ mutex_lock(&tracepoints_mutex);
+ target_func = find_func_to_override(tp->funcs, probe_addr);
+ if (!target_func)
+ return -ESRCH;
+ tp_func.func = probe;
+ tp_func.data = data;
+ tp_func.prio = target_func->prio;
+ ret = save_func_snapshot(&(tp->snapshot), &tp_func, target_func);
+ if (ret)
+ goto unlock;
+
+ ret = tracepoint_override_func(tp, target_func, &tp_func);
+unlock:
+ mutex_unlock(&tracepoints_mutex);
+ return ret;
+}
+
/**
* tracepoint_probe_register_prio_may_exist - Connect a probe to a tracepoint with priority
* @tp: tracepoint
@@ -496,12 +658,38 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register);
int tracepoint_probe_unregister(struct tracepoint *tp, void *probe, void *data)
{
struct tracepoint_func tp_func;
+ struct tracepoint_func_snapshot *shot;
int ret;
+ bool is_override; /* whether probe is an overriding func */
mutex_lock(&tracepoints_mutex);
tp_func.func = probe;
tp_func.data = data;
- ret = tracepoint_remove_func(tp, &tp_func);
+
+ shot = find_func_snapshot(&(tp->snapshot), &tp_func, &is_override);
+ if (!shot) {
+ ret = tracepoint_remove_func(tp, &tp_func);
+ } else {
+ /* unregister probe rengistered by raw_tracepoint_open,
+ * restore to original tp_func.
+ *
+ * 1. restore orig func from snapshot.
+ * 2. remove snapshot.
+ */
+ if (is_override)
+ ret = tracepoint_restore_func(tp, &tp_func, &(shot->orig));
+ /* unregister orig probe registered by register_trace_*.
+ *
+ * 1. remove curr probe func(registered by raw_tracepoint_open)
+ * from tp->funcs.
+ * 2. remove snapshot.
+ */
+ else
+ ret = tracepoint_remove_func(tp, &(shot->override));
+ if (!ret)
+ drop_func_snapshot(&(tp->snapshot), shot);
+ }
+
mutex_unlock(&tracepoints_mutex);
return ret;
}
--
2.43.0
Powered by blists - more mailing lists