[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20180524050738.fgay6jcagx4exrnr@kafai-mbp>
Date: Wed, 23 May 2018 22:07:40 -0700
From: Martin KaFai Lau <kafai@...com>
To: Yonghong Song <yhs@...com>
CC: <peterz@...radead.org>, <ast@...com>, <daniel@...earbox.net>,
<netdev@...r.kernel.org>, <kernel-team@...com>
Subject: Re: [PATCH bpf-next v4 2/7] bpf: introduce bpf subcommand
BPF_TASK_FD_QUERY
On Wed, May 23, 2018 at 05:18:42PM -0700, Yonghong Song wrote:
> Currently, suppose a userspace application has loaded a bpf program
> and attached it to a tracepoint/kprobe/uprobe, and a bpf
> introspection tool, e.g., bpftool, wants to show which bpf program
> is attached to which tracepoint/kprobe/uprobe. Such attachment
> information will be really useful to understand the overall bpf
> deployment in the system.
>
> There is a name field (16 bytes) for each program, which could
> be used to encode the attachment point. There are some drawbacks
> for this approaches. First, bpftool user (e.g., an admin) may not
> really understand the association between the name and the
> attachment point. Second, if one program is attached to multiple
> places, encoding a proper name which can imply all these
> attachments becomes difficult.
>
> This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
> Given a pid and fd, if the <pid, fd> is associated with a
> tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return
> . prog_id
> . tracepoint name, or
> . k[ret]probe funcname + offset or kernel addr, or
> . u[ret]probe filename + offset
> to the userspace.
> The user can use "bpftool prog" to find more information about
> bpf program itself with prog_id.
>
> Signed-off-by: Yonghong Song <yhs@...com>
> ---
> include/linux/trace_events.h | 17 +++++++
> include/uapi/linux/bpf.h | 26 ++++++++++
> kernel/bpf/syscall.c | 115 +++++++++++++++++++++++++++++++++++++++++++
> kernel/trace/bpf_trace.c | 48 ++++++++++++++++++
> kernel/trace/trace_kprobe.c | 29 +++++++++++
> kernel/trace/trace_uprobe.c | 22 +++++++++
> 6 files changed, 257 insertions(+)
>
> diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
> index 2bde3ef..d34144a 100644
> --- a/include/linux/trace_events.h
> +++ b/include/linux/trace_events.h
> @@ -473,6 +473,9 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info);
> int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
> int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
> struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name);
> +int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
> + u32 *fd_type, const char **buf,
> + u64 *probe_offset, u64 *probe_addr);
> #else
> static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
> {
> @@ -504,6 +507,13 @@ static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name
> {
> return NULL;
> }
> +static inline int bpf_get_perf_event_info(const struct perf_event *event,
> + u32 *prog_id, u32 *fd_type,
> + const char **buf, u64 *probe_offset,
> + u64 *probe_addr)
> +{
> + return -EOPNOTSUPP;
> +}
> #endif
>
> enum {
> @@ -560,10 +570,17 @@ extern void perf_trace_del(struct perf_event *event, int flags);
> #ifdef CONFIG_KPROBE_EVENTS
> extern int perf_kprobe_init(struct perf_event *event, bool is_retprobe);
> extern void perf_kprobe_destroy(struct perf_event *event);
> +extern int bpf_get_kprobe_info(const struct perf_event *event,
> + u32 *fd_type, const char **symbol,
> + u64 *probe_offset, u64 *probe_addr,
> + bool perf_type_tracepoint);
> #endif
> #ifdef CONFIG_UPROBE_EVENTS
> extern int perf_uprobe_init(struct perf_event *event, bool is_retprobe);
> extern void perf_uprobe_destroy(struct perf_event *event);
> +extern int bpf_get_uprobe_info(const struct perf_event *event,
> + u32 *fd_type, const char **filename,
> + u64 *probe_offset, bool perf_type_tracepoint);
> #endif
> extern int ftrace_profile_set_filter(struct perf_event *event, int event_id,
> char *filter_str);
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index c3e502d..0d51946 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -97,6 +97,7 @@ enum bpf_cmd {
> BPF_RAW_TRACEPOINT_OPEN,
> BPF_BTF_LOAD,
> BPF_BTF_GET_FD_BY_ID,
> + BPF_TASK_FD_QUERY,
> };
>
> enum bpf_map_type {
> @@ -379,6 +380,22 @@ union bpf_attr {
> __u32 btf_log_size;
> __u32 btf_log_level;
> };
> +
> + struct {
> + __u32 pid; /* input: pid */
> + __u32 fd; /* input: fd */
> + __u32 flags; /* input: flags */
> + __u32 buf_len; /* input/output: buf len */
> + __aligned_u64 buf; /* input/output:
> + * tp_name for tracepoint
> + * symbol for kprobe
> + * filename for uprobe
> + */
> + __u32 prog_id; /* output: prod_id */
> + __u32 fd_type; /* output: BPF_FD_TYPE_* */
> + __u64 probe_offset; /* output: probe_offset */
> + __u64 probe_addr; /* output: probe_addr */
> + } task_fd_query;
> } __attribute__((aligned(8)));
>
> /* The description below is an attempt at providing documentation to eBPF
> @@ -2458,4 +2475,13 @@ struct bpf_fib_lookup {
> __u8 dmac[6]; /* ETH_ALEN */
> };
>
> +enum bpf_task_fd_type {
> + BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */
> + BPF_FD_TYPE_TRACEPOINT, /* tp name */
> + BPF_FD_TYPE_KPROBE, /* (symbol + offset) or addr */
> + BPF_FD_TYPE_KRETPROBE, /* (symbol + offset) or addr */
> + BPF_FD_TYPE_UPROBE, /* filename + offset */
> + BPF_FD_TYPE_URETPROBE, /* filename + offset */
> +};
> +
> #endif /* _UAPI__LINUX_BPF_H__ */
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 0b4c945..7dd8c86 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -18,7 +18,9 @@
> #include <linux/vmalloc.h>
> #include <linux/mmzone.h>
> #include <linux/anon_inodes.h>
> +#include <linux/fdtable.h>
> #include <linux/file.h>
> +#include <linux/fs.h>
> #include <linux/license.h>
> #include <linux/filter.h>
> #include <linux/version.h>
> @@ -2102,6 +2104,116 @@ static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
> return btf_get_fd_by_id(attr->btf_id);
> }
>
> +static int bpf_task_fd_query_copy(const union bpf_attr *attr,
> + union bpf_attr __user *uattr,
> + u32 prog_id, u32 fd_type,
> + const char *buf, u64 probe_offset,
> + u64 probe_addr)
> +{
> + void __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf);
> + u32 len = buf ? strlen(buf) + 1 : 0, input_len;
> + int err = 0;
> +
> + if (put_user(len, &uattr->task_fd_query.buf_len))
> + return -EFAULT;
> + input_len = attr->task_fd_query.buf_len;
> + if (input_len && len && ubuf) {
When len is 0 and input_len > 0, ubuf will not be touched (and
so not null terminated).
It may be helpful to note in uapi bpf.h that !output_buf_len has to be
checked on top of checking the syscall return value. It is reasonable for
the userspace to assume that ubuf can be directly used with
strlen()/printf()... as long as the syscall does not return -1/ENOSPC.
I think the comment change could be done in a follow up patch.
or
always null terminate ubuf as long as input_len > 0
and the output_buf_len should be strlen(buf) instead of
strlen(buf) + 1 (i.e. exclude the null char in output_buf_len)
such that the !buf case will have output_buf_len == 0.
The user can depend on ENOSPC or input_buf_len <= output_buf_len
to decide the truncated condition. This convention should be
closer to the snprintf() situation.
Other than that,
Acked-by: Martin KaFai Lau <kafai@...com>
> + if (input_len < len) {
> + err = -ENOSPC;
> + len = input_len;
> + }
> + if (copy_to_user(ubuf, buf, len))
> + return -EFAULT;
> + }
> +
> + if (put_user(prog_id, &uattr->task_fd_query.prog_id) ||
> + put_user(fd_type, &uattr->task_fd_query.fd_type) ||
> + put_user(probe_offset, &uattr->task_fd_query.probe_offset) ||
> + put_user(probe_addr, &uattr->task_fd_query.probe_addr))
> + return -EFAULT;
> +
> + return err;
> +}
> +
> +#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr
> +
> +static int bpf_task_fd_query(const union bpf_attr *attr,
> + union bpf_attr __user *uattr)
> +{
> + pid_t pid = attr->task_fd_query.pid;
> + u32 fd = attr->task_fd_query.fd;
> + const struct perf_event *event;
> + struct files_struct *files;
> + struct task_struct *task;
> + struct file *file;
> + int err;
> +
> + if (CHECK_ATTR(BPF_TASK_FD_QUERY))
> + return -EINVAL;
> +
> + if (!capable(CAP_SYS_ADMIN))
> + return -EPERM;
> +
> + if (attr->task_fd_query.flags != 0)
> + return -EINVAL;
> +
> + task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
> + if (!task)
> + return -ENOENT;
> +
> + files = get_files_struct(task);
> + put_task_struct(task);
> + if (!files)
> + return -ENOENT;
> +
> + err = 0;
> + spin_lock(&files->file_lock);
> + file = fcheck_files(files, fd);
> + if (!file)
> + err = -EBADF;
> + else
> + get_file(file);
> + spin_unlock(&files->file_lock);
> + put_files_struct(files);
> +
> + if (err)
> + goto out;
> +
> + if (file->f_op == &bpf_raw_tp_fops) {
> + struct bpf_raw_tracepoint *raw_tp = file->private_data;
> + struct bpf_raw_event_map *btp = raw_tp->btp;
> +
> + err = bpf_task_fd_query_copy(attr, uattr,
> + raw_tp->prog->aux->id,
> + BPF_FD_TYPE_RAW_TRACEPOINT,
> + btp->tp->name, 0, 0);
> + goto put_file;
> + }
> +
> + event = perf_get_event(file);
> + if (!IS_ERR(event)) {
> + u64 probe_offset, probe_addr;
> + u32 prog_id, fd_type;
> + const char *buf;
> +
> + err = bpf_get_perf_event_info(event, &prog_id, &fd_type,
> + &buf, &probe_offset,
> + &probe_addr);
> + if (!err)
> + err = bpf_task_fd_query_copy(attr, uattr, prog_id,
> + fd_type, buf,
> + probe_offset,
> + probe_addr);
> + goto put_file;
> + }
> +
> + err = -ENOTSUPP;
> +put_file:
> + fput(file);
> +out:
> + return err;
> +}
> +
> SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
> {
> union bpf_attr attr = {};
> @@ -2188,6 +2300,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
> case BPF_BTF_GET_FD_BY_ID:
> err = bpf_btf_get_fd_by_id(&attr);
> break;
> + case BPF_TASK_FD_QUERY:
> + err = bpf_task_fd_query(&attr, uattr);
> + break;
> default:
> err = -EINVAL;
> break;
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index ce2cbbf..81fdf2f 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -14,6 +14,7 @@
> #include <linux/uaccess.h>
> #include <linux/ctype.h>
> #include <linux/kprobes.h>
> +#include <linux/syscalls.h>
> #include <linux/error-injection.h>
>
> #include "trace_probe.h"
> @@ -1163,3 +1164,50 @@ int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
> mutex_unlock(&bpf_event_mutex);
> return err;
> }
> +
> +int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
> + u32 *fd_type, const char **buf,
> + u64 *probe_offset, u64 *probe_addr)
> +{
> + bool is_tracepoint, is_syscall_tp;
> + struct bpf_prog *prog;
> + int flags, err = 0;
> +
> + prog = event->prog;
> + if (!prog)
> + return -ENOENT;
> +
> + /* not supporting BPF_PROG_TYPE_PERF_EVENT yet */
> + if (prog->type == BPF_PROG_TYPE_PERF_EVENT)
> + return -EOPNOTSUPP;
> +
> + *prog_id = prog->aux->id;
> + flags = event->tp_event->flags;
> + is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT;
> + is_syscall_tp = is_syscall_trace_event(event->tp_event);
> +
> + if (is_tracepoint || is_syscall_tp) {
> + *buf = is_tracepoint ? event->tp_event->tp->name
> + : event->tp_event->name;
> + *fd_type = BPF_FD_TYPE_TRACEPOINT;
> + *probe_offset = 0x0;
> + *probe_addr = 0x0;
> + } else {
> + /* kprobe/uprobe */
> + err = -EOPNOTSUPP;
> +#ifdef CONFIG_KPROBE_EVENTS
> + if (flags & TRACE_EVENT_FL_KPROBE)
> + err = bpf_get_kprobe_info(event, fd_type, buf,
> + probe_offset, probe_addr,
> + event->attr.type == PERF_TYPE_TRACEPOINT);
> +#endif
> +#ifdef CONFIG_UPROBE_EVENTS
> + if (flags & TRACE_EVENT_FL_UPROBE)
> + err = bpf_get_uprobe_info(event, fd_type, buf,
> + probe_offset,
> + event->attr.type == PERF_TYPE_TRACEPOINT);
> +#endif
> + }
> +
> + return err;
> +}
> diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
> index 02aed76..daa8157 100644
> --- a/kernel/trace/trace_kprobe.c
> +++ b/kernel/trace/trace_kprobe.c
> @@ -1287,6 +1287,35 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
> head, NULL);
> }
> NOKPROBE_SYMBOL(kretprobe_perf_func);
> +
> +int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type,
> + const char **symbol, u64 *probe_offset,
> + u64 *probe_addr, bool perf_type_tracepoint)
> +{
> + const char *pevent = trace_event_name(event->tp_event);
> + const char *group = event->tp_event->class->system;
> + struct trace_kprobe *tk;
> +
> + if (perf_type_tracepoint)
> + tk = find_trace_kprobe(pevent, group);
> + else
> + tk = event->tp_event->data;
> + if (!tk)
> + return -EINVAL;
> +
> + *fd_type = trace_kprobe_is_return(tk) ? BPF_FD_TYPE_KRETPROBE
> + : BPF_FD_TYPE_KPROBE;
> + if (tk->symbol) {
> + *symbol = tk->symbol;
> + *probe_offset = tk->rp.kp.offset;
> + *probe_addr = 0;
> + } else {
> + *symbol = NULL;
> + *probe_offset = 0;
> + *probe_addr = (unsigned long)tk->rp.kp.addr;
> + }
> + return 0;
> +}
> #endif /* CONFIG_PERF_EVENTS */
>
> /*
> diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
> index ac89287..bf89a51 100644
> --- a/kernel/trace/trace_uprobe.c
> +++ b/kernel/trace/trace_uprobe.c
> @@ -1161,6 +1161,28 @@ static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
> {
> __uprobe_perf_func(tu, func, regs, ucb, dsize);
> }
> +
> +int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type,
> + const char **filename, u64 *probe_offset,
> + bool perf_type_tracepoint)
> +{
> + const char *pevent = trace_event_name(event->tp_event);
> + const char *group = event->tp_event->class->system;
> + struct trace_uprobe *tu;
> +
> + if (perf_type_tracepoint)
> + tu = find_probe_event(pevent, group);
> + else
> + tu = event->tp_event->data;
> + if (!tu)
> + return -EINVAL;
> +
> + *fd_type = is_ret_probe(tu) ? BPF_FD_TYPE_URETPROBE
> + : BPF_FD_TYPE_UPROBE;
> + *filename = tu->filename;
> + *probe_offset = tu->offset;
> + return 0;
> +}
> #endif /* CONFIG_PERF_EVENTS */
>
> static int
> --
> 2.9.5
>
Powered by blists - more mailing lists