[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20180523171320.ziswsvpsyelxb6fz@kafai-mbp>
Date: Wed, 23 May 2018 10:13:22 -0700
From: Martin KaFai Lau <kafai@...com>
To: Yonghong Song <yhs@...com>
CC: <peterz@...radead.org>, <ast@...com>, <daniel@...earbox.net>,
<netdev@...r.kernel.org>, <kernel-team@...com>
Subject: Re: [PATCH bpf-next v3 2/7] bpf: introduce bpf subcommand
BPF_TASK_FD_QUERY
On Tue, May 22, 2018 at 09:30:46AM -0700, Yonghong Song wrote:
> Currently, suppose a userspace application has loaded a bpf program
> and attached it to a tracepoint/kprobe/uprobe, and a bpf
> introspection tool, e.g., bpftool, wants to show which bpf program
> is attached to which tracepoint/kprobe/uprobe. Such attachment
> information will be really useful to understand the overall bpf
> deployment in the system.
>
> There is a name field (16 bytes) for each program, which could
> be used to encode the attachment point. There are some drawbacks
> for this approaches. First, bpftool user (e.g., an admin) may not
> really understand the association between the name and the
> attachment point. Second, if one program is attached to multiple
> places, encoding a proper name which can imply all these
> attachments becomes difficult.
>
> This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
> Given a pid and fd, if the <pid, fd> is associated with a
> tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return
> . prog_id
> . tracepoint name, or
> . k[ret]probe funcname + offset or kernel addr, or
> . u[ret]probe filename + offset
> to the userspace.
> The user can use "bpftool prog" to find more information about
> bpf program itself with prog_id.
LGTM, some comments inline.
>
> Signed-off-by: Yonghong Song <yhs@...com>
> ---
> include/linux/trace_events.h | 16 ++++++
> include/uapi/linux/bpf.h | 27 ++++++++++
> kernel/bpf/syscall.c | 124 +++++++++++++++++++++++++++++++++++++++++++
> kernel/trace/bpf_trace.c | 48 +++++++++++++++++
> kernel/trace/trace_kprobe.c | 29 ++++++++++
> kernel/trace/trace_uprobe.c | 22 ++++++++
> 6 files changed, 266 insertions(+)
>
> diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
> index 2bde3ef..eab806d 100644
> --- a/include/linux/trace_events.h
> +++ b/include/linux/trace_events.h
> @@ -473,6 +473,9 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info);
> int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
> int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
> struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name);
> +int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
> + u32 *attach_info, const char **buf,
> + u64 *probe_offset, u64 *probe_addr);
The first arg is 'const struct perf_event *event' while...
> #else
> static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
> {
> @@ -504,6 +507,12 @@ static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name
> {
> return NULL;
> }
> +static inline int bpf_get_perf_event_info(const struct file *file, u32 *prog_id,
this one has 'const struct file *file'?
> + u32 *attach_info, const char **buf,
> + u64 *probe_offset, u64 *probe_addr)
> +{
> + return -EOPNOTSUPP;
> +}
> #endif
>
> enum {
> @@ -560,10 +569,17 @@ extern void perf_trace_del(struct perf_event *event, int flags);
> #ifdef CONFIG_KPROBE_EVENTS
> extern int perf_kprobe_init(struct perf_event *event, bool is_retprobe);
> extern void perf_kprobe_destroy(struct perf_event *event);
> +extern int bpf_get_kprobe_info(const struct perf_event *event,
> + u32 *attach_info, const char **symbol,
> + u64 *probe_offset, u64 *probe_addr,
> + bool perf_type_tracepoint);
> #endif
> #ifdef CONFIG_UPROBE_EVENTS
> extern int perf_uprobe_init(struct perf_event *event, bool is_retprobe);
> extern void perf_uprobe_destroy(struct perf_event *event);
> +extern int bpf_get_uprobe_info(const struct perf_event *event,
> + u32 *attach_info, const char **filename,
> + u64 *probe_offset, bool perf_type_tracepoint);
> #endif
> extern int ftrace_profile_set_filter(struct perf_event *event, int event_id,
> char *filter_str);
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 97446bb..a602150 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -97,6 +97,7 @@ enum bpf_cmd {
> BPF_RAW_TRACEPOINT_OPEN,
> BPF_BTF_LOAD,
> BPF_BTF_GET_FD_BY_ID,
> + BPF_TASK_FD_QUERY,
> };
>
> enum bpf_map_type {
> @@ -379,6 +380,22 @@ union bpf_attr {
> __u32 btf_log_size;
> __u32 btf_log_level;
> };
> +
> + struct {
> + int pid; /* input: pid */
> + int fd; /* input: fd */
Should fd and pid be always positive?
The current fd (like map_fd) in bpf_attr is using __u32.
> + __u32 flags; /* input: flags */
> + __u32 buf_len; /* input: buf len */
> + __aligned_u64 buf; /* input/output:
> + * tp_name for tracepoint
> + * symbol for kprobe
> + * filename for uprobe
> + */
> + __u32 prog_id; /* output: prod_id */
> + __u32 attach_info; /* output: BPF_ATTACH_* */
> + __u64 probe_offset; /* output: probe_offset */
> + __u64 probe_addr; /* output: probe_addr */
> + } task_fd_query;
> } __attribute__((aligned(8)));
>
> /* The description below is an attempt at providing documentation to eBPF
> @@ -2458,4 +2475,14 @@ struct bpf_fib_lookup {
> __u8 dmac[6]; /* ETH_ALEN */
> };
>
> +/* used by <task, fd> based query */
> +enum {
Nit. Instead of a comment, is it better to give this
enum a descriptive name?
> + BPF_ATTACH_RAW_TRACEPOINT, /* tp name */
> + BPF_ATTACH_TRACEPOINT, /* tp name */
> + BPF_ATTACH_KPROBE, /* (symbol + offset) or addr */
> + BPF_ATTACH_KRETPROBE, /* (symbol + offset) or addr */
> + BPF_ATTACH_UPROBE, /* filename + offset */
> + BPF_ATTACH_URETPROBE, /* filename + offset */
> +};
> +
> #endif /* _UAPI__LINUX_BPF_H__ */
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index bfcde94..9356f0e 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -18,7 +18,9 @@
> #include <linux/vmalloc.h>
> #include <linux/mmzone.h>
> #include <linux/anon_inodes.h>
> +#include <linux/fdtable.h>
> #include <linux/file.h>
> +#include <linux/fs.h>
> #include <linux/license.h>
> #include <linux/filter.h>
> #include <linux/version.h>
> @@ -2102,6 +2104,125 @@ static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
> return btf_get_fd_by_id(attr->btf_id);
> }
>
> +static int bpf_task_fd_query_copy(const union bpf_attr *attr,
> + union bpf_attr __user *uattr,
> + u32 prog_id, u32 attach_info,
> + const char *buf, u64 probe_offset,
> + u64 probe_addr)
> +{
> + __u64 __user *ubuf;
Nit. ubuf is a string instead of an array of __u64?
> + int len;
> +
> + ubuf = u64_to_user_ptr(attr->task_fd_query.buf);
> + if (buf) {
> + len = strlen(buf);
> + if (attr->task_fd_query.buf_len < len + 1)
I think the current convention is to take the min,
copy whatever it can to buf and return the real
len/size in buf_len. F.e., the prog_ids and prog_cnt in
__cgroup_bpf_query().
Should the same be done here or it does not make sense to
truncate the string? The user may/may not need the tailing
char though if its pretty print has limited width anyway.
The user still needs to know what the buf_len should be to
retry also but I guess any reasonable buf_len should
work?
> + return -ENOSPC;
> + if (copy_to_user(ubuf, buf, len + 1))
> + return -EFAULT;
> + } else if (attr->task_fd_query.buf_len) {
> + /* copy '\0' to ubuf */
> + __u8 zero = 0;
> +
> + if (copy_to_user(ubuf, &zero, 1))
> + return -EFAULT;
> + }
> +
> + if (copy_to_user(&uattr->task_fd_query.prog_id, &prog_id,
> + sizeof(prog_id)) ||
> + copy_to_user(&uattr->task_fd_query.attach_info, &attach_info,
> + sizeof(attach_info)) ||
> + copy_to_user(&uattr->task_fd_query.probe_offset, &probe_offset,
> + sizeof(probe_offset)) ||
> + copy_to_user(&uattr->task_fd_query.probe_addr, &probe_addr,
> + sizeof(probe_addr)))
Nit. put_user() may be able to shorten them.
> + return -EFAULT;
> +
> + return 0;
> +}
> +
> +#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr
> +
> +static int bpf_task_fd_query(const union bpf_attr *attr,
> + union bpf_attr __user *uattr)
> +{
> + pid_t pid = attr->task_fd_query.pid;
> + int fd = attr->task_fd_query.fd;
> + const struct perf_event *event;
> + struct files_struct *files;
> + struct task_struct *task;
> + struct file *file;
> + int err;
> +
> + if (CHECK_ATTR(BPF_TASK_FD_QUERY))
> + return -EINVAL;
> +
> + if (!capable(CAP_SYS_ADMIN))
> + return -EPERM;
> +
> + if (attr->task_fd_query.flags != 0)
How flags is used?
> + return -EINVAL;
> +
> + task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
> + if (!task)
> + return -ENOENT;
> +
> + files = get_files_struct(task);
> + put_task_struct(task);
> + if (!files)
> + return -ENOENT;
> +
> + err = 0;
> + spin_lock(&files->file_lock);
> + file = fcheck_files(files, fd);
> + if (!file)
> + err = -EBADF;
> + else
> + get_file(file);
> + spin_unlock(&files->file_lock);
> + put_files_struct(files);
> +
> + if (err)
> + goto out;
> +
> + if (file->f_op == &bpf_raw_tp_fops) {
> + struct bpf_raw_tracepoint *raw_tp = file->private_data;
> + struct bpf_raw_event_map *btp = raw_tp->btp;
> +
> + if (!raw_tp->prog)
> + err = -ENOENT;
> + else
> + err = bpf_task_fd_query_copy(attr, uattr,
> + raw_tp->prog->aux->id,
> + BPF_ATTACH_RAW_TRACEPOINT,
> + btp->tp->name, 0, 0);
> + goto put_file;
> + }
> +
> + event = perf_get_event(file);
> + if (!IS_ERR(event)) {
> + u64 probe_offset, probe_addr;
> + u32 prog_id, attach_info;
> + const char *buf;
> +
> + err = bpf_get_perf_event_info(event, &prog_id, &attach_info,
> + &buf, &probe_offset,
> + &probe_addr);
> + if (!err)
> + err = bpf_task_fd_query_copy(attr, uattr, prog_id,
> + attach_info, buf,
> + probe_offset,
> + probe_addr);
> + goto put_file;
> + }
> +
> + err = -ENOTSUPP;
> +put_file:
> + fput(file);
> +out:
> + return err;
> +}
> +
> SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
> {
> union bpf_attr attr = {};
> @@ -2188,6 +2309,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
> case BPF_BTF_GET_FD_BY_ID:
> err = bpf_btf_get_fd_by_id(&attr);
> break;
> + case BPF_TASK_FD_QUERY:
> + err = bpf_task_fd_query(&attr, uattr);
> + break;
> default:
> err = -EINVAL;
> break;
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index ce2cbbf..323c80e 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -14,6 +14,7 @@
> #include <linux/uaccess.h>
> #include <linux/ctype.h>
> #include <linux/kprobes.h>
> +#include <linux/syscalls.h>
> #include <linux/error-injection.h>
>
> #include "trace_probe.h"
> @@ -1163,3 +1164,50 @@ int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
> mutex_unlock(&bpf_event_mutex);
> return err;
> }
> +
> +int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
> + u32 *attach_info, const char **buf,
> + u64 *probe_offset, u64 *probe_addr)
> +{
> + bool is_tracepoint, is_syscall_tp;
> + struct bpf_prog *prog;
> + int flags, err = 0;
> +
> + prog = event->prog;
> + if (!prog)
> + return -ENOENT;
> +
> + /* not supporting BPF_PROG_TYPE_PERF_EVENT yet */
> + if (prog->type == BPF_PROG_TYPE_PERF_EVENT)
> + return -EOPNOTSUPP;
> +
> + *prog_id = prog->aux->id;
> + flags = event->tp_event->flags;
> + is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT;
> + is_syscall_tp = is_syscall_trace_event(event->tp_event);
> +
> + if (is_tracepoint || is_syscall_tp) {
> + *buf = is_tracepoint ? event->tp_event->tp->name
> + : event->tp_event->name;
> + *attach_info = BPF_ATTACH_TRACEPOINT;
> + *probe_offset = 0x0;
> + *probe_addr = 0x0;
> + } else {
> + /* kprobe/uprobe */
> + err = -EOPNOTSUPP;
> +#ifdef CONFIG_KPROBE_EVENTS
> + if (flags & TRACE_EVENT_FL_KPROBE)
> + err = bpf_get_kprobe_info(event, attach_info, buf,
> + probe_offset, probe_addr,
> + event->attr.type == PERF_TYPE_TRACEPOINT);
> +#endif
> +#ifdef CONFIG_UPROBE_EVENTS
> + if (flags & TRACE_EVENT_FL_UPROBE)
> + err = bpf_get_uprobe_info(event, attach_info, buf,
> + probe_offset,
> + event->attr.type == PERF_TYPE_TRACEPOINT);
> +#endif
> + }
> +
> + return err;
> +}
> diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
> index 02aed76..32e9190 100644
> --- a/kernel/trace/trace_kprobe.c
> +++ b/kernel/trace/trace_kprobe.c
> @@ -1287,6 +1287,35 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
> head, NULL);
> }
> NOKPROBE_SYMBOL(kretprobe_perf_func);
> +
> +int bpf_get_kprobe_info(const struct perf_event *event, u32 *attach_info,
> + const char **symbol, u64 *probe_offset,
> + u64 *probe_addr, bool perf_type_tracepoint)
> +{
> + const char *pevent = trace_event_name(event->tp_event);
> + const char *group = event->tp_event->class->system;
> + struct trace_kprobe *tk;
> +
> + if (perf_type_tracepoint)
> + tk = find_trace_kprobe(pevent, group);
> + else
> + tk = event->tp_event->data;
> + if (!tk)
> + return -EINVAL;
> +
> + *attach_info = trace_kprobe_is_return(tk) ? BPF_ATTACH_KRETPROBE
> + : BPF_ATTACH_KPROBE;
> + if (tk->symbol) {
> + *symbol = tk->symbol;
> + *probe_offset = tk->rp.kp.offset;
> + *probe_addr = 0;
> + } else {
> + *symbol = NULL;
> + *probe_offset = 0;
> + *probe_addr = (unsigned long)tk->rp.kp.addr;
> + }
> + return 0;
> +}
> #endif /* CONFIG_PERF_EVENTS */
>
> /*
> diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
> index ac89287..12a3667 100644
> --- a/kernel/trace/trace_uprobe.c
> +++ b/kernel/trace/trace_uprobe.c
> @@ -1161,6 +1161,28 @@ static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
> {
> __uprobe_perf_func(tu, func, regs, ucb, dsize);
> }
> +
> +int bpf_get_uprobe_info(const struct perf_event *event, u32 *attach_info,
> + const char **filename, u64 *probe_offset,
> + bool perf_type_tracepoint)
> +{
> + const char *pevent = trace_event_name(event->tp_event);
> + const char *group = event->tp_event->class->system;
> + struct trace_uprobe *tu;
> +
> + if (perf_type_tracepoint)
> + tu = find_probe_event(pevent, group);
> + else
> + tu = event->tp_event->data;
> + if (!tu)
> + return -EINVAL;
> +
> + *attach_info = is_ret_probe(tu) ? BPF_ATTACH_URETPROBE
> + : BPF_ATTACH_UPROBE;
> + *filename = tu->filename;
> + *probe_offset = tu->offset;
> + return 0;
> +}
> #endif /* CONFIG_PERF_EVENTS */
>
> static int
> --
> 2.9.5
>
Powered by blists - more mailing lists