[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <0937648b-89b5-729c-0089-f0df2ed5e5b4@fb.com>
Date: Wed, 23 May 2018 14:25:34 -0700
From: Yonghong Song <yhs@...com>
To: Martin KaFai Lau <kafai@...com>
CC: <peterz@...radead.org>, <ast@...com>, <daniel@...earbox.net>,
<netdev@...r.kernel.org>, <kernel-team@...com>
Subject: Re: [PATCH bpf-next v3 2/7] bpf: introduce bpf subcommand
BPF_TASK_FD_QUERY
On 5/23/18 10:13 AM, Martin KaFai Lau wrote:
> On Tue, May 22, 2018 at 09:30:46AM -0700, Yonghong Song wrote:
>> Currently, suppose a userspace application has loaded a bpf program
>> and attached it to a tracepoint/kprobe/uprobe, and a bpf
>> introspection tool, e.g., bpftool, wants to show which bpf program
>> is attached to which tracepoint/kprobe/uprobe. Such attachment
>> information will be really useful to understand the overall bpf
>> deployment in the system.
>>
>> There is a name field (16 bytes) for each program, which could
>> be used to encode the attachment point. There are some drawbacks
>> for this approaches. First, bpftool user (e.g., an admin) may not
>> really understand the association between the name and the
>> attachment point. Second, if one program is attached to multiple
>> places, encoding a proper name which can imply all these
>> attachments becomes difficult.
>>
>> This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
>> Given a pid and fd, if the <pid, fd> is associated with a
>> tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return
>> . prog_id
>> . tracepoint name, or
>> . k[ret]probe funcname + offset or kernel addr, or
>> . u[ret]probe filename + offset
>> to the userspace.
>> The user can use "bpftool prog" to find more information about
>> bpf program itself with prog_id.
> LGTM, some comments inline.
>
>>
>> Signed-off-by: Yonghong Song <yhs@...com>
>> ---
>> include/linux/trace_events.h | 16 ++++++
>> include/uapi/linux/bpf.h | 27 ++++++++++
>> kernel/bpf/syscall.c | 124 +++++++++++++++++++++++++++++++++++++++++++
>> kernel/trace/bpf_trace.c | 48 +++++++++++++++++
>> kernel/trace/trace_kprobe.c | 29 ++++++++++
>> kernel/trace/trace_uprobe.c | 22 ++++++++
>> 6 files changed, 266 insertions(+)
>>
>> diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
>> index 2bde3ef..eab806d 100644
>> --- a/include/linux/trace_events.h
>> +++ b/include/linux/trace_events.h
>> @@ -473,6 +473,9 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info);
>> int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
>> int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
>> struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name);
>> +int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
>> + u32 *attach_info, const char **buf,
>> + u64 *probe_offset, u64 *probe_addr);
> The first arg is 'const struct perf_event *event' while...
>
>> #else
>> static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
>> {
>> @@ -504,6 +507,12 @@ static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name
>> {
>> return NULL;
>> }
>> +static inline int bpf_get_perf_event_info(const struct file *file, u32 *prog_id,
> this one has 'const struct file *file'?
Thanks for catching this. Will correct this in the next revision.
>
>> + u32 *attach_info, const char **buf,
>> + u64 *probe_offset, u64 *probe_addr)
>> +{
>> + return -EOPNOTSUPP;
>> +}
>> #endif
>>
>> enum {
>> @@ -560,10 +569,17 @@ extern void perf_trace_del(struct perf_event *event, int flags);
>> #ifdef CONFIG_KPROBE_EVENTS
>> extern int perf_kprobe_init(struct perf_event *event, bool is_retprobe);
>> extern void perf_kprobe_destroy(struct perf_event *event);
>> +extern int bpf_get_kprobe_info(const struct perf_event *event,
>> + u32 *attach_info, const char **symbol,
>> + u64 *probe_offset, u64 *probe_addr,
>> + bool perf_type_tracepoint);
>> #endif
>> #ifdef CONFIG_UPROBE_EVENTS
>> extern int perf_uprobe_init(struct perf_event *event, bool is_retprobe);
>> extern void perf_uprobe_destroy(struct perf_event *event);
>> +extern int bpf_get_uprobe_info(const struct perf_event *event,
>> + u32 *attach_info, const char **filename,
>> + u64 *probe_offset, bool perf_type_tracepoint);
>> #endif
>> extern int ftrace_profile_set_filter(struct perf_event *event, int event_id,
>> char *filter_str);
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 97446bb..a602150 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -97,6 +97,7 @@ enum bpf_cmd {
>> BPF_RAW_TRACEPOINT_OPEN,
>> BPF_BTF_LOAD,
>> BPF_BTF_GET_FD_BY_ID,
>> + BPF_TASK_FD_QUERY,
>> };
>>
>> enum bpf_map_type {
>> @@ -379,6 +380,22 @@ union bpf_attr {
>> __u32 btf_log_size;
>> __u32 btf_log_level;
>> };
>> +
>> + struct {
>> + int pid; /* input: pid */
>> + int fd; /* input: fd */
> Should fd and pid be always positive?
> The current fd (like map_fd) in bpf_attr is using __u32.
Will change both pid and fd to __u32. In kernel fd is unsigned, but
pid_t is actually an int. The negative pid is often referred to
the process group the pid is in. Since here, we are only concerned
with actual process pid, make __u32 should be okay.
>
>> + __u32 flags; /* input: flags */
>> + __u32 buf_len; /* input: buf len */
>> + __aligned_u64 buf; /* input/output:
>> + * tp_name for tracepoint
>> + * symbol for kprobe
>> + * filename for uprobe
>> + */
>> + __u32 prog_id; /* output: prod_id */
>> + __u32 attach_info; /* output: BPF_ATTACH_* */
>> + __u64 probe_offset; /* output: probe_offset */
>> + __u64 probe_addr; /* output: probe_addr */
>> + } task_fd_query;
>> } __attribute__((aligned(8)));
>>
>> /* The description below is an attempt at providing documentation to eBPF
>> @@ -2458,4 +2475,14 @@ struct bpf_fib_lookup {
>> __u8 dmac[6]; /* ETH_ALEN */
>> };
>>
>> +/* used by <task, fd> based query */
>> +enum {
> Nit. Instead of a comment, is it better to give this
> enum a descriptive name?
Yes, will add an enum name bpf_task_fd_info to make it easy for
correlation with task_fd_query.
>
>> + BPF_ATTACH_RAW_TRACEPOINT, /* tp name */
>> + BPF_ATTACH_TRACEPOINT, /* tp name */
>> + BPF_ATTACH_KPROBE, /* (symbol + offset) or addr */
>> + BPF_ATTACH_KRETPROBE, /* (symbol + offset) or addr */
>> + BPF_ATTACH_UPROBE, /* filename + offset */
>> + BPF_ATTACH_URETPROBE, /* filename + offset */
>> +};
>> +
>> #endif /* _UAPI__LINUX_BPF_H__ */
>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>> index bfcde94..9356f0e 100644
>> --- a/kernel/bpf/syscall.c
>> +++ b/kernel/bpf/syscall.c
>> @@ -18,7 +18,9 @@
>> #include <linux/vmalloc.h>
>> #include <linux/mmzone.h>
>> #include <linux/anon_inodes.h>
>> +#include <linux/fdtable.h>
>> #include <linux/file.h>
>> +#include <linux/fs.h>
>> #include <linux/license.h>
>> #include <linux/filter.h>
>> #include <linux/version.h>
>> @@ -2102,6 +2104,125 @@ static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
>> return btf_get_fd_by_id(attr->btf_id);
>> }
>>
>> +static int bpf_task_fd_query_copy(const union bpf_attr *attr,
>> + union bpf_attr __user *uattr,
>> + u32 prog_id, u32 attach_info,
>> + const char *buf, u64 probe_offset,
>> + u64 probe_addr)
>> +{
>> + __u64 __user *ubuf;
> Nit. ubuf is a string instead of an array of __u64?
will change it to void *.
>
>> + int len;
>> +
>> + ubuf = u64_to_user_ptr(attr->task_fd_query.buf);
>> + if (buf) {
>> + len = strlen(buf);
>> + if (attr->task_fd_query.buf_len < len + 1)
> I think the current convention is to take the min,
> copy whatever it can to buf and return the real
> len/size in buf_len. F.e., the prog_ids and prog_cnt in
> __cgroup_bpf_query().
>
> Should the same be done here or it does not make sense to
> truncate the string? The user may/may not need the tailing
> char though if its pretty print has limited width anyway.
> The user still needs to know what the buf_len should be to
> retry also but I guess any reasonable buf_len should
> work?
Make sense, will make buf_len input/output and copy
the actually needed length to buf_len and back to user.
>
>> + return -ENOSPC;
>> + if (copy_to_user(ubuf, buf, len + 1))
>> + return -EFAULT;
>> + } else if (attr->task_fd_query.buf_len) {
>> + /* copy '\0' to ubuf */
>> + __u8 zero = 0;
>> +
>> + if (copy_to_user(ubuf, &zero, 1))
>> + return -EFAULT;
>> + }
>> +
>> + if (copy_to_user(&uattr->task_fd_query.prog_id, &prog_id,
>> + sizeof(prog_id)) ||
>> + copy_to_user(&uattr->task_fd_query.attach_info, &attach_info,
>> + sizeof(attach_info)) ||
>> + copy_to_user(&uattr->task_fd_query.probe_offset, &probe_offset,
>> + sizeof(probe_offset)) ||
>> + copy_to_user(&uattr->task_fd_query.probe_addr, &probe_addr,
>> + sizeof(probe_addr)))
> Nit. put_user() may be able to shorten them.
Indeed, thanks for suggestion.
>
>> + return -EFAULT;
>> +
>> + return 0;
>> +}
>> +
>> +#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr
>> +
>> +static int bpf_task_fd_query(const union bpf_attr *attr,
>> + union bpf_attr __user *uattr)
>> +{
>> + pid_t pid = attr->task_fd_query.pid;
>> + int fd = attr->task_fd_query.fd;
>> + const struct perf_event *event;
>> + struct files_struct *files;
>> + struct task_struct *task;
>> + struct file *file;
>> + int err;
>> +
>> + if (CHECK_ATTR(BPF_TASK_FD_QUERY))
>> + return -EINVAL;
>> +
>> + if (!capable(CAP_SYS_ADMIN))
>> + return -EPERM;
>> +
>> + if (attr->task_fd_query.flags != 0)
> How flags is used?
flags is not used yet, but for future extension.
For example, it could be used to query a specific
type of files (raw_tracepoint vs. perf-event based, etc.).
>
>> + return -EINVAL;
>> +
>> + task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
>> + if (!task)
>> + return -ENOENT;
>> +
>> + files = get_files_struct(task);
>> + put_task_struct(task);
>> + if (!files)
>> + return -ENOENT;
>> +
>> + err = 0;
>> + spin_lock(&files->file_lock);
>> + file = fcheck_files(files, fd);
>> + if (!file)
>> + err = -EBADF;
>> + else
>> + get_file(file);
>> + spin_unlock(&files->file_lock);
>> + put_files_struct(files);
>> +
>> + if (err)
>> + goto out;
>> +
>> + if (file->f_op == &bpf_raw_tp_fops) {
>> + struct bpf_raw_tracepoint *raw_tp = file->private_data;
>> + struct bpf_raw_event_map *btp = raw_tp->btp;
>> +
>> + if (!raw_tp->prog)
>> + err = -ENOENT;
>> + else
>> + err = bpf_task_fd_query_copy(attr, uattr,
>> + raw_tp->prog->aux->id,
>> + BPF_ATTACH_RAW_TRACEPOINT,
>> + btp->tp->name, 0, 0);
>> + goto put_file;
>> + }
>> +
>> + event = perf_get_event(file);
>> + if (!IS_ERR(event)) {
>> + u64 probe_offset, probe_addr;
>> + u32 prog_id, attach_info;
>> + const char *buf;
>> +
>> + err = bpf_get_perf_event_info(event, &prog_id, &attach_info,
>> + &buf, &probe_offset,
>> + &probe_addr);
>> + if (!err)
>> + err = bpf_task_fd_query_copy(attr, uattr, prog_id,
>> + attach_info, buf,
>> + probe_offset,
>> + probe_addr);
>> + goto put_file;
>> + }
>> +
>> + err = -ENOTSUPP;
>> +put_file:
>> + fput(file);
>> +out:
>> + return err;
>> +}
>> +
>> SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
>> {
>> union bpf_attr attr = {};
>> @@ -2188,6 +2309,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
>> case BPF_BTF_GET_FD_BY_ID:
>> err = bpf_btf_get_fd_by_id(&attr);
>> break;
>> + case BPF_TASK_FD_QUERY:
>> + err = bpf_task_fd_query(&attr, uattr);
>> + break;
>> default:
>> err = -EINVAL;
>> break;
>> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
>> index ce2cbbf..323c80e 100644
>> --- a/kernel/trace/bpf_trace.c
>> +++ b/kernel/trace/bpf_trace.c
>> @@ -14,6 +14,7 @@
>> #include <linux/uaccess.h>
>> #include <linux/ctype.h>
>> #include <linux/kprobes.h>
>> +#include <linux/syscalls.h>
>> #include <linux/error-injection.h>
>>
>> #include "trace_probe.h"
>> @@ -1163,3 +1164,50 @@ int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
>> mutex_unlock(&bpf_event_mutex);
>> return err;
>> }
>> +
>> +int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
>> + u32 *attach_info, const char **buf,
>> + u64 *probe_offset, u64 *probe_addr)
>> +{
>> + bool is_tracepoint, is_syscall_tp;
>> + struct bpf_prog *prog;
>> + int flags, err = 0;
>> +
>> + prog = event->prog;
>> + if (!prog)
>> + return -ENOENT;
>> +
>> + /* not supporting BPF_PROG_TYPE_PERF_EVENT yet */
>> + if (prog->type == BPF_PROG_TYPE_PERF_EVENT)
>> + return -EOPNOTSUPP;
>> +
>> + *prog_id = prog->aux->id;
>> + flags = event->tp_event->flags;
>> + is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT;
>> + is_syscall_tp = is_syscall_trace_event(event->tp_event);
>> +
>> + if (is_tracepoint || is_syscall_tp) {
>> + *buf = is_tracepoint ? event->tp_event->tp->name
>> + : event->tp_event->name;
>> + *attach_info = BPF_ATTACH_TRACEPOINT;
>> + *probe_offset = 0x0;
>> + *probe_addr = 0x0;
>> + } else {
>> + /* kprobe/uprobe */
>> + err = -EOPNOTSUPP;
>> +#ifdef CONFIG_KPROBE_EVENTS
>> + if (flags & TRACE_EVENT_FL_KPROBE)
>> + err = bpf_get_kprobe_info(event, attach_info, buf,
>> + probe_offset, probe_addr,
>> + event->attr.type == PERF_TYPE_TRACEPOINT);
>> +#endif
>> +#ifdef CONFIG_UPROBE_EVENTS
>> + if (flags & TRACE_EVENT_FL_UPROBE)
>> + err = bpf_get_uprobe_info(event, attach_info, buf,
>> + probe_offset,
>> + event->attr.type == PERF_TYPE_TRACEPOINT);
>> +#endif
>> + }
>> +
>> + return err;
>> +}
>> diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
>> index 02aed76..32e9190 100644
>> --- a/kernel/trace/trace_kprobe.c
>> +++ b/kernel/trace/trace_kprobe.c
>> @@ -1287,6 +1287,35 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
>> head, NULL);
>> }
>> NOKPROBE_SYMBOL(kretprobe_perf_func);
>> +
>> +int bpf_get_kprobe_info(const struct perf_event *event, u32 *attach_info,
>> + const char **symbol, u64 *probe_offset,
>> + u64 *probe_addr, bool perf_type_tracepoint)
>> +{
>> + const char *pevent = trace_event_name(event->tp_event);
>> + const char *group = event->tp_event->class->system;
>> + struct trace_kprobe *tk;
>> +
>> + if (perf_type_tracepoint)
>> + tk = find_trace_kprobe(pevent, group);
>> + else
>> + tk = event->tp_event->data;
>> + if (!tk)
>> + return -EINVAL;
>> +
>> + *attach_info = trace_kprobe_is_return(tk) ? BPF_ATTACH_KRETPROBE
>> + : BPF_ATTACH_KPROBE;
>> + if (tk->symbol) {
>> + *symbol = tk->symbol;
>> + *probe_offset = tk->rp.kp.offset;
>> + *probe_addr = 0;
>> + } else {
>> + *symbol = NULL;
>> + *probe_offset = 0;
>> + *probe_addr = (unsigned long)tk->rp.kp.addr;
>> + }
>> + return 0;
>> +}
>> #endif /* CONFIG_PERF_EVENTS */
>>
>> /*
>> diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
>> index ac89287..12a3667 100644
>> --- a/kernel/trace/trace_uprobe.c
>> +++ b/kernel/trace/trace_uprobe.c
>> @@ -1161,6 +1161,28 @@ static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
>> {
>> __uprobe_perf_func(tu, func, regs, ucb, dsize);
>> }
>> +
>> +int bpf_get_uprobe_info(const struct perf_event *event, u32 *attach_info,
>> + const char **filename, u64 *probe_offset,
>> + bool perf_type_tracepoint)
>> +{
>> + const char *pevent = trace_event_name(event->tp_event);
>> + const char *group = event->tp_event->class->system;
>> + struct trace_uprobe *tu;
>> +
>> + if (perf_type_tracepoint)
>> + tu = find_probe_event(pevent, group);
>> + else
>> + tu = event->tp_event->data;
>> + if (!tu)
>> + return -EINVAL;
>> +
>> + *attach_info = is_ret_probe(tu) ? BPF_ATTACH_URETPROBE
>> + : BPF_ATTACH_UPROBE;
>> + *filename = tu->filename;
>> + *probe_offset = tu->offset;
>> + return 0;
>> +}
>> #endif /* CONFIG_PERF_EVENTS */
>>
>> static int
>> --
>> 2.9.5
>>
Powered by blists - more mailing lists