[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20160126045947.GA40151@ast-mbp.thefacebook.com>
Date: Mon, 25 Jan 2016 20:59:49 -0800
From: Alexei Starovoitov <alexei.starovoitov@...il.com>
To: Peter Zijlstra <peterz@...radead.org>
Cc: Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
Ingo Molnar <mingo@...hat.com>, linux-kernel@...r.kernel.org,
vince@...ter.net, eranian@...gle.com,
Arnaldo Carvalho de Melo <acme@...radead.org>,
Jiri Olsa <jolsa@...nel.org>,
Daniel Borkmann <daniel@...earbox.net>,
Wang Nan <wangnan0@...wei.com>
Subject: Re: [PATCH v2] perf: Synchronously cleanup child events
On Mon, Jan 25, 2016 at 10:04:10PM +0100, Peter Zijlstra wrote:
> On Mon, Jan 25, 2016 at 03:54:14PM +0100, Peter Zijlstra wrote:
> > Alexander, Alexei,
> >
> > How about the below? That uses event->state == PERF_EVENT_STATE_EXIT to
> > indicate the event has been given up by its 'owner' and decouples us
> > from the actual event->owner logic.
> >
> > This retains the event->owner and event->owner_list thing purely for the
> > prclt(.option = PR_TASK_PERF_EVENTS_{EN,DIS}ABLE) calls, but does give
> > us strict 'owner' semantics in that:
> >
> > struct perf_event *my_event = perf_event_create_kernel_counter();
> >
> > /* ... */
> >
> > perf_event_release_kernel(my_event);
> >
> > Or
> >
> > int fd = sys_perf_event_open(...);
> >
> > close(fd); /* last, calls fops::release */
> >
> > Will destroy the event dead. event::refcount will 'retain' the object
> > but it will become non functional and is strictly meant as a temporal
> > existence guarantee (for when RCU isn't good enough).
> >
> > So this should restore the scm_rights case, which preserves the fd but
> > could result in not having event->owner (and therefore being removed
> > from its owner_list), which is fine.
> >
> > BPF still needs to get fixed to use filedesc references instead.
>
> Still no BPF, but this one actually 'works', as in it doesn't have the
> blatant exit races and has survived a few hours of runtime.
>
> ---
> include/linux/perf_event.h | 3
> kernel/events/core.c | 304 ++++++++++++++++++++++-----------------------
> 2 files changed, 150 insertions(+), 157 deletions(-)
I think I understand what you're trying to do and
the patch looks good to me.
As far as BPF side I did the following...
does it match the model you outlined above?
I did basic testing and it looks fine.
Subject: [PATCH ] perf,bpf: convert perf_event_array to use struct file
Signed-off-by: Alexei Starovoitov <ast@...nel.org>
---
include/linux/perf_event.h | 4 ++--
kernel/bpf/arraymap.c | 21 +++++++++++----------
kernel/events/core.c | 20 ++++++++------------
kernel/trace/bpf_trace.c | 14 ++++++++++----
4 files changed, 31 insertions(+), 28 deletions(-)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f9828a48f16a..df275020fde9 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -729,7 +729,7 @@ extern int perf_event_init_task(struct task_struct *child);
extern void perf_event_exit_task(struct task_struct *child);
extern void perf_event_free_task(struct task_struct *task);
extern void perf_event_delayed_put(struct task_struct *task);
-extern struct perf_event *perf_event_get(unsigned int fd);
+extern struct file *perf_event_get(unsigned int fd);
extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event);
extern void perf_event_print_debug(void);
extern void perf_pmu_disable(struct pmu *pmu);
@@ -1070,7 +1070,7 @@ static inline int perf_event_init_task(struct task_struct *child) { return 0; }
static inline void perf_event_exit_task(struct task_struct *child) { }
static inline void perf_event_free_task(struct task_struct *task) { }
static inline void perf_event_delayed_put(struct task_struct *task) { }
-static inline struct perf_event *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); }
+static inline struct file *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); }
static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
{
return ERR_PTR(-EINVAL);
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index b0799bced518..89ebbc4d1164 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -291,10 +291,13 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
{
struct perf_event *event;
const struct perf_event_attr *attr;
+ struct file *file;
- event = perf_event_get(fd);
- if (IS_ERR(event))
- return event;
+ file = perf_event_get(fd);
+ if (IS_ERR(file))
+ return file;
+
+ event = file->private_data;
attr = perf_event_attrs(event);
if (IS_ERR(attr))
@@ -304,24 +307,22 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
goto err;
if (attr->type == PERF_TYPE_RAW)
- return event;
+ return file;
if (attr->type == PERF_TYPE_HARDWARE)
- return event;
+ return file;
if (attr->type == PERF_TYPE_SOFTWARE &&
attr->config == PERF_COUNT_SW_BPF_OUTPUT)
- return event;
+ return file;
err:
- perf_event_release_kernel(event);
+ fput(file);
return ERR_PTR(-EINVAL);
}
static void perf_event_fd_array_put_ptr(void *ptr)
{
- struct perf_event *event = ptr;
-
- perf_event_release_kernel(event);
+ fput((struct file *)ptr);
}
static const struct bpf_map_ops perf_event_array_ops = {
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 06ae52e99ac2..2a95e0d2370f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8896,21 +8896,17 @@ void perf_event_delayed_put(struct task_struct *task)
WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
}
-struct perf_event *perf_event_get(unsigned int fd)
+struct file *perf_event_get(unsigned int fd)
{
- int err;
- struct fd f;
- struct perf_event *event;
-
- err = perf_fget_light(fd, &f);
- if (err)
- return ERR_PTR(err);
+ struct file *file;
- event = f.file->private_data;
- atomic_long_inc(&event->refcount);
- fdput(f);
+ file = fget_raw(fd);
+ if (file->f_op != &perf_fops) {
+ fput(file);
+ return ERR_PTR(-EBADF);
+ }
- return event;
+ return file;
}
const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 45dd798bcd37..326a75e884db 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -191,14 +191,17 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct perf_event *event;
+ struct file *file;
if (unlikely(index >= array->map.max_entries))
return -E2BIG;
- event = (struct perf_event *)array->ptrs[index];
- if (!event)
+ file = (struct file *)array->ptrs[index];
+ if (unlikely(!file))
return -ENOENT;
+ event = file->private_data;
+
/* make sure event is local and doesn't have pmu::count */
if (event->oncpu != smp_processor_id() ||
event->pmu->count)
@@ -228,6 +231,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
void *data = (void *) (long) r4;
struct perf_sample_data sample_data;
struct perf_event *event;
+ struct file *file;
struct perf_raw_record raw = {
.size = size,
.data = data,
@@ -236,10 +240,12 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
if (unlikely(index >= array->map.max_entries))
return -E2BIG;
- event = (struct perf_event *)array->ptrs[index];
- if (unlikely(!event))
+ file = (struct file *)array->ptrs[index];
+ if (unlikely(!file))
return -ENOENT;
+ event = file->private_data;
+
if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
return -EINVAL;
--
2.4.6
Powered by blists - more mailing lists