[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <qtivtftbdvarukcxdr4yfwstzvnh4z7eipukwxymi4e2x76y54@dxqn3y22u2pw>
Date: Fri, 18 Oct 2024 14:46:31 -0500
From: Lucas De Marchi <lucas.demarchi@...el.com>
To: Peter Zijlstra <peterz@...radead.org>
CC: <linux-kernel@...r.kernel.org>, <dri-devel@...ts.freedesktop.org>, "Ingo
Molnar" <mingo@...hat.com>, Arnaldo Carvalho de Melo <acme@...nel.org>, Umesh
Nerlige Ramappa <umesh.nerlige.ramappa@...el.com>, Ian Rogers
<irogers@...gle.com>, Tvrtko Ursulin <tvrtko.ursulin@...lia.com>
Subject: Re: [PATCH 3/5] perf: Add pmu get/put
On Wed, Oct 16, 2024 at 02:03:02PM +0200, Peter Zijlstra wrote:
>On Mon, Oct 14, 2024 at 09:25:19PM +0200, Peter Zijlstra wrote:
>
>> Let me ponder that a little bit.
>
>So I did the thing on top of the get/put thing that would allow you to
>get rid of the ->closed thing, and before I was finished I already hated
>all of it :-(
>
>The thing is, if you're going to the effort of adding get/put and
>putting the responsibility on the implementation, then the ->closed
>thing is only a little extra ask.
>
>So... I wondered, how hard it would be for perf_pmu_unregister() to do
>what you want.
>
>Time passed, hacks were done...
>
>and while what I have is still riddled with holes (more work is
>definitely required), it does pass your dummy_pmu test scipt.
>
>I'll poke at this a little longer. Afaict it should be possible to make
>this good enough for what you want. Just needs more holes filled.
>
>---
> include/linux/perf_event.h | 13 ++-
> kernel/events/core.c | 260 ++++++++++++++++++++++++++++++++++++++-------
> 2 files changed, 228 insertions(+), 45 deletions(-)
>
>diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
>index fb908843f209..20995d33d27e 100644
>--- a/include/linux/perf_event.h
>+++ b/include/linux/perf_event.h
>@@ -318,6 +318,9 @@ struct perf_output_handle;
> struct pmu {
> struct list_head entry;
>
>+ spinlock_t events_lock;
>+ struct list_head events;
oh... I looked at several lists and was wondering which one would
contain the events of our pmu. Now I see we didn't have that :)
>+
> struct module *module;
> struct device *dev;
> struct device *parent;
>@@ -612,9 +615,10 @@ struct perf_addr_filter_range {
> * enum perf_event_state - the states of an event:
> */
> enum perf_event_state {
>- PERF_EVENT_STATE_DEAD = -4,
>- PERF_EVENT_STATE_EXIT = -3,
>- PERF_EVENT_STATE_ERROR = -2,
>+ PERF_EVENT_STATE_DEAD = -5,
>+ PERF_EVENT_STATE_REVOKED = -4, /* pmu gone, must not touch */
>+ PERF_EVENT_STATE_EXIT = -3, /* task died, still inherit */
>+ PERF_EVENT_STATE_ERROR = -2, /* scheduling error, can enable */
> PERF_EVENT_STATE_OFF = -1,
> PERF_EVENT_STATE_INACTIVE = 0,
> PERF_EVENT_STATE_ACTIVE = 1,
>@@ -853,6 +857,7 @@ struct perf_event {
> void *security;
> #endif
> struct list_head sb_list;
>+ struct list_head pmu_list;
>
> /*
> * Certain events gets forwarded to another pmu internally by over-
>@@ -1103,7 +1108,7 @@ extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags);
> extern void perf_event_itrace_started(struct perf_event *event);
>
> extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
>-extern void perf_pmu_unregister(struct pmu *pmu);
>+extern int perf_pmu_unregister(struct pmu *pmu);
>
> extern void __perf_event_task_sched_in(struct task_struct *prev,
> struct task_struct *task);
>diff --git a/kernel/events/core.c b/kernel/events/core.c
>index cdd09769e6c5..e66827367a97 100644
>--- a/kernel/events/core.c
>+++ b/kernel/events/core.c
>@@ -2406,7 +2406,9 @@ ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event)
>
> #define DETACH_GROUP 0x01UL
> #define DETACH_CHILD 0x02UL
>-#define DETACH_DEAD 0x04UL
>+#define DETACH_EXIT 0x04UL
>+#define DETACH_REVOKE 0x08UL
>+#define DETACH_DEAD 0x10UL
>
> /*
> * Cross CPU call to remove a performance event
>@@ -2421,6 +2423,7 @@ __perf_remove_from_context(struct perf_event *event,
> void *info)
> {
> struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
>+ enum perf_event_state state = PERF_EVENT_STATE_OFF;
> unsigned long flags = (unsigned long)info;
>
> ctx_time_update(cpuctx, ctx);
>@@ -2429,16 +2432,22 @@ __perf_remove_from_context(struct perf_event *event,
> * Ensure event_sched_out() switches to OFF, at the very least
> * this avoids raising perf_pending_task() at this time.
> */
>- if (flags & DETACH_DEAD)
>+ if (flags & DETACH_EXIT)
>+ state = PERF_EVENT_STATE_EXIT;
>+ if (flags & DETACH_REVOKE)
>+ state = PERF_EVENT_STATE_REVOKED;
>+ if (flags & DETACH_DEAD) {
> event->pending_disable = 1;
>+ state = PERF_EVENT_STATE_DEAD;
>+ }
> event_sched_out(event, ctx);
> if (flags & DETACH_GROUP)
> perf_group_detach(event);
> if (flags & DETACH_CHILD)
> perf_child_detach(event);
> list_del_event(event, ctx);
>- if (flags & DETACH_DEAD)
>- event->state = PERF_EVENT_STATE_DEAD;
>+
>+ event->state = state;
>
> if (!pmu_ctx->nr_events) {
> pmu_ctx->rotate_necessary = 0;
>@@ -4508,7 +4517,8 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
>
> static void perf_remove_from_owner(struct perf_event *event);
> static void perf_event_exit_event(struct perf_event *event,
>- struct perf_event_context *ctx);
>+ struct perf_event_context *ctx,
>+ bool revoke);
>
> /*
> * Removes all events from the current task that have been marked
>@@ -4535,7 +4545,7 @@ static void perf_event_remove_on_exec(struct perf_event_context *ctx)
>
> modified = true;
>
>- perf_event_exit_event(event, ctx);
>+ perf_event_exit_event(event, ctx, false);
> }
>
> raw_spin_lock_irqsave(&ctx->lock, flags);
>@@ -5121,6 +5131,7 @@ static bool is_sb_event(struct perf_event *event)
> attr->context_switch || attr->text_poke ||
> attr->bpf_event)
> return true;
>+
> return false;
> }
>
>@@ -5321,6 +5332,8 @@ static void perf_pending_task_sync(struct perf_event *event)
>
> static void _free_event(struct perf_event *event)
> {
>+ struct pmu *pmu = event->pmu;
>+
> irq_work_sync(&event->pending_irq);
> irq_work_sync(&event->pending_disable_irq);
> perf_pending_task_sync(event);
>@@ -5330,6 +5343,7 @@ static void _free_event(struct perf_event *event)
> security_perf_event_free(event);
>
> if (event->rb) {
>+ WARN_ON_ONCE(!pmu);
> /*
> * Can happen when we close an event with re-directed output.
> *
>@@ -5349,12 +5363,16 @@ static void _free_event(struct perf_event *event)
> put_callchain_buffers();
> }
>
>- perf_event_free_bpf_prog(event);
>- perf_addr_filters_splice(event, NULL);
>- kfree(event->addr_filter_ranges);
>+ if (pmu) {
>+ perf_event_free_bpf_prog(event);
>+ perf_addr_filters_splice(event, NULL);
>+ kfree(event->addr_filter_ranges);
>+ }
>
>- if (event->destroy)
>+ if (event->destroy) {
>+ WARN_ON_ONCE(!pmu);
> event->destroy(event);
>+ }
>
> /*
> * Must be after ->destroy(), due to uprobe_perf_close() using
>@@ -5363,8 +5381,10 @@ static void _free_event(struct perf_event *event)
> if (event->hw.target)
> put_task_struct(event->hw.target);
>
>- if (event->pmu_ctx)
>+ if (event->pmu_ctx) {
>+ WARN_ON_ONCE(!pmu);
> put_pmu_ctx(event->pmu_ctx);
>+ }
>
> /*
> * perf_event_free_task() relies on put_ctx() being 'last', in particular
>@@ -5373,8 +5393,14 @@ static void _free_event(struct perf_event *event)
> if (event->ctx)
> put_ctx(event->ctx);
>
>- exclusive_event_destroy(event);
>- module_put(event->pmu->module);
>+ if (pmu) {
>+ exclusive_event_destroy(event);
>+ module_put(pmu->module);
>+ scoped_guard(spinlock, &pmu->events_lock) {
>+ list_del(&event->pmu_list);
>+ wake_up_var(pmu);
>+ }
>+ }
>
> call_rcu(&event->rcu_head, free_event_rcu);
> }
>@@ -5492,7 +5518,11 @@ int perf_event_release_kernel(struct perf_event *event)
> * Thus this guarantees that we will in fact observe and kill _ALL_
> * child events.
> */
>- perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
>+ if (event->state > PERF_EVENT_STATE_REVOKED) {
>+ perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
>+ } else {
>+ event->state = PERF_EVENT_STATE_DEAD;
>+ }
>
> perf_event_ctx_unlock(event, ctx);
>
>@@ -5803,7 +5833,7 @@ __perf_read(struct perf_event *event, char __user *buf, size_t count)
> * error state (i.e. because it was pinned but it couldn't be
> * scheduled on to the CPU at some point).
> */
>- if (event->state == PERF_EVENT_STATE_ERROR)
>+ if (event->state <= PERF_EVENT_STATE_ERROR)
> return 0;
>
> if (count < event->read_size)
>@@ -5842,8 +5872,14 @@ static __poll_t perf_poll(struct file *file, poll_table *wait)
> struct perf_buffer *rb;
> __poll_t events = EPOLLHUP;
>
>+ if (event->state <= PERF_EVENT_STATE_REVOKED)
>+ return EPOLLERR;
>+
> poll_wait(file, &event->waitq, wait);
>
>+ if (event->state <= PERF_EVENT_STATE_REVOKED)
>+ return EPOLLERR;
>+
> if (is_event_hup(event))
> return events;
>
>@@ -6013,7 +6049,7 @@ static inline int perf_fget_light(int fd, struct fd *p)
> }
>
> static int perf_event_set_output(struct perf_event *event,
>- struct perf_event *output_event);
>+ struct perf_event *output_event, bool force);
> static int perf_event_set_filter(struct perf_event *event, void __user *arg);
> static int perf_copy_attr(struct perf_event_attr __user *uattr,
> struct perf_event_attr *attr);
>@@ -6023,6 +6059,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
> void (*func)(struct perf_event *);
> u32 flags = arg;
>
>+ if (event->state <= PERF_EVENT_STATE_REVOKED)
>+ return -ENODEV;
>+
> switch (cmd) {
> case PERF_EVENT_IOC_ENABLE:
> func = _perf_event_enable;
>@@ -6065,10 +6104,10 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
> if (ret)
> return ret;
> output_event = fd_file(output)->private_data;
>- ret = perf_event_set_output(event, output_event);
>+ ret = perf_event_set_output(event, output_event, false);
> fdput(output);
> } else {
>- ret = perf_event_set_output(event, NULL);
>+ ret = perf_event_set_output(event, NULL, false);
> }
> return ret;
> }
>@@ -6472,6 +6511,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
> unsigned long size = perf_data_size(rb);
> bool detach_rest = false;
>
>+ /* FIXIES vs perf_pmu_unregister() */
> if (event->pmu->event_unmapped)
> event->pmu->event_unmapped(event, vma->vm_mm);
>
>@@ -6591,7 +6631,15 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
> unsigned long vma_size;
> unsigned long nr_pages;
> long user_extra = 0, extra = 0;
>- int ret = 0, flags = 0;
>+ int ret, flags = 0;
>+
>+ ret = security_perf_event_read(event);
>+ if (ret)
>+ return ret;
>+
>+ /* XXX needs event->mmap_mutex? */
>+ if (event->state <= PERF_EVENT_STATE_REVOKED)
>+ return -ENODEV;
>
> /*
> * Don't allow mmap() of inherited per-task counters. This would
>@@ -6604,10 +6652,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
> if (!(vma->vm_flags & VM_SHARED))
> return -EINVAL;
>
>- ret = security_perf_event_read(event);
>- if (ret)
>- return ret;
>-
> vma_size = vma->vm_end - vma->vm_start;
>
> if (vma->vm_pgoff == 0) {
>@@ -6810,6 +6854,9 @@ static int perf_fasync(int fd, struct file *filp, int on)
> struct perf_event *event = filp->private_data;
> int retval;
>
>+ if (event->state <= PERF_EVENT_STATE_REVOKED)
>+ return -ENODEV;
>+
> inode_lock(inode);
> retval = fasync_helper(fd, filp, on, &event->fasync);
> inode_unlock(inode);
>@@ -11513,6 +11560,7 @@ static int perf_event_idx_default(struct perf_event *event)
>
> static void free_pmu_context(struct pmu *pmu)
> {
>+ free_percpu(pmu->pmu_disable_count);
> free_percpu(pmu->cpu_pmu_context);
> }
>
>@@ -11753,6 +11801,7 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
> if (type >= 0)
> max = type;
>
>+ // XXX broken vs perf_init_event(), this publishes before @pmu is finalized
> ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
> if (ret < 0)
> goto free_pdc;
>@@ -11809,8 +11858,14 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
> if (!pmu->event_idx)
> pmu->event_idx = perf_event_idx_default;
>
>- list_add_rcu(&pmu->entry, &pmus);
> atomic_set(&pmu->exclusive_cnt, 0);
>+ INIT_LIST_HEAD(&pmu->events);
>+ spin_lock_init(&pmu->events_lock);
>+
>+ /*
>+ * Publish the pmu after it is fully initialized.
>+ */
>+ list_add_rcu(&pmu->entry, &pmus);
> ret = 0;
> unlock:
> mutex_unlock(&pmus_lock);
>@@ -11832,10 +11887,110 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
> }
> EXPORT_SYMBOL_GPL(perf_pmu_register);
>
>-void perf_pmu_unregister(struct pmu *pmu)
>+static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event,
>+ struct perf_event_context *ctx)
> {
>+ /*
>+ * De-schedule the event and mark it EXIT_PMU.
>+ */
>+ perf_event_exit_event(event, ctx, true);
>+
>+ /*
>+ * All _free_event() bits that rely on event->pmu:
>+ */
>+ perf_event_set_output(event, NULL, true);
>+
>+ perf_event_free_bpf_prog(event);
>+ perf_addr_filters_splice(event, NULL);
>+ kfree(event->addr_filter_ranges);
>+
>+ if (event->destroy) {
>+ event->destroy(event);
>+ event->destroy = NULL;
>+ }
>+
>+ if (event->pmu_ctx) {
>+ /*
>+ * Make sure pmu->cpu_pmu_context is unused. An alternative is
>+ * to have it be pointers and make put_pmu_ctx()'s
>+ * epc->embedded case be another RCU free case.
>+ */
>+ put_pmu_ctx(event->pmu_ctx);
>+ event->pmu_ctx = NULL;
>+ }
>+
>+ exclusive_event_destroy(event);
>+ module_put(pmu->module);
>+
>+ event->pmu = NULL; /* force fault instead of UAF */
>+}
>+
>+static void pmu_detach_event(struct pmu *pmu, struct perf_event *event)
>+{
>+ struct perf_event_context *ctx;
>+
>+ ctx = perf_event_ctx_lock(event);
>+ __pmu_detach_event(pmu, event, ctx);
>+ perf_event_ctx_unlock(event, ctx);
>+
>+ scoped_guard(spinlock, &pmu->events_lock)
>+ list_del(&event->pmu_list);
>+}
>+
>+static struct perf_event *pmu_get_event(struct pmu *pmu)
>+{
>+ struct perf_event *event;
>+
>+ guard(spinlock)(&pmu->events_lock);
>+ list_for_each_entry(event, &pmu->events, pmu_list) {
>+ if (atomic_long_inc_not_zero(&event->refcount))
>+ return event;
>+ }
>+
>+ return NULL;
>+}
>+
>+static bool pmu_empty(struct pmu *pmu)
>+{
>+ guard(spinlock)(&pmu->events_lock);
>+ return list_empty(&pmu->events);
>+}
>+
>+static void pmu_detach_events(struct pmu *pmu)
>+{
>+ struct perf_event *event;
>+
>+ for (;;) {
>+ event = pmu_get_event(pmu);
>+ if (!event)
>+ break;
>+
>+ pmu_detach_event(pmu, event);
>+ put_event(event);
>+ }
>+
>+ /*
>+ * wait for pending _free_event()s
>+ */
>+ wait_var_event(pmu, pmu_empty(pmu));
ok... so now we are synchronosly moving the events to a revoked state
during unregister, so we wouldn't need the refcount on the driver side
anymore and can just free the objects right after return.
I will give this a try with i915 and/or xe.
thanks
Lucas De Marchi
>+}
>+
>+int perf_pmu_unregister(struct pmu *pmu)
>+{
>+ /*
>+ * FIXME!
>+ *
>+ * perf_mmap_close(); event->pmu->event_unmapped()
>+ *
>+ * XXX this check is racy vs perf_event_alloc()
>+ */
>+ if (pmu->event_unmapped && !pmu_empty(pmu))
>+ return -EBUSY;
>+
> mutex_lock(&pmus_lock);
> list_del_rcu(&pmu->entry);
>+ idr_remove(&pmu_idr, pmu->type);
>+ mutex_unlock(&pmus_lock);
>
> /*
> * We dereference the pmu list under both SRCU and regular RCU, so
>@@ -11844,16 +11999,29 @@ void perf_pmu_unregister(struct pmu *pmu)
> synchronize_srcu(&pmus_srcu);
> synchronize_rcu();
>
>- free_percpu(pmu->pmu_disable_count);
>- idr_remove(&pmu_idr, pmu->type);
>+ /*
>+ * PMU is removed from the pmus list, so no new events will
>+ * be created, now take care of the existing ones.
>+ */
>+ pmu_detach_events(pmu);
>+
>+ /*
>+ * PMU is unused, make it go away.
>+ */
> if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) {
> if (pmu->nr_addr_filters)
> device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
> device_del(pmu->dev);
> put_device(pmu->dev);
> }
>+
>+ /*
>+ * XXX -- broken? can still contain SW events at this point?
>+ * audit more, make sure DETACH_GROUP DTRT
>+ */
> free_pmu_context(pmu);
>- mutex_unlock(&pmus_lock);
>+
>+ return 0;
> }
> EXPORT_SYMBOL_GPL(perf_pmu_unregister);
>
>@@ -12330,6 +12498,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
> /* symmetric to unaccount_event() in _free_event() */
> account_event(event);
>
>+ scoped_guard(spinlock, &pmu->events_lock)
>+ list_add(&event->pmu_list, &pmu->events);
>+
> return event;
>
> err_callchain_buffer:
>@@ -12493,7 +12664,7 @@ static void mutex_lock_double(struct mutex *a, struct mutex *b)
> }
>
> static int
>-perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
>+perf_event_set_output(struct perf_event *event, struct perf_event *output_event, bool force)
> {
> struct perf_buffer *rb = NULL;
> int ret = -EINVAL;
>@@ -12549,8 +12720,12 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
> mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
> set:
> /* Can't redirect output if we've got an active mmap() */
>- if (atomic_read(&event->mmap_count))
>- goto unlock;
>+ if (atomic_read(&event->mmap_count)) {
>+ if (!force)
>+ goto unlock;
>+
>+ WARN_ON_ONCE(event->pmu->event_unmapped);
>+ }
>
> if (output_event) {
> /* get the rb we want to redirect to */
>@@ -12740,6 +12915,10 @@ SYSCALL_DEFINE5(perf_event_open,
> if (err)
> goto err_fd;
> group_leader = fd_file(group)->private_data;
>+ if (group_leader->state <= PERF_EVENT_STATE_REVOKED) {
>+ err = -ENODEV;
>+ goto err_group_fd;
>+ }
> if (flags & PERF_FLAG_FD_OUTPUT)
> output_event = group_leader;
> if (flags & PERF_FLAG_FD_NO_GROUP)
>@@ -12916,7 +13095,7 @@ SYSCALL_DEFINE5(perf_event_open,
> event->pmu_ctx = pmu_ctx;
>
> if (output_event) {
>- err = perf_event_set_output(event, output_event);
>+ err = perf_event_set_output(event, output_event, false);
> if (err)
> goto err_context;
> }
>@@ -13287,10 +13466,11 @@ static void sync_child_event(struct perf_event *child_event)
> }
>
> static void
>-perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
>+perf_event_exit_event(struct perf_event *event,
>+ struct perf_event_context *ctx, bool revoke)
> {
> struct perf_event *parent_event = event->parent;
>- unsigned long detach_flags = 0;
>+ unsigned long detach_flags = DETACH_EXIT;
>
> if (parent_event) {
> /*
>@@ -13305,16 +13485,14 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
> * Do destroy all inherited groups, we don't care about those
> * and being thorough is better.
> */
>- detach_flags = DETACH_GROUP | DETACH_CHILD;
>+ detach_flags |= DETACH_GROUP | DETACH_CHILD;
> mutex_lock(&parent_event->child_mutex);
> }
>
>- perf_remove_from_context(event, detach_flags);
>+ if (revoke)
>+ detach_flags |= DETACH_GROUP | DETACH_REVOKE;
>
>- raw_spin_lock_irq(&ctx->lock);
>- if (event->state > PERF_EVENT_STATE_EXIT)
>- perf_event_set_state(event, PERF_EVENT_STATE_EXIT);
>- raw_spin_unlock_irq(&ctx->lock);
>+ perf_remove_from_context(event, detach_flags);
>
> /*
> * Child events can be freed.
>@@ -13390,7 +13568,7 @@ static void perf_event_exit_task_context(struct task_struct *child)
> perf_event_task(child, child_ctx, 0);
>
> list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
>- perf_event_exit_event(child_event, child_ctx);
>+ perf_event_exit_event(child_event, child_ctx, false);
>
> mutex_unlock(&child_ctx->mutex);
>
Powered by blists - more mailing lists