[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20241104135519.715883982@infradead.org>
Date: Mon, 04 Nov 2024 14:39:28 +0100
From: Peter Zijlstra <peterz@...radead.org>
To: mingo@...nel.org,
lucas.demarchi@...el.com
Cc: linux-kernel@...r.kernel.org,
peterz@...radead.org,
willy@...radead.org,
acme@...nel.org,
namhyung@...nel.org,
mark.rutland@....com,
alexander.shishkin@...ux.intel.com,
jolsa@...nel.org,
irogers@...gle.com,
adrian.hunter@...el.com,
kan.liang@...ux.intel.com
Subject: [PATCH 19/19] perf: Make perf_pmu_unregister() useable
Previously it was only safe to call perf_pmu_unregister() if there
were no active events of that pmu around -- which was impossible to
guarantee since it races all sorts against perf_init_event().
Rework the whole thing by:
- keeping track of all events for a given pmu
- 'hiding' the pmu from perf_init_event()
- waiting for the appropriate (s)rcu grace periods such that all
prior references to the PMU will be completed
- detaching all still existing events of that pmu (see first point)
and moving them to a new REVOKED state.
- actually freeing the pmu data.
Where notably the new REVOKED state must inhibit all event actions
from reaching code that wants to use event->pmu.
Signed-off-by: Peter Zijlstra (Intel) <peterz@...radead.org>
---
include/linux/perf_event.h | 13 +-
kernel/events/core.c | 222 ++++++++++++++++++++++++++++++++++++++++-----
2 files changed, 210 insertions(+), 25 deletions(-)
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -318,6 +318,9 @@ struct perf_output_handle;
struct pmu {
struct list_head entry;
+ spinlock_t events_lock;
+ struct list_head events;
+
struct module *module;
struct device *dev;
struct device *parent;
@@ -611,9 +614,10 @@ struct perf_addr_filter_range {
* enum perf_event_state - the states of an event:
*/
enum perf_event_state {
- PERF_EVENT_STATE_DEAD = -4,
- PERF_EVENT_STATE_EXIT = -3,
- PERF_EVENT_STATE_ERROR = -2,
+ PERF_EVENT_STATE_DEAD = -5,
+ PERF_EVENT_STATE_REVOKED = -4, /* pmu gone, must not touch */
+ PERF_EVENT_STATE_EXIT = -3, /* task died, still inherit */
+ PERF_EVENT_STATE_ERROR = -2, /* scheduling error, can enable */
PERF_EVENT_STATE_OFF = -1,
PERF_EVENT_STATE_INACTIVE = 0,
PERF_EVENT_STATE_ACTIVE = 1,
@@ -854,6 +858,7 @@ struct perf_event {
void *security;
#endif
struct list_head sb_list;
+ struct list_head pmu_list;
/*
* Certain events gets forwarded to another pmu internally by over-
@@ -1105,7 +1110,7 @@ extern void perf_aux_output_flag(struct
extern void perf_event_itrace_started(struct perf_event *event);
extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
-extern void perf_pmu_unregister(struct pmu *pmu);
+extern int perf_pmu_unregister(struct pmu *pmu);
extern void __perf_event_task_sched_in(struct task_struct *prev,
struct task_struct *task);
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2412,7 +2412,9 @@ ctx_time_update_event(struct perf_event_
#define DETACH_GROUP 0x01UL
#define DETACH_CHILD 0x02UL
-#define DETACH_DEAD 0x04UL
+#define DETACH_EXIT 0x04UL
+#define DETACH_REVOKE 0x08UL
+#define DETACH_DEAD 0x10UL
/*
* Cross CPU call to remove a performance event
@@ -2427,6 +2429,7 @@ __perf_remove_from_context(struct perf_e
void *info)
{
struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
+ enum perf_event_state state = PERF_EVENT_STATE_OFF;
unsigned long flags = (unsigned long)info;
ctx_time_update(cpuctx, ctx);
@@ -2435,16 +2438,22 @@ __perf_remove_from_context(struct perf_e
* Ensure event_sched_out() switches to OFF, at the very least
* this avoids raising perf_pending_task() at this time.
*/
- if (flags & DETACH_DEAD)
+ if (flags & DETACH_EXIT)
+ state = PERF_EVENT_STATE_EXIT;
+ if (flags & DETACH_REVOKE)
+ state = PERF_EVENT_STATE_REVOKED;
+ if (flags & DETACH_DEAD) {
event->pending_disable = 1;
+ state = PERF_EVENT_STATE_DEAD;
+ }
event_sched_out(event, ctx);
if (flags & DETACH_GROUP)
perf_group_detach(event);
if (flags & DETACH_CHILD)
perf_child_detach(event);
list_del_event(event, ctx);
- if (flags & DETACH_DEAD)
- event->state = PERF_EVENT_STATE_DEAD;
+
+ event->state = state;
if (!pmu_ctx->nr_events) {
pmu_ctx->rotate_necessary = 0;
@@ -4511,7 +4520,8 @@ static void perf_event_enable_on_exec(st
static void perf_remove_from_owner(struct perf_event *event);
static void perf_event_exit_event(struct perf_event *event,
- struct perf_event_context *ctx);
+ struct perf_event_context *ctx,
+ bool revoke);
/*
* Removes all events from the current task that have been marked
@@ -4538,7 +4548,7 @@ static void perf_event_remove_on_exec(st
modified = true;
- perf_event_exit_event(event, ctx);
+ perf_event_exit_event(event, ctx, false);
}
raw_spin_lock_irqsave(&ctx->lock, flags);
@@ -5138,6 +5148,7 @@ static bool is_sb_event(struct perf_even
attr->context_switch || attr->text_poke ||
attr->bpf_event)
return true;
+
return false;
}
@@ -5339,6 +5350,8 @@ static void perf_pending_task_sync(struc
/* vs perf_event_alloc() error */
static void __free_event(struct perf_event *event)
{
+ struct pmu *pmu = event->pmu;
+
if (event->attach_state & PERF_ATTACH_CALLCHAIN)
put_callchain_buffers();
@@ -5365,6 +5378,7 @@ static void __free_event(struct perf_eve
* put_pmu_ctx() needs an event->ctx reference, because of
* epc->ctx.
*/
+ WARN_ON_ONCE(!pmu);
WARN_ON_ONCE(!event->ctx);
WARN_ON_ONCE(event->pmu_ctx->ctx != event->ctx);
put_pmu_ctx(event->pmu_ctx);
@@ -5377,8 +5391,13 @@ static void __free_event(struct perf_eve
if (event->ctx)
put_ctx(event->ctx);
- if (event->pmu)
- module_put(event->pmu->module);
+ if (pmu) {
+ module_put(pmu->module);
+ scoped_guard (spinlock, &pmu->events_lock) {
+ list_del(&event->pmu_list);
+ wake_up_var(pmu);
+ }
+ }
call_rcu(&event->rcu_head, free_event_rcu);
}
@@ -5397,6 +5416,7 @@ static void _free_event(struct perf_even
security_perf_event_free(event);
if (event->rb) {
+ WARN_ON_ONCE(!event->pmu);
/*
* Can happen when we close an event with re-directed output.
*
@@ -5527,7 +5547,11 @@ int perf_event_release_kernel(struct per
* Thus this guarantees that we will in fact observe and kill _ALL_
* child events.
*/
- perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
+ if (event->state > PERF_EVENT_STATE_REVOKED) {
+ perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
+ } else {
+ event->state = PERF_EVENT_STATE_DEAD;
+ }
perf_event_ctx_unlock(event, ctx);
@@ -5838,7 +5862,7 @@ __perf_read(struct perf_event *event, ch
* error state (i.e. because it was pinned but it couldn't be
* scheduled on to the CPU at some point).
*/
- if (event->state == PERF_EVENT_STATE_ERROR)
+ if (event->state <= PERF_EVENT_STATE_ERROR)
return 0;
if (count < event->read_size)
@@ -5877,8 +5901,14 @@ static __poll_t perf_poll(struct file *f
struct perf_buffer *rb;
__poll_t events = EPOLLHUP;
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return EPOLLERR;
+
poll_wait(file, &event->waitq, wait);
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return EPOLLERR;
+
if (is_event_hup(event))
return events;
@@ -6058,6 +6088,9 @@ static long _perf_ioctl(struct perf_even
void (*func)(struct perf_event *);
u32 flags = arg;
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return -ENODEV;
+
switch (cmd) {
case PERF_EVENT_IOC_ENABLE:
func = _perf_event_enable;
@@ -6507,6 +6540,7 @@ static void perf_mmap_close(struct vm_ar
unsigned long size = perf_data_size(rb);
bool detach_rest = false;
+ /* FIXIES vs perf_pmu_unregister() */
if (event->pmu->event_unmapped)
event->pmu->event_unmapped(event, vma->vm_mm);
@@ -6657,6 +6691,16 @@ static int perf_mmap(struct file *file,
mutex_lock(&event->mmap_mutex);
ret = -EINVAL;
+ /*
+ * This relies on __pmu_detach_event() taking mmap_mutex after marking
+ * the event REVOKED. Either we observe the state, or __pmu_detach_event()
+ * will detach the rb created here.
+ */
+ if (event->state <= PERF_EVENT_STATE_REVOKED) {
+ ret = -ENODEV;
+ goto unlock;
+ }
+
if (vma->vm_pgoff == 0) {
nr_pages -= 1;
@@ -6840,6 +6884,9 @@ static int perf_fasync(int fd, struct fi
struct perf_event *event = filp->private_data;
int retval;
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return -ENODEV;
+
inode_lock(inode);
retval = fasync_helper(fd, filp, on, &event->fasync);
inode_unlock(inode);
@@ -11892,6 +11939,9 @@ int perf_pmu_register(struct pmu *_pmu,
if (!pmu->event_idx)
pmu->event_idx = perf_event_idx_default;
+ INIT_LIST_HEAD(&pmu->events);
+ spin_lock_init(&pmu->events_lock);
+
/*
* Now that the PMU is complete, make it visible to perf_try_init_event().
*/
@@ -11905,11 +11955,100 @@ int perf_pmu_register(struct pmu *_pmu,
}
EXPORT_SYMBOL_GPL(perf_pmu_register);
-void perf_pmu_unregister(struct pmu *pmu)
+static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+ /*
+ * De-schedule the event and mark it REVOKED.
+ */
+ perf_event_exit_event(event, ctx, true);
+
+ /*
+ * All _free_event() bits that rely on event->pmu:
+ *
+ * Notably, perf_mmap() relies on the ordering here.
+ */
+ scoped_guard (mutex, &event->mmap_mutex) {
+ WARN_ON_ONCE(pmu->event_unmapped);
+ ring_buffer_attach(event, NULL);
+ }
+
+ perf_event_free_bpf_prog(event);
+ perf_free_addr_filters(event);
+
+ if (event->destroy) {
+ event->destroy(event);
+ event->destroy = NULL;
+ }
+
+ if (event->pmu_ctx) {
+ put_pmu_ctx(event->pmu_ctx);
+ event->pmu_ctx = NULL;
+ }
+
+ exclusive_event_destroy(event);
+ module_put(pmu->module);
+
+ event->pmu = NULL; /* force fault instead of UAF */
+}
+
+static void pmu_detach_event(struct pmu *pmu, struct perf_event *event)
+{
+ struct perf_event_context *ctx;
+
+ ctx = perf_event_ctx_lock(event);
+ __pmu_detach_event(pmu, event, ctx);
+ perf_event_ctx_unlock(event, ctx);
+
+ scoped_guard (spinlock, &pmu->events_lock)
+ list_del(&event->pmu_list);
+}
+
+static struct perf_event *pmu_get_event(struct pmu *pmu)
+{
+ struct perf_event *event;
+
+ guard(spinlock)(&pmu->events_lock);
+ list_for_each_entry(event, &pmu->events, pmu_list) {
+ if (atomic_long_inc_not_zero(&event->refcount))
+ return event;
+ }
+
+ return NULL;
+}
+
+static bool pmu_empty(struct pmu *pmu)
+{
+ guard(spinlock)(&pmu->events_lock);
+ return list_empty(&pmu->events);
+}
+
+static void pmu_detach_events(struct pmu *pmu)
+{
+ struct perf_event *event;
+
+ for (;;) {
+ event = pmu_get_event(pmu);
+ if (!event)
+ break;
+
+ pmu_detach_event(pmu, event);
+ put_event(event);
+ }
+
+ /*
+ * wait for pending _free_event()s
+ */
+ wait_var_event(pmu, pmu_empty(pmu));
+}
+
+int perf_pmu_unregister(struct pmu *pmu)
{
scoped_guard (mutex, &pmus_lock) {
+ if (!idr_cmpxchg(&pmu_idr, pmu->type, pmu, NULL))
+ return -EINVAL;
+
list_del_rcu(&pmu->entry);
- idr_remove(&pmu_idr, pmu->type);
}
/*
@@ -11919,7 +12058,31 @@ void perf_pmu_unregister(struct pmu *pmu
synchronize_srcu(&pmus_srcu);
synchronize_rcu();
+ if (pmu->event_unmapped && !pmu_empty(pmu)) {
+ /*
+ * Can't force remove events when pmu::event_unmapped()
+ * is used in perf_mmap_close().
+ */
+ guard(mutex)(&pmus_lock);
+ idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu);
+ list_add_rcu(&pmu->entry, &pmus);
+ return -EBUSY;
+ }
+
+ scoped_guard (mutex, &pmus_lock)
+ idr_remove(&pmu_idr, pmu->type);
+
+ /*
+ * PMU is removed from the pmus list, so no new events will
+ * be created, now take care of the existing ones.
+ */
+ pmu_detach_events(pmu);
+
+ /*
+ * PMU is unused, make it go away.
+ */
perf_pmu_free(pmu);
+ return 0;
}
EXPORT_SYMBOL_GPL(perf_pmu_unregister);
@@ -12226,6 +12389,7 @@ perf_event_alloc(struct perf_event_attr
INIT_LIST_HEAD(&event->active_entry);
INIT_LIST_HEAD(&event->addr_filters.list);
INIT_HLIST_NODE(&event->hlist_entry);
+ INIT_LIST_HEAD(&event->pmu_list);
init_waitqueue_head(&event->waitq);
@@ -12294,6 +12458,13 @@ perf_event_alloc(struct perf_event_attr
perf_event__state_init(event);
+ /*
+ * Hold SRCU critical section around perf_init_event(), until returning
+ * the fully formed event put on pmu->events_list. This ensures that
+ * perf_pmu_unregister() will see any in-progress event creation that
+ * races.
+ */
+ guard(srcu)(&pmus_srcu);
pmu = NULL;
hwc = &event->hw;
@@ -12383,6 +12554,9 @@ perf_event_alloc(struct perf_event_attr
/* symmetric to unaccount_event() in _free_event() */
account_event(event);
+ scoped_guard (spinlock, &pmu->events_lock)
+ list_add(&event->pmu_list, &pmu->events);
+
return_ptr(event);
}
@@ -12769,6 +12943,10 @@ SYSCALL_DEFINE5(perf_event_open,
if (err)
goto err_fd;
group_leader = fd_file(group)->private_data;
+ if (group_leader->state <= PERF_EVENT_STATE_REVOKED) {
+ err = -ENODEV;
+ goto err_group_fd;
+ }
if (flags & PERF_FLAG_FD_OUTPUT)
output_event = group_leader;
if (flags & PERF_FLAG_FD_NO_GROUP)
@@ -13316,10 +13494,11 @@ static void sync_child_event(struct perf
}
static void
-perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
+perf_event_exit_event(struct perf_event *event,
+ struct perf_event_context *ctx, bool revoke)
{
struct perf_event *parent_event = event->parent;
- unsigned long detach_flags = 0;
+ unsigned long detach_flags = DETACH_EXIT;
if (parent_event) {
/*
@@ -13334,16 +13513,14 @@ perf_event_exit_event(struct perf_event
* Do destroy all inherited groups, we don't care about those
* and being thorough is better.
*/
- detach_flags = DETACH_GROUP | DETACH_CHILD;
+ detach_flags |= DETACH_GROUP | DETACH_CHILD;
mutex_lock(&parent_event->child_mutex);
}
- perf_remove_from_context(event, detach_flags);
+ if (revoke)
+ detach_flags |= DETACH_GROUP | DETACH_REVOKE;
- raw_spin_lock_irq(&ctx->lock);
- if (event->state > PERF_EVENT_STATE_EXIT)
- perf_event_set_state(event, PERF_EVENT_STATE_EXIT);
- raw_spin_unlock_irq(&ctx->lock);
+ perf_remove_from_context(event, detach_flags);
/*
* Child events can be freed.
@@ -13419,7 +13596,7 @@ static void perf_event_exit_task_context
perf_event_task(child, child_ctx, 0);
list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
- perf_event_exit_event(child_event, child_ctx);
+ perf_event_exit_event(child_event, child_ctx, false);
mutex_unlock(&child_ctx->mutex);
@@ -13609,6 +13786,9 @@ inherit_event(struct perf_event *parent_
if (parent_event->parent)
parent_event = parent_event->parent;
+ if (parent_event->state <= PERF_EVENT_STATE_REVOKED)
+ return NULL;
+
child_event = perf_event_alloc(&parent_event->attr,
parent_event->cpu,
child,
Powered by blists - more mailing lists