>From 4467a6520c8f59065220a651167c4cd8da7a6e9b Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 23 Sep 2022 16:43:19 +0200 Subject: [PATCH 2/2] perf: Fix missing SIGTRAPs due to pending_disable abuse Due to the implementation of how SIGTRAP are delivered if perf_event_attr::sigtrap is set, we've noticed 3 issues: 1. Missing SIGTRAP due to a race with event_sched_out() (more details below). 2. Hardware PMU events being disabled due to returning 1 from perf_event_overflow(). The only way to re-enable the event is for user space to first "properly" disable the event and then re-enable it. 3. The inability to automatically disable an event after a specified number of overflows via PERF_EVENT_IOC_REFRESH. The worst of the 3 issues is problem (1), which occurs when a pending_disable is "consumed" by a racing event_sched_out(), observed as follows: CPU0 | CPU1 --------------------------------+--------------------------- __perf_event_overflow() | perf_event_disable_inatomic() | pending_disable = CPU0 | ... | _perf_event_enable() | event_function_call() | task_function_call() | /* sends IPI to CPU0 */ | ... __perf_event_enable() +--------------------------- ctx_resched() task_ctx_sched_out() ctx_sched_out() group_sched_out() event_sched_out() pending_disable = -1 perf_pending_event() perf_pending_event_disable() /* Fails to send SIGTRAP because no pending_disable! */ In the above case, not only is that particular SIGTRAP missed, but also all future SIGTRAPs because 'event_limit' is not reset back to 1. To fix, rework pending delivery of SIGTRAP via IRQ-work by introduction of a separate 'pending_sigtrap', no longer using 'event_limit' and 'pending_disable' for its delivery. During testing, this also revealed several more possible races between reschedules and pending IRQ work; see code comments for details. Doing so makes it possible to use 'event_limit' normally (thereby enabling use of PERF_EVENT_IOC_REFRESH), perf_event_overflow() no longer returns 1 on SIGTRAP causing disabling of hardware PMUs, and finally the race is no longer possible due to event_sched_out() not consuming 'pending_disable'. Fixes: 97ba62b27867 ("perf: Add support for SIGTRAP on perf events") Reported-by: Dmitry Vyukov Debugged-by: Dmitry Vyukov Signed-off-by: Marco Elver --- v2: * Use irq_work_raw_sync(). --- include/linux/perf_event.h | 1 + kernel/events/core.c | 64 +++++++++++++++++++++++++++++++------- 2 files changed, 53 insertions(+), 12 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 907b0e3f1318..c119fa7b70d6 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -740,6 +740,7 @@ struct perf_event { int pending_wakeup; int pending_kill; int pending_disable; + int pending_sigtrap; unsigned long pending_addr; /* SIGTRAP */ struct irq_work pending; diff --git a/kernel/events/core.c b/kernel/events/core.c index c37ba0909078..6ba02a1b5c5d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2527,6 +2527,15 @@ event_sched_in(struct perf_event *event, if (event->attr.exclusive) cpuctx->exclusive = 1; + if (event->pending_sigtrap) { + /* + * The task and event might have been moved to another CPU: + * queue another IRQ work. See perf_pending_event_sigtrap(). + */ + irq_work_raw_sync(&event->pending); /* Syncs if pending on other CPU. */ + irq_work_queue(&event->pending); + } + out: perf_pmu_enable(event->pmu); @@ -6446,6 +6455,37 @@ static void perf_sigtrap(struct perf_event *event) event->attr.type, event->attr.sig_data); } +static void perf_pending_event_sigtrap(struct perf_event *event) +{ + if (!event->pending_sigtrap) + return; + + /* + * If we're racing with disabling of the event, consume pending_sigtrap + * and don't send the SIGTRAP. This avoids potentially delaying a signal + * indefinitely (oncpu mismatch) until the event is enabled again, which + * could happen after already returning to user space; in that case the + * signal would erroneously become asynchronous. + */ + if (event->state == PERF_EVENT_STATE_OFF) { + event->pending_sigtrap = 0; + return; + } + + /* + * Only process this pending SIGTRAP if this IRQ work is running on the + * right CPU: the scheduler is able to run before the IRQ work, which + * moved the task to another CPU. In event_sched_in() another IRQ work + * is scheduled, so that the signal is not lost; given the kernel has + * not yet returned to user space, the signal remains synchronous. + */ + if (READ_ONCE(event->oncpu) != smp_processor_id()) + return; + + event->pending_sigtrap = 0; + perf_sigtrap(event); +} + static void perf_pending_event_disable(struct perf_event *event) { int cpu = READ_ONCE(event->pending_disable); @@ -6455,13 +6495,6 @@ static void perf_pending_event_disable(struct perf_event *event) if (cpu == smp_processor_id()) { WRITE_ONCE(event->pending_disable, -1); - - if (event->attr.sigtrap) { - perf_sigtrap(event); - atomic_set_release(&event->event_limit, 1); /* rearm event */ - return; - } - perf_event_disable_local(event); return; } @@ -6500,6 +6533,7 @@ static void perf_pending_event(struct irq_work *entry) * and we won't recurse 'further'. */ + perf_pending_event_sigtrap(event); perf_pending_event_disable(event); if (event->pending_wakeup) { @@ -9209,11 +9243,20 @@ static int __perf_event_overflow(struct perf_event *event, if (events && atomic_dec_and_test(&event->event_limit)) { ret = 1; event->pending_kill = POLL_HUP; - event->pending_addr = data->addr; - perf_event_disable_inatomic(event); } + if (event->attr.sigtrap) { + /* + * Should not be able to return to user space without processing + * pending_sigtrap (kernel events can overflow multiple times). + */ + WARN_ON_ONCE(event->pending_sigtrap && event->attr.exclude_kernel); + event->pending_sigtrap = 1; + event->pending_addr = data->addr; + irq_work_queue(&event->pending); + } + READ_ONCE(event->overflow_handler)(event, data, regs); if (*perf_event_fasync(event) && event->pending_kill) { @@ -11557,9 +11600,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (parent_event) event->event_caps = parent_event->event_caps; - if (event->attr.sigtrap) - atomic_set(&event->event_limit, 1); - if (task) { event->attach_state = PERF_ATTACH_TASK; /* -- 2.37.3.998.g577e59143f-goog