[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20220927121322.1236730-1-elver@google.com>
Date: Tue, 27 Sep 2022 14:13:22 +0200
From: Marco Elver <elver@...gle.com>
To: elver@...gle.com, Peter Zijlstra <peterz@...radead.org>
Cc: Ingo Molnar <mingo@...hat.com>,
Arnaldo Carvalho de Melo <acme@...nel.org>,
Mark Rutland <mark.rutland@....com>,
Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
Jiri Olsa <jolsa@...nel.org>,
Namhyung Kim <namhyung@...nel.org>,
linux-perf-users@...r.kernel.org, linux-kernel@...r.kernel.org,
kasan-dev@...glegroups.com, Dmitry Vyukov <dvyukov@...gle.com>
Subject: [PATCH] perf: Fix missing SIGTRAPs due to pending_disable abuse
Due to the implementation of how SIGTRAP are delivered if
perf_event_attr::sigtrap is set, we've noticed 3 issues:
1. Missing SIGTRAP due to a race with event_sched_out() (more
details below).
2. Hardware PMU events being disabled due to returning 1 from
perf_event_overflow(). The only way to re-enable the event is
for user space to first "properly" disable the event and then
re-enable it.
3. The inability to automatically disable an event after a
specified number of overflows via PERF_EVENT_IOC_REFRESH.
The worst of the 3 issues is problem (1), which occurs when a
pending_disable is "consumed" by a racing event_sched_out(), observed as
follows:
CPU0 | CPU1
--------------------------------+---------------------------
__perf_event_overflow() |
perf_event_disable_inatomic() |
pending_disable = CPU0 | ...
| _perf_event_enable()
| event_function_call()
| task_function_call()
| /* sends IPI to CPU0 */
<IPI> | ...
__perf_event_enable() +---------------------------
ctx_resched()
task_ctx_sched_out()
ctx_sched_out()
group_sched_out()
event_sched_out()
pending_disable = -1
</IPI>
<IRQ-work>
perf_pending_event()
perf_pending_event_disable()
/* Fails to send SIGTRAP because no pending_disable! */
</IRQ-work>
In the above case, not only is that particular SIGTRAP missed, but also
all future SIGTRAPs because 'event_limit' is not reset back to 1.
To fix, rework pending delivery of SIGTRAP via IRQ-work by introduction
of a separate 'pending_sigtrap', no longer using 'event_limit' and
'pending_disable' for its delivery.
During testing, this also revealed several more possible races between
reschedules and pending IRQ work; see code comments for details.
Doing so makes it possible to use 'event_limit' normally (thereby
enabling use of PERF_EVENT_IOC_REFRESH), perf_event_overflow() no longer
returns 1 on SIGTRAP causing disabling of hardware PMUs, and finally the
race is no longer possible due to event_sched_out() not consuming
'pending_disable'.
Fixes: 97ba62b27867 ("perf: Add support for SIGTRAP on perf events")
Reported-by: Dmitry Vyukov <dvyukov@...gle.com>
Debugged-by: Dmitry Vyukov <dvyukov@...gle.com>
Signed-off-by: Marco Elver <elver@...gle.com>
---
include/linux/perf_event.h | 2 +
kernel/events/core.c | 85 ++++++++++++++++++++++++++++++++------
2 files changed, 75 insertions(+), 12 deletions(-)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 907b0e3f1318..dff3430844a2 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -740,8 +740,10 @@ struct perf_event {
int pending_wakeup;
int pending_kill;
int pending_disable;
+ int pending_sigtrap;
unsigned long pending_addr; /* SIGTRAP */
struct irq_work pending;
+ struct irq_work pending_resched;
atomic_t event_limit;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 75f5705b6892..df90777262bf 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2527,6 +2527,14 @@ event_sched_in(struct perf_event *event,
if (event->attr.exclusive)
cpuctx->exclusive = 1;
+ if (event->pending_sigtrap) {
+ /*
+ * The task and event might have been moved to another CPU:
+ * queue another IRQ work. See perf_pending_event_sigtrap().
+ */
+ WARN_ON_ONCE(!irq_work_queue(&event->pending_resched));
+ }
+
out:
perf_pmu_enable(event->pmu);
@@ -4938,6 +4946,7 @@ static void perf_addr_filters_splice(struct perf_event *event,
static void _free_event(struct perf_event *event)
{
irq_work_sync(&event->pending);
+ irq_work_sync(&event->pending_resched);
unaccount_event(event);
@@ -6446,6 +6455,37 @@ static void perf_sigtrap(struct perf_event *event)
event->attr.type, event->attr.sig_data);
}
+static void perf_pending_event_sigtrap(struct perf_event *event)
+{
+ if (!event->pending_sigtrap)
+ return;
+
+ /*
+ * If we're racing with disabling of the event, consume pending_sigtrap
+ * and don't send the SIGTRAP. This avoids potentially delaying a signal
+ * indefinitely (oncpu mismatch) until the event is enabled again, which
+ * could happen after already returning to user space; in that case the
+ * signal would erroneously become asynchronous.
+ */
+ if (event->state == PERF_EVENT_STATE_OFF) {
+ event->pending_sigtrap = 0;
+ return;
+ }
+
+ /*
+ * Only process this pending SIGTRAP if this IRQ work is running on the
+ * right CPU: the scheduler is able to run before the IRQ work, which
+ * moved the task to another CPU. In event_sched_in() another IRQ work
+ * is scheduled, so that the signal is not lost; given the kernel has
+ * not yet returned to user space, the signal remains synchronous.
+ */
+ if (READ_ONCE(event->oncpu) != smp_processor_id())
+ return;
+
+ event->pending_sigtrap = 0;
+ perf_sigtrap(event);
+}
+
static void perf_pending_event_disable(struct perf_event *event)
{
int cpu = READ_ONCE(event->pending_disable);
@@ -6455,13 +6495,6 @@ static void perf_pending_event_disable(struct perf_event *event)
if (cpu == smp_processor_id()) {
WRITE_ONCE(event->pending_disable, -1);
-
- if (event->attr.sigtrap) {
- perf_sigtrap(event);
- atomic_set_release(&event->event_limit, 1); /* rearm event */
- return;
- }
-
perf_event_disable_local(event);
return;
}
@@ -6500,6 +6533,7 @@ static void perf_pending_event(struct irq_work *entry)
* and we won't recurse 'further'.
*/
+ perf_pending_event_sigtrap(event);
perf_pending_event_disable(event);
if (event->pending_wakeup) {
@@ -6511,6 +6545,26 @@ static void perf_pending_event(struct irq_work *entry)
perf_swevent_put_recursion_context(rctx);
}
+/*
+ * If handling of a pending action must occur before returning to user space,
+ * and it is possible to reschedule an event (to another CPU) with pending
+ * actions, where the moved-from CPU may not yet have run event->pending (and
+ * irq_work_queue() would fail on reuse), we'll use a separate IRQ work that
+ * runs perf_pending_event_resched().
+ */
+static void perf_pending_event_resched(struct irq_work *entry)
+{
+ struct perf_event *event = container_of(entry, struct perf_event, pending_resched);
+ int rctx;
+
+ rctx = perf_swevent_get_recursion_context();
+
+ perf_pending_event_sigtrap(event);
+
+ if (rctx >= 0)
+ perf_swevent_put_recursion_context(rctx);
+}
+
#ifdef CONFIG_GUEST_PERF_EVENTS
struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
@@ -9209,11 +9263,20 @@ static int __perf_event_overflow(struct perf_event *event,
if (events && atomic_dec_and_test(&event->event_limit)) {
ret = 1;
event->pending_kill = POLL_HUP;
- event->pending_addr = data->addr;
-
perf_event_disable_inatomic(event);
}
+ if (event->attr.sigtrap) {
+ /*
+ * Should not be able to return to user space without processing
+ * pending_sigtrap (kernel events can overflow multiple times).
+ */
+ WARN_ON_ONCE(event->pending_sigtrap && event->attr.exclude_kernel);
+ event->pending_sigtrap = 1;
+ event->pending_addr = data->addr;
+ irq_work_queue(&event->pending);
+ }
+
READ_ONCE(event->overflow_handler)(event, data, regs);
if (*perf_event_fasync(event) && event->pending_kill) {
@@ -11536,6 +11599,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
init_waitqueue_head(&event->waitq);
event->pending_disable = -1;
init_irq_work(&event->pending, perf_pending_event);
+ init_irq_work(&event->pending_resched, perf_pending_event_resched);
mutex_init(&event->mmap_mutex);
raw_spin_lock_init(&event->addr_filters.lock);
@@ -11557,9 +11621,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (parent_event)
event->event_caps = parent_event->event_caps;
- if (event->attr.sigtrap)
- atomic_set(&event->event_limit, 1);
-
if (task) {
event->attach_state = PERF_ATTACH_TASK;
/*
--
2.37.3.998.g577e59143f-goog
Powered by blists - more mailing lists