linux-kernel - [PATCH] perf: Fix missing SIGTRAPs due to pending

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20220927121322.1236730-1-elver@google.com>
Date:   Tue, 27 Sep 2022 14:13:22 +0200
From:   Marco Elver <elver@...gle.com>
To:     elver@...gle.com, Peter Zijlstra <peterz@...radead.org>
Cc:     Ingo Molnar <mingo@...hat.com>,
        Arnaldo Carvalho de Melo <acme@...nel.org>,
        Mark Rutland <mark.rutland@....com>,
        Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
        Jiri Olsa <jolsa@...nel.org>,
        Namhyung Kim <namhyung@...nel.org>,
        linux-perf-users@...r.kernel.org, linux-kernel@...r.kernel.org,
        kasan-dev@...glegroups.com, Dmitry Vyukov <dvyukov@...gle.com>
Subject: [PATCH] perf: Fix missing SIGTRAPs due to pending_disable abuse

Due to the implementation of how SIGTRAP are delivered if
perf_event_attr::sigtrap is set, we've noticed 3 issues:

	1. Missing SIGTRAP due to a race with event_sched_out() (more
	   details below).

	2. Hardware PMU events being disabled due to returning 1 from
	   perf_event_overflow(). The only way to re-enable the event is
	   for user space to first "properly" disable the event and then
	   re-enable it.

	3. The inability to automatically disable an event after a
	   specified number of overflows via PERF_EVENT_IOC_REFRESH.

The worst of the 3 issues is problem (1), which occurs when a
pending_disable is "consumed" by a racing event_sched_out(), observed as
follows:

		CPU0			| 	CPU1
	--------------------------------+---------------------------
	__perf_event_overflow()		|
	 perf_event_disable_inatomic()	|
	  pending_disable = CPU0	| ...
	  				| _perf_event_enable()
					|  event_function_call()
					|   task_function_call()
					|    /* sends IPI to CPU0 */
	<IPI>				| ...
	 __perf_event_enable()		+---------------------------
	  ctx_resched()
	   task_ctx_sched_out()
	    ctx_sched_out()
	     group_sched_out()
	      event_sched_out()
	       pending_disable = -1
	</IPI>
	<IRQ-work>
	 perf_pending_event()
	  perf_pending_event_disable()
	   /* Fails to send SIGTRAP because no pending_disable! */
	</IRQ-work>

In the above case, not only is that particular SIGTRAP missed, but also
all future SIGTRAPs because 'event_limit' is not reset back to 1.

To fix, rework pending delivery of SIGTRAP via IRQ-work by introduction
of a separate 'pending_sigtrap', no longer using 'event_limit' and
'pending_disable' for its delivery.

During testing, this also revealed several more possible races between
reschedules and pending IRQ work; see code comments for details.

Doing so makes it possible to use 'event_limit' normally (thereby
enabling use of PERF_EVENT_IOC_REFRESH), perf_event_overflow() no longer
returns 1 on SIGTRAP causing disabling of hardware PMUs, and finally the
race is no longer possible due to event_sched_out() not consuming
'pending_disable'.

Fixes: 97ba62b27867 ("perf: Add support for SIGTRAP on perf events")
Reported-by: Dmitry Vyukov <dvyukov@...gle.com>
Debugged-by: Dmitry Vyukov <dvyukov@...gle.com>
Signed-off-by: Marco Elver <elver@...gle.com>
---
 include/linux/perf_event.h |  2 +
 kernel/events/core.c       | 85 ++++++++++++++++++++++++++++++++------
 2 files changed, 75 insertions(+), 12 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 907b0e3f1318..dff3430844a2 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -740,8 +740,10 @@ struct perf_event {
 	int				pending_wakeup;
 	int				pending_kill;
 	int				pending_disable;
+	int				pending_sigtrap;
 	unsigned long			pending_addr;	/* SIGTRAP */
 	struct irq_work			pending;
+	struct irq_work			pending_resched;
 
 	atomic_t			event_limit;
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 75f5705b6892..df90777262bf 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2527,6 +2527,14 @@ event_sched_in(struct perf_event *event,
 	if (event->attr.exclusive)
 		cpuctx->exclusive = 1;
 
+	if (event->pending_sigtrap) {
+		/*
+		 * The task and event might have been moved to another CPU:
+		 * queue another IRQ work. See perf_pending_event_sigtrap().
+		 */
+		WARN_ON_ONCE(!irq_work_queue(&event->pending_resched));
+	}
+
 out:
 	perf_pmu_enable(event->pmu);
 
@@ -4938,6 +4946,7 @@ static void perf_addr_filters_splice(struct perf_event *event,
 static void _free_event(struct perf_event *event)
 {
 	irq_work_sync(&event->pending);
+	irq_work_sync(&event->pending_resched);
 
 	unaccount_event(event);
 
@@ -6446,6 +6455,37 @@ static void perf_sigtrap(struct perf_event *event)
 		      event->attr.type, event->attr.sig_data);
 }
 
+static void perf_pending_event_sigtrap(struct perf_event *event)
+{
+	if (!event->pending_sigtrap)
+		return;
+
+	/*
+	 * If we're racing with disabling of the event, consume pending_sigtrap
+	 * and don't send the SIGTRAP. This avoids potentially delaying a signal
+	 * indefinitely (oncpu mismatch) until the event is enabled again, which
+	 * could happen after already returning to user space; in that case the
+	 * signal would erroneously become asynchronous.
+	 */
+	if (event->state == PERF_EVENT_STATE_OFF) {
+		event->pending_sigtrap = 0;
+		return;
+	}
+
+	/*
+	 * Only process this pending SIGTRAP if this IRQ work is running on the
+	 * right CPU: the scheduler is able to run before the IRQ work, which
+	 * moved the task to another CPU. In event_sched_in() another IRQ work
+	 * is scheduled, so that the signal is not lost; given the kernel has
+	 * not yet returned to user space, the signal remains synchronous.
+	 */
+	if (READ_ONCE(event->oncpu) != smp_processor_id())
+		return;
+
+	event->pending_sigtrap = 0;
+	perf_sigtrap(event);
+}
+
 static void perf_pending_event_disable(struct perf_event *event)
 {
 	int cpu = READ_ONCE(event->pending_disable);
@@ -6455,13 +6495,6 @@ static void perf_pending_event_disable(struct perf_event *event)
 
 	if (cpu == smp_processor_id()) {
 		WRITE_ONCE(event->pending_disable, -1);
-
-		if (event->attr.sigtrap) {
-			perf_sigtrap(event);
-			atomic_set_release(&event->event_limit, 1); /* rearm event */
-			return;
-		}
-
 		perf_event_disable_local(event);
 		return;
 	}
@@ -6500,6 +6533,7 @@ static void perf_pending_event(struct irq_work *entry)
 	 * and we won't recurse 'further'.
 	 */
 
+	perf_pending_event_sigtrap(event);
 	perf_pending_event_disable(event);
 
 	if (event->pending_wakeup) {
@@ -6511,6 +6545,26 @@ static void perf_pending_event(struct irq_work *entry)
 		perf_swevent_put_recursion_context(rctx);
 }
 
+/*
+ * If handling of a pending action must occur before returning to user space,
+ * and it is possible to reschedule an event (to another CPU) with pending
+ * actions, where the moved-from CPU may not yet have run event->pending (and
+ * irq_work_queue() would fail on reuse), we'll use a separate IRQ work that
+ * runs perf_pending_event_resched().
+ */
+static void perf_pending_event_resched(struct irq_work *entry)
+{
+	struct perf_event *event = container_of(entry, struct perf_event, pending_resched);
+	int rctx;
+
+	rctx = perf_swevent_get_recursion_context();
+
+	perf_pending_event_sigtrap(event);
+
+	if (rctx >= 0)
+		perf_swevent_put_recursion_context(rctx);
+}
+
 #ifdef CONFIG_GUEST_PERF_EVENTS
 struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
 
@@ -9209,11 +9263,20 @@ static int __perf_event_overflow(struct perf_event *event,
 	if (events && atomic_dec_and_test(&event->event_limit)) {
 		ret = 1;
 		event->pending_kill = POLL_HUP;
-		event->pending_addr = data->addr;
-
 		perf_event_disable_inatomic(event);
 	}
 
+	if (event->attr.sigtrap) {
+		/*
+		 * Should not be able to return to user space without processing
+		 * pending_sigtrap (kernel events can overflow multiple times).
+		 */
+		WARN_ON_ONCE(event->pending_sigtrap && event->attr.exclude_kernel);
+		event->pending_sigtrap = 1;
+		event->pending_addr = data->addr;
+		irq_work_queue(&event->pending);
+	}
+
 	READ_ONCE(event->overflow_handler)(event, data, regs);
 
 	if (*perf_event_fasync(event) && event->pending_kill) {
@@ -11536,6 +11599,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	init_waitqueue_head(&event->waitq);
 	event->pending_disable = -1;
 	init_irq_work(&event->pending, perf_pending_event);
+	init_irq_work(&event->pending_resched, perf_pending_event_resched);
 
 	mutex_init(&event->mmap_mutex);
 	raw_spin_lock_init(&event->addr_filters.lock);
@@ -11557,9 +11621,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	if (parent_event)
 		event->event_caps = parent_event->event_caps;
 
-	if (event->attr.sigtrap)
-		atomic_set(&event->event_limit, 1);
-
 	if (task) {
 		event->attach_state = PERF_ATTACH_TASK;
 		/*
-- 
2.37.3.998.g577e59143f-goog