linux-kernel - Re: [PATCH] perf: Fix missing SIGTRAPs due to pending

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Yzxou9HB/1XjMXWI@hirez.programming.kicks-ass.net>
Date:   Tue, 4 Oct 2022 19:09:15 +0200
From:   Peter Zijlstra <peterz@...radead.org>
To:     Marco Elver <elver@...gle.com>
Cc:     Ingo Molnar <mingo@...hat.com>,
        Arnaldo Carvalho de Melo <acme@...nel.org>,
        Mark Rutland <mark.rutland@....com>,
        Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
        Jiri Olsa <jolsa@...nel.org>,
        Namhyung Kim <namhyung@...nel.org>,
        linux-perf-users@...r.kernel.org, linux-kernel@...r.kernel.org,
        kasan-dev@...glegroups.com, Dmitry Vyukov <dvyukov@...gle.com>
Subject: Re: [PATCH] perf: Fix missing SIGTRAPs due to pending_disable abuse

On Wed, Sep 28, 2022 at 04:55:46PM +0200, Marco Elver wrote:
> On Wed, Sep 28, 2022 at 12:06PM +0200, Marco Elver wrote:
> 
> > My second idea about introducing something like irq_work_raw_sync().
> > Maybe it's not that crazy if it is actually safe. I expect this case
> > where we need the irq_work_raw_sync() to be very very rare.
> 
> The previous irq_work_raw_sync() forgot about irq_work_queue_on(). Alas,
> I might still be missing something obvious, because "it's never that
> easy". ;-)
> 
> And for completeness, the full perf patch of what it would look like
> together with irq_work_raw_sync() (consider it v1.5). It's already
> survived some shorter stress tests and fuzzing.

So.... I don't like it. But I cooked up the below, which _almost_ works :-/

For some raisin it sometimes fails with 14999 out of 15000 events
delivered and I've not yet figured out where it goes sideways. I'm
currently thinking it's that sigtrap clear on OFF.

Still, what do you think of the approach?

---
 include/linux/perf_event.h |  8 ++--
 kernel/events/core.c       | 92 +++++++++++++++++++++++++---------------------
 2 files changed, 55 insertions(+), 45 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index ee8b9ecdc03b..c54161719d37 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -736,9 +736,11 @@ struct perf_event {
 	struct fasync_struct		*fasync;
 
 	/* delayed work for NMIs and such */
-	int				pending_wakeup;
-	int				pending_kill;
-	int				pending_disable;
+	unsigned int			pending_wakeup	:1;
+	unsigned int			pending_disable	:1;
+	unsigned int			pending_sigtrap	:1;
+	unsigned int			pending_kill	:3;
+
 	unsigned long			pending_addr;	/* SIGTRAP */
 	struct irq_work			pending;
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2621fd24ad26..8e5dbe971d9e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2268,11 +2268,15 @@ event_sched_out(struct perf_event *event,
 	event->pmu->del(event, 0);
 	event->oncpu = -1;
 
-	if (READ_ONCE(event->pending_disable) >= 0) {
-		WRITE_ONCE(event->pending_disable, -1);
+	if (event->pending_disable) {
+		event->pending_disable = 0;
 		perf_cgroup_event_disable(event, ctx);
 		state = PERF_EVENT_STATE_OFF;
 	}
+
+	if (event->pending_sigtrap && state == PERF_EVENT_STATE_OFF)
+		event->pending_sigtrap = 0;
+
 	perf_event_set_state(event, state);
 
 	if (!is_software_event(event))
@@ -2463,8 +2467,7 @@ EXPORT_SYMBOL_GPL(perf_event_disable);
 
 void perf_event_disable_inatomic(struct perf_event *event)
 {
-	WRITE_ONCE(event->pending_disable, smp_processor_id());
-	/* can fail, see perf_pending_event_disable() */
+	event->pending_disable = 1;
 	irq_work_queue(&event->pending);
 }
 
@@ -2527,6 +2530,9 @@ event_sched_in(struct perf_event *event,
 	if (event->attr.exclusive)
 		cpuctx->exclusive = 1;
 
+	if (event->pending_disable || event->pending_sigtrap)
+		irq_work_queue(&event->pending);
+
 out:
 	perf_pmu_enable(event->pmu);
 
@@ -6440,47 +6446,40 @@ static void perf_sigtrap(struct perf_event *event)
 		      event->attr.type, event->attr.sig_data);
 }
 
-static void perf_pending_event_disable(struct perf_event *event)
+/*
+ * Deliver the pending work in-event-context or follow the context.
+ */
+static void __perf_pending_event(struct perf_event *event)
 {
-	int cpu = READ_ONCE(event->pending_disable);
+	int cpu = READ_ONCE(event->oncpu);
 
+	/*
+	 * If the event isn't running; we done. event_sched_in() will restart
+	 * the irq_work when needed.
+	 */
 	if (cpu < 0)
 		return;
 
+	/*
+	 * Yay, we hit home and are in the context of the event.
+	 */
 	if (cpu == smp_processor_id()) {
-		WRITE_ONCE(event->pending_disable, -1);
-
-		if (event->attr.sigtrap) {
+		if (event->pending_sigtrap) {
+			event->pending_sigtrap = 0;
 			perf_sigtrap(event);
-			atomic_set_release(&event->event_limit, 1); /* rearm event */
-			return;
 		}
-
-		perf_event_disable_local(event);
-		return;
+		if (event->pending_disable) {
+			event->pending_disable = 0;
+			perf_event_disable_local(event);
+		}
 	}
 
 	/*
-	 *  CPU-A			CPU-B
-	 *
-	 *  perf_event_disable_inatomic()
-	 *    @pending_disable = CPU-A;
-	 *    irq_work_queue();
-	 *
-	 *  sched-out
-	 *    @pending_disable = -1;
-	 *
-	 *				sched-in
-	 *				perf_event_disable_inatomic()
-	 *				  @pending_disable = CPU-B;
-	 *				  irq_work_queue(); // FAILS
-	 *
-	 *  irq_work_run()
-	 *    perf_pending_event()
-	 *
-	 * But the event runs on CPU-B and wants disabling there.
+	 * Requeue if there's still any pending work left, make sure to follow
+	 * where the event went.
 	 */
-	irq_work_queue_on(&event->pending, cpu);
+	if (event->pending_disable || event->pending_sigtrap)
+		irq_work_queue_on(&event->pending, cpu);
 }
 
 static void perf_pending_event(struct irq_work *entry)
@@ -6488,19 +6487,23 @@ static void perf_pending_event(struct irq_work *entry)
 	struct perf_event *event = container_of(entry, struct perf_event, pending);
 	int rctx;
 
-	rctx = perf_swevent_get_recursion_context();
 	/*
 	 * If we 'fail' here, that's OK, it means recursion is already disabled
 	 * and we won't recurse 'further'.
 	 */
+	rctx = perf_swevent_get_recursion_context();
 
-	perf_pending_event_disable(event);
-
+	/*
+	 * The wakeup isn't bound to the context of the event -- it can happen
+	 * irrespective of where the event is.
+	 */
 	if (event->pending_wakeup) {
 		event->pending_wakeup = 0;
 		perf_event_wakeup(event);
 	}
 
+	__perf_pending_event(event);
+
 	if (rctx >= 0)
 		perf_swevent_put_recursion_context(rctx);
 }
@@ -9203,11 +9206,20 @@ static int __perf_event_overflow(struct perf_event *event,
 	if (events && atomic_dec_and_test(&event->event_limit)) {
 		ret = 1;
 		event->pending_kill = POLL_HUP;
-		event->pending_addr = data->addr;
-
 		perf_event_disable_inatomic(event);
 	}
 
+	if (event->attr.sigtrap) {
+		/*
+		 * Should not be able to return to user space without processing
+		 * pending_sigtrap (kernel events can overflow multiple times).
+		 */
+		WARN_ON_ONCE(event->pending_sigtrap && event->attr.exclude_kernel);
+		event->pending_sigtrap = 1;
+		event->pending_addr = data->addr;
+		irq_work_queue(&event->pending);
+	}
+
 	READ_ONCE(event->overflow_handler)(event, data, regs);
 
 	if (*perf_event_fasync(event) && event->pending_kill) {
@@ -11528,7 +11540,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 
 
 	init_waitqueue_head(&event->waitq);
-	event->pending_disable = -1;
 	init_irq_work(&event->pending, perf_pending_event);
 
 	mutex_init(&event->mmap_mutex);
@@ -11551,9 +11562,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	if (parent_event)
 		event->event_caps = parent_event->event_caps;
 
-	if (event->attr.sigtrap)
-		atomic_set(&event->event_limit, 1);
-
 	if (task) {
 		event->attach_state = PERF_ATTACH_TASK;
 		/*