lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20260206103349.GG1395416@noisy.programming.kicks-ass.net>
Date: Fri, 6 Feb 2026 11:33:49 +0100
From: Peter Zijlstra <peterz@...radead.org>
To: Dmitry Vyukov <dvyukov@...gle.com>
Cc: syzbot <syzbot+5334e6bdc43f6d1dcb7d@...kaller.appspotmail.com>,
	acme@...nel.org, adrian.hunter@...el.com,
	alexander.shishkin@...ux.intel.com, irogers@...gle.com,
	james.clark@...aro.org, jolsa@...nel.org,
	linux-kernel@...r.kernel.org, linux-perf-users@...r.kernel.org,
	mark.rutland@....com, mingo@...hat.com, namhyung@...nel.org,
	syzkaller-bugs@...glegroups.com
Subject: Re: [syzbot] [perf?] KCSAN: data-race in perf_event_set_state /
 perf_mmap_rb

On Fri, Feb 06, 2026 at 10:18:38AM +0100, Peter Zijlstra wrote:

> > Because `perf_mmap_rb()` does not hold the `perf_event_context` lock
> > (`ctx->lock`), which is the intended protection for these timing
> > fields, it races with the `event_sched_out()` path (which does hold
> > `ctx->lock`).
> > 
> > The race on `total_time_enabled` and `total_time_running` involves
> > non-atomic read-modify-write operations. If both threads read the same
> > old value of `total_time_enabled` before either writes back the
> > updated value, one of the updates (representing a chunk of time the
> > event was enabled) will be lost. Additionally, the race on
> > `event->tstamp` can lead to inconsistent state where the timestamp and
> > the total time counters are out of sync, causing further errors in
> > subsequent time calculations.
> 
> Yeah, fair enough. Let me go stare at that.

I ended up with the below. It boots and passes 'perf test' with lockdep
on. No further testing was done.

Can you throw this at the robot?

---
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5b5cb620499e..a5b724cb6b42 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1356,7 +1356,9 @@ static void put_ctx(struct perf_event_context *ctx)
  *	      perf_event_context::lock
  *	    mmap_lock
  *	      perf_event::mmap_mutex
+ *	        perf_buffer::event_lock
  *	        perf_buffer::aux_mutex
+ *	        perf_event_context::lock
  *	      perf_addr_filters_head::lock
  *
  *    cpu_hotplug_lock
@@ -1582,6 +1584,8 @@ static u64 perf_event_time(struct perf_event *event)
 	if (unlikely(!ctx))
 		return 0;
 
+	lockdep_assert_held(&ctx->lock);
+
 	if (is_cgroup_event(event))
 		return perf_cgroup_event_time(event);
 
@@ -6157,9 +6161,15 @@ static __poll_t perf_poll(struct file *file, poll_table *wait)
 
 static void _perf_event_reset(struct perf_event *event)
 {
+	/*
+	 * Must disable PMU to stop the event from triggering during
+	 * perf_event_update_userpage().
+	 */
+	perf_pmu_disable(event->pmu);
 	(void)perf_event_read(event, false);
 	local64_set(&event->count, 0);
 	perf_event_update_userpage(event);
+	perf_pmu_enable(event->pmu);
 }
 
 /* Assume it's not an event with inherit set. */
@@ -6504,15 +6514,9 @@ static int perf_event_index(struct perf_event *event)
 	return event->pmu->event_idx(event);
 }
 
-static void perf_event_init_userpage(struct perf_event *event)
+static void perf_event_init_userpage(struct perf_event *event, struct perf_buffer *rb)
 {
 	struct perf_event_mmap_page *userpg;
-	struct perf_buffer *rb;
-
-	rcu_read_lock();
-	rb = rcu_dereference(event->rb);
-	if (!rb)
-		goto unlock;
 
 	userpg = rb->user_page;
 
@@ -6521,9 +6525,6 @@ static void perf_event_init_userpage(struct perf_event *event)
 	userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
 	userpg->data_offset = PAGE_SIZE;
 	userpg->data_size = perf_data_size(rb);
-
-unlock:
-	rcu_read_unlock();
 }
 
 void __weak arch_perf_update_userpage(
@@ -6536,17 +6537,11 @@ void __weak arch_perf_update_userpage(
  * the seqlock logic goes bad. We can not serialize this because the arch
  * code calls this from NMI context.
  */
-void perf_event_update_userpage(struct perf_event *event)
+static void __perf_event_update_userpage(struct perf_event *event, struct perf_buffer *rb)
 {
 	struct perf_event_mmap_page *userpg;
-	struct perf_buffer *rb;
 	u64 enabled, running, now;
 
-	rcu_read_lock();
-	rb = rcu_dereference(event->rb);
-	if (!rb)
-		goto unlock;
-
 	/*
 	 * compute total_time_enabled, total_time_running
 	 * based on snapshot values taken when the event
@@ -6582,7 +6577,16 @@ void perf_event_update_userpage(struct perf_event *event)
 	barrier();
 	++userpg->lock;
 	preempt_enable();
-unlock:
+}
+
+void perf_event_update_userpage(struct perf_event *event)
+{
+	struct perf_buffer *rb;
+
+	rcu_read_lock();
+	rb = rcu_dereference(event->rb);
+	if (rb)
+		__perf_event_update_userpage(event, rb);
 	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(perf_event_update_userpage);
@@ -6978,6 +6982,7 @@ static void perf_mmap_account(struct vm_area_struct *vma, long user_extra, long
 static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event,
 			unsigned long nr_pages)
 {
+	struct perf_event_context *ctx = event->ctx;
 	long extra = 0, user_extra = nr_pages;
 	struct perf_buffer *rb;
 	int rb_flags = 0;
@@ -7032,11 +7037,19 @@ static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event,
 	rb->mmap_user = get_current_user();
 	rb->mmap_locked = extra;
 
-	ring_buffer_attach(event, rb);
+	scoped_guard (raw_spinlock_irq, &ctx->lock) {
+		ctx_time_update_event(ctx, event);
+		perf_event_update_time(event);
+	}
 
-	perf_event_update_time(event);
-	perf_event_init_userpage(event);
-	perf_event_update_userpage(event);
+	/*
+	 * Initialize before setting event->rb to ensure it cannot nest
+	 * if the event is already active.
+	 */
+	perf_event_init_userpage(event, rb);
+	__perf_event_update_userpage(event, rb);
+
+	ring_buffer_attach(event, rb);
 
 	perf_mmap_account(vma, user_extra, extra);
 	refcount_set(&event->mmap_count, 1);

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ