linux-kernel - [PATCH] perf_event: fix loss of notification with multi-event sampling

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20111116145320.GA20146@quad>
Date:	Wed, 16 Nov 2011 15:53:20 +0100
From:	Stephane Eranian <eranian@...gle.com>
To:	linux-kernel@...r.kernel.org
Cc:	peterz@...radead.org, mingo@...e.hu
Subject: [PATCH] perf_event: fix loss of notification with multi-event
 sampling


When you do:
	$ perf record -e cycles,cycles,cycles noploop 10

You expect about 10,000 samples for each event, i.e., 10s at
1000samples/sec. However, this is not what's happening. You
get much fewer samples, maybe 3700 samples/event:

$ perf report -D | tail -15
Aggregated stats:
           TOTAL events:      10998
            MMAP events:         66
            COMM events:          2
          SAMPLE events:      10930
cycles stats:
           TOTAL events:       3644
          SAMPLE events:       3644
cycles stats:
           TOTAL events:       3642
          SAMPLE events:       3642
cycles stats:
           TOTAL events:       3644
          SAMPLE events:       3644

On a Intel Nehalem or even AMD64, there are 4 counters capable
of measuring cycles, so there is plenty of space to measure those
events without multiplexing (even with the NMI watchdog active).
And even with multiplexing, we'd expect roughly the same number
of samples per event.

The root of the problem was that when the event that caused the buffer
to become full was not the first event passed on the cmdline, the user
notification would get lost. The notification was sent to the file
descriptor of the overflowed event but the perf tool was not polling
on it.  The perf tool aggregates all samples into a single buffer, i.e.,
the buffer of the first event. Consequently, it assumes notifications
for any event will come via that descriptor.

One can discuss the ABI on this particular problem. It would seem natural
that if a tool aggregates all samples into a single buffer, it would expect
notifications to come on the file descriptor to which the buffer is bound.
The alternative approach would be to still poll on all the descriptors,
despite the aggregation. Perf is written assuming the former which seems
reasonable. Therefore let's fix the kernel.

The patch fixes this problem by moving the waitq into the ring_buffer
so it can be shared between the 'aggregated' events. The buffer is removed
only when the last event is destroyed. Therefore, we don't need extra ref
counting. The notification only needs the waitq.

With the patch, we get:
$ perf report -D | tail -20
Aggregated stats:
           TOTAL events:      30100
            MMAP events:         80
            COMM events:          2
            EXIT events:          2
          SAMPLE events:      30016
cycles stats:
           TOTAL events:      10014
            MMAP events:          5
            COMM events:          1
            EXIT events:          2
          SAMPLE events:      10006
cycles stats:
           TOTAL events:      10005
          SAMPLE events:      10005
cycles stats:
           TOTAL events:      10005
          SAMPLE events:      10005

Signed-off-by: Stephane Eranian <eranian@...gle.com>
---

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 1e9ebe5..c9e2108 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -823,10 +823,11 @@ struct perf_event {
 	struct user_struct		*mmap_user;
 	struct ring_buffer		*rb;
 
-	/* poll related */
-	wait_queue_head_t		waitq;
+	/* ring_buffer waitq pointer */
+	wait_queue_head_t		*waitq;
 	struct fasync_struct		*fasync;
 
+
 	/* delayed work for NMIs and such */
 	int				pending_wakeup;
 	int				pending_kill;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2e41c8e..e1dd7a0 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2754,6 +2754,7 @@ static void free_event(struct perf_event *event)
 	if (event->rb) {
 		ring_buffer_put(event->rb);
 		event->rb = NULL;
+		event->waitq = NULL;
 	}
 
 	if (is_cgroup_event(event))
@@ -2990,7 +2991,9 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
 		events = atomic_xchg(&rb->poll, 0);
 	rcu_read_unlock();
 
-	poll_wait(file, &event->waitq, wait);
+	/* rb==NULL => waitq=NULL */
+	if (event->waitq)
+		poll_wait(file, event->waitq, wait);
 
 	return events;
 }
@@ -3440,6 +3443,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 		ret = -ENOMEM;
 		goto unlock;
 	}
+	event->waitq = &rb->waitq;
 	rcu_assign_pointer(event->rb, rb);
 
 	atomic_long_add(user_extra, &user->locked_vm);
@@ -3494,7 +3498,8 @@ static const struct file_operations perf_fops = {
 
 void perf_event_wakeup(struct perf_event *event)
 {
-	wake_up_all(&event->waitq);
+	if (event->waitq)
+		wake_up_all(event->waitq);
 
 	if (event->pending_kill) {
 		kill_fasync(&event->fasync, SIGIO, event->pending_kill);
@@ -5621,7 +5626,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	INIT_LIST_HEAD(&event->group_entry);
 	INIT_LIST_HEAD(&event->event_entry);
 	INIT_LIST_HEAD(&event->sibling_list);
-	init_waitqueue_head(&event->waitq);
 	init_irq_work(&event->pending, perf_pending_event);
 
 	mutex_init(&event->mmap_mutex);
@@ -5826,6 +5830,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 	}
 
 	old_rb = event->rb;
+	event->waitq = output_event->waitq;
 	rcu_assign_pointer(event->rb, rb);
 	ret = 0;
 unlock:
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index be4a43f..2bf9546 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -19,6 +19,9 @@ struct ring_buffer {
 
 	atomic_t			poll;		/* POLL_ for wakeups */
 
+	/* poll related */
+	wait_queue_head_t		waitq;
+
 	local_t				head;		/* write position    */
 	local_t				nest;		/* nested writers    */
 	local_t				events;		/* event limit       */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index a2a2920..ef4ea75 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -209,6 +209,7 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
 		rb->writable = 1;
 
 	atomic_set(&rb->refcount, 1);
+	init_waitqueue_head(&rb->waitq);
 }
 
 #ifndef CONFIG_PERF_USE_VMALLOC
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/