[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20160829100309.GS10121@twins.programming.kicks-ass.net>
Date: Mon, 29 Aug 2016 12:03:09 +0200
From: Peter Zijlstra <peterz@...radead.org>
To: Jiri Olsa <jolsa@...hat.com>
Cc: Vegard Nossum <vegard.nossum@...il.com>,
Thomas Gleixner <tglx@...utronix.de>,
Stephane Eranian <eranian@...gle.com>,
Vince Weaver <vincent.weaver@...ne.edu>,
Ingo Molnar <mingo@...nel.org>,
David Carrillo-Cisneros <davidcc@...gle.com>,
"H. Peter Anvin" <hpa@...or.com>, Kan Liang <kan.liang@...el.com>,
Arnaldo Carvalho de Melo <acme@...hat.com>,
Paul Turner <pjt@...gle.com>,
Linus Torvalds <torvalds@...ux-foundation.org>,
LKML <linux-kernel@...r.kernel.org>,
Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
linux-tip-commits@...r.kernel.org
Subject: Re: [tip:perf/core] perf/core: Check return value of the
perf_event_read() IPI
On Mon, Aug 22, 2016 at 12:38:23PM +0200, Jiri Olsa wrote:
> ---
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 3f07e6cfc1b6..375274b6f3b4 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -1802,8 +1802,9 @@ event_sched_out(struct perf_event *event,
>
> event->tstamp_stopped = tstamp;
> event->pmu->del(event, 0);
> - event->oncpu = -1;
> - event->state = PERF_EVENT_STATE_INACTIVE;
> + WRITE_ONCE(event->state, PERF_EVENT_STATE_INACTIVE);
> + smp_wmb();
> + WRITE_ONCE(event->oncpu, -1);
> if (event->pending_disable) {
> event->pending_disable = 0;
> event->state = PERF_EVENT_STATE_OFF;
> @@ -3561,13 +3561,17 @@ u64 perf_event_read_local(struct perf_event *event)
>
> static int perf_event_read(struct perf_event *event, bool group)
> {
> - int ret = 0, cpu_to_read, local_cpu;
> + int ret = 0, cpu_to_read, local_cpu, state;
> +
> + state = READ_ONCE(event->state);
> + smp_rmb();
> + cpu_to_read = event->oncpu;
>
This cannot be right, this doesn't provide any guarantees. You need to
cross the variables to cancel out timing.
X = 1 r1 = X
wmb rmb
Y = 1 r2 = Y
is a no-op, you can still get all 4 possible outcomes:
r1==0 && r2==0:
r1 = X
rmb
r2 = Y
X = 1
wmb
Y = 1
r1==0 && r2==1:
r1 = X
X = 1
wmb rmb
Y = 1
r2 = Y
r1==1 && r2==0:
X = 1
r1 = X
wmb rmb
r2 = Y
Y = 1
r1==1 && r2==1:
X = 1
wmb
Y = 1
r1 = X
rmb
r2 = Y
But once you cross, like:
X = 1 r2 = Y
wmb rmb
Y = 1 r1 = X
do you get a guarantee, namely: r1==0 && r2==1 becomes impossible, since
if you observe Y==1, we must then also observe X==1.
That said, you're on the right track, and this mirrors event_sched_in()
nicely. But even if we do cross things, we're still not good, because
even if ->oncpu was valid, nothing guarantees us it still is, it could
have been unplugged meanwhile.
Luckily that's not too hard to fix, we just need to disable preemption
over the lot. And while looking at that, the put_cpu() is too early
anyway, with the current code we could get migrated between put_cpu()
and smp_call_function_single(), which would destroy the whole point of
us doing the find_cpu_to_read() thing.
So, how about something like this:
---
kernel/events/core.c | 53 ++++++++++++++++++++++++++++++++++++++--------------
1 file changed, 39 insertions(+), 14 deletions(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index eed96b85503f..c672c5eb2c44 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1802,8 +1802,18 @@ event_sched_out(struct perf_event *event,
event->tstamp_stopped = tstamp;
event->pmu->del(event, 0);
- event->oncpu = -1;
- event->state = PERF_EVENT_STATE_INACTIVE;
+
+ WRITE_ONCE(event->state, PERF_EVENT_STATE_INACTIVE);
+ /*
+ * pmu::del() will have updated the event count. Now mark it inactive,
+ * but take care to clear ->oncpu after the INACTIVE store, such that
+ * while ->state == ACTIVE, ->oncpu must be valid.
+ *
+ * See event_sched_in(), perf_event_restart() and perf_event_read().
+ */
+ smp_wmb();
+ WRITE_ONCE(event->oncpu, -1);
+
if (event->pending_disable) {
event->pending_disable = 0;
event->state = PERF_EVENT_STATE_OFF;
@@ -2015,8 +2025,10 @@ event_sched_in(struct perf_event *event,
WRITE_ONCE(event->oncpu, smp_processor_id());
/*
- * Order event::oncpu write to happen before the ACTIVE state
- * is visible.
+ * Order event::oncpu write to happen before the ACTIVE state is
+ * visible, such that when we observe ACTIVE, oncpu must be correct.
+ *
+ * Matches the smp_rmb() in perf_event_restart().
*/
smp_wmb();
WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE);
@@ -2509,7 +2521,11 @@ static int perf_event_restart(struct perf_event *event)
if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
return 0;
- /* matches smp_wmb() in event_sched_in() */
+ /*
+ * Matches the smp_wmb() from event_sched_in(), such that if
+ * we observe ACTIVE above, we know the ->oncpu load below
+ * must be a valid CPU.
+ */
smp_rmb();
/*
@@ -3424,9 +3440,8 @@ struct perf_read_data {
int ret;
};
-static int find_cpu_to_read(struct perf_event *event, int local_cpu)
+static int find_cpu_to_read(struct perf_event *event, int event_cpu, int local_cpu)
{
- int event_cpu = event->oncpu;
u16 local_pkg, event_pkg;
if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
@@ -3561,28 +3576,36 @@ u64 perf_event_read_local(struct perf_event *event)
static int perf_event_read(struct perf_event *event, bool group)
{
- int ret = 0, cpu_to_read, local_cpu;
+ int ret = 0, cpu_to_read, local_cpu, state;
+
+ local_cpu = get_cpu(); /* disable preemption to hold off hotplut */
+ cpu_to_read = READ_ONCE(event->oncpu);
+ /*
+ * Matches smp_wmb() from event_sched_out(), ->oncpu must be valid
+ * IFF we observe ACTIVE.
+ */
+ smp_rmb();
+ state = READ_ONCE(event->state);
/*
* If event is enabled and currently active on a CPU, update the
* value in the event structure:
*/
- if (event->state == PERF_EVENT_STATE_ACTIVE) {
+ if (state == PERF_EVENT_STATE_ACTIVE) {
struct perf_read_data data = {
.event = event,
.group = group,
.ret = 0,
};
- local_cpu = get_cpu();
- cpu_to_read = find_cpu_to_read(event, local_cpu);
- put_cpu();
-
+ cpu_to_read = find_cpu_to_read(event, cpu_to_read, local_cpu);
ret = smp_call_function_single(cpu_to_read, __perf_event_read, &data, 1);
+
/* The event must have been read from an online CPU: */
WARN_ON_ONCE(ret);
ret = ret ? : data.ret;
- } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
+
+ } else if (state == PERF_EVENT_STATE_INACTIVE) {
struct perf_event_context *ctx = event->ctx;
unsigned long flags;
@@ -3603,6 +3626,8 @@ static int perf_event_read(struct perf_event *event, bool group)
raw_spin_unlock_irqrestore(&ctx->lock, flags);
}
+ put_cpu();
+
return ret;
}
Powered by blists - more mailing lists