lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Tue, 21 Apr 2020 01:04:29 +0200
From:   Peter Zijlstra <peterz@...radead.org>
To:     Song Liu <songliubraving@...com>
Cc:     linux-kernel@...r.kernel.org, kernel-team@...com,
        Arnaldo Carvalho de Melo <acme@...hat.com>,
        Jiri Olsa <jolsa@...nel.org>,
        Alexey Budankov <alexey.budankov@...ux.intel.com>,
        Namhyung Kim <namhyung@...nel.org>, Tejun Heo <tj@...nel.org>,
        kernel test robot <rong.a.chen@...el.com>
Subject: Re: [PATCH v12] perf: Sharing PMU counters across compatible events

On Tue, Mar 31, 2020 at 12:55:33AM -0700, Song Liu wrote:
> @@ -1778,6 +1775,246 @@ perf_event_groups_next(struct perf_event *event)
>  		event = rb_entry_safe(rb_next(&event->group_node),	\
>  				typeof(*event), group_node))
>  
> +static inline bool perf_event_can_share(struct perf_event *event)
> +{
> +	/* only share hardware counting events */
> +	return !is_sampling_event(event);
> +	return !is_software_event(event) && !is_sampling_event(event);
> +}

ISTR pointing out before that one of those returns is superfluous.

> +/*
> + * Returns whether the two events can share a PMU counter.
> + *
> + * Note: This function does NOT check perf_event_can_share() for
> + * the two events, they should be checked before this function
> + */
> +static inline bool perf_event_compatible(struct perf_event *event_a,
> +					 struct perf_event *event_b)
> +{
> +	return memcmp(&event_a->attr, &event_b->attr, event_a->attr.size) == 0;
> +}
> +
> +static void perf_event_init_dup_master(struct perf_event *event)
> +{
> +	bool is_active = event->state == PERF_EVENT_STATE_ACTIVE;
> +	s64 count;
> +
> +	WARN_ON_ONCE(event->dup_active != 0);
> +	/*
> +	 * The event sharing scheme allows for duplicate events to be ACTIVE
> +	 * while the master is not. In order to facilitate this, the master
> +	 * will be put in the ENABLED state whenever it has active duplicates
> +	 * but is itself *not* ACTIVE.
> +	 *
> +	 * When ENABLED the master event is scheduled, but its counter must
> +	 * appear stalled. Since the PMU driver updates event->count, the
> +	 * master must keep a shadow counter for itself, this is
> +	 * event->master_count.
> +	 */
> +	count = local64_read(&event->count);
> +	local64_set(&event->master_count, count);
> +
> +	if (is_active) {
> +		local64_set(&event->dup_count, count);
> +		event->dup_active = 1;
> +	}
> +
> +	barrier();
> +
> +	WRITE_ONCE(event->dup_master, event);
> +	event->dup_master = event;
> +}

OK, set up master_count, dup_count, barrier, set dup_master.

> +/* tear down dup_master, no more sharing for this event */
> +static void perf_event_exit_dup_master(struct perf_event *event)
> +{
> +	WARN_ON_ONCE(event->state < PERF_EVENT_STATE_OFF ||
> +		     event->state > PERF_EVENT_STATE_INACTIVE);
> +
> +	/* restore event->count and event->child_count */
> +	local64_set(&event->count, local64_read(&event->master_count));
> +
> +	event->dup_active = 0;
> +	WRITE_ONCE(event->dup_master, NULL);
> +
> +	barrier();
> +}

XXX ordering doesn't make sense, I think you want the barrier() between
setting count and dup_master, that way you'll get the correct result out
of perf_event_count() no matter the timing.

> +#define EVENT_TOMBSTONE ((void *)-1L)
> +
> +/*
> + * sync data count from dup_master to event, called on event_pmu_read()
> + * and event_pmu_del()
> + */
> +static void
> +event_pmu_read_dup(struct perf_event *event, struct perf_event *master)
> +{
> +	u64 prev_count, new_count;
> +
> +	if (master == EVENT_TOMBSTONE)
> +		return;
> +
> +again:
> +	prev_count = local64_read(&event->dup_count);
> +	if (master->state > PERF_EVENT_STATE_INACTIVE)
> +		master->pmu->read(master);
> +	new_count = local64_read(&master->count);
> +	if (local64_cmpxchg(&event->dup_count, prev_count, new_count) != prev_count)
> +		goto again;
> +
> +	if (event == master)
> +		local64_add(new_count - prev_count, &event->master_count);
> +	else
> +		local64_add(new_count - prev_count, &event->count);
> +}
> +
> +/* After adding a event to the ctx, try find compatible event(s). */
> +static void
> +perf_event_setup_dup(struct perf_event *event, struct perf_event_context *ctx)
> +{
> +	struct perf_event *tmp;
> +
> +	if (event->state < PERF_EVENT_STATE_OFF ||
> +	    !perf_event_can_share(event))
> +		return;
> +
> +	/* look for dup with other events */
> +	list_for_each_entry(tmp, &ctx->event_list, event_entry) {
> +		if (tmp == event ||
> +		    !perf_event_can_share(tmp) ||
> +		    !perf_event_compatible(event, tmp))
> +			continue;
> +		if (tmp->state < PERF_EVENT_STATE_INACTIVE)
> +			continue;
> +
> +		/* first dup, pick tmp as the master */
> +		if (!tmp->dup_master) {
> +			if (tmp->state == PERF_EVENT_STATE_ACTIVE)
> +				tmp->pmu->read(tmp);
> +			perf_event_init_dup_master(tmp);
> +		}
> +
> +		event->dup_master = tmp->dup_master;
> +
> +		break;
> +	}
> +}
> +
> +/* Remove dup_master for the event */
> +static void
> +perf_event_remove_dup(struct perf_event *event, struct perf_event_context *ctx)
> +
> +{
> +	struct perf_event *tmp, *new_master;
> +	int dup_count, active_count;
> +	int ret;
> +
> +	/* no sharing */
> +	if (!event->dup_master)
> +		return;
> +
> +	WARN_ON_ONCE(event->state < PERF_EVENT_STATE_OFF ||
> +		     event->state > PERF_EVENT_STATE_ENABLED);
> +
> +	/* this event is not the master */
> +	if (event->dup_master != event) {
> +		event->dup_master = NULL;
> +		return;
> +	}
> +
> +	active_count = event->dup_active;
> +	if (active_count) {
> +		perf_pmu_disable(event->pmu);
> +		event->pmu->del(event, 0);
> +	}
> +
> +	/* this event is the master */
> +	dup_count = 0;
> +	new_master = NULL;
> +	list_for_each_entry(tmp, &ctx->event_list, event_entry) {
> +		u64 count;
> +
> +		if (tmp->dup_master != event || tmp == event)
> +			continue;
> +		if (!new_master) {
> +			new_master = tmp;
> +			list_del_init(&new_master->dup_active_list);
> +		}
> +
> +		if (tmp->state == PERF_EVENT_STATE_ACTIVE) {
> +			/* sync read from old master */
> +			event_pmu_read_dup(tmp, event);
> +		}
> +		/*
> +		 * Flip an active event to a new master; this is tricky
> +		 * because for an active event event_pmu_read() can be
> +		 * called at any time from NMI context.
> +		 *
> +		 * This means we need to have ->dup_master and ->dup_count
> +		 * consistent at all times. Of course we cannot do two
> +		 * writes at once :/
> +		 *
> +		 * Instead, flip ->dup_master to EVENT_TOMBSTONE, this will
> +		 * make event_pmu_read_dup() NOP. Then we can set
> +		 * ->dup_count and finally set ->dup_master to the
> +		 * new_master to let event_pmu_read_dup() rip.
> +		 */
> +		WRITE_ONCE(tmp->dup_master, EVENT_TOMBSTONE);
> +		barrier();
> +
> +		count = local64_read(&new_master->count);
> +		local64_set(&tmp->dup_count, count);
> +
> +		if (tmp == new_master)
> +			local64_set(&tmp->master_count, count);
> +		else if (tmp->state == PERF_EVENT_STATE_ACTIVE)
> +			list_move_tail(&tmp->dup_active_list,
> +				       &new_master->dup_active_list);
> +
> +		barrier();
> +		WRITE_ONCE(tmp->dup_master, new_master);
> +		dup_count++;
> +	}
> +
> +	perf_event_exit_dup_master(event);
> +
> +	if (!dup_count)
> +		return;
> +
> +	if (active_count) {
> +		/* copy hardware configure to switch faster */
> +		if (event->pmu->copy_hw_config)
> +			event->pmu->copy_hw_config(event, new_master);
> +
> +		ret = new_master->pmu->add(new_master, PERF_EF_START);
> +		/*
> +		 * Since we just removed the old master (@event), it should be
> +		 * impossible to fail to schedule the new master, an identical
> +		 * event.
> +		 */
> +		WARN_ON_ONCE(ret);
> +		if (new_master->state == PERF_EVENT_STATE_INACTIVE) {
> +			/*
> +			 * We don't need to update time, so don't call
> +			 * perf_event_set_state().
> +			 */
> +			new_master->state = PERF_EVENT_STATE_ENABLED;
> +		}
> +		perf_pmu_enable(new_master->pmu);
> +	}
> +
> +	if (dup_count == 1) {
> +		/*
> +		 * We set up as a master, but there aren't any more duplicates.
> +		 * Simply clear ->dup_master, as ->master_count == ->count per
> +		 * the above.
> +		 */
> +		WRITE_ONCE(new_master->dup_master, NULL);
> +	} else {
> +		new_master->dup_active = active_count;
> +	}
> +}
> +
>  /*
>   * Add an event from the lists for its context.
>   * Must be called with ctx->mutex and ctx->lock held.

> @@ -4223,7 +4534,14 @@ static void __perf_event_read(void *info)
>  
>  static inline u64 perf_event_count(struct perf_event *event)
>  {
> -	return local64_read(&event->count) + atomic64_read(&event->child_count);
> +	u64 count;
> +
> +	if (likely(event->dup_master != event))
> +		count = local64_read(&event->count);
> +	else
> +		count = local64_read(&event->master_count);
> +
> +	return count + atomic64_read(&event->child_count);
>  }

So lsat time I said something about SMP ordering here. Where did that
go?

Specifically, without ordering it is possible to observe dup_master and
dup_count out of order. So while we might then see dup_master, we might
then also see an old dup_count, which would give 'interesting' effects.

Granted, adding memory barriers all over this will suck :/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ