linux-kernel - [RFC] Sharing PMU counters across compatible events

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Date:   Fri, 1 Dec 2017 06:19:50 -0800
From:   Tejun Heo <tj@...nel.org>
To:     Peter Zijlstra <peterz@...radead.org>,
        Ingo Molnar <mingo@...hat.com>,
        Arnaldo Carvalho de Melo <acme@...nel.org>,
        Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
        Jiri Olsa <jolsa@...hat.com>,
        Namhyung Kim <namhyung@...nel.org>
Cc:     linux-kernel@...r.kernel.org, kernel-team@...com
Subject: [RFC] Sharing PMU counters across compatible events

Hello,

There are cases where a single PMU event, let's say instructions, is
interesting for different subsets of the system.  For example, it
could be interesting to monitor instructions system-wide, at cgroup
level and per each thread.

This could easily be me not knowing better but I can't think of a way
to do the above without consuming multiple PMU counters to monitor
instructions, which quickly gets out of hand with the product of
multiple domains and counters.  Getting broken down numbers and adding
up doesn't work at cgroup (the numbers aren't mutually exclusive) or
thread level.

If there's a way to achieve this using the existing features, I'd be
glad to learn about it.  If not, the patch at the bottom is a crude
half-working proof-of-concept to share PMU counters across compatible
events.

In a nutshell, while adding events, it looks whether there already is
a compatible event.  If so, instead of enabling the new event, it gets
linked to the already enabled one (the master event) and count of the
dup event is updated by adding the delta of the master event.

That patch is just a proof of concept.  It's missing counter
propagation somewhere and the dup counts end up somewhat lower than
they should be.  The master selection and switching are half-broken.
Event compatibility testing is barely implemented, so on and so forth.
However, it does allow gathering events for different targets without
consuming multiple PMU counters.

What do you think?  Would this be something worth pursuing?

Thank you very much.

---
 include/linux/perf_event.h |    7 ++
 kernel/events/core.c       |  129 +++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 126 insertions(+), 10 deletions(-)

--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -583,6 +583,13 @@ struct perf_event {
 	local64_t			count;
 	atomic64_t			child_count;
 
+	struct list_head		active_event_entry;
+
+	struct perf_event		*dup_master;
+	struct list_head		dup_list;
+	u64				dup_base_count;
+	u64				dup_base_child_count;
+
 	/*
 	 * These are the total time in nanoseconds that the event
 	 * has been enabled (i.e. eligible to run, and the task has
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1119,6 +1119,7 @@ void perf_pmu_enable(struct pmu *pmu)
 }
 
 static DEFINE_PER_CPU(struct list_head, active_ctx_list);
+static DEFINE_PER_CPU(struct list_head, active_event_list);
 
 /*
  * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
@@ -1788,12 +1789,58 @@ event_filter_match(struct perf_event *ev
 	       perf_cgroup_match(event) && pmu_filter_match(event);
 }
 
+static void event_sync_dup(struct perf_event *dup)
+{
+	struct perf_event *master = dup->dup_master;
+	u64 new_count = local64_read(&master->count);
+	u64 new_child_count = atomic64_read(&master->child_count);
+
+	local64_add(new_count - dup->dup_base_count, &dup->count);
+	atomic64_add(new_child_count - dup->dup_base_child_count,
+		     &dup->child_count);
+
+	dup->dup_base_count = new_count;
+	dup->dup_base_child_count = new_child_count;
+}
+
+static bool event_cleanup_dup(struct perf_event *event,
+			      struct perf_event **first_dupp)
+{
+	struct perf_event *first_dup = NULL;
+	struct perf_event *dup, *tmp;
+
+	if (event->dup_master) {
+		event_sync_dup(event);
+		list_del_init(&event->dup_list);
+		event->dup_master = NULL;
+		return true;
+	}
+
+	list_for_each_entry_safe(dup, tmp, &event->dup_list, dup_list) {
+		event_sync_dup(dup);
+
+		if (!first_dup) {
+			*first_dupp = first_dup = dup;
+			dup->dup_master = NULL;
+			list_del_init(&dup->dup_list);
+		} else {
+			dup->dup_master = first_dup;
+			list_move_tail(&dup->dup_list, &first_dup->dup_list);
+			dup->dup_base_count = local64_read(&first_dup->count);
+			dup->dup_base_child_count =
+				atomic64_read(&first_dup->child_count);
+		}
+	}
+	return false;
+}
+
 static void
 event_sched_out(struct perf_event *event,
 		  struct perf_cpu_context *cpuctx,
 		  struct perf_event_context *ctx)
 {
 	enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
+	struct perf_event *first_dup = NULL;
 
 	WARN_ON_ONCE(event->ctx != ctx);
 	lockdep_assert_held(&ctx->lock);
@@ -1803,7 +1850,17 @@ event_sched_out(struct perf_event *event
 
 	perf_pmu_disable(event->pmu);
 
-	event->pmu->del(event, 0);
+	list_del_init(&event->active_event_entry);
+
+	if (!event_cleanup_dup(event, &first_dup)) {
+		event->pmu->del(event, 0);
+		/*
+		 * XXX: Should probably use a virtual master and avoid hot
+		 * swapping masters.
+		 */
+		if (first_dup)
+			WARN_ON_ONCE(event->pmu->add(first_dup, PERF_EF_START));
+	}
 	event->oncpu = -1;
 
 	if (event->pending_disable) {
@@ -2038,6 +2095,45 @@ static void perf_set_shadow_time(struct
 static void perf_log_throttle(struct perf_event *event, int enable);
 static void perf_log_itrace_start(struct perf_event *event);
 
+static bool event_setup_dup(struct perf_event *event,
+			    struct perf_cpu_context *cpuctx,
+			    struct perf_event_context *ctx)
+{
+	struct list_head *head = this_cpu_ptr(&active_event_list);
+	struct perf_event_attr *attr = &event->attr;
+	struct perf_event *tevent;
+
+	/* XXX: let's just do instructions for now */
+	if (!(attr->type == PERF_TYPE_HARDWARE &&
+	      attr->config == PERF_COUNT_HW_INSTRUCTIONS))
+		return false;
+
+	/* XXX: no group support yet either */
+	if (attr->read_format & PERF_FORMAT_GROUP)
+		return false;
+
+	list_for_each_entry(tevent, head, active_event_entry) {
+		struct perf_event_attr *tattr = &tevent->attr;
+
+		if (tevent->dup_master)
+			continue;
+
+		/* XXX: this definitely isn't enough */
+		if (attr->type == tattr->type && attr->config == tattr->config &&
+		    attr->freq == tattr->freq &&
+		    attr->sample_freq == tattr->sample_freq) {
+			event->dup_master = tevent;
+			list_add_tail(&event->dup_list, &tevent->dup_list);
+			event->dup_base_count = local64_read(&tevent->count);
+			event->dup_base_child_count =
+				atomic64_read(&tevent->child_count);
+			return true;
+		}
+	}
+
+	return false;
+}
+
 static int
 event_sched_in(struct perf_event *event,
 		 struct perf_cpu_context *cpuctx,
@@ -2075,7 +2171,8 @@ event_sched_in(struct perf_event *event,
 
 	perf_log_itrace_start(event);
 
-	if (event->pmu->add(event, PERF_EF_START)) {
+	if (!event_setup_dup(event, cpuctx, ctx) &&
+	    event->pmu->add(event, PERF_EF_START)) {
 		perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
 		event->oncpu = -1;
 		ret = -EAGAIN;
@@ -2092,6 +2189,7 @@ event_sched_in(struct perf_event *event,
 	if (event->attr.exclusive)
 		cpuctx->exclusive = 1;
 
+	list_add_tail(&event->active_event_entry, this_cpu_ptr(&active_event_list));
 out:
 	perf_pmu_enable(event->pmu);
 
@@ -2745,6 +2843,14 @@ static int context_equiv(struct perf_eve
 	return 0;
 }
 
+static void event_pmu_read(struct perf_event *event)
+{
+	if (!event->dup_master)
+		event->pmu->read(event);
+	else
+		event_sync_dup(event);
+}
+
 static void __perf_event_sync_stat(struct perf_event *event,
 				     struct perf_event *next_event)
 {
@@ -2761,7 +2867,7 @@ static void __perf_event_sync_stat(struc
 	 * don't need to use it.
 	 */
 	if (event->state == PERF_EVENT_STATE_ACTIVE)
-		event->pmu->read(event);
+		event_pmu_read(event);
 
 	perf_event_update_time(event);
 
@@ -3528,14 +3634,14 @@ static void __perf_event_read(void *info
 		goto unlock;
 
 	if (!data->group) {
-		pmu->read(event);
+		event_pmu_read(event);
 		data->ret = 0;
 		goto unlock;
 	}
 
 	pmu->start_txn(pmu, PERF_PMU_TXN_READ);
 
-	pmu->read(event);
+	event_pmu_read(event);
 
 	list_for_each_entry(sub, &event->sibling_list, group_entry) {
 		if (sub->state == PERF_EVENT_STATE_ACTIVE) {
@@ -3543,7 +3649,7 @@ static void __perf_event_read(void *info
 			 * Use sibling's PMU rather than @event's since
 			 * sibling could be on different (eg: software) PMU.
 			 */
-			sub->pmu->read(sub);
+			event_pmu_read(sub);
 		}
 	}
 
@@ -3607,7 +3713,7 @@ int perf_event_read_local(struct perf_ev
 	 * oncpu == -1).
 	 */
 	if (event->oncpu == smp_processor_id())
-		event->pmu->read(event);
+		event_pmu_read(event);
 
 	*value = local64_read(&event->count);
 	if (enabled || running) {
@@ -5718,7 +5824,7 @@ static void perf_output_read_group(struc
 		values[n++] = running;
 
 	if (leader != event)
-		leader->pmu->read(leader);
+		event_pmu_read(leader);
 
 	values[n++] = perf_event_count(leader);
 	if (read_format & PERF_FORMAT_ID)
@@ -5731,7 +5837,7 @@ static void perf_output_read_group(struc
 
 		if ((sub != event) &&
 		    (sub->state == PERF_EVENT_STATE_ACTIVE))
-			sub->pmu->read(sub);
+			event_pmu_read(sub);
 
 		values[n++] = perf_event_count(sub);
 		if (read_format & PERF_FORMAT_ID)
@@ -8555,7 +8661,7 @@ static enum hrtimer_restart perf_swevent
 	if (event->state != PERF_EVENT_STATE_ACTIVE)
 		return HRTIMER_NORESTART;
 
-	event->pmu->read(event);
+	event_pmu_read(event);
 
 	perf_sample_data_init(&data, 0, event->hw.last_period);
 	regs = get_irq_regs();
@@ -9383,6 +9489,8 @@ perf_event_alloc(struct perf_event_attr
 	INIT_LIST_HEAD(&event->sibling_list);
 	INIT_LIST_HEAD(&event->rb_entry);
 	INIT_LIST_HEAD(&event->active_entry);
+	INIT_LIST_HEAD(&event->active_event_entry);
+	INIT_LIST_HEAD(&event->dup_list);
 	INIT_LIST_HEAD(&event->addr_filters.list);
 	INIT_HLIST_NODE(&event->hlist_entry);
 
@@ -10981,6 +11089,7 @@ static void __init perf_event_init_all_c
 		swhash = &per_cpu(swevent_htable, cpu);
 		mutex_init(&swhash->hlist_mutex);
 		INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
+		INIT_LIST_HEAD(&per_cpu(active_event_list, cpu));
 
 		INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
 		raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));