linux-kernel - [RFC 2/2] perf: Sharing PMU counters across compatible events

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Fri, 4 May 2018 16:11:02 -0700
From:   Song Liu <songliubraving@...com>
To:     <linux-kernel@...r.kernel.org>
CC:     Song Liu <songliubraving@...com>, <kernel-team@...com>,
        <tj@...nel.org>, <peterz@...radead.org>, <jolsa@...nel.org>
Subject: [RFC 2/2] perf: Sharing PMU counters across compatible events

This patch tries to enable PMU sharing. To make perf event scheduling
fast, we use special data structures.

An array of "struct perf_event_dup" is added to the cpuctx, to remember
all the duplicated events under this cpuctx. All the events under this
cpuctx has a "dup_id" pointing to its perf_event_dup. Compatible events
under the same cpuctx share the same perf_event_dup. The following figure
shows a simplified version of the data structure.

   cpuctx ->  perf_event_dup -> master
                     ^       -> active_dup (list)
                     |
         perf_event /|
                     |
         perf_event /

Connection among perf_event and perf_event_dup are built with function
rebuild_event_dup_list(cpuctx). This function is only called when events
are added/removed or when a task is scheduled in/out. So it is not on
critical path of perf_rotate_context().

On the critical paths, perf_events are added to/removed from the
active_dup list of the perf_event. The first event added to the list
will be the master event, and the only event that runs pmu->add().
Later events will all refer to this master for read().

   cpuctx ->  perf_event_dup -> master
                     ^       -> active_dup (list)
                     |             ^  ^
         perf_event /|  ----------/   |
                     |                |
         perf_event /   -------------/

Similarly, on event_sched_out path, only the last event calls
pmu->del().

The following functions are introduced to add/remove/move events
in the active_dup list, and update master accordingly:

    event_dup_try_add_follower();
    event_dup_setup_master();
    event_dup_sched_out().
    pdup_adjust_offsets().

Given the data structure above, all these functions have O(1) execution
time. Therefore, this change does not increase the complexity of perf
event scheduling.

Cc: Tejun Heo <tj@...nel.org>
Cc: Peter Zijlstra <peterz@...radead.org>
Cc: Jiri Olsa <jolsa@...nel.org>
---
 include/linux/perf_event.h |  50 +++++++++
 kernel/events/core.c       | 252 +++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 284 insertions(+), 18 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 4c84549..af9f269 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -702,6 +702,12 @@ struct perf_event {
 #endif
 
 	struct list_head		sb_list;
+
+	/* for PMU sharing */
+	int				dup_id;
+	struct list_head		dup_sibling_entry;
+	u64				dup_base_count;
+	u64				dup_base_child_count;
 #endif /* CONFIG_PERF_EVENTS */
 };
 
@@ -774,6 +780,47 @@ struct perf_event_context {
  */
 #define PERF_NR_CONTEXTS	4
 
+struct perf_event_dup {
+	struct perf_event	*first;		/* first event of this type */
+	struct perf_event	*master;	/* active event */
+	struct list_head	active_dup;	/* list of active events */
+
+	/*
+	 * When we switch active master, the new master will not have same
+	 * count/child_count as previous master, so future reads of these
+	 * values will not be accurate. To fix this, we add an offset to
+	 * all readings from the sensor. The offset is adjusted accordingly
+	 * when we switch master.
+	 */
+	s64			count_offset;
+	s64			child_count_offset;
+};
+
+static inline u64 pdup_read_count(struct perf_event_dup *pdup)
+{
+	return local64_read(&pdup->master->count) + pdup->count_offset;
+}
+
+static inline u64 pdup_read_child_count(struct perf_event_dup *pdup)
+{
+	return atomic64_read(&pdup->master->child_count) +
+		pdup->child_count_offset;
+}
+
+static inline void pdup_switch_master(struct perf_event_dup *pdup,
+				      struct perf_event *old_master,
+				      struct perf_event *new_master)
+{
+	/* adjust offset */
+	pdup->count_offset += local64_read(&old_master->count) -
+		local64_read(&new_master->count);
+	pdup->child_count_offset += atomic64_read(&old_master->child_count) -
+		atomic64_read(&new_master->child_count);
+	/* move data */
+	if (old_master->pmu->move_dup)
+		old_master->pmu->move_dup(old_master, new_master);
+}
+
 /**
  * struct perf_event_cpu_context - per cpu event context structure
  */
@@ -797,6 +844,9 @@ struct perf_cpu_context {
 	int				sched_cb_usage;
 
 	int				online;
+
+	struct perf_event_dup		*dup_event_list;
+	int				dup_event_count;
 };
 
 struct perf_output_handle {
diff --git a/kernel/events/core.c b/kernel/events/core.c
index bec1840..79de462 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1982,6 +1982,94 @@ event_filter_match(struct perf_event *event)
 	       perf_cgroup_match(event) && pmu_filter_match(event);
 }
 
+static void event_dup_sync(struct perf_event *event,
+			   struct perf_cpu_context *cpuctx);
+
+/*
+ * If an dup event is already active, add this event as follower, and
+ * return 0; otherwise, return -EAGAIN
+ *
+ * RFC NOTE: this an o(1) operation
+ */
+static int event_dup_try_add_follower(struct perf_event *event,
+				      struct perf_cpu_context *cpuctx)
+{
+	struct perf_event_dup *pdup;
+
+	if (event->dup_id >= cpuctx->dup_event_count)
+		return -EAGAIN;
+
+	pdup = &cpuctx->dup_event_list[event->dup_id];
+	if (list_empty(&pdup->active_dup))
+		return -EAGAIN;
+
+	list_add_tail(&event->dup_sibling_entry, &pdup->active_dup);
+	pdup->master->pmu->read(pdup->master);
+	event->dup_base_count = pdup_read_count(pdup);
+	event->dup_base_child_count = pdup_read_child_count(pdup);
+	return 0;
+}
+
+/*
+ * make the (just pmu->add()ed) event the active master of this dup
+ *
+ * RFC NOTE: this an o(1) operation
+ */
+static void event_dup_setup_master(struct perf_event *event,
+				   struct perf_cpu_context *cpuctx)
+{
+	struct perf_event_dup *pdup;
+
+	if (event->dup_id >= cpuctx->dup_event_count)
+		return;
+
+	pdup = &cpuctx->dup_event_list[event->dup_id];
+	WARN_ON(pdup->master != NULL);
+	WARN_ON(!list_empty(&pdup->active_dup));
+
+	cpuctx->dup_event_list[event->dup_id].master = event;
+	list_add_tail(&event->dup_sibling_entry, &pdup->active_dup);
+	cpuctx->dup_event_list[event->dup_id].count_offset = 0;
+	cpuctx->dup_event_list[event->dup_id].child_count_offset = 0;
+}
+
+/*
+ * remove event from the dup list; if it is the master, and there are
+ * other active events, promote another event as the new master.
+ *
+ * return 0, if it is there are more active events in this dup;
+ * return -EAGAIN, if it is the last active event
+ *
+ * RFC NOTE: this an o(1) operation
+ */
+static int event_dup_sched_out(struct perf_event *event,
+			       struct perf_cpu_context *cpuctx)
+{
+	struct perf_event_dup *pdup;
+
+	if (event->dup_id >= cpuctx->dup_event_count)
+		return -EAGAIN;
+
+	pdup = &cpuctx->dup_event_list[event->dup_id];
+	list_del_init(&event->dup_sibling_entry);
+	if (event == pdup->master ) {
+		if (list_empty(&pdup->active_dup)) {
+			pdup->master = NULL;
+			return -EAGAIN;
+		} else {
+			struct perf_event *new_master;
+
+			new_master = list_first_entry(
+				&cpuctx->dup_event_list[event->dup_id].active_dup,
+				struct perf_event, dup_sibling_entry);
+			event_dup_sync(new_master, cpuctx);
+			pdup_switch_master(pdup, event, new_master);
+			pdup->master = new_master;
+		}
+	}
+	return 0;
+}
+
 static void
 event_sched_out(struct perf_event *event,
 		  struct perf_cpu_context *cpuctx,
@@ -2004,7 +2092,8 @@ event_sched_out(struct perf_event *event,
 
 	perf_pmu_disable(event->pmu);
 
-	event->pmu->del(event, 0);
+	if (event_dup_sched_out(event, cpuctx))
+		event->pmu->del(event, 0);
 	event->oncpu = -1;
 
 	if (event->pending_disable) {
@@ -2051,6 +2140,8 @@ group_sched_out(struct perf_event *group_event,
 		cpuctx->exclusive = 0;
 }
 
+static void rebuild_event_dup_list(struct perf_cpu_context *cpuctx);
+
 #define DETACH_GROUP	0x01UL
 
 /*
@@ -2084,6 +2175,7 @@ __perf_remove_from_context(struct perf_event *event,
 			cpuctx->task_ctx = NULL;
 		}
 	}
+	rebuild_event_dup_list(cpuctx);
 }
 
 /*
@@ -2276,11 +2368,14 @@ event_sched_in(struct perf_event *event,
 
 	perf_log_itrace_start(event);
 
-	if (event->pmu->add(event, PERF_EF_START)) {
-		perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
-		event->oncpu = -1;
-		ret = -EAGAIN;
-		goto out;
+	if (event_dup_try_add_follower(event, cpuctx)) {
+		if (event->pmu->add(event, PERF_EF_START)) {
+			perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
+			event->oncpu = -1;
+			ret = -EAGAIN;
+			goto out;
+		}
+		event_dup_setup_master(event, cpuctx);
 	}
 
 	if (!is_software_event(event))
@@ -2536,7 +2631,7 @@ static int  __perf_install_in_context(void *info)
 
 unlock:
 	perf_ctx_unlock(cpuctx, task_ctx);
-
+	rebuild_event_dup_list(cpuctx);
 	return ret;
 }
 
@@ -2919,8 +3014,10 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 
 	if (ctx->task) {
 		WARN_ON_ONCE(cpuctx->task_ctx != ctx);
-		if (!ctx->is_active)
+		if (!ctx->is_active) {
 			cpuctx->task_ctx = NULL;
+			rebuild_event_dup_list(cpuctx);
+		}
 	}
 
 	/*
@@ -2995,6 +3092,49 @@ static int context_equiv(struct perf_event_context *ctx1,
 	return 0;
 }
 
+/*
+ * sync data read from dup master
+ *
+ * RFC NOTE: this an o(1) operation
+ */
+static void event_dup_sync(struct perf_event *event,
+			   struct perf_cpu_context *cpuctx)
+{
+	struct perf_event *master;
+	struct perf_event_dup *pdup;
+	u64 new_count, new_child_count;
+
+	pdup = &cpuctx->dup_event_list[event->dup_id];
+	master = pdup->master;
+	WARN_ON(master == event);
+
+	master->pmu->read(master);
+	new_count = pdup_read_count(pdup);
+	new_child_count = pdup_read_child_count(pdup);
+
+	local64_add(new_count - event->dup_base_count, &event->count);
+	atomic64_add(new_child_count - event->dup_base_child_count,
+		     &event->child_count);
+
+	event->dup_base_count = new_count;
+	event->dup_base_child_count = new_child_count;
+}
+
+static void event_pmu_read(struct perf_event *event)
+{
+	struct perf_cpu_context *cpuctx;
+
+	if (list_empty(&event->dup_sibling_entry)) {
+		event->pmu->read(event);
+	} else {
+		cpuctx = __get_cpu_context(event->ctx);
+		if (event == cpuctx->dup_event_list[event->dup_id].master)
+			event->pmu->read(event);
+		else
+			event_dup_sync(event, cpuctx);
+	}
+}
+
 static void __perf_event_sync_stat(struct perf_event *event,
 				     struct perf_event *next_event)
 {
@@ -3011,7 +3151,7 @@ static void __perf_event_sync_stat(struct perf_event *event,
 	 * don't need to use it.
 	 */
 	if (event->state == PERF_EVENT_STATE_ACTIVE)
-		event->pmu->read(event);
+		event_pmu_read(event);
 
 	perf_event_update_time(event);
 
@@ -3366,9 +3506,10 @@ ctx_sched_in(struct perf_event_context *ctx,
 
 	ctx->is_active |= (event_type | EVENT_TIME);
 	if (ctx->task) {
-		if (!is_active)
+		if (!is_active) {
 			cpuctx->task_ctx = ctx;
-		else
+			rebuild_event_dup_list(cpuctx);
+		} else
 			WARN_ON_ONCE(cpuctx->task_ctx != ctx);
 	}
 
@@ -3402,6 +3543,71 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
 	ctx_sched_in(ctx, cpuctx, event_type, task);
 }
 
+static void add_event_to_dup_event_list(struct perf_event *event,
+					struct perf_cpu_context *cpuctx)
+{
+	int i;
+
+	for (i = 0; i < cpuctx->dup_event_count; ++i)
+		if (memcmp(&event->attr,
+			   &cpuctx->dup_event_list[i].first->attr,
+			   sizeof(event->attr)) == 0) {
+			event->dup_id = i;
+			return;
+		}
+	i = cpuctx->dup_event_count++;
+	cpuctx->dup_event_list[i].first = event;
+	cpuctx->dup_event_list[i].master = NULL;
+	INIT_LIST_HEAD(&cpuctx->dup_event_list[i].active_dup);
+	event->dup_id = i;
+	INIT_LIST_HEAD(&event->dup_sibling_entry);
+}
+
+static int add_group_to_dup_event_list(struct perf_event *event, void *data)
+{
+	struct sched_in_data *sid = data;
+	struct perf_event *sibling;
+
+	add_event_to_dup_event_list(event, sid->cpuctx);
+	for_each_sibling_event(sibling, event)
+		add_event_to_dup_event_list(sibling, sid->cpuctx);
+
+	return 0;
+}
+
+static void rebuild_event_dup_list(struct perf_cpu_context *cpuctx)
+{
+	int dup_count = cpuctx->ctx.nr_events;
+	struct perf_event_context *ctx = cpuctx->task_ctx;
+	struct sched_in_data sid = {
+		.ctx = ctx,
+		.cpuctx = cpuctx,
+		.can_add_hw = 1,
+	};
+
+	if (ctx)
+		dup_count += ctx->nr_events;
+
+	kfree(cpuctx->dup_event_list);
+	cpuctx->dup_event_count = 0;
+
+	cpuctx->dup_event_list =
+		kzalloc(sizeof(struct perf_event_dup) * dup_count, GFP_ATOMIC);
+	if (!cpuctx->dup_event_list)
+		return;
+
+	visit_groups_merge(&cpuctx->ctx.pinned_groups, smp_processor_id(),
+			   add_group_to_dup_event_list, &sid);
+	visit_groups_merge(&cpuctx->ctx.flexible_groups, smp_processor_id(),
+			   add_group_to_dup_event_list, &sid);
+	if (ctx) {
+		visit_groups_merge(&ctx->pinned_groups, smp_processor_id(),
+				   add_group_to_dup_event_list, &sid);
+		visit_groups_merge(&ctx->flexible_groups, smp_processor_id(),
+				   add_group_to_dup_event_list, &sid);
+	}
+}
+
 static void perf_event_context_sched_in(struct perf_event_context *ctx,
 					struct task_struct *task)
 {
@@ -3867,14 +4073,14 @@ static void __perf_event_read(void *info)
 		goto unlock;
 
 	if (!data->group) {
-		pmu->read(event);
+		event_pmu_read(event);
 		data->ret = 0;
 		goto unlock;
 	}
 
 	pmu->start_txn(pmu, PERF_PMU_TXN_READ);
 
-	pmu->read(event);
+	event_pmu_read(event);
 
 	for_each_sibling_event(sub, event) {
 		if (sub->state == PERF_EVENT_STATE_ACTIVE) {
@@ -3882,7 +4088,7 @@ static void __perf_event_read(void *info)
 			 * Use sibling's PMU rather than @event's since
 			 * sibling could be on different (eg: software) PMU.
 			 */
-			sub->pmu->read(sub);
+			event_pmu_read(sub);
 		}
 	}
 
@@ -3946,7 +4152,7 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
 	 * oncpu == -1).
 	 */
 	if (event->oncpu == smp_processor_id())
-		event->pmu->read(event);
+		event_pmu_read(event);
 
 	*value = local64_read(&event->count);
 	if (enabled || running) {
@@ -6085,7 +6291,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
 
 	if ((leader != event) &&
 	    (leader->state == PERF_EVENT_STATE_ACTIVE))
-		leader->pmu->read(leader);
+		event_pmu_read(leader);
 
 	values[n++] = perf_event_count(leader);
 	if (read_format & PERF_FORMAT_ID)
@@ -6098,7 +6304,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
 
 		if ((sub != event) &&
 		    (sub->state == PERF_EVENT_STATE_ACTIVE))
-			sub->pmu->read(sub);
+			event_pmu_read(sub);
 
 		values[n++] = perf_event_count(sub);
 		if (read_format & PERF_FORMAT_ID)
@@ -9107,7 +9313,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
 	if (event->state != PERF_EVENT_STATE_ACTIVE)
 		return HRTIMER_NORESTART;
 
-	event->pmu->read(event);
+	event_pmu_read(event);
 
 	perf_sample_data_init(&data, 0, event->hw.last_period);
 	regs = get_irq_regs();
@@ -9416,6 +9622,8 @@ static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
 
 static void free_pmu_context(struct pmu *pmu)
 {
+	int cpu;
+
 	/*
 	 * Static contexts such as perf_sw_context have a global lifetime
 	 * and may be shared between different PMUs. Avoid freeing them
@@ -9425,6 +9633,12 @@ static void free_pmu_context(struct pmu *pmu)
 		return;
 
 	mutex_lock(&pmus_lock);
+	for_each_possible_cpu(cpu) {
+		struct perf_cpu_context *cpuctx;
+
+		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+		kfree(cpuctx->dup_event_list);
+	}
 	free_percpu(pmu->pmu_cpu_context);
 	mutex_unlock(&pmus_lock);
 }
@@ -9629,6 +9843,7 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
 		cpuctx->ctx.pmu = pmu;
 		cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
 
+		rebuild_event_dup_list(cpuctx);
 		__perf_mux_hrtimer_init(cpuctx, cpu);
 	}
 
@@ -9945,6 +10160,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	INIT_LIST_HEAD(&event->addr_filters.list);
 	INIT_HLIST_NODE(&event->hlist_entry);
 
+	INIT_LIST_HEAD(&event->dup_sibling_entry);
 
 	init_waitqueue_head(&event->waitq);
 	init_irq_work(&event->pending, perf_pending_event);
-- 
2.9.5