lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Date:   Fri, 30 Jun 2017 13:20:54 +0300
From:   Alexey Budankov <alexey.budankov@...ux.intel.com>
To:     Peter Zijlstra <peterz@...radead.org>,
        Ingo Molnar <mingo@...hat.com>,
        Arnaldo Carvalho de Melo <acme@...nel.org>,
        Alexander Shishkin <alexander.shishkin@...ux.intel.com>
Cc:     Andi Kleen <ak@...ux.intel.com>, Kan Liang <kan.liang@...el.com>,
        Dmitri Prokhorov <Dmitry.Prohorov@...el.com>,
        Valery Cherepennikov <valery.cherepennikov@...el.com>,
        Mark Rutland <mark.rutland@....com>,
        David Carrillo-Cisneros <davidcc@...gle.com>,
        Stephane Eranian <eranian@...gle.com>,
        linux-kernel <linux-kernel@...r.kernel.org>
Subject: [PATCH v5 4/4] perf/core: addressing 4x slowdown during per-process
 profiling of STREAM benchmark on Intel Xeon Phi

perf/core: complete replace of lists by rb trees for pinned and 
           flexible groups at perf_event_context

By default, the userspace perf tool opens per-cpu task-bound events
when sampling, so for N logical events requested by the user, the tool
will open N * NR_CPUS events.

In the kernel, we mux events with a hrtimer, periodically rotating the
flexible group list and trying to schedule each group in turn. We skip
groups whose cpu filter doesn't match. So when we get unlucky, we can
walk N * (NR_CPUS - 1) groups pointlessly for each hrtimer invocation.

This has been observed to result in significant overhead when running
the STREAM benchmark on 272 core Xeon Phi systems.

One way to avoid this is to place our events into an rb tree sorted by
CPU filter, so that our hrtimer can skip to the current CPU's
list and ignore everything else.

This patch implements complete replacement of lists by rb trees for 
pinned and flexible groups.

Signed-off-by: Alexey Budankov <alexey.budankov@...ux.intel.com>
---
 include/linux/perf_event.h | 20 +---------
 kernel/events/core.c       | 92 ++++++++++++++++------------------------------
 2 files changed, 34 insertions(+), 78 deletions(-)

1. removed group_list_entry from perf_event type reusing group_entry instead.

2. removed perf_event_groups type replacing it by rb_tree for pinned and
   flexible groups at perf_event_context and all over the code.

3. removed perf_event_groups_init() and perf_event_groups_empty() employing
   natural rb tree API instead.

4. updated perf_event_groups_iterate() implementation to go thru tree 
   instead of list.

The patch set was tested on Xeon Phi using perf_fuzzer and tests 
from here: https://github.com/deater/perf_event_tests

The full patch set (v1-4) is attached for convenience. 

Branch revision:
* perf/core 007b811b4041989ec2dc91b9614aa2c41332723e 
  Merge tag 'perf-core-for-mingo-4.13-20170719' of 
  git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux into perf/core

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 7b2cddf..8e1967f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -603,13 +603,6 @@ struct perf_event {
 	 */
 	struct list_head		group_list;
 	/*
-	 * Entry into the group_list list above;
-	 * the entry may be attached to the self group_list list above
-	 * in case the event is directly attached to the tree;
-	 */
-	struct list_head		group_list_entry;
-
-	/*
 	 * We need storage to track the entries in perf_pmu_migrate_context; we
 	 * cannot use the event_entry because of RCU and we want to keep the
 	 * group in tact which avoids us using the other two entries.
@@ -749,15 +742,6 @@ struct perf_event {
 #endif /* CONFIG_PERF_EVENTS */
 };
 
-/*
- * event groups keep group leader events arranged as an rb tree with
- * event->cpu key and as a list for the whole tree iterations;
- */
-struct perf_event_groups {
-	struct list_head list;
-	struct rb_root	 tree;
-};
-
 /**
  * struct perf_event_context - event context structure
  *
@@ -778,8 +762,8 @@ struct perf_event_context {
 	struct mutex			mutex;
 
 	struct list_head		active_ctx_list;
-	struct perf_event_groups	pinned_groups;
-	struct perf_event_groups	flexible_groups;
+	struct rb_root			pinned_groups;
+	struct rb_root			flexible_groups;
 	struct list_head		event_list;
 	int				nr_events;
 	int				nr_active;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 39d3fb8..2d02f75 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1466,7 +1466,7 @@ static enum event_type_t get_event_type(struct perf_event *event)
  * Extract pinned or flexible groups from the context
  * based on event attrs bits;
  */
-static struct perf_event_groups *
+static struct rb_root *
 get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
 {
 	if (event->attr.pinned)
@@ -1476,11 +1476,11 @@ get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
 }
 
 static void
-perf_event_groups_insert(struct perf_event_groups *groups,
+perf_event_groups_insert(struct rb_root *groups,
 		struct perf_event *event);
 
 static void
-perf_event_groups_delete(struct perf_event_groups *groups,
+perf_event_groups_delete(struct rb_root *groups,
 		struct perf_event *event);
 
 /*
@@ -1490,7 +1490,7 @@ perf_event_groups_delete(struct perf_event_groups *groups,
 static void
 add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
 {
-	struct perf_event_groups *groups;
+	struct rb_root *groups;
 
 	groups = get_event_groups(event, ctx);
 	perf_event_groups_insert(groups, event);
@@ -1502,39 +1502,19 @@ add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
 static void
 del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
 {
-	struct perf_event_groups *groups;
+	struct rb_root *groups;
 
 	groups = get_event_groups(event, ctx);
 	perf_event_groups_delete(groups, event);
 }
 
 /*
- * Helper function to test if event groups are empty;
- */
-static int
-perf_event_groups_empty(struct perf_event_groups *groups)
-{
-	return list_empty(&groups->list);
-}
-
-/*
- * Helper function to Initialize event groups object;
- */
-static void
-perf_event_groups_init(struct perf_event_groups *groups)
-{
-	INIT_LIST_HEAD(&groups->list);
-	groups->tree = RB_ROOT;
-}
-
-/*
  * Insert a group into a tree using event->cpu as a key. If event->cpu node
  * is already attached to the tree then the event is added to the attached
  * group's group_list list.
  */
 static void
-perf_event_groups_insert(struct perf_event_groups *groups,
-		struct perf_event *event)
+perf_event_groups_insert(struct rb_root *groups, struct perf_event *event)
 {
 	struct rb_node **node;
 	struct rb_node *parent;
@@ -1543,7 +1523,7 @@ perf_event_groups_insert(struct perf_event_groups *groups,
 	WARN_ON_ONCE(!groups || !event);
 	WARN_ON_ONCE(!list_empty(&event->group_entry));
 
-	node = &groups->tree.rb_node;
+	node = &groups->rb_node;
 	parent = *node;
 
 	while (*node) {
@@ -1565,7 +1545,7 @@ perf_event_groups_insert(struct perf_event_groups *groups,
 	list_add_tail(&event->group_entry, &event->group_list);
 
 	rb_link_node(&event->group_node, parent, node);
-	rb_insert_color(&event->group_node, &groups->tree);
+	rb_insert_color(&event->group_node, groups);
 }
 
 /*
@@ -1573,8 +1553,7 @@ perf_event_groups_insert(struct perf_event_groups *groups,
  * it also detaches all groups on the group's group_list list.
  */
 static void
-perf_event_groups_delete(struct perf_event_groups *groups,
-		struct perf_event *event)
+perf_event_groups_delete(struct rb_root *groups, struct perf_event *event)
 {
 	struct perf_event *next;
 
@@ -1585,17 +1564,20 @@ perf_event_groups_delete(struct perf_event_groups *groups,
 
 	if (!RB_EMPTY_NODE(&event->group_node)) {
 		WARN_ON_ONCE(!groups);
-		WARN_ON_ONCE(!groups->tree.rb_node);
-		if (list_empty(&event->group_list)) {
-			rb_erase(&event->group_node, &groups->tree);
-		} else {
-			next = list_first_entry(&event->group_list,
-					struct perf_event, group_entry);
-			list_replace_init(&event->group_list,
-					&next->group_list);
-			rb_replace_node(&event->group_node,
-					&next->group_node, &groups->tree);
+		if (!RB_EMPTY_ROOT(groups)) {
+			if (list_empty(&event->group_list)) {
+				rb_erase(&event->group_node, groups);
+			} else {
+				next = list_first_entry(&event->group_list,
+						struct perf_event, group_entry);
+				list_replace_init(&event->group_list,
+						&next->group_list);
+				rb_replace_node(&event->group_node,
+						&next->group_node, groups);
+
+			}
 		}
+		RB_CLEAR_NODE(&event->group_node);
 	}
 }
 
@@ -1603,14 +1585,14 @@ perf_event_groups_delete(struct perf_event_groups *groups,
  * Find group list by a cpu key and rotate it.
  */
 static void
-perf_event_groups_rotate(struct perf_event_groups *groups, int cpu)
+perf_event_groups_rotate(struct rb_root *groups, int cpu)
 {
 	struct rb_node *node;
 	struct perf_event *node_event;
 
 	WARN_ON_ONCE(!groups);
 
-	node = groups->tree.rb_node;
+	node = groups->rb_node;
 
 	while (node) {
 		node_event = container_of(node,
@@ -1635,7 +1617,7 @@ perf_event_groups_rotate(struct perf_event_groups *groups, int cpu)
 typedef int(*perf_event_groups_iterate_f)(struct perf_event *, void *);
 
 static void
-perf_event_groups_iterate_cpu(struct perf_event_groups *groups, int cpu,
+perf_event_groups_iterate_cpu(struct rb_root *groups, int cpu,
 		perf_event_groups_iterate_f callback, void *data)
 {
 	struct rb_node *node;
@@ -1643,7 +1625,7 @@ perf_event_groups_iterate_cpu(struct perf_event_groups *groups, int cpu,
 
 	WARN_ON_ONCE(!groups);
 
-	node = groups->tree.rb_node;
+	node = groups->rb_node;
 
 	while (node) {
 		node_event = container_of(node,
@@ -1667,22 +1649,15 @@ perf_event_groups_iterate_cpu(struct perf_event_groups *groups, int cpu,
  * Iteration stops if the callback returns non zero.
  */
 static int
-perf_event_groups_iterate(struct perf_event_groups *groups,
+perf_event_groups_iterate(struct rb_root *groups,
 		perf_event_groups_iterate_f callback, void *data)
 {
 	int ret = 0;
-	struct perf_event *event;
+	struct rb_node *node;
+	struct perf_event *event, *node_event;
 
 	WARN_ON_ONCE(!groups);
 
-	list_for_each_entry(event, &groups->list, group_entry) {
-		ret = callback(event, data);
-		if (ret)
-			break;
-	}
-
-	/* will replace itration above in patch v5 4/4
-
 	for (node = rb_first(groups); node; node = rb_next(node)) {
 		node_event = container_of(node,	struct perf_event, group_node);
 		list_for_each_entry(event, &node_event->group_list,
@@ -1695,8 +1670,6 @@ perf_event_groups_iterate(struct perf_event_groups *groups,
 		}
 	}
 
-	*/
-
 	return ret;
 }
 
@@ -3489,7 +3462,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 	 * However, if task's ctx is not carrying any pinned
 	 * events, no need to flip the cpuctx's events around.
 	 */
-	if (!perf_event_groups_empty(&ctx->pinned_groups))
+	if (!RB_EMPTY_ROOT(&ctx->pinned_groups))
 		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux);
 	perf_event_sched_in(cpuctx, ctx, task, mux);
 	perf_pmu_enable(ctx->pmu);
@@ -4057,8 +4030,8 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
 	raw_spin_lock_init(&ctx->lock);
 	mutex_init(&ctx->mutex);
 	INIT_LIST_HEAD(&ctx->active_ctx_list);
-	perf_event_groups_init(&ctx->pinned_groups);
-	perf_event_groups_init(&ctx->flexible_groups);
+	ctx->pinned_groups = RB_ROOT;
+	ctx->flexible_groups = RB_ROOT;
 	INIT_LIST_HEAD(&ctx->event_list);
 	atomic_set(&ctx->refcount, 1);
 }
@@ -9695,7 +9668,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	INIT_LIST_HEAD(&event->sibling_list);
 	RB_CLEAR_NODE(&event->group_node);
 	INIT_LIST_HEAD(&event->group_list);
-	INIT_LIST_HEAD(&event->group_list_entry);
 	INIT_LIST_HEAD(&event->rb_entry);
 	INIT_LIST_HEAD(&event->active_entry);
 	INIT_LIST_HEAD(&event->addr_filters.list);



View attachment "v5-n.patch" of type "text/plain" (36099 bytes)

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ