[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <09226446-39b9-9bd2-d60f-b9bb947987c5@linux.intel.com>
Date: Thu, 15 Jun 2017 20:41:42 +0300
From: Alexey Budankov <alexey.budankov@...ux.intel.com>
To: Peter Zijlstra <peterz@...radead.org>,
Ingo Molnar <mingo@...hat.com>,
Arnaldo Carvalho de Melo <acme@...nel.org>,
Alexander Shishkin <alexander.shishkin@...ux.intel.com>
Cc: Andi Kleen <ak@...ux.intel.com>, Kan Liang <kan.liang@...el.com>,
Dmitri Prokhorov <Dmitry.Prohorov@...el.com>,
Valery Cherepennikov <valery.cherepennikov@...el.com>,
David Carrillo-Cisneros <davidcc@...gle.com>,
Stephane Eranian <eranian@...gle.com>,
Mark Rutland <mark.rutland@....com>,
linux-kernel@...r.kernel.org
Subject: [PATCH v3 1/n] perf/core: addressing 4x slowdown during per-process
profiling of STREAM benchmark on Intel Xeon Phi
This series of patches continues v2 and addresses captured comments.
Specifically this patch replaces pinned_groups and flexible_groups
lists of perf_event_context by red-black cpu indexed trees avoiding data
structures duplication and introducing possibility to iterate event
groups for a specific CPU only.
Signed-off-by: Alexey Budankov <alexey.budankov@...ux.intel.com>
---
include/linux/perf_event.h | 34 +++-
kernel/events/core.c | 386
+++++++++++++++++++++++++++++++++------------
2 files changed, 315 insertions(+), 105 deletions(-)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 24a6358..2c1dcf1 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -574,6 +574,27 @@ struct perf_event {
struct list_head sibling_list;
/*
+ * Node on the pinned or flexible tree located at the event context;
+ * the node may be empty in case its event is not directly attached
+ * to the tree but to group_list list of the event directly
+ * attached to the tree;
+ */
+ struct rb_node group_node;
+ /*
+ * List keeps groups allocated for the same cpu;
+ * the list may be empty in case its event is not directly
+ * attached to the tree but to group_list list of the event directly
+ * attached to the tree;
+ */
+ struct list_head group_list;
+ /*
+ * Entry into the group_list list above;
+ * the entry may be attached to the self group_list list above
+ * in case the event is directly attached to the pinned or
+ * flexible tree;
+ */
+ struct list_head group_list_entry;
+ /*
* We need storage to track the entries in perf_pmu_migrate_context; we
* cannot use the event_entry because of RCU and we want to keep the
* group in tact which avoids us using the other two entries.
@@ -741,8 +762,17 @@ struct perf_event_context {
struct mutex mutex;
struct list_head active_ctx_list;
- struct list_head pinned_groups;
- struct list_head flexible_groups;
+ /*
+ * RB tree for pinned groups; keeps event's group_node
+ * nodes of attached pinned groups;
+ */
+ struct rb_root pinned_tree;
+ /*
+ * RB tree for flexible groups; keeps event's group_node
+ * nodes of attached flexible groups;
+ */
+ struct rb_root flexible_tree;
+
struct list_head event_list;
int nr_events;
int nr_active;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index bc63f8d..a3531f9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1458,13 +1458,142 @@ static enum event_type_t get_event_type(struct
perf_event *event)
return event_type;
}
-static struct list_head *
-ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
+static struct rb_root *
+ctx_group_cpu_tree(struct perf_event *event, struct perf_event_context
*ctx)
{
if (event->attr.pinned)
- return &ctx->pinned_groups;
+ return &ctx->pinned_tree;
else
- return &ctx->flexible_groups;
+ return &ctx->flexible_tree;
+}
+
+/*
+ * Insert a group into a tree using event->cpu as a key. If event->cpu node
+ * is already attached to the tree then the event is added to the attached
+ * group's group_list list.
+ */
+static void
+perf_cpu_tree_insert(struct rb_root *tree, struct perf_event *event)
+{
+ struct rb_node **node;
+ struct rb_node *parent;
+
+ WARN_ON_ONCE(!tree || !event);
+
+ node = &tree->rb_node;
+ parent = *node;
+
+ while (*node) {
+ struct perf_event *node_event = container_of(*node,
+ struct perf_event, group_node);
+
+ parent = *node;
+
+ if (event->cpu < node_event->cpu) {
+ node = &parent->rb_left;
+ } else if (event->cpu > node_event->cpu) {
+ node = &parent->rb_right;
+ } else {
+ list_add_tail(&event->group_list_entry,
+ &node_event->group_list);
+ return;
+ }
+ }
+
+ list_add_tail(&event->group_list_entry, &event->group_list);
+
+ rb_link_node(&event->group_node, parent, node);
+ rb_insert_color(&event->group_node, tree);
+}
+
+/*
+ * Delete a group from a tree. If the group is directly attached to the
tree
+ * it also detaches all groups on the group's group_list list.
+ */
+static void
+perf_cpu_tree_delete(struct rb_root *tree, struct perf_event *event)
+{
+ WARN_ON_ONCE(!tree || !event);
+
+ if (RB_EMPTY_NODE(&event->group_node)) {
+ list_del_init(&event->group_list_entry);
+ } else {
+ struct perf_event *list_event, *list_next;
+
+ rb_erase(&event->group_node, tree);
+ list_for_each_entry_safe(list_event, list_next,
+ &event->group_list, group_list_entry)
+ list_del_init(&list_event->group_list_entry);
+ }
+}
+
+/*
+ * Find group_list list by a cpu key and call provided callback for every
+ * group on the list. Iteration stops if the callback returns non zero.
+ */
+
+typedef int (*perf_cpu_tree_callback_t)(struct perf_event *, void *);
+
+static int
+perf_cpu_tree_iterate_cpu(struct rb_root *tree, int cpu,
+ perf_cpu_tree_callback_t callback, void *data)
+{
+ int ret = 0;
+ struct rb_node *node;
+ struct perf_event *event;
+
+ WARN_ON_ONCE(!tree);
+
+ node = tree->rb_node;
+
+ while (node) {
+ struct perf_event *node_event = container_of(node,
+ struct perf_event, group_node);
+
+ if (cpu < node_event->cpu) {
+ node = node->rb_left;
+ } else if (cpu > node_event->cpu) {
+ node = node->rb_right;
+ } else {
+ list_for_each_entry(event, &node_event->group_list,
+ group_list_entry) {
+ ret = callback(event, data);
+ if (ret)
+ return ret;
+ }
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Iterate a tree and call provided callback for every group in the tree.
+ * Iteration stops if the callback returns non zero.
+ */
+static int
+perf_cpu_tree_iterate(struct rb_root *tree,
+ perf_cpu_tree_callback_t callback, void *data)
+{
+ int ret = 0;
+ struct rb_node *node;
+ struct perf_event *event;
+
+ WARN_ON_ONCE(!tree);
+
+ for (node = rb_first(tree); node; node = rb_next(node)) {
+ struct perf_event *node_event = container_of(node,
+ struct perf_event, group_node);
+
+ list_for_each_entry(event, &node_event->group_list,
+ group_list_entry) {
+ ret = callback(event, data);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
}
/*
@@ -1485,12 +1614,12 @@ list_add_event(struct perf_event *event, struct
perf_event_context *ctx)
* perf_group_detach can, at all times, locate all siblings.
*/
if (event->group_leader == event) {
- struct list_head *list;
+ struct rb_root *tree;
event->group_caps = event->event_caps;
- list = ctx_group_list(event, ctx);
- list_add_tail(&event->group_entry, list);
+ tree = ctx_group_cpu_tree(event, ctx);
+ perf_cpu_tree_insert(tree, event);
}
list_update_cgroup_event(event, ctx, true);
@@ -1680,8 +1809,12 @@ list_del_event(struct perf_event *event, struct
perf_event_context *ctx)
list_del_rcu(&event->event_entry);
- if (event->group_leader == event)
- list_del_init(&event->group_entry);
+ if (event->group_leader == event) {
+ struct rb_root *tree;
+
+ tree = ctx_group_cpu_tree(event, ctx);
+ perf_cpu_tree_delete(tree, event);
+ }
update_group_times(event);
@@ -2699,12 +2832,80 @@ int perf_event_refresh(struct perf_event *event,
int refresh)
}
EXPORT_SYMBOL_GPL(perf_event_refresh);
+static void
+sched_in_group(struct perf_event *event,
+ struct perf_event_context *ctx,
+ struct perf_cpu_context *cpuctx,
+ int *can_add_hw)
+{
+ /* Ignore events in OFF or ERROR state and
+ * listen to the 'cpu' scheduling filter constraint
+ * of events:
+ */
+ if (event->state <= PERF_EVENT_STATE_OFF || !event_filter_match(event))
+ return;
+
+ /* may need to reset tstamp_enabled */
+ if (is_cgroup_event(event))
+ perf_cgroup_mark_enabled(event, ctx);
+
+ if (group_can_go_on(event, cpuctx, *can_add_hw)) {
+ if (group_sched_in(event, cpuctx, ctx))
+ *can_add_hw = 0;
+ }
+}
+
+struct group_sched_params {
+ struct perf_cpu_context *cpuctx;
+ struct perf_event_context *ctx;
+ int can_add_hw;
+};
+
+static int
+group_sched_in_pinned_callback(struct perf_event *event, void *data)
+{
+ struct group_sched_params *params = data;
+ int can_add_hw = 1;
+
+ sched_in_group(event, params->ctx, params->cpuctx, &can_add_hw);
+
+ if (!can_add_hw) {
+ update_group_times(event);
+ event->state = PERF_EVENT_STATE_ERROR;
+ }
+
+ return 0;
+}
+
+static int
+group_sched_in_flexible_callback(struct perf_event *event, void *data)
+{
+ struct group_sched_params *params = data;
+
+ sched_in_group(event, params->ctx, params->cpuctx,
+ ¶ms->can_add_hw);
+
+ return 0;
+}
+
+static int group_sched_out_callback(struct perf_event *event, void *data)
+{
+ struct group_sched_params *params = data;
+
+ group_sched_out(event, params->cpuctx, params->ctx);
+
+ return 0;
+}
+
static void ctx_sched_out(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
enum event_type_t event_type)
{
int is_active = ctx->is_active;
- struct perf_event *event;
+ struct group_sched_params params = {
+ .cpuctx = cpuctx,
+ .ctx = ctx
+ };
lockdep_assert_held(&ctx->lock);
@@ -2750,15 +2951,15 @@ static void ctx_sched_out(struct
perf_event_context *ctx,
return;
perf_pmu_disable(ctx->pmu);
- if (is_active & EVENT_PINNED) {
- list_for_each_entry(event, &ctx->pinned_groups, group_entry)
- group_sched_out(event, cpuctx, ctx);
- }
- if (is_active & EVENT_FLEXIBLE) {
- list_for_each_entry(event, &ctx->flexible_groups, group_entry)
- group_sched_out(event, cpuctx, ctx);
- }
+ if (is_active & EVENT_PINNED)
+ perf_cpu_tree_iterate(&ctx->pinned_tree,
+ group_sched_out_callback, ¶ms);
+
+ if (is_active & EVENT_FLEXIBLE)
+ perf_cpu_tree_iterate(&ctx->flexible_tree,
+ group_sched_out_callback, ¶ms);
+
perf_pmu_enable(ctx->pmu);
}
@@ -3052,72 +3253,18 @@ static void cpu_ctx_sched_out(struct
perf_cpu_context *cpuctx,
}
static void
-ctx_pinned_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx)
-{
- struct perf_event *event;
-
- list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
- if (event->state <= PERF_EVENT_STATE_OFF)
- continue;
- if (!event_filter_match(event))
- continue;
-
- /* may need to reset tstamp_enabled */
- if (is_cgroup_event(event))
- perf_cgroup_mark_enabled(event, ctx);
-
- if (group_can_go_on(event, cpuctx, 1))
- group_sched_in(event, cpuctx, ctx);
-
- /*
- * If this pinned group hasn't been scheduled,
- * put it in error state.
- */
- if (event->state == PERF_EVENT_STATE_INACTIVE) {
- update_group_times(event);
- event->state = PERF_EVENT_STATE_ERROR;
- }
- }
-}
-
-static void
-ctx_flexible_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx)
-{
- struct perf_event *event;
- int can_add_hw = 1;
-
- list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
- /* Ignore events in OFF or ERROR state */
- if (event->state <= PERF_EVENT_STATE_OFF)
- continue;
- /*
- * Listen to the 'cpu' scheduling filter constraint
- * of events:
- */
- if (!event_filter_match(event))
- continue;
-
- /* may need to reset tstamp_enabled */
- if (is_cgroup_event(event))
- perf_cgroup_mark_enabled(event, ctx);
-
- if (group_can_go_on(event, cpuctx, can_add_hw)) {
- if (group_sched_in(event, cpuctx, ctx))
- can_add_hw = 0;
- }
- }
-}
-
-static void
ctx_sched_in(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
enum event_type_t event_type,
struct task_struct *task)
{
int is_active = ctx->is_active;
- u64 now;
+ struct group_sched_params params = {
+ .cpuctx = cpuctx,
+ .ctx = ctx,
+ .can_add_hw = 1
+
+ };
lockdep_assert_held(&ctx->lock);
@@ -3136,8 +3283,7 @@ ctx_sched_in(struct perf_event_context *ctx,
if (is_active & EVENT_TIME) {
/* start ctx time */
- now = perf_clock();
- ctx->timestamp = now;
+ ctx->timestamp = perf_clock();
perf_cgroup_set_timestamp(task, ctx);
}
@@ -3146,11 +3292,13 @@ ctx_sched_in(struct perf_event_context *ctx,
* in order to give them the best chance of going on.
*/
if (is_active & EVENT_PINNED)
- ctx_pinned_sched_in(ctx, cpuctx);
+ perf_cpu_tree_iterate(&ctx->pinned_tree,
+ group_sched_in_pinned_callback, ¶ms);
/* Then walk through the lower prio flexible groups */
if (is_active & EVENT_FLEXIBLE)
- ctx_flexible_sched_in(ctx, cpuctx);
+ perf_cpu_tree_iterate(&ctx->flexible_tree,
+ group_sched_in_flexible_callback, ¶ms);
}
static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
@@ -3181,7 +3329,7 @@ static void perf_event_context_sched_in(struct
perf_event_context *ctx,
* However, if task's ctx is not carrying any pinned
* events, no need to flip the cpuctx's events around.
*/
- if (!list_empty(&ctx->pinned_groups))
+ if (!RB_EMPTY_ROOT(&ctx->pinned_tree))
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
perf_event_sched_in(cpuctx, ctx, task);
perf_pmu_enable(ctx->pmu);
@@ -3410,14 +3558,25 @@ static void
perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
/*
* Round-robin a context's events:
*/
+static int rotate_ctx_callback(struct perf_event *event, void *data)
+{
+ list_rotate_left(&event->group_list);
+
+ return 1;
+}
+
static void rotate_ctx(struct perf_event_context *ctx)
{
/*
* Rotate the first entry last of non-pinned groups. Rotation might be
* disabled by the inheritance code.
*/
- if (!ctx->rotate_disable)
- list_rotate_left(&ctx->flexible_groups);
+ if (!ctx->rotate_disable) {
+ perf_cpu_tree_iterate_cpu(&ctx->flexible_tree,
+ -1, rotate_ctx_callback, NULL);
+ perf_cpu_tree_iterate_cpu(&ctx->flexible_tree,
+ smp_processor_id(), rotate_ctx_callback, NULL);
+ }
}
static int perf_rotate_context(struct perf_cpu_context *cpuctx)
@@ -3743,8 +3902,8 @@ static void __perf_event_init_context(struct
perf_event_context *ctx)
raw_spin_lock_init(&ctx->lock);
mutex_init(&ctx->mutex);
INIT_LIST_HEAD(&ctx->active_ctx_list);
- INIT_LIST_HEAD(&ctx->pinned_groups);
- INIT_LIST_HEAD(&ctx->flexible_groups);
+ ctx->pinned_tree = RB_ROOT;
+ ctx->flexible_tree = RB_ROOT;
INIT_LIST_HEAD(&ctx->event_list);
atomic_set(&ctx->refcount, 1);
}
@@ -9377,6 +9536,9 @@ perf_event_alloc(struct perf_event_attr *attr, int
cpu,
INIT_LIST_HEAD(&event->child_list);
INIT_LIST_HEAD(&event->group_entry);
+ RB_CLEAR_NODE(&event->group_node);
+ INIT_LIST_HEAD(&event->group_list);
+ INIT_LIST_HEAD(&event->group_list_entry);
INIT_LIST_HEAD(&event->event_entry);
INIT_LIST_HEAD(&event->sibling_list);
INIT_LIST_HEAD(&event->rb_entry);
@@ -10816,6 +10978,23 @@ inherit_task_group(struct perf_event *event,
struct task_struct *parent,
return ret;
}
+struct inherit_task_group_params {
+ struct task_struct *parent;
+ struct perf_event_context *parent_ctx;
+ struct task_struct *child;
+ int ctxn;
+ int inherited_all;
+};
+
+static int
+inherit_task_group_callback(struct perf_event *event, void *data)
+{
+ struct inherit_task_group_params *params = data;
+
+ return inherit_task_group(event, params->parent, params->parent_ctx,
+ params->child, params->ctxn, ¶ms->inherited_all);
+}
+
/*
* Initialize the perf_event context in task_struct
*/
@@ -10823,11 +11002,15 @@ static int perf_event_init_context(struct
task_struct *child, int ctxn)
{
struct perf_event_context *child_ctx, *parent_ctx;
struct perf_event_context *cloned_ctx;
- struct perf_event *event;
struct task_struct *parent = current;
- int inherited_all = 1;
unsigned long flags;
int ret = 0;
+ struct inherit_task_group_params params = {
+ .parent = parent,
+ .child = child,
+ .ctxn = ctxn,
+ .inherited_all = 1
+ };
if (likely(!parent->perf_event_ctxp[ctxn]))
return 0;
@@ -10839,7 +11022,8 @@ static int perf_event_init_context(struct
task_struct *child, int ctxn)
parent_ctx = perf_pin_task_context(parent, ctxn);
if (!parent_ctx)
return 0;
-
+ else
+ params.parent_ctx = parent_ctx;
/*
* No need to check if parent_ctx != NULL here; since we saw
* it non-NULL earlier, the only reason for it to become NULL
@@ -10857,12 +11041,10 @@ static int perf_event_init_context(struct
task_struct *child, int ctxn)
* We dont have to disable NMIs - we are only looking at
* the list, not manipulating it:
*/
- list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
- ret = inherit_task_group(event, parent, parent_ctx,
- child, ctxn, &inherited_all);
- if (ret)
- goto out_unlock;
- }
+ ret = perf_cpu_tree_iterate(&parent_ctx->pinned_tree,
+ inherit_task_group_callback, ¶ms);
+ if (ret)
+ goto out_unlock;
/*
* We can't hold ctx->lock when iterating the ->flexible_group list due
@@ -10873,19 +11055,17 @@ static int perf_event_init_context(struct
task_struct *child, int ctxn)
parent_ctx->rotate_disable = 1;
raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
- list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
- ret = inherit_task_group(event, parent, parent_ctx,
- child, ctxn, &inherited_all);
- if (ret)
- goto out_unlock;
- }
+ ret = perf_cpu_tree_iterate(&parent_ctx->flexible_tree,
+ inherit_task_group_callback, ¶ms);
+ if (ret)
+ goto out_unlock;
raw_spin_lock_irqsave(&parent_ctx->lock, flags);
parent_ctx->rotate_disable = 0;
child_ctx = child->perf_event_ctxp[ctxn];
- if (child_ctx && inherited_all) {
+ if (child_ctx && params.inherited_all) {
/*
* Mark the child context as a clone of the parent
* context, or of whatever the parent is a clone of.
Powered by blists - more mailing lists