include/linux/perf_event.h |  55 +++--
 kernel/events/core.c       | 604 +++++++++++++++++++++++++++++++++------------
 2 files changed, 491 insertions(+), 168 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 24a6358..8e1967f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -550,6 +550,22 @@ struct pmu_event_list {
 	struct list_head	list;
 };
 
+struct perf_event_tstamp {
+	/*
+	 * These are timestamps used for computing total_time_enabled
+	 * and total_time_running when the event is in INACTIVE or
+	 * ACTIVE state, measured in nanoseconds from an arbitrary point
+	 * in time.
+	 * enabled: the notional time when the event was enabled
+	 * running: the notional time when the event was scheduled on
+	 * stopped: in INACTIVE state, the notional time when the
+	 *    event was scheduled off.
+	 */
+	u64 enabled;
+	u64 running;
+	u64 stopped;
+};
+
 /**
  * struct perf_event - performance event kernel representation:
  */
@@ -572,7 +588,20 @@ struct perf_event {
 	 */
 	struct list_head		group_entry;
 	struct list_head		sibling_list;
-
+	/*
+	 * Node on the pinned or flexible tree located at the event context;
+	 * the node may be empty in case its event is not directly attached
+	 * to the tree but to group_list list of the event directly
+	 * attached to the tree;
+	 */
+	struct rb_node			group_node;
+	/*
+	 * List keeps groups allocated for the same cpu;
+	 * the list may be empty in case its event is not directly
+	 * attached to the tree but to group_list list of the event directly
+	 * attached to the tree;
+	 */
+	struct list_head		group_list;
 	/*
 	 * We need storage to track the entries in perf_pmu_migrate_context; we
 	 * cannot use the event_entry because of RCU and we want to keep the
@@ -611,19 +640,11 @@ struct perf_event {
 	u64				total_time_running;
 
 	/*
-	 * These are timestamps used for computing total_time_enabled
-	 * and total_time_running when the event is in INACTIVE or
-	 * ACTIVE state, measured in nanoseconds from an arbitrary point
-	 * in time.
-	 * tstamp_enabled: the notional time when the event was enabled
-	 * tstamp_running: the notional time when the event was scheduled on
-	 * tstamp_stopped: in INACTIVE state, the notional time when the
-	 *	event was scheduled off.
+	 * tstamp points to the tstamp_data object below or to the object
+	 * located at the event context;
 	 */
-	u64				tstamp_enabled;
-	u64				tstamp_running;
-	u64				tstamp_stopped;
-
+	struct perf_event_tstamp	*tstamp;
+	struct perf_event_tstamp	tstamp_data;
 	/*
 	 * timestamp shadows the actual context timing but it can
 	 * be safely used in NMI interrupt context. It reflects the
@@ -741,8 +762,8 @@ struct perf_event_context {
 	struct mutex			mutex;
 
 	struct list_head		active_ctx_list;
-	struct list_head		pinned_groups;
-	struct list_head		flexible_groups;
+	struct rb_root			pinned_groups;
+	struct rb_root			flexible_groups;
 	struct list_head		event_list;
 	int				nr_events;
 	int				nr_active;
@@ -758,6 +779,10 @@ struct perf_event_context {
 	 */
 	u64				time;
 	u64				timestamp;
+	/*
+	 * Context cache for filtered out events;
+	 */
+	struct perf_event_tstamp	tstamp_data;
 
 	/*
 	 * These fields let us detect when two contexts have both
diff --git a/kernel/events/core.c b/kernel/events/core.c
index bc63f8d..2d02f75 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -555,11 +555,11 @@ void perf_sample_event_took(u64 sample_len_ns)
 static atomic64_t perf_event_id;
 
 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
-			      enum event_type_t event_type);
+			      enum event_type_t event_type, int mux);
 
 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
 			     enum event_type_t event_type,
-			     struct task_struct *task);
+			     struct task_struct *task, int mux);
 
 static void update_context_time(struct perf_event_context *ctx);
 static u64 perf_event_time(struct perf_event *event);
@@ -701,6 +701,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
 	struct perf_cpu_context *cpuctx;
 	struct list_head *list;
 	unsigned long flags;
+	int mux = 0;
 
 	/*
 	 * Disable interrupts and preemption to avoid this CPU's
@@ -716,7 +717,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
 		perf_pmu_disable(cpuctx->ctx.pmu);
 
 		if (mode & PERF_CGROUP_SWOUT) {
-			cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+			cpu_ctx_sched_out(cpuctx, EVENT_ALL, mux);
 			/*
 			 * must not be done before ctxswout due
 			 * to event_filter_match() in event_sched_out()
@@ -735,7 +736,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
 			 */
 			cpuctx->cgrp = perf_cgroup_from_task(task,
 							     &cpuctx->ctx);
-			cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
+			cpu_ctx_sched_in(cpuctx, EVENT_ALL, task, mux);
 		}
 		perf_pmu_enable(cpuctx->ctx.pmu);
 		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -865,10 +866,10 @@ perf_cgroup_mark_enabled(struct perf_event *event,
 
 	event->cgrp_defer_enabled = 0;
 
-	event->tstamp_enabled = tstamp - event->total_time_enabled;
+	event->tstamp->enabled = tstamp - event->total_time_enabled;
 	list_for_each_entry(sub, &event->sibling_list, group_entry) {
 		if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
-			sub->tstamp_enabled = tstamp - sub->total_time_enabled;
+			sub->tstamp->enabled = tstamp - sub->total_time_enabled;
 			sub->cgrp_defer_enabled = 0;
 		}
 	}
@@ -1378,6 +1379,9 @@ static void update_context_time(struct perf_event_context *ctx)
 
 	ctx->time += now - ctx->timestamp;
 	ctx->timestamp = now;
+
+	ctx->tstamp_data.running += ctx->time - ctx->tstamp_data.stopped;
+	ctx->tstamp_data.stopped = ctx->time;
 }
 
 static u64 perf_event_time(struct perf_event *event)
@@ -1419,16 +1423,16 @@ static void update_event_times(struct perf_event *event)
 	else if (ctx->is_active)
 		run_end = ctx->time;
 	else
-		run_end = event->tstamp_stopped;
+		run_end = event->tstamp->stopped;
 
-	event->total_time_enabled = run_end - event->tstamp_enabled;
+	event->total_time_enabled = run_end - event->tstamp->enabled;
 
 	if (event->state == PERF_EVENT_STATE_INACTIVE)
-		run_end = event->tstamp_stopped;
+		run_end = event->tstamp->stopped;
 	else
 		run_end = perf_event_time(event);
 
-	event->total_time_running = run_end - event->tstamp_running;
+	event->total_time_running = run_end - event->tstamp->running;
 
 }
 
@@ -1458,8 +1462,12 @@ static enum event_type_t get_event_type(struct perf_event *event)
 	return event_type;
 }
 
-static struct list_head *
-ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
+/*
+ * Extract pinned or flexible groups from the context
+ * based on event attrs bits;
+ */
+static struct rb_root *
+get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
 {
 	if (event->attr.pinned)
 		return &ctx->pinned_groups;
@@ -1467,6 +1475,204 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
 		return &ctx->flexible_groups;
 }
 
+static void
+perf_event_groups_insert(struct rb_root *groups,
+		struct perf_event *event);
+
+static void
+perf_event_groups_delete(struct rb_root *groups,
+		struct perf_event *event);
+
+/*
+ * Helper function to insert event into the pinned or
+ * flexible groups;
+ */
+static void
+add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
+{
+	struct rb_root *groups;
+
+	groups = get_event_groups(event, ctx);
+	perf_event_groups_insert(groups, event);
+}
+
+/*
+ * Helper function to delete event from its groups;
+ */
+static void
+del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
+{
+	struct rb_root *groups;
+
+	groups = get_event_groups(event, ctx);
+	perf_event_groups_delete(groups, event);
+}
+
+/*
+ * Insert a group into a tree using event->cpu as a key. If event->cpu node
+ * is already attached to the tree then the event is added to the attached
+ * group's group_list list.
+ */
+static void
+perf_event_groups_insert(struct rb_root *groups, struct perf_event *event)
+{
+	struct rb_node **node;
+	struct rb_node *parent;
+	struct perf_event *node_event;
+
+	WARN_ON_ONCE(!groups || !event);
+	WARN_ON_ONCE(!list_empty(&event->group_entry));
+
+	node = &groups->rb_node;
+	parent = *node;
+
+	while (*node) {
+		parent = *node;
+		node_event = container_of(*node,
+				struct perf_event, group_node);
+
+		if (event->cpu < node_event->cpu) {
+			node = &parent->rb_left;
+		} else if (event->cpu > node_event->cpu) {
+			node = &parent->rb_right;
+		} else {
+			list_add_tail(&event->group_entry,
+					&node_event->group_list);
+			return;
+		}
+	}
+
+	list_add_tail(&event->group_entry, &event->group_list);
+
+	rb_link_node(&event->group_node, parent, node);
+	rb_insert_color(&event->group_node, groups);
+}
+
+/*
+ * Delete a group from a tree. If the group is directly attached to the tree
+ * it also detaches all groups on the group's group_list list.
+ */
+static void
+perf_event_groups_delete(struct rb_root *groups, struct perf_event *event)
+{
+	struct perf_event *next;
+
+	WARN_ON_ONCE(!event);
+	WARN_ON_ONCE(list_empty(&event->group_entry));
+
+	list_del_init(&event->group_entry);
+
+	if (!RB_EMPTY_NODE(&event->group_node)) {
+		WARN_ON_ONCE(!groups);
+		if (!RB_EMPTY_ROOT(groups)) {
+			if (list_empty(&event->group_list)) {
+				rb_erase(&event->group_node, groups);
+			} else {
+				next = list_first_entry(&event->group_list,
+						struct perf_event, group_entry);
+				list_replace_init(&event->group_list,
+						&next->group_list);
+				rb_replace_node(&event->group_node,
+						&next->group_node, groups);
+
+			}
+		}
+		RB_CLEAR_NODE(&event->group_node);
+	}
+}
+
+/*
+ * Find group list by a cpu key and rotate it.
+ */
+static void
+perf_event_groups_rotate(struct rb_root *groups, int cpu)
+{
+	struct rb_node *node;
+	struct perf_event *node_event;
+
+	WARN_ON_ONCE(!groups);
+
+	node = groups->rb_node;
+
+	while (node) {
+		node_event = container_of(node,
+				struct perf_event, group_node);
+
+		if (cpu < node_event->cpu) {
+			node = node->rb_left;
+		} else if (cpu > node_event->cpu) {
+			node = node->rb_right;
+		} else {
+			list_rotate_left(&node_event->group_list);
+			break;
+		}
+	}
+}
+
+/*
+ * Find group_list list by a cpu key and call provided callback for every
+ * group on the list.
+ */
+
+typedef int(*perf_event_groups_iterate_f)(struct perf_event *, void *);
+
+static void
+perf_event_groups_iterate_cpu(struct rb_root *groups, int cpu,
+		perf_event_groups_iterate_f callback, void *data)
+{
+	struct rb_node *node;
+	struct perf_event *event, *node_event;
+
+	WARN_ON_ONCE(!groups);
+
+	node = groups->rb_node;
+
+	while (node) {
+		node_event = container_of(node,
+				struct perf_event, group_node);
+
+		if (cpu < node_event->cpu) {
+			node = node->rb_left;
+		} else if (cpu > node_event->cpu) {
+			node = node->rb_right;
+		} else {
+			list_for_each_entry(event, &node_event->group_list,
+					group_entry)
+				callback(event, data);
+			break;
+		}
+	}
+}
+
+/*
+ * Iterate event groups and call provided callback for every group in the tree.
+ * Iteration stops if the callback returns non zero.
+ */
+static int
+perf_event_groups_iterate(struct rb_root *groups,
+		perf_event_groups_iterate_f callback, void *data)
+{
+	int ret = 0;
+	struct rb_node *node;
+	struct perf_event *event, *node_event;
+
+	WARN_ON_ONCE(!groups);
+
+	for (node = rb_first(groups); node; node = rb_next(node)) {
+		node_event = container_of(node,	struct perf_event, group_node);
+		list_for_each_entry(event, &node_event->group_list,
+				group_entry) {
+			WARN_ON_ONCE(!(event->cpu == node_event->cpu));
+			ret = callback(event, data);
+			if (ret) {
+				return ret;
+			}
+		}
+	}
+
+	return ret;
+}
+
 /*
  * Add a event from the lists for its context.
  * Must be called with ctx->mutex and ctx->lock held.
@@ -1485,12 +1691,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 	 * perf_group_detach can, at all times, locate all siblings.
 	 */
 	if (event->group_leader == event) {
-		struct list_head *list;
-
 		event->group_caps = event->event_caps;
-
-		list = ctx_group_list(event, ctx);
-		list_add_tail(&event->group_entry, list);
+		add_event_to_groups(event, ctx);
 	}
 
 	list_update_cgroup_event(event, ctx, true);
@@ -1681,7 +1883,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 	list_del_rcu(&event->event_entry);
 
 	if (event->group_leader == event)
-		list_del_init(&event->group_entry);
+		del_event_from_groups(event, ctx);
 
 	update_group_times(event);
 
@@ -1701,7 +1903,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 static void perf_group_detach(struct perf_event *event)
 {
 	struct perf_event *sibling, *tmp;
-	struct list_head *list = NULL;
 
 	lockdep_assert_held(&event->ctx->lock);
 
@@ -1722,22 +1923,23 @@ static void perf_group_detach(struct perf_event *event)
 		goto out;
 	}
 
-	if (!list_empty(&event->group_entry))
-		list = &event->group_entry;
-
 	/*
 	 * If this was a group event with sibling events then
 	 * upgrade the siblings to singleton events by adding them
 	 * to whatever list we are on.
 	 */
 	list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
-		if (list)
-			list_move_tail(&sibling->group_entry, list);
+
 		sibling->group_leader = sibling;
 
 		/* Inherit group flags from the previous leader */
 		sibling->group_caps = event->group_caps;
 
+		if (!list_empty(&event->group_entry)) {
+			list_del_init(&sibling->group_entry);
+			add_event_to_groups(sibling, event->ctx);
+		}
+
 		WARN_ON_ONCE(sibling->ctx != event->ctx);
 	}
 
@@ -1806,9 +2008,13 @@ event_sched_out(struct perf_event *event,
 	 */
 	if (event->state == PERF_EVENT_STATE_INACTIVE &&
 	    !event_filter_match(event)) {
-		delta = tstamp - event->tstamp_stopped;
-		event->tstamp_running += delta;
-		event->tstamp_stopped = tstamp;
+		delta = tstamp - event->tstamp->stopped;
+		event->tstamp->running += delta;
+		event->tstamp->stopped = tstamp;
+		if (event->tstamp != &event->tstamp_data) {
+			event->tstamp_data = *event->tstamp;
+			event->tstamp = &event->tstamp_data;
+		}
 	}
 
 	if (event->state != PERF_EVENT_STATE_ACTIVE)
@@ -1816,7 +2022,7 @@ event_sched_out(struct perf_event *event,
 
 	perf_pmu_disable(event->pmu);
 
-	event->tstamp_stopped = tstamp;
+	event->tstamp->stopped = tstamp;
 	event->pmu->del(event, 0);
 	event->oncpu = -1;
 	event->state = PERF_EVENT_STATE_INACTIVE;
@@ -1861,6 +2067,22 @@ group_sched_out(struct perf_event *group_event,
 		cpuctx->exclusive = 0;
 }
 
+struct group_sched_params {
+	struct perf_cpu_context *cpuctx;
+	struct perf_event_context *ctx;
+	int can_add_hw;
+};
+
+static int
+group_sched_out_callback(struct perf_event *event, void *data)
+{
+	struct group_sched_params *params = data;
+
+	group_sched_out(event, params->cpuctx, params->ctx);
+
+	return 0;
+}
+
 #define DETACH_GROUP	0x01UL
 
 /*
@@ -2091,7 +2313,7 @@ event_sched_in(struct perf_event *event,
 		goto out;
 	}
 
-	event->tstamp_running += tstamp - event->tstamp_stopped;
+	event->tstamp->running += tstamp - event->tstamp->stopped;
 
 	if (!is_software_event(event))
 		cpuctx->active_oncpu++;
@@ -2163,8 +2385,8 @@ group_sched_in(struct perf_event *group_event,
 			simulate = true;
 
 		if (simulate) {
-			event->tstamp_running += now - event->tstamp_stopped;
-			event->tstamp_stopped = now;
+			event->tstamp->running += now - event->tstamp->stopped;
+			event->tstamp->stopped = now;
 		} else {
 			event_sched_out(event, cpuctx, ctx);
 		}
@@ -2216,43 +2438,45 @@ static void add_event_to_ctx(struct perf_event *event,
 
 	list_add_event(event, ctx);
 	perf_group_attach(event);
-	event->tstamp_enabled = tstamp;
-	event->tstamp_running = tstamp;
-	event->tstamp_stopped = tstamp;
+	event->tstamp->enabled = tstamp;
+	event->tstamp->running = tstamp;
+	event->tstamp->stopped = tstamp;
 }
 
 static void ctx_sched_out(struct perf_event_context *ctx,
 			  struct perf_cpu_context *cpuctx,
-			  enum event_type_t event_type);
+			  enum event_type_t event_type, int mux);
 static void
 ctx_sched_in(struct perf_event_context *ctx,
 	     struct perf_cpu_context *cpuctx,
 	     enum event_type_t event_type,
-	     struct task_struct *task);
+	     struct task_struct *task, int mux);
 
 static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
 			       struct perf_event_context *ctx,
 			       enum event_type_t event_type)
 {
+	int mux = 0;
+
 	if (!cpuctx->task_ctx)
 		return;
 
 	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
 		return;
 
-	ctx_sched_out(ctx, cpuctx, event_type);
+	ctx_sched_out(ctx, cpuctx, event_type, mux);
 }
 
 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
 				struct perf_event_context *ctx,
-				struct task_struct *task)
+				struct task_struct *task, int mux)
 {
-	cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
+	cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task, mux);
 	if (ctx)
-		ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
-	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
+		ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task, mux);
+	cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task, mux);
 	if (ctx)
-		ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
+		ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task, mux);
 }
 
 /*
@@ -2276,6 +2500,7 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
 {
 	enum event_type_t ctx_event_type = event_type & EVENT_ALL;
 	bool cpu_event = !!(event_type & EVENT_CPU);
+	int mux = 0;
 
 	/*
 	 * If pinned groups are involved, flexible groups also need to be
@@ -2296,11 +2521,11 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
 	 *  - otherwise, do nothing more.
 	 */
 	if (cpu_event)
-		cpu_ctx_sched_out(cpuctx, ctx_event_type);
+		cpu_ctx_sched_out(cpuctx, ctx_event_type, mux);
 	else if (ctx_event_type & EVENT_PINNED)
-		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux);
 
-	perf_event_sched_in(cpuctx, task_ctx, current);
+	perf_event_sched_in(cpuctx, task_ctx, current, mux);
 	perf_pmu_enable(cpuctx->ctx.pmu);
 }
 
@@ -2318,6 +2543,7 @@ static int  __perf_install_in_context(void *info)
 	struct perf_event_context *task_ctx = cpuctx->task_ctx;
 	bool reprogram = true;
 	int ret = 0;
+	int mux = 0;
 
 	raw_spin_lock(&cpuctx->ctx.lock);
 	if (ctx->task) {
@@ -2344,7 +2570,7 @@ static int  __perf_install_in_context(void *info)
 	}
 
 	if (reprogram) {
-		ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+		ctx_sched_out(ctx, cpuctx, EVENT_TIME, mux);
 		add_event_to_ctx(event, ctx);
 		ctx_resched(cpuctx, task_ctx, get_event_type(event));
 	} else {
@@ -2463,10 +2689,10 @@ static void __perf_event_mark_enabled(struct perf_event *event)
 	u64 tstamp = perf_event_time(event);
 
 	event->state = PERF_EVENT_STATE_INACTIVE;
-	event->tstamp_enabled = tstamp - event->total_time_enabled;
+	event->tstamp->enabled = tstamp - event->total_time_enabled;
 	list_for_each_entry(sub, &event->sibling_list, group_entry) {
 		if (sub->state >= PERF_EVENT_STATE_INACTIVE)
-			sub->tstamp_enabled = tstamp - sub->total_time_enabled;
+			sub->tstamp->enabled = tstamp - sub->total_time_enabled;
 	}
 }
 
@@ -2480,13 +2706,14 @@ static void __perf_event_enable(struct perf_event *event,
 {
 	struct perf_event *leader = event->group_leader;
 	struct perf_event_context *task_ctx;
+	int mux = 0;
 
 	if (event->state >= PERF_EVENT_STATE_INACTIVE ||
 	    event->state <= PERF_EVENT_STATE_ERROR)
 		return;
 
 	if (ctx->is_active)
-		ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+		ctx_sched_out(ctx, cpuctx, EVENT_TIME, mux);
 
 	__perf_event_mark_enabled(event);
 
@@ -2496,7 +2723,7 @@ static void __perf_event_enable(struct perf_event *event,
 	if (!event_filter_match(event)) {
 		if (is_cgroup_event(event))
 			perf_cgroup_defer_enabled(event);
-		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current, mux);
 		return;
 	}
 
@@ -2505,7 +2732,7 @@ static void __perf_event_enable(struct perf_event *event,
 	 * then don't put it on unless the group is on.
 	 */
 	if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
-		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current, mux);
 		return;
 	}
 
@@ -2701,10 +2928,14 @@ EXPORT_SYMBOL_GPL(perf_event_refresh);
 
 static void ctx_sched_out(struct perf_event_context *ctx,
 			  struct perf_cpu_context *cpuctx,
-			  enum event_type_t event_type)
+			  enum event_type_t event_type, int mux)
 {
 	int is_active = ctx->is_active;
-	struct perf_event *event;
+	struct group_sched_params params = {
+			.cpuctx = cpuctx,
+			.ctx = ctx
+	};
+	int cpu = smp_processor_id();
 
 	lockdep_assert_held(&ctx->lock);
 
@@ -2751,13 +2982,27 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 
 	perf_pmu_disable(ctx->pmu);
 	if (is_active & EVENT_PINNED) {
-		list_for_each_entry(event, &ctx->pinned_groups, group_entry)
-			group_sched_out(event, cpuctx, ctx);
+		if (mux) {
+			perf_event_groups_iterate_cpu(&ctx->pinned_groups, -1,
+					group_sched_out_callback, &params);
+			perf_event_groups_iterate_cpu(&ctx->pinned_groups, cpu,
+					group_sched_out_callback, &params);
+		} else {
+			perf_event_groups_iterate(&ctx->pinned_groups,
+					group_sched_out_callback, &params);
+		}
 	}
 
 	if (is_active & EVENT_FLEXIBLE) {
-		list_for_each_entry(event, &ctx->flexible_groups, group_entry)
-			group_sched_out(event, cpuctx, ctx);
+		if (mux) {
+			perf_event_groups_iterate_cpu(&ctx->flexible_groups, -1,
+					group_sched_out_callback, &params);
+			perf_event_groups_iterate_cpu(&ctx->flexible_groups, cpu,
+					group_sched_out_callback, &params);
+		} else {
+			perf_event_groups_iterate(&ctx->flexible_groups,
+					group_sched_out_callback, &params);
+		}
 	}
 	perf_pmu_enable(ctx->pmu);
 }
@@ -3046,78 +3291,85 @@ void __perf_event_task_sched_out(struct task_struct *task,
  * Called with IRQs disabled
  */
 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
-			      enum event_type_t event_type)
+			      enum event_type_t event_type, int mux)
 {
-	ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
+	ctx_sched_out(&cpuctx->ctx, cpuctx, event_type, mux);
 }
 
-static void
-ctx_pinned_sched_in(struct perf_event_context *ctx,
-		    struct perf_cpu_context *cpuctx)
+static int
+ctx_pinned_sched_in(struct perf_event *event, void *data)
 {
-	struct perf_event *event;
+	struct group_sched_params *params = data;
 
-	list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
-		if (event->state <= PERF_EVENT_STATE_OFF)
-			continue;
-		if (!event_filter_match(event))
-			continue;
+	if (event->state <= PERF_EVENT_STATE_OFF)
+		return 0;
+	if (!event_filter_match(event)) {
+		if (event->tstamp != &params->ctx->tstamp_data)
+			event->tstamp = &params->ctx->tstamp_data;
+		return 0;
+	}
 
-		/* may need to reset tstamp_enabled */
-		if (is_cgroup_event(event))
-			perf_cgroup_mark_enabled(event, ctx);
+	/* may need to reset tstamp_enabled */
+	if (is_cgroup_event(event))
+		perf_cgroup_mark_enabled(event, params->ctx);
 
-		if (group_can_go_on(event, cpuctx, 1))
-			group_sched_in(event, cpuctx, ctx);
+	if (group_can_go_on(event, params->cpuctx, 1))
+		group_sched_in(event, params->cpuctx, params->ctx);
 
-		/*
-		 * If this pinned group hasn't been scheduled,
-		 * put it in error state.
-		 */
-		if (event->state == PERF_EVENT_STATE_INACTIVE) {
-			update_group_times(event);
-			event->state = PERF_EVENT_STATE_ERROR;
-		}
+	/*
+	 * If this pinned group hasn't been scheduled,
+	 * put it in error state.
+	 */
+	if (event->state == PERF_EVENT_STATE_INACTIVE) {
+		update_group_times(event);
+		event->state = PERF_EVENT_STATE_ERROR;
 	}
+
+	return 0;
 }
 
-static void
-ctx_flexible_sched_in(struct perf_event_context *ctx,
-		      struct perf_cpu_context *cpuctx)
+static int
+ctx_flexible_sched_in(struct perf_event *event, void *data)
 {
-	struct perf_event *event;
-	int can_add_hw = 1;
+	struct group_sched_params *params = data;
 
-	list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
-		/* Ignore events in OFF or ERROR state */
-		if (event->state <= PERF_EVENT_STATE_OFF)
-			continue;
-		/*
-		 * Listen to the 'cpu' scheduling filter constraint
-		 * of events:
-		 */
-		if (!event_filter_match(event))
-			continue;
+	/* Ignore events in OFF or ERROR state */
+	if (event->state <= PERF_EVENT_STATE_OFF)
+		return 0;
+	/*
+	 * Listen to the 'cpu' scheduling filter constraint
+	 * of events:
+	 */
+	if (!event_filter_match(event)) {
+		if (event->tstamp != &params->ctx->tstamp_data)
+			event->tstamp = &params->ctx->tstamp_data;
+		return 0;
+	}
 
-		/* may need to reset tstamp_enabled */
-		if (is_cgroup_event(event))
-			perf_cgroup_mark_enabled(event, ctx);
+	/* may need to reset tstamp_enabled */
+	if (is_cgroup_event(event))
+		perf_cgroup_mark_enabled(event, params->ctx);
 
-		if (group_can_go_on(event, cpuctx, can_add_hw)) {
-			if (group_sched_in(event, cpuctx, ctx))
-				can_add_hw = 0;
-		}
+	if (group_can_go_on(event, params->cpuctx, params->can_add_hw)) {
+		if (group_sched_in(event, params->cpuctx, params->ctx))
+			params->can_add_hw = 0;
 	}
+
+	return 0;
 }
 
 static void
 ctx_sched_in(struct perf_event_context *ctx,
 	     struct perf_cpu_context *cpuctx,
 	     enum event_type_t event_type,
-	     struct task_struct *task)
+	     struct task_struct *task, int mux)
 {
 	int is_active = ctx->is_active;
-	u64 now;
+	struct group_sched_params params = {
+			.cpuctx = cpuctx,
+			.ctx = ctx
+	};
+	int cpu = smp_processor_id();
 
 	lockdep_assert_held(&ctx->lock);
 
@@ -3136,7 +3388,7 @@ ctx_sched_in(struct perf_event_context *ctx,
 
 	if (is_active & EVENT_TIME) {
 		/* start ctx time */
-		now = perf_clock();
+		u64 now = perf_clock();
 		ctx->timestamp = now;
 		perf_cgroup_set_timestamp(task, ctx);
 	}
@@ -3145,27 +3397,56 @@ ctx_sched_in(struct perf_event_context *ctx,
 	 * First go through the list and put on any pinned groups
 	 * in order to give them the best chance of going on.
 	 */
-	if (is_active & EVENT_PINNED)
-		ctx_pinned_sched_in(ctx, cpuctx);
+
+	if (is_active & EVENT_PINNED) {
+		if (mux) {
+			perf_event_groups_iterate_cpu(&ctx->pinned_groups,
+					-1, ctx_pinned_sched_in,
+					&params);
+			perf_event_groups_iterate_cpu(&ctx->pinned_groups,
+					cpu, ctx_pinned_sched_in,
+					&params);
+		} else {
+			perf_event_groups_iterate(&ctx->pinned_groups,
+					ctx_pinned_sched_in,
+					&params);
+		}
+	}
 
 	/* Then walk through the lower prio flexible groups */
-	if (is_active & EVENT_FLEXIBLE)
-		ctx_flexible_sched_in(ctx, cpuctx);
+	if (is_active & EVENT_FLEXIBLE) {
+		if (mux) {
+			params.can_add_hw = 1;
+			perf_event_groups_iterate_cpu(&ctx->flexible_groups,
+					-1, ctx_flexible_sched_in,
+					&params);
+			params.can_add_hw = 1;
+			perf_event_groups_iterate_cpu(&ctx->flexible_groups,
+					cpu, ctx_flexible_sched_in,
+					&params);
+		} else {
+			params.can_add_hw = 1;
+			perf_event_groups_iterate(&ctx->flexible_groups,
+					ctx_flexible_sched_in,
+					&params);
+		}
+	}
 }
 
 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
 			     enum event_type_t event_type,
-			     struct task_struct *task)
+			     struct task_struct *task, int mux)
 {
 	struct perf_event_context *ctx = &cpuctx->ctx;
 
-	ctx_sched_in(ctx, cpuctx, event_type, task);
+	ctx_sched_in(ctx, cpuctx, event_type, task, mux);
 }
 
 static void perf_event_context_sched_in(struct perf_event_context *ctx,
 					struct task_struct *task)
 {
 	struct perf_cpu_context *cpuctx;
+	int mux = 0;
 
 	cpuctx = __get_cpu_context(ctx);
 	if (cpuctx->task_ctx == ctx)
@@ -3181,9 +3462,9 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 	 * However, if task's ctx is not carrying any pinned
 	 * events, no need to flip the cpuctx's events around.
 	 */
-	if (!list_empty(&ctx->pinned_groups))
-		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
-	perf_event_sched_in(cpuctx, ctx, task);
+	if (!RB_EMPTY_ROOT(&ctx->pinned_groups))
+		cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux);
+	perf_event_sched_in(cpuctx, ctx, task, mux);
 	perf_pmu_enable(ctx->pmu);
 	perf_ctx_unlock(cpuctx, ctx);
 }
@@ -3416,14 +3697,19 @@ static void rotate_ctx(struct perf_event_context *ctx)
 	 * Rotate the first entry last of non-pinned groups. Rotation might be
 	 * disabled by the inheritance code.
 	 */
-	if (!ctx->rotate_disable)
-		list_rotate_left(&ctx->flexible_groups);
+	if (!ctx->rotate_disable) {
+		int cpu = smp_processor_id();
+
+		perf_event_groups_rotate(&ctx->flexible_groups, -1);
+		perf_event_groups_rotate(&ctx->flexible_groups, cpu);
+	}
 }
 
 static int perf_rotate_context(struct perf_cpu_context *cpuctx)
 {
 	struct perf_event_context *ctx = NULL;
 	int rotate = 0;
+	int mux = 1;
 
 	if (cpuctx->ctx.nr_events) {
 		if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
@@ -3442,15 +3728,15 @@ static int perf_rotate_context(struct perf_cpu_context *cpuctx)
 	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
 	perf_pmu_disable(cpuctx->ctx.pmu);
 
-	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+	cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE, mux);
 	if (ctx)
-		ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
+		ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE, mux);
 
 	rotate_ctx(&cpuctx->ctx);
 	if (ctx)
 		rotate_ctx(ctx);
 
-	perf_event_sched_in(cpuctx, ctx, current);
+	perf_event_sched_in(cpuctx, ctx, current, mux);
 
 	perf_pmu_enable(cpuctx->ctx.pmu);
 	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -3502,6 +3788,7 @@ static void perf_event_enable_on_exec(int ctxn)
 	struct perf_event *event;
 	unsigned long flags;
 	int enabled = 0;
+	int mux = 0;
 
 	local_irq_save(flags);
 	ctx = current->perf_event_ctxp[ctxn];
@@ -3510,7 +3797,7 @@ static void perf_event_enable_on_exec(int ctxn)
 
 	cpuctx = __get_cpu_context(ctx);
 	perf_ctx_lock(cpuctx, ctx);
-	ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+	ctx_sched_out(ctx, cpuctx, EVENT_TIME, mux);
 	list_for_each_entry(event, &ctx->event_list, event_entry) {
 		enabled |= event_enable_on_exec(event, ctx);
 		event_type |= get_event_type(event);
@@ -3523,7 +3810,7 @@ static void perf_event_enable_on_exec(int ctxn)
 		clone_ctx = unclone_ctx(ctx);
 		ctx_resched(cpuctx, ctx, event_type);
 	} else {
-		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current, mux);
 	}
 	perf_ctx_unlock(cpuctx, ctx);
 
@@ -3743,8 +4030,8 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
 	raw_spin_lock_init(&ctx->lock);
 	mutex_init(&ctx->mutex);
 	INIT_LIST_HEAD(&ctx->active_ctx_list);
-	INIT_LIST_HEAD(&ctx->pinned_groups);
-	INIT_LIST_HEAD(&ctx->flexible_groups);
+	ctx->pinned_groups = RB_ROOT;
+	ctx->flexible_groups = RB_ROOT;
 	INIT_LIST_HEAD(&ctx->event_list);
 	atomic_set(&ctx->refcount, 1);
 }
@@ -4843,8 +5130,8 @@ static void calc_timer_values(struct perf_event *event,
 
 	*now = perf_clock();
 	ctx_time = event->shadow_ctx_time + *now;
-	*enabled = ctx_time - event->tstamp_enabled;
-	*running = ctx_time - event->tstamp_running;
+	*enabled = ctx_time - event->tstamp->enabled;
+	*running = ctx_time - event->tstamp->running;
 }
 
 static void perf_event_init_userpage(struct perf_event *event)
@@ -9379,6 +9666,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	INIT_LIST_HEAD(&event->group_entry);
 	INIT_LIST_HEAD(&event->event_entry);
 	INIT_LIST_HEAD(&event->sibling_list);
+	RB_CLEAR_NODE(&event->group_node);
+	INIT_LIST_HEAD(&event->group_list);
 	INIT_LIST_HEAD(&event->rb_entry);
 	INIT_LIST_HEAD(&event->active_entry);
 	INIT_LIST_HEAD(&event->addr_filters.list);
@@ -9392,6 +9681,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	raw_spin_lock_init(&event->addr_filters.lock);
 
 	atomic_long_set(&event->refcount, 1);
+	event->tstamp		= &event->tstamp_data;
 	event->cpu		= cpu;
 	event->attr		= *attr;
 	event->group_leader	= group_leader;
@@ -10767,6 +11057,14 @@ static int inherit_group(struct perf_event *parent_event,
 	return 0;
 }
 
+struct inherit_task_group_params {
+	struct task_struct *parent;
+	struct perf_event_context *parent_ctx;
+	struct task_struct *child;
+	int ctxn;
+	int inherited_all;
+};
+
 /*
  * Creates the child task context and tries to inherit the event-group.
  *
@@ -10779,20 +11077,18 @@ static int inherit_group(struct perf_event *parent_event,
  *  - <0 on error
  */
 static int
-inherit_task_group(struct perf_event *event, struct task_struct *parent,
-		   struct perf_event_context *parent_ctx,
-		   struct task_struct *child, int ctxn,
-		   int *inherited_all)
+inherit_task_group(struct perf_event *event, void *data)
 {
 	int ret;
 	struct perf_event_context *child_ctx;
+	struct inherit_task_group_params *params = data;
 
 	if (!event->attr.inherit) {
-		*inherited_all = 0;
+		params->inherited_all = 0;
 		return 0;
 	}
 
-	child_ctx = child->perf_event_ctxp[ctxn];
+	child_ctx = params->child->perf_event_ctxp[params->ctxn];
 	if (!child_ctx) {
 		/*
 		 * This is executed from the parent task context, so
@@ -10800,18 +11096,19 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
 		 * First allocate and initialize a context for the
 		 * child.
 		 */
-		child_ctx = alloc_perf_context(parent_ctx->pmu, child);
+		child_ctx = alloc_perf_context(params->parent_ctx->pmu,
+				params->child);
 		if (!child_ctx)
 			return -ENOMEM;
 
-		child->perf_event_ctxp[ctxn] = child_ctx;
+		params->child->perf_event_ctxp[params->ctxn] = child_ctx;
 	}
 
-	ret = inherit_group(event, parent, parent_ctx,
-			    child, child_ctx);
+	ret = inherit_group(event, params->parent, params->parent_ctx,
+			    params->child, child_ctx);
 
 	if (ret)
-		*inherited_all = 0;
+		params->inherited_all = 0;
 
 	return ret;
 }
@@ -10823,11 +11120,15 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
 {
 	struct perf_event_context *child_ctx, *parent_ctx;
 	struct perf_event_context *cloned_ctx;
-	struct perf_event *event;
 	struct task_struct *parent = current;
-	int inherited_all = 1;
 	unsigned long flags;
 	int ret = 0;
+	struct inherit_task_group_params params = {
+			.parent = parent,
+			.child = child,
+			.ctxn = ctxn,
+			.inherited_all = 1
+	};
 
 	if (likely(!parent->perf_event_ctxp[ctxn]))
 		return 0;
@@ -10840,6 +11141,8 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
 	if (!parent_ctx)
 		return 0;
 
+	params.parent_ctx = parent_ctx;
+
 	/*
 	 * No need to check if parent_ctx != NULL here; since we saw
 	 * it non-NULL earlier, the only reason for it to become NULL
@@ -10857,13 +11160,10 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
 	 * We dont have to disable NMIs - we are only looking at
 	 * the list, not manipulating it:
 	 */
-	list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
-		ret = inherit_task_group(event, parent, parent_ctx,
-					 child, ctxn, &inherited_all);
-		if (ret)
-			goto out_unlock;
-	}
-
+	ret = perf_event_groups_iterate(&parent_ctx->pinned_groups,
+			inherit_task_group, &params);
+	if (ret)
+		goto out_unlock;
 	/*
 	 * We can't hold ctx->lock when iterating the ->flexible_group list due
 	 * to allocations, but we need to prevent rotation because
@@ -10873,19 +11173,17 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
 	parent_ctx->rotate_disable = 1;
 	raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
 
-	list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
-		ret = inherit_task_group(event, parent, parent_ctx,
-					 child, ctxn, &inherited_all);
-		if (ret)
-			goto out_unlock;
-	}
+	ret = perf_event_groups_iterate(&parent_ctx->flexible_groups,
+			inherit_task_group, &params);
+	if (ret)
+		goto out_unlock;
 
 	raw_spin_lock_irqsave(&parent_ctx->lock, flags);
 	parent_ctx->rotate_disable = 0;
 
 	child_ctx = child->perf_event_ctxp[ctxn];
 
-	if (child_ctx && inherited_all) {
+	if (child_ctx && params.inherited_all) {
 		/*
 		 * Mark the child context as a clone of the parent
 		 * context, or of whatever the parent is a clone of.