linux-kernel - [RFC PATCH 1/4] perf/core: split context's event group list into pinned and non-pinned lists

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1257711206-12243-2-git-send-email-fweisbec@gmail.com>
Date:	Sun,  8 Nov 2009 21:13:23 +0100
From:	Frederic Weisbecker <fweisbec@...il.com>
To:	Ingo Molnar <mingo@...e.hu>
Cc:	LKML <linux-kernel@...r.kernel.org>,
	Frederic Weisbecker <fweisbec@...il.com>,
	Peter Zijlstra <peterz@...radead.org>,
	Arnaldo Carvalho de Melo <acme@...hat.com>,
	Mike Galbraith <efault@....de>,
	Paul Mackerras <paulus@...ba.org>,
	Thomas Gleixner <tglx@...utronix.de>
Subject: [RFC PATCH 1/4] perf/core: split context's event group list into pinned and non-pinned lists

Split-up struct perf_event_context::group_list into pinned_grp_list
and volatile_grp_list (non-pinned).

This first appears to be useless as it duplicates various loops around
the group list handlings.

But it scales better in the fast-path in perf_sched_in(). We don't
anymore iterate twice through the entire list to separate pinned and
non-pinned scheduling. Instead we interate through two distinct lists.

The another desired effect is that it makes easier the distinct
scheduling rules for both.

Signed-off-by: Frederic Weisbecker <fweisbec@...il.com>
Cc: Peter Zijlstra <peterz@...radead.org>
Cc: Arnaldo Carvalho de Melo <acme@...hat.com>
Cc: Mike Galbraith <efault@....de>
Cc: Paul Mackerras <paulus@...ba.org>
Cc: Thomas Gleixner <tglx@...utronix.de>
---
 include/linux/perf_event.h |    3 +-
 kernel/perf_event.c        |  177 +++++++++++++++++++++++++++++++-------------
 2 files changed, 127 insertions(+), 53 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 6ff7c3b..659351c 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -662,7 +662,8 @@ struct perf_event_context {
 	 */
 	struct mutex			mutex;
 
-	struct list_head		group_list;
+	struct list_head		pinned_grp_list;
+	struct list_head		volatile_grp_list;
 	struct list_head		event_list;
 	int				nr_events;
 	int				nr_active;
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 6f4ed3b..b3a31c8 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -259,9 +259,15 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 	 * add it straight to the context's event list, or to the group
 	 * leader's sibling list:
 	 */
-	if (group_leader == event)
-		list_add_tail(&event->group_entry, &ctx->group_list);
-	else {
+	if (group_leader == event) {
+		struct list_head *list;
+
+		if (event->attr.pinned)
+			list = &ctx->pinned_grp_list;
+		else
+			list = &ctx->volatile_grp_list;
+		list_add_tail(&event->group_entry, list);
+	} else {
 		list_add_tail(&event->group_entry, &group_leader->sibling_list);
 		group_leader->nr_siblings++;
 	}
@@ -299,8 +305,14 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 	 * to the context list directly:
 	 */
 	list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
+		struct list_head *list;
+
+		if (sibling->attr.pinned)
+			list = &ctx->pinned_grp_list;
+		else
+			list = &ctx->volatile_grp_list;
 
-		list_move_tail(&sibling->group_entry, &ctx->group_list);
+		list_move_tail(&sibling->group_entry, list);
 		sibling->group_leader = sibling;
 	}
 }
@@ -1032,10 +1044,14 @@ void __perf_event_sched_out(struct perf_event_context *ctx,
 	update_context_time(ctx);
 
 	perf_disable();
-	if (ctx->nr_active)
-		list_for_each_entry(event, &ctx->group_list, group_entry)
+	if (ctx->nr_active) {
+		list_for_each_entry(event, &ctx->pinned_grp_list, group_entry)
 			group_sched_out(event, cpuctx, ctx);
 
+		list_for_each_entry(event, &ctx->volatile_grp_list, group_entry)
+			group_sched_out(event, cpuctx, ctx);
+	}
+
 	perf_enable();
  out:
 	spin_unlock(&ctx->lock);
@@ -1249,9 +1265,8 @@ __perf_event_sched_in(struct perf_event_context *ctx,
 	 * First go through the list and put on any pinned groups
 	 * in order to give them the best chance of going on.
 	 */
-	list_for_each_entry(event, &ctx->group_list, group_entry) {
-		if (event->state <= PERF_EVENT_STATE_OFF ||
-		    !event->attr.pinned)
+	list_for_each_entry(event, &ctx->pinned_grp_list, group_entry) {
+		if (event->state <= PERF_EVENT_STATE_OFF)
 			continue;
 		if (event->cpu != -1 && event->cpu != cpu)
 			continue;
@@ -1269,13 +1284,12 @@ __perf_event_sched_in(struct perf_event_context *ctx,
 		}
 	}
 
-	list_for_each_entry(event, &ctx->group_list, group_entry) {
+	list_for_each_entry(event, &ctx->volatile_grp_list, group_entry) {
 		/*
 		 * Ignore events in OFF or ERROR state, and
 		 * ignore pinned events since we did them already.
 		 */
-		if (event->state <= PERF_EVENT_STATE_OFF ||
-		    event->attr.pinned)
+		if (event->state <= PERF_EVENT_STATE_OFF)
 			continue;
 
 		/*
@@ -1428,8 +1442,13 @@ static void rotate_ctx(struct perf_event_context *ctx)
 	 * Rotate the first entry last (works just fine for group events too):
 	 */
 	perf_disable();
-	list_for_each_entry(event, &ctx->group_list, group_entry) {
-		list_move_tail(&event->group_entry, &ctx->group_list);
+	list_for_each_entry(event, &ctx->pinned_grp_list, group_entry) {
+		list_move_tail(&event->group_entry, &ctx->pinned_grp_list);
+		break;
+	}
+
+	list_for_each_entry(event, &ctx->volatile_grp_list, group_entry) {
+		list_move_tail(&event->group_entry, &ctx->volatile_grp_list);
 		break;
 	}
 	perf_enable();
@@ -1465,6 +1484,22 @@ void perf_event_task_tick(struct task_struct *curr, int cpu)
 		perf_event_task_sched_in(curr, cpu);
 }
 
+static void __perf_event_enable_on_exec(struct perf_event *event,
+					struct perf_event_context *ctx,
+					int *enabled)
+{
+	if (!event->attr.enable_on_exec)
+		return;
+
+	event->attr.enable_on_exec = 0;
+	if (event->state >= PERF_EVENT_STATE_INACTIVE)
+		return;
+
+	__perf_event_mark_enabled(event, ctx);
+
+	*enabled = 1;
+}
+
 /*
  * Enable all of a task's events that have been marked enable-on-exec.
  * This expects task == current.
@@ -1485,15 +1520,11 @@ static void perf_event_enable_on_exec(struct task_struct *task)
 
 	spin_lock(&ctx->lock);
 
-	list_for_each_entry(event, &ctx->group_list, group_entry) {
-		if (!event->attr.enable_on_exec)
-			continue;
-		event->attr.enable_on_exec = 0;
-		if (event->state >= PERF_EVENT_STATE_INACTIVE)
-			continue;
-		__perf_event_mark_enabled(event, ctx);
-		enabled = 1;
-	}
+	list_for_each_entry(event, &ctx->pinned_grp_list, group_entry)
+		__perf_event_enable_on_exec(event, ctx, &enabled);
+
+	list_for_each_entry(event, &ctx->volatile_grp_list, group_entry)
+		__perf_event_enable_on_exec(event, ctx, &enabled);
 
 	/*
 	 * Unclone this context if we enabled any event.
@@ -1562,7 +1593,8 @@ __perf_event_init_context(struct perf_event_context *ctx,
 	memset(ctx, 0, sizeof(*ctx));
 	spin_lock_init(&ctx->lock);
 	mutex_init(&ctx->mutex);
-	INIT_LIST_HEAD(&ctx->group_list);
+	INIT_LIST_HEAD(&ctx->pinned_grp_list);
+	INIT_LIST_HEAD(&ctx->volatile_grp_list);
 	INIT_LIST_HEAD(&ctx->event_list);
 	atomic_set(&ctx->refcount, 1);
 	ctx->task = task;
@@ -4869,7 +4901,11 @@ void perf_event_exit_task(struct task_struct *child)
 	mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
 
 again:
-	list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list,
+	list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_grp_list,
+				 group_entry)
+		__perf_event_exit_task(child_event, child_ctx, child);
+
+	list_for_each_entry_safe(child_event, tmp, &child_ctx->volatile_grp_list,
 				 group_entry)
 		__perf_event_exit_task(child_event, child_ctx, child);
 
@@ -4878,7 +4914,8 @@ again:
 	 * its siblings to the list, but we obtained 'tmp' before that which
 	 * will still point to the list head terminating the iteration.
 	 */
-	if (!list_empty(&child_ctx->group_list))
+	if (!list_empty(&child_ctx->pinned_grp_list) ||
+	    !list_empty(&child_ctx->volatile_grp_list))
 		goto again;
 
 	mutex_unlock(&child_ctx->mutex);
@@ -4886,6 +4923,24 @@ again:
 	put_ctx(child_ctx);
 }
 
+static void perf_event_free_event(struct perf_event *event,
+				  struct perf_event_context *ctx)
+{
+	struct perf_event *parent = event->parent;
+
+	if (WARN_ON_ONCE(!parent))
+		return;
+
+	mutex_lock(&parent->child_mutex);
+	list_del_init(&event->child_list);
+	mutex_unlock(&parent->child_mutex);
+
+	fput(parent->filp);
+
+	list_del_event(event, ctx);
+	free_event(event);
+}
+
 /*
  * free an unexposed, unused context as created by inheritance by
  * init_task below, used by fork() in case of fail.
@@ -4900,23 +4955,15 @@ void perf_event_free_task(struct task_struct *task)
 
 	mutex_lock(&ctx->mutex);
 again:
-	list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) {
-		struct perf_event *parent = event->parent;
-
-		if (WARN_ON_ONCE(!parent))
-			continue;
-
-		mutex_lock(&parent->child_mutex);
-		list_del_init(&event->child_list);
-		mutex_unlock(&parent->child_mutex);
+	list_for_each_entry_safe(event, tmp, &ctx->pinned_grp_list, group_entry)
+		perf_event_free_event(event, ctx);
 
-		fput(parent->filp);
-
-		list_del_event(event, ctx);
-		free_event(event);
-	}
+	list_for_each_entry_safe(event, tmp, &ctx->volatile_grp_list,
+				 group_entry)
+		perf_event_free_event(event, ctx);
 
-	if (!list_empty(&ctx->group_list))
+	if (!list_empty(&ctx->pinned_grp_list) ||
+	    !list_empty(&ctx->volatile_grp_list))
 		goto again;
 
 	mutex_unlock(&ctx->mutex);
@@ -4924,6 +4971,29 @@ again:
 	put_ctx(ctx);
 }
 
+static int
+perf_event_inherit(struct perf_event *event, struct task_struct *parent,
+		   struct perf_event_context *parent_ctx,
+		   struct task_struct *child,
+		   struct perf_event_context *child_ctx,
+		   int *inherited_all)
+{
+	int ret;
+
+	if (!event->attr.inherit) {
+		*inherited_all = 0;
+		return 0;
+	}
+
+	ret = inherit_group(event, parent, parent_ctx,
+			    child, child_ctx);
+	if (ret)
+		*inherited_all = 0;
+
+	return ret;
+}
+
+
 /*
  * Initialize the perf_event context in task_struct
  */
@@ -4981,19 +5051,20 @@ int perf_event_init_task(struct task_struct *child)
 	 * We dont have to disable NMIs - we are only looking at
 	 * the list, not manipulating it:
 	 */
-	list_for_each_entry(event, &parent_ctx->group_list, group_entry) {
+	list_for_each_entry(event, &parent_ctx->pinned_grp_list, group_entry) {
 
-		if (!event->attr.inherit) {
-			inherited_all = 0;
-			continue;
-		}
+		ret = perf_event_inherit(event, parent, parent_ctx, child,
+					 child_ctx, &inherited_all);
+		if (ret)
+			break;
+	}
+
+	list_for_each_entry(event, &parent_ctx->volatile_grp_list, group_entry) {
 
-		ret = inherit_group(event, parent, parent_ctx,
-					     child, child_ctx);
-		if (ret) {
-			inherited_all = 0;
+		ret = perf_event_inherit(event, parent, parent_ctx, child,
+					 child_ctx, &inherited_all);
+		if (ret)
 			break;
-		}
 	}
 
 	if (inherited_all) {
@@ -5044,7 +5115,9 @@ static void __perf_event_exit_cpu(void *info)
 	struct perf_event_context *ctx = &cpuctx->ctx;
 	struct perf_event *event, *tmp;
 
-	list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry)
+	list_for_each_entry_safe(event, tmp, &ctx->pinned_grp_list, group_entry)
+		__perf_event_remove_from_context(event);
+	list_for_each_entry_safe(event, tmp, &ctx->volatile_grp_list, group_entry)
 		__perf_event_remove_from_context(event);
 }
 static void perf_event_exit_cpu(int cpu)
-- 
1.6.2.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/