linux-kernel - [PATCH 4/7] perf: avoid a bounded set of visit_groups

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20190702065955.165738-5-irogers@google.com>
Date:   Mon,  1 Jul 2019 23:59:52 -0700
From:   Ian Rogers <irogers@...gle.com>
To:     Peter Zijlstra <peterz@...radead.org>,
        Ingo Molnar <mingo@...hat.com>,
        Arnaldo Carvalho de Melo <acme@...nel.org>,
        Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
        Jiri Olsa <jolsa@...hat.com>,
        Namhyung Kim <namhyung@...nel.org>,
        linux-kernel@...r.kernel.org
Cc:     Kan Liang <kan.liang@...ux.intel.com>,
        Stephane Eranian <eranian@...gle.com>,
        Ian Rogers <irogers@...gle.com>
Subject: [PATCH 4/7] perf: avoid a bounded set of visit_groups_merge iterators

Create a per-cpu array of iterators that gets resized when cgroup events
are added. The size of the array reflects the maximum depth of cgroups,
although not all cgroups will have events monitored within them. This
approach avoids added storage cost to perf_event.

Signed-off-by: Ian Rogers <irogers@...gle.com>
---
 include/linux/perf_event.h |  2 +
 kernel/events/core.c       | 94 ++++++++++++++++++++++++++++----------
 2 files changed, 71 insertions(+), 25 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 16e38c286d46..5c479f61622c 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -802,6 +802,8 @@ struct perf_cpu_context {
 #ifdef CONFIG_CGROUP_PERF
 	struct perf_cgroup		*cgrp;
 	struct list_head		cgrp_cpuctx_entry;
+	struct perf_event		**visit_groups_merge_iterator_storage;
+	int			       visit_groups_merge_iterator_storage_size;
 #endif
 
 	struct list_head		sched_cb_entry;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 396b5ac6dcd4..a2c5ea868de9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1711,6 +1711,20 @@ perf_event_groups_next(struct perf_event *event)
 	return next;
 }
 
+#ifdef CONFIG_CGROUP_PERF
+int perf_event_cgroup_depth(struct perf_event *event)
+{
+	struct cgroup_subsys_state *css;
+	struct perf_cgroup *cgrp = event->cgrp;
+	int depth = 0;
+
+	if (cgrp)
+		for (css = &cgrp->css; css; css = css->parent)
+			depth++;
+	return depth;
+}
+#endif
+
 /*
  * Iterate through the whole groups tree.
  */
@@ -2592,6 +2606,7 @@ static int  __perf_install_in_context(void *info)
 
 #ifdef CONFIG_CGROUP_PERF
 	if (is_cgroup_event(event)) {
+		int max_iterators;
 		/*
 		 * If the current cgroup doesn't match the event's
 		 * cgroup, we should not try to schedule it.
@@ -2599,6 +2614,30 @@ static int  __perf_install_in_context(void *info)
 		struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
 		reprogram = cgroup_is_descendant(cgrp->css.cgroup,
 					event->cgrp->css.cgroup);
+
+		/*
+		 * Ensure space for visit_groups_merge iterator storage. With
+		 * cgroup profiling we may have an event at each depth plus
+		 * system wide events.
+		 */
+		max_iterators = perf_event_cgroup_depth(event) + 1;
+		if (max_iterators >
+		    cpuctx->visit_groups_merge_iterator_storage_size) {
+			struct perf_event **storage =
+			   krealloc(cpuctx->visit_groups_merge_iterator_storage,
+				    sizeof(struct perf_event *) * max_iterators,
+				    GFP_KERNEL);
+			if (storage) {
+				cpuctx->visit_groups_merge_iterator_storage
+						= storage;
+				cpuctx->visit_groups_merge_iterator_storage_size
+						= max_iterators;
+			} else {
+				WARN_ONCE(1, "Unable to increase iterator "
+					"storage for perf events with cgroups");
+				ret = -ENOMEM;
+			}
+		}
 	}
 #endif
 
@@ -3389,6 +3428,13 @@ static void min_heap_pop_push(struct perf_event_heap *heap,
 	}
 }
 
+
+/*
+ * Without cgroups, with a task context, there may be per-CPU and any
+ * CPU events.
+ */
+#define MIN_VISIT_GROUP_MERGE_ITERATORS 2
+
 static int visit_groups_merge(struct perf_event_context *ctx,
 			      struct perf_cpu_context *cpuctx,
 			      struct perf_event_groups *groups,
@@ -3398,35 +3444,27 @@ static int visit_groups_merge(struct perf_event_context *ctx,
 					  int *),
 			      int *data)
 {
-#ifndef CONFIG_CGROUP_PERF
-	/*
-	 * Without cgroups, with a task context, iterate over per-CPU and any
-	 * CPU events.
-	 */
-	const int max_itrs = 2;
-#else
-	/*
-	 * The depth of cgroups is limited by MAX_PATH. It is unlikely that this
-	 * many parent-child related cgroups will have perf events
-	 * monitored. Limit the number of cgroup iterators to 16.
-	 */
-	const int max_cgroups_with_events_depth = 16;
-	/*
-	 * With cgroups we either iterate for a task context (per-CPU or any CPU
-	 * events) or for per CPU the global and per cgroup events.
-	 */
-	const int max_itrs = max(2, 1 + max_cgroups_with_events_depth);
-#endif
 	/*
 	 * A set of iterators, the iterator for the visit is chosen by the
 	 * group_index.
 	 */
-	struct perf_event *itrs[max_itrs];
+#ifndef CONFIG_CGROUP_PERF
+	struct perf_event *itrs[MIN_VISIT_GROUP_MERGE_ITERATORS];
 	struct perf_event_heap heap = {
 		.storage = itrs,
 		.num_elements = 0,
-		.max_elements = max_itrs
+		.max_elements = MIN_VISIT_GROUP_MERGE_ITERATORS
 	};
+#else
+	/*
+	 * With cgroups usage space in the CPU context reserved for iterators.
+	 */
+	struct perf_event_heap heap = {
+		.storage = cpuctx->visit_groups_merge_iterator_storage,
+		.num_elements = 0,
+		.max_elements = cpuctx->visit_groups_merge_iterator_storage_size
+	};
+#endif
 	int ret, cpu = smp_processor_id();
 
 	heap.storage[0] = perf_event_groups_first(groups, cpu, NULL);
@@ -3461,9 +3499,8 @@ static int visit_groups_merge(struct perf_event_context *ctx,
 					heap.num_elements++;
 					if (heap.num_elements ==
 					    heap.max_elements) {
-						WARN_ONCE(
-				     max_cgroups_with_events_depth,
-				     "Insufficient iterators for cgroup depth");
+						WARN_ONCE(1,
+						"per-CPU min-heap under sized");
 						break;
 					}
 				}
@@ -10155,7 +10192,14 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
 		lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
 		cpuctx->ctx.pmu = pmu;
 		cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
-
+#ifdef CONFIG_CGROUP_PERF
+		cpuctx->visit_groups_merge_iterator_storage =
+				kmalloc_array(MIN_VISIT_GROUP_MERGE_ITERATORS,
+					      sizeof(struct perf_event *),
+					      GFP_KERNEL);
+		cpuctx->visit_groups_merge_iterator_storage_size =
+				MIN_VISIT_GROUP_MERGE_ITERATORS;
+#endif
 		__perf_mux_hrtimer_init(cpuctx, cpu);
 	}
 
-- 
2.22.0.410.gd8fdbe21b5-goog