lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Wed,  8 Jul 2020 18:16:34 +0300
From:   Alexander Shishkin <alexander.shishkin@...ux.intel.com>
To:     Peter Zijlstra <a.p.zijlstra@...llo.nl>,
        Arnaldo Carvalho de Melo <acme@...hat.com>
Cc:     Ingo Molnar <mingo@...hat.com>, linux-kernel@...r.kernel.org,
        Jiri Olsa <jolsa@...nel.org>, alexey.budankov@...ux.intel.com,
        adrian.hunter@...el.com,
        Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
        Andi Kleen <ak@...ux.intel.com>
Subject: [PATCH 1/2] perf: Add closing sibling events' file descriptors

Currently, perf requires one file descriptor per event. In large groups,
this may mean running into the limit on open file descriptors. However,
the sibling events in a group only need file descriptors for the initial
configuration stage, after which they may not be needed any more.

This adds an opt-in flag to the perf_event_open() syscall to retain
sibling events after their file descriptors are closed. In this case, the
actual events will be closed with the group leader.

Signed-off-by: Alexander Shishkin <alexander.shishkin@...ux.intel.com>
Suggested-by: Andi Kleen <ak@...ux.intel.com>
---
 include/linux/perf_event.h      |   7 ++
 include/uapi/linux/perf_event.h |   1 +
 kernel/events/core.c            | 149 +++++++++++++++++++++++---------
 3 files changed, 115 insertions(+), 42 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 3b22db08b6fb..46666ce2c303 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -623,6 +623,11 @@ struct perf_event {
 	 * either sufficies for read.
 	 */
 	struct list_head		sibling_list;
+	/*
+	 * ALLOW_CLOSED siblings that were actually closed; when the group
+	 * leader goes, so should they.
+	 */
+	struct list_head		closed_list;
 	struct list_head		active_list;
 	/*
 	 * Node on the pinned or flexible tree located at the event context;
@@ -644,6 +649,8 @@ struct perf_event {
 	int				event_caps;
 	/* The cumulative AND of all event_caps for events in this group. */
 	int				group_caps;
+	unsigned			allow_close	: 1,
+					closed		: 1;
 
 	struct perf_event		*group_leader;
 	struct pmu			*pmu;
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 52ca2093831c..69823c0e3cbd 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -1093,6 +1093,7 @@ enum perf_callchain_context {
 #define PERF_FLAG_FD_OUTPUT		(1UL << 1)
 #define PERF_FLAG_PID_CGROUP		(1UL << 2) /* pid=cgroup id, per-cpu mode only */
 #define PERF_FLAG_FD_CLOEXEC		(1UL << 3) /* O_CLOEXEC */
+#define PERF_FLAG_ALLOW_CLOSE		(1UL << 4) /* retain the event past fd close */
 
 #if defined(__LITTLE_ENDIAN_BITFIELD)
 union perf_mem_data_src {
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7c436d705fbd..e61be9cfce98 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -352,7 +352,8 @@ static void event_function_local(struct perf_event *event, event_f func, void *d
 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
 		       PERF_FLAG_FD_OUTPUT  |\
 		       PERF_FLAG_PID_CGROUP |\
-		       PERF_FLAG_FD_CLOEXEC)
+		       PERF_FLAG_FD_CLOEXEC |\
+		       PERF_FLAG_ALLOW_CLOSE)
 
 /*
  * branch priv levels that need permission checks
@@ -2165,6 +2166,15 @@ static void perf_group_detach(struct perf_event *event)
 	 * to whatever list we are on.
 	 */
 	list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
+		if (sibling->closed) {
+			list_move(&sibling->sibling_list, &event->closed_list);
+			event->nr_siblings--;
+			continue;
+		}
+
+		/* Proceed as if it was an ordinary sibling */
+		if (sibling->allow_close)
+			sibling->allow_close = 0;
 
 		sibling->group_leader = sibling;
 		list_del_init(&sibling->sibling_list);
@@ -2313,6 +2323,7 @@ __perf_remove_from_context(struct perf_event *event,
 			   void *info)
 {
 	unsigned long flags = (unsigned long)info;
+	struct perf_event *sibling;
 
 	if (ctx->is_active & EVENT_TIME) {
 		update_context_time(ctx);
@@ -2332,6 +2343,10 @@ __perf_remove_from_context(struct perf_event *event,
 			cpuctx->task_ctx = NULL;
 		}
 	}
+
+	flags &= ~DETACH_GROUP;
+	list_for_each_entry(sibling, &event->closed_list, sibling_list)
+		__perf_remove_from_context(sibling, cpuctx, ctx, (void *)flags);
 }
 
 /*
@@ -4906,51 +4921,12 @@ static void put_event(struct perf_event *event)
 	_free_event(event);
 }
 
-/*
- * Kill an event dead; while event:refcount will preserve the event
- * object, it will not preserve its functionality. Once the last 'user'
- * gives up the object, we'll destroy the thing.
- */
-int perf_event_release_kernel(struct perf_event *event)
+static void perf_event_free_children(struct perf_event *event)
 {
-	struct perf_event_context *ctx = event->ctx;
 	struct perf_event *child, *tmp;
+	struct perf_event_context *ctx;
 	LIST_HEAD(free_list);
 
-	/*
-	 * If we got here through err_file: fput(event_file); we will not have
-	 * attached to a context yet.
-	 */
-	if (!ctx) {
-		WARN_ON_ONCE(event->attach_state &
-				(PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
-		goto no_ctx;
-	}
-
-	if (!is_kernel_event(event))
-		perf_remove_from_owner(event);
-
-	ctx = perf_event_ctx_lock(event);
-	WARN_ON_ONCE(ctx->parent_ctx);
-	perf_remove_from_context(event, DETACH_GROUP);
-
-	raw_spin_lock_irq(&ctx->lock);
-	/*
-	 * Mark this event as STATE_DEAD, there is no external reference to it
-	 * anymore.
-	 *
-	 * Anybody acquiring event->child_mutex after the below loop _must_
-	 * also see this, most importantly inherit_event() which will avoid
-	 * placing more children on the list.
-	 *
-	 * Thus this guarantees that we will in fact observe and kill _ALL_
-	 * child events.
-	 */
-	event->state = PERF_EVENT_STATE_DEAD;
-	raw_spin_unlock_irq(&ctx->lock);
-
-	perf_event_ctx_unlock(event, ctx);
-
 again:
 	mutex_lock(&event->child_mutex);
 	list_for_each_entry(child, &event->child_list, child_list) {
@@ -5016,6 +4992,82 @@ int perf_event_release_kernel(struct perf_event *event)
 		smp_mb(); /* pairs with wait_var_event() */
 		wake_up_var(var);
 	}
+}
+
+/*
+ * Kill an event dead; while event:refcount will preserve the event
+ * object, it will not preserve its functionality. Once the last 'user'
+ * gives up the object, we'll destroy the thing.
+ */
+int perf_event_release_kernel(struct perf_event *event)
+{
+	struct perf_event_context *ctx = event->ctx;
+	struct perf_event *sibling;
+
+	/*
+	 * If we got here through err_file: fput(event_file); we will not have
+	 * attached to a context yet.
+	 */
+	if (!ctx) {
+		WARN_ON_ONCE(event->attach_state &
+				(PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
+		goto no_ctx;
+	}
+
+	if (!is_kernel_event(event))
+		perf_remove_from_owner(event);
+
+	ctx = perf_event_ctx_lock(event);
+	WARN_ON_ONCE(ctx->parent_ctx);
+
+	if (event->allow_close && !event->closed) {
+		event->closed = 1;
+		perf_event_ctx_unlock(event, ctx);
+		return 0;
+	}
+
+	/*
+	 * The below will also move all closed siblings to the closed_list,
+	 * so that we can reap their children and remove them from the owner.
+	 */
+	perf_remove_from_context(event, DETACH_GROUP);
+
+	raw_spin_lock_irq(&ctx->lock);
+	/*
+	 * Mark this event as STATE_DEAD, there is no external reference to it
+	 * anymore.
+	 *
+	 * Anybody acquiring event->child_mutex after the below loop _must_
+	 * also see this, most importantly inherit_event() which will avoid
+	 * placing more children on the list. It will also skip over closed
+	 * siblings, as they are also going away together with their leader.
+	 *
+	 * Thus this guarantees that we will in fact observe and kill _ALL_
+	 * child events.
+	 */
+	event->state = PERF_EVENT_STATE_DEAD;
+	raw_spin_unlock_irq(&ctx->lock);
+
+	perf_event_ctx_unlock(event, ctx);
+
+	perf_event_free_children(event);
+
+	/*
+	 * The events on the closed_list are former closed siblings; they
+	 * don't have file descriptors, so this is their teardown.
+	 */
+	list_for_each_entry(sibling, &event->closed_list, sibling_list) {
+		if (!is_kernel_event(sibling))
+			perf_remove_from_owner(sibling);
+		perf_event_free_children(sibling);
+		/*
+		 * The below may be last, or it may be raced for it
+		 * by the perf_event_exit_event() path; we can do better
+		 * and ensure one way or the other, but it doesn't matter,
+		 * the other path is fully equipped to free events.
+		 */
+		put_event(sibling);
+	}
 
 no_ctx:
 	put_event(event); /* Must be the 'last' reference */
@@ -11118,6 +11170,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 
 	INIT_LIST_HEAD(&event->event_entry);
 	INIT_LIST_HEAD(&event->sibling_list);
+	INIT_LIST_HEAD(&event->closed_list);
 	INIT_LIST_HEAD(&event->active_list);
 	init_event_group(event);
 	INIT_LIST_HEAD(&event->rb_entry);
@@ -11718,6 +11771,14 @@ SYSCALL_DEFINE5(perf_event_open,
 		}
 	}
 
+	if (flags & PERF_FLAG_ALLOW_CLOSE) {
+		if (!group_leader || group_leader->group_leader != group_leader) {
+			err = -EINVAL;
+			goto err_task;
+		}
+		event->allow_close = 1;
+	}
+
 	/*
 	 * Special case software events and allow them to be part of
 	 * any hardware group.
@@ -12498,6 +12559,10 @@ inherit_event(struct perf_event *parent_event,
 	if (parent_event->parent)
 		parent_event = parent_event->parent;
 
+	/* If group leader is getting closed, ignore its closed siblings */
+	if (!group_leader && parent_event->closed)
+		return NULL;
+
 	child_event = perf_event_alloc(&parent_event->attr,
 					   parent_event->cpu,
 					   child,
-- 
2.27.0

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ