lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20160125145414.GG6375@twins.programming.kicks-ass.net>
Date:	Mon, 25 Jan 2016 15:54:14 +0100
From:	Peter Zijlstra <peterz@...radead.org>
To:	Alexei Starovoitov <alexei.starovoitov@...il.com>
Cc:	Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
	Ingo Molnar <mingo@...hat.com>, linux-kernel@...r.kernel.org,
	vince@...ter.net, eranian@...gle.com,
	Arnaldo Carvalho de Melo <acme@...radead.org>,
	Jiri Olsa <jolsa@...nel.org>
Subject: Re: [PATCH v2] perf: Synchronously cleanup child events

On Mon, Jan 25, 2016 at 12:48:46PM +0100, Peter Zijlstra wrote:
> On Fri, Jan 22, 2016 at 11:44:03AM -0800, Alexei Starovoitov wrote:
> > On Fri, Jan 22, 2016 at 01:38:47PM +0100, Peter Zijlstra wrote:
> > > On Fri, Jan 22, 2016 at 01:35:40PM +0200, Alexander Shishkin wrote:
> > > > Peter Zijlstra <peterz@...radead.org> writes:
> > > > 
> > > > > So I think there's a number of problems still :-(
> > 
> > I've been looking at how perf_event->owner is handled and couldn't
> > figure out how you deal with the case of passing perf_event_fd via scm_rights.
> > It seems one process can open an event, pass it to another process,
> > but when current process exists owner will still point to dead task,
> > since refcount > 0.
> > Which part am I missing?
> 
> Nothing, you raised a good point. I think this shows we cannot link
> !event->owner to an event being 'dead'.
> 
> If we keep these two states separate, the scm_rights thing should work
> again.

Alexander, Alexei,

How about the below? That uses event->state == PERF_EVENT_STATE_EXIT to
indicate the event has been given up by its 'owner' and decouples us
from the actual event->owner logic.

This retains the event->owner and event->owner_list thing purely for the
prclt(.option = PR_TASK_PERF_EVENTS_{EN,DIS}ABLE) calls, but does give
us strict 'owner' semantics in that:

  struct perf_event *my_event = perf_event_create_kernel_counter();

  /* ... */

  perf_event_release_kernel(my_event);

Or

  int fd = sys_perf_event_open(...);

  close(fd); /* last, calls fops::release */

Will destroy the event dead. event::refcount will 'retain' the object
but it will become non functional and is strictly meant as a temporal
existence guarantee (for when RCU isn't good enough).

So this should restore the scm_rights case, which preserves the fd but
could result in not having event->owner (and therefore being removed
from its owner_list), which is fine.

BPF still needs to get fixed to use filedesc references instead.

---

--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -634,9 +634,6 @@ struct perf_event_context {
 	int				nr_cgroups;	 /* cgroup evts */
 	void				*task_ctx_data; /* pmu specific data */
 	struct rcu_head			rcu_head;
-
-	struct delayed_work		orphans_remove;
-	bool				orphans_remove_sched;
 };
 
 /*
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -49,8 +49,6 @@
 
 #include <asm/irq_regs.h>
 
-static struct workqueue_struct *perf_wq;
-
 typedef int (*remote_function_f)(void *);
 
 struct remote_function_call {
@@ -1086,8 +1084,8 @@ static void put_ctx(struct perf_event_co
  * Lock order:
  *	task_struct::perf_event_mutex
  *	  perf_event_context::mutex
- *	    perf_event_context::lock
  *	    perf_event::child_mutex;
+ *	      perf_event_context::lock
  *	    perf_event::mmap_mutex
  *	    mmap_sem
  */
@@ -1646,45 +1644,11 @@ static void perf_group_detach(struct per
 		perf_event__header_size(tmp);
 }
 
-/*
- * User event without the task.
- */
 static bool is_orphaned_event(struct perf_event *event)
 {
-	return event && !is_kernel_event(event) && !event->owner;
-}
-
-/*
- * Event has a parent but parent's task finished and it's
- * alive only because of children holding refference.
- */
-static bool is_orphaned_child(struct perf_event *event)
-{
-	return is_orphaned_event(event->parent);
-}
-
-static void orphans_remove_work(struct work_struct *work);
-
-static void schedule_orphans_remove(struct perf_event_context *ctx)
-{
-	if (!ctx->task || ctx->orphans_remove_sched || !perf_wq)
-		return;
-
-	if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) {
-		get_ctx(ctx);
-		ctx->orphans_remove_sched = true;
-	}
+	return event->state == PERF_EVENT_STATE_EXIT;
 }
 
-static int __init perf_workqueue_init(void)
-{
-	perf_wq = create_singlethread_workqueue("perf");
-	WARN(!perf_wq, "failed to create perf workqueue\n");
-	return perf_wq ? 0 : -1;
-}
-
-core_initcall(perf_workqueue_init);
-
 static inline int pmu_filter_match(struct perf_event *event)
 {
 	struct pmu *pmu = event->pmu;
@@ -1745,9 +1709,6 @@ event_sched_out(struct perf_event *event
 	if (event->attr.exclusive || !cpuctx->active_oncpu)
 		cpuctx->exclusive = 0;
 
-	if (is_orphaned_child(event))
-		schedule_orphans_remove(ctx);
-
 	perf_pmu_enable(event->pmu);
 }
 
@@ -1771,6 +1732,9 @@ group_sched_out(struct perf_event *group
 		cpuctx->exclusive = 0;
 }
 
+#define DETACH_GROUP	0x01
+#define DETACH_STATE	0x02
+
 /*
  * Cross CPU call to remove a performance event
  *
@@ -1783,13 +1747,19 @@ __perf_remove_from_context(struct perf_e
 			   struct perf_event_context *ctx,
 			   void *info)
 {
-	bool detach_group = (unsigned long)info;
+	unsigned long flags = (unsigned long)info;
 
 	event_sched_out(event, cpuctx, ctx);
-	if (detach_group)
+
+	if (flags & DETACH_GROUP)
 		perf_group_detach(event);
 	list_del_event(event, ctx);
 
+	if (flags & DETACH_STATE) {
+		event->state = PERF_EVENT_STATE_EXIT;
+		perf_event_wakeup(event);
+	}
+
 	if (!ctx->nr_events && ctx->is_active) {
 		ctx->is_active = 0;
 		if (ctx->task) {
@@ -1809,12 +1779,11 @@ __perf_remove_from_context(struct perf_e
  * When called from perf_event_exit_task, it's OK because the
  * context has been detached from its task.
  */
-static void perf_remove_from_context(struct perf_event *event, bool detach_group)
+static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
 {
 	lockdep_assert_held(&event->ctx->mutex);
 
-	event_function_call(event, __perf_remove_from_context,
-			    (void *)(unsigned long)detach_group);
+	event_function_call(event, __perf_remove_from_context, (void *)flags);
 }
 
 /*
@@ -1980,9 +1949,6 @@ event_sched_in(struct perf_event *event,
 	if (event->attr.exclusive)
 		cpuctx->exclusive = 1;
 
-	if (is_orphaned_child(event))
-		schedule_orphans_remove(ctx);
-
 out:
 	perf_pmu_enable(event->pmu);
 
@@ -2253,7 +2219,8 @@ static void __perf_event_enable(struct p
 	struct perf_event *leader = event->group_leader;
 	struct perf_event_context *task_ctx;
 
-	if (event->state >= PERF_EVENT_STATE_INACTIVE)
+	if (event->state >= PERF_EVENT_STATE_INACTIVE ||
+	    event->state <= PERF_EVENT_STATE_ERROR)
 		return;
 
 	update_context_time(ctx);
@@ -2298,7 +2265,8 @@ static void _perf_event_enable(struct pe
 	struct perf_event_context *ctx = event->ctx;
 
 	raw_spin_lock_irq(&ctx->lock);
-	if (event->state >= PERF_EVENT_STATE_INACTIVE) {
+	if (event->state >= PERF_EVENT_STATE_INACTIVE ||
+	    event->state < PERF_EVENT_STATE_ERROR) {
 		raw_spin_unlock_irq(&ctx->lock);
 		return;
 	}
@@ -3363,7 +3331,6 @@ static void __perf_event_init_context(st
 	INIT_LIST_HEAD(&ctx->flexible_groups);
 	INIT_LIST_HEAD(&ctx->event_list);
 	atomic_set(&ctx->refcount, 1);
-	INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work);
 }
 
 static struct perf_event_context *
@@ -3665,29 +3632,6 @@ static bool exclusive_event_installable(
 	return true;
 }
 
-static void __free_event(struct perf_event *event)
-{
-	if (!event->parent) {
-		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
-			put_callchain_buffers();
-	}
-
-	perf_event_free_bpf_prog(event);
-
-	if (event->destroy)
-		event->destroy(event);
-
-	if (event->ctx)
-		put_ctx(event->ctx);
-
-	if (event->pmu) {
-		exclusive_event_destroy(event);
-		module_put(event->pmu->module);
-	}
-
-	call_rcu(&event->rcu_head, free_event_rcu);
-}
-
 static void _free_event(struct perf_event *event)
 {
 	irq_work_sync(&event->pending);
@@ -3709,7 +3653,25 @@ static void _free_event(struct perf_even
 	if (is_cgroup_event(event))
 		perf_detach_cgroup(event);
 
-	__free_event(event);
+	if (!event->parent) {
+		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+			put_callchain_buffers();
+	}
+
+	perf_event_free_bpf_prog(event);
+
+	if (event->destroy)
+		event->destroy(event);
+
+	if (event->ctx)
+		put_ctx(event->ctx);
+
+	if (event->pmu) {
+		exclusive_event_destroy(event);
+		module_put(event->pmu->module);
+	}
+
+	call_rcu(&event->rcu_head, free_event_rcu);
 }
 
 /*
@@ -3771,8 +3733,10 @@ static void perf_remove_from_owner(struc
 		 * ensured they're done, and we can proceed with freeing the
 		 * event.
 		 */
-		if (event->owner)
+		if (event->owner) {
 			list_del_init(&event->owner_entry);
+			event->owner = NULL;
+		}
 		mutex_unlock(&owner->perf_event_mutex);
 		put_task_struct(owner);
 	}
@@ -3780,11 +3744,23 @@ static void perf_remove_from_owner(struc
 
 static void put_event(struct perf_event *event)
 {
-	struct perf_event_context *ctx;
-
 	if (!atomic_long_dec_and_test(&event->refcount))
 		return;
 
+	_free_event(event);
+}
+
+/*
+ * Kill an event dead; while event:refcount will preserve the event
+ * object, it will not preserve its functionality. Once the last 'user'
+ * gives up the object, we'll destroy the thing.
+ */
+int perf_event_release_kernel(struct perf_event *event)
+{
+	struct perf_event_context *ctx;
+	struct perf_event *child, *tmp;
+	LIST_HEAD(child_list);
+
 	if (!is_kernel_event(event))
 		perf_remove_from_owner(event);
 
@@ -3802,14 +3778,57 @@ static void put_event(struct perf_event
 	 */
 	ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
 	WARN_ON_ONCE(ctx->parent_ctx);
-	perf_remove_from_context(event, true);
+	perf_remove_from_context(event, DETACH_GROUP | DETACH_STATE);
 	perf_event_ctx_unlock(event, ctx);
 
-	_free_event(event);
-}
+	/*
+	 * At this point we must have event->state == PERF_EVENT_STATE_EXIT,
+	 * either from the above perf_remove_from_context() or through
+	 * __perf_event_exit_task().
+	 *
+	 * Therefore, anybody acquireing event->child_mutex after the below
+	 * list_splice_init() _must_ also see this, most importantly
+	 * inherit_event() which will avoid placing more children on the
+	 * list.
+	 *
+	 * Thus this guarantees that we will in fact observe and kill _ALL_
+	 * child events.
+	 */
+	WARN_ON_ONCE(event->state != PERF_EVENT_STATE_EXIT);
 
-int perf_event_release_kernel(struct perf_event *event)
-{
+	/*
+	 * event::child_mutex nests inside ctx::mutex, so move children
+	 * to a safe place first and avoid inversion
+	 */
+	mutex_lock(&event->child_mutex);
+	list_splice_init(&event->child_list, &child_list);
+	mutex_unlock(&event->child_mutex);
+
+	list_for_each_entry_safe(child, tmp, &child_list, child_list) {
+		struct perf_event_context *ctx;
+
+		/*
+		 * This is somewhat similar to perf_free_event(),
+		 * except for these events are alive and need
+		 * proper perf_remove_from_context().
+		 */
+		ctx = perf_event_ctx_lock(child);
+		perf_remove_from_context(child, DETACH_GROUP);
+		perf_event_ctx_unlock(child, ctx);
+
+		list_del(&child->child_list);
+
+		/* Children will have exactly one reference */
+		free_event(child);
+
+		/*
+		 * This matches the refcount bump in inherit_event();
+		 * this can't be the last reference.
+		 */
+		put_event(event);
+	}
+
+	/* Must be the last reference */
 	put_event(event);
 	return 0;
 }
@@ -3820,46 +3839,10 @@ EXPORT_SYMBOL_GPL(perf_event_release_ker
  */
 static int perf_release(struct inode *inode, struct file *file)
 {
-	put_event(file->private_data);
+	perf_event_release_kernel(file->private_data);
 	return 0;
 }
 
-/*
- * Remove all orphanes events from the context.
- */
-static void orphans_remove_work(struct work_struct *work)
-{
-	struct perf_event_context *ctx;
-	struct perf_event *event, *tmp;
-
-	ctx = container_of(work, struct perf_event_context,
-			   orphans_remove.work);
-
-	mutex_lock(&ctx->mutex);
-	list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) {
-		struct perf_event *parent_event = event->parent;
-
-		if (!is_orphaned_child(event))
-			continue;
-
-		perf_remove_from_context(event, true);
-
-		mutex_lock(&parent_event->child_mutex);
-		list_del_init(&event->child_list);
-		mutex_unlock(&parent_event->child_mutex);
-
-		free_event(event);
-		put_event(parent_event);
-	}
-
-	raw_spin_lock_irq(&ctx->lock);
-	ctx->orphans_remove_sched = false;
-	raw_spin_unlock_irq(&ctx->lock);
-	mutex_unlock(&ctx->mutex);
-
-	put_ctx(ctx);
-}
-
 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
 {
 	struct perf_event *child;
@@ -8432,11 +8415,11 @@ SYSCALL_DEFINE5(perf_event_open,
 		 * See perf_event_ctx_lock() for comments on the details
 		 * of swizzling perf_event::ctx.
 		 */
-		perf_remove_from_context(group_leader, false);
+		perf_remove_from_context(group_leader, 0);
 
 		list_for_each_entry(sibling, &group_leader->sibling_list,
 				    group_entry) {
-			perf_remove_from_context(sibling, false);
+			perf_remove_from_context(sibling, 0);
 			put_ctx(gctx);
 		}
 
@@ -8616,7 +8599,7 @@ void perf_pmu_migrate_context(struct pmu
 	mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
 	list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
 				 event_entry) {
-		perf_remove_from_context(event, false);
+		perf_remove_from_context(event, 0);
 		unaccount_event_cpu(event, src_cpu);
 		put_ctx(src_ctx);
 		list_add(&event->migrate_entry, &events);
@@ -8665,7 +8648,7 @@ void perf_pmu_migrate_context(struct pmu
 EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
 
 static void sync_child_event(struct perf_event *child_event,
-			       struct task_struct *child)
+			     struct task_struct *child)
 {
 	struct perf_event *parent_event = child_event->parent;
 	u64 child_val;
@@ -8684,25 +8667,6 @@ static void sync_child_event(struct perf
 	atomic64_add(child_event->total_time_running,
 		     &parent_event->child_total_time_running);
 
-	/*
-	 * Remove this event from the parent's list
-	 */
-	WARN_ON_ONCE(parent_event->ctx->parent_ctx);
-	mutex_lock(&parent_event->child_mutex);
-	list_del_init(&child_event->child_list);
-	mutex_unlock(&parent_event->child_mutex);
-
-	/*
-	 * Make sure user/parent get notified, that we just
-	 * lost one event.
-	 */
-	perf_event_wakeup(parent_event);
-
-	/*
-	 * Release the parent event, if this was the last
-	 * reference to it.
-	 */
-	put_event(parent_event);
 }
 
 static void
@@ -8710,6 +8674,8 @@ __perf_event_exit_task(struct perf_event
 			 struct perf_event_context *child_ctx,
 			 struct task_struct *child)
 {
+	struct perf_event *parent_event = child_event->parent;
+
 	/*
 	 * Do not destroy the 'original' grouping; because of the context
 	 * switch optimization the original events could've ended up in a
@@ -8728,20 +8694,39 @@ __perf_event_exit_task(struct perf_event
 	if (!!child_event->parent)
 		perf_group_detach(child_event);
 	list_del_event(child_event, child_ctx);
+	child_event->state = PERF_EVENT_STATE_EXIT; /* see perf_event_release_kernel() */
 	raw_spin_unlock_irq(&child_ctx->lock);
 
 	/*
-	 * It can happen that the parent exits first, and has events
-	 * that are still around due to the child reference. These
-	 * events need to be zapped.
+	 * Parent events are governed by their filedesc, retain them.
 	 */
-	if (child_event->parent) {
-		sync_child_event(child_event, child);
-		free_event(child_event);
-	} else {
-		child_event->state = PERF_EVENT_STATE_EXIT;
+	if (!child_event->parent) {
 		perf_event_wakeup(child_event);
+		return;
 	}
+	/*
+	 * This is a child event, cleanup and free.
+	 */
+
+	/*
+	 * Fold delta back into parent counts.
+	 */
+	sync_child_event(child_event, child);
+
+	/*
+	 * Remove this event from the parent's list.
+	 */
+	WARN_ON_ONCE(parent_event->ctx->parent_ctx);
+	mutex_lock(&parent_event->child_mutex);
+	list_del_init(&child_event->child_list);
+	mutex_unlock(&parent_event->child_mutex);
+
+	/*
+	 * Kick perf_poll for is_event_hup().
+	 */
+	perf_event_wakeup(parent_event);
+	free_event(child_event);
+	put_event(parent_event);
 }
 
 static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
@@ -8973,8 +8958,15 @@ inherit_event(struct perf_event *parent_
 	if (IS_ERR(child_event))
 		return child_event;
 
+	/*
+	 * is_orphaned_event() and list_add_tail(&parent_event->child_list)
+	 * must be under the same lock in order to serialize against
+	 * perf_event_release_kernel().
+	 */
+	mutex_lock(&parent_event->child_mutex);
 	if (is_orphaned_event(parent_event) ||
 	    !atomic_long_inc_not_zero(&parent_event->refcount)) {
+		mutex_unlock(&parent_event->child_mutex);
 		free_event(child_event);
 		return NULL;
 	}
@@ -9022,8 +9014,6 @@ inherit_event(struct perf_event *parent_
 	/*
 	 * Link this into the parent event's child list
 	 */
-	WARN_ON_ONCE(parent_event->ctx->parent_ctx);
-	mutex_lock(&parent_event->child_mutex);
 	list_add_tail(&child_event->child_list, &parent_event->child_list);
 	mutex_unlock(&parent_event->child_mutex);
 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ