linux-kernel - [tip:perfcounters/core] perf_counter: Don't swap contexts containing locked mutex

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <tip-ad3a37de81c45f6c20d410ece86004b98f7b6d84@git.kernel.org>
Date:	Fri, 29 May 2009 12:03:33 GMT
From:	tip-bot for Paul Mackerras <paulus@...ba.org>
To:	linux-tip-commits@...r.kernel.org
Cc:	linux-kernel@...r.kernel.org, paulus@...ba.org, hpa@...or.com,
	mingo@...hat.com, a.p.zijlstra@...llo.nl, tglx@...utronix.de,
	mingo@...e.hu
Subject: [tip:perfcounters/core] perf_counter: Don't swap contexts containing locked mutex

Commit-ID:  ad3a37de81c45f6c20d410ece86004b98f7b6d84
Gitweb:     http://git.kernel.org/tip/ad3a37de81c45f6c20d410ece86004b98f7b6d84
Author:     Paul Mackerras <paulus@...ba.org>
AuthorDate: Fri, 29 May 2009 16:06:20 +1000
Committer:  Ingo Molnar <mingo@...e.hu>
CommitDate: Fri, 29 May 2009 11:02:46 +0200

perf_counter: Don't swap contexts containing locked mutex

Peter Zijlstra pointed out that under some circumstances, we can take
the mutex in a context or a counter and then swap that context or
counter to another task, potentially leading to lock order inversions
or the mutexes not protecting what they are supposed to protect.

This fixes the problem by making sure that we never take a mutex in a
context or counter which could get swapped to another task.  Most of
the cases where we take a mutex is on a top-level counter or context,
i.e. a counter which has an fd associated with it or a context that
contains such a counter.  This adds WARN_ON_ONCE statements to verify
that.

The two cases where we need to take the mutex on a context that is a
clone of another are in perf_counter_exit_task and
perf_counter_init_task.  The perf_counter_exit_task case is solved by
uncloning the context before starting to remove the counters from it.
The perf_counter_init_task is a little trickier; we temporarily
disable context swapping for the parent (forking) task by setting its
ctx->parent_gen to the all-1s value after locking the context, if it
is a cloned context, and restore the ctx->parent_gen value at the end
if the context didn't get uncloned in the meantime.

This also moves the increment of the context generation count to be
within the same critical section, protected by the context mutex, that
adds the new counter to the context.  That way, taking the mutex is
sufficient to ensure that both the counter list and the generation
count are stable.

[ Impact: fix hangs, races with inherited and PID counters ]

Signed-off-by: Paul Mackerras <paulus@...ba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@...llo.nl>
LKML-Reference: <18975.31580.520676.619896@...ngo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@...e.hu>


---
 kernel/perf_counter.c |   89 +++++++++++++++++++++++++++++++++++++++++++-----
 1 files changed, 79 insertions(+), 10 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 52e5a15..db843f8 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -919,7 +919,8 @@ static int context_equiv(struct perf_counter_context *ctx1,
 			 struct perf_counter_context *ctx2)
 {
 	return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
-		&& ctx1->parent_gen == ctx2->parent_gen;
+		&& ctx1->parent_gen == ctx2->parent_gen
+		&& ctx1->parent_gen != ~0ull;
 }
 
 /*
@@ -1339,7 +1340,6 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 			put_ctx(parent_ctx);
 			ctx->parent_ctx = NULL;		/* no longer a clone */
 		}
-		++ctx->generation;
 		/*
 		 * Get an extra reference before dropping the lock so that
 		 * this context won't get freed if the task exits.
@@ -1414,6 +1414,7 @@ static int perf_release(struct inode *inode, struct file *file)
 
 	file->private_data = NULL;
 
+	WARN_ON_ONCE(ctx->parent_ctx);
 	mutex_lock(&ctx->mutex);
 	perf_counter_remove_from_context(counter);
 	mutex_unlock(&ctx->mutex);
@@ -1445,6 +1446,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 	if (counter->state == PERF_COUNTER_STATE_ERROR)
 		return 0;
 
+	WARN_ON_ONCE(counter->ctx->parent_ctx);
 	mutex_lock(&counter->child_mutex);
 	values[0] = perf_counter_read(counter);
 	n = 1;
@@ -1504,6 +1506,7 @@ static void perf_counter_for_each_sibling(struct perf_counter *counter,
 	struct perf_counter_context *ctx = counter->ctx;
 	struct perf_counter *sibling;
 
+	WARN_ON_ONCE(ctx->parent_ctx);
 	mutex_lock(&ctx->mutex);
 	counter = counter->group_leader;
 
@@ -1524,6 +1527,7 @@ static void perf_counter_for_each_child(struct perf_counter *counter,
 {
 	struct perf_counter *child;
 
+	WARN_ON_ONCE(counter->ctx->parent_ctx);
 	mutex_lock(&counter->child_mutex);
 	func(counter);
 	list_for_each_entry(child, &counter->child_list, child_list)
@@ -1536,6 +1540,7 @@ static void perf_counter_for_each(struct perf_counter *counter,
 {
 	struct perf_counter *child;
 
+	WARN_ON_ONCE(counter->ctx->parent_ctx);
 	mutex_lock(&counter->child_mutex);
 	perf_counter_for_each_sibling(counter, func);
 	list_for_each_entry(child, &counter->child_list, child_list)
@@ -1741,6 +1746,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 {
 	struct perf_counter *counter = vma->vm_file->private_data;
 
+	WARN_ON_ONCE(counter->ctx->parent_ctx);
 	if (atomic_dec_and_mutex_lock(&counter->mmap_count,
 				      &counter->mmap_mutex)) {
 		struct user_struct *user = current_user();
@@ -1788,6 +1794,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	if (vma->vm_pgoff != 0)
 		return -EINVAL;
 
+	WARN_ON_ONCE(counter->ctx->parent_ctx);
 	mutex_lock(&counter->mmap_mutex);
 	if (atomic_inc_not_zero(&counter->mmap_count)) {
 		if (nr_pages != counter->data->nr_pages)
@@ -3349,8 +3356,10 @@ SYSCALL_DEFINE5(perf_counter_open,
 		goto err_free_put_context;
 
 	counter->filp = counter_file;
+	WARN_ON_ONCE(ctx->parent_ctx);
 	mutex_lock(&ctx->mutex);
 	perf_install_in_context(ctx, counter, cpu);
+	++ctx->generation;
 	mutex_unlock(&ctx->mutex);
 
 	counter->owner = current;
@@ -3436,6 +3445,7 @@ inherit_counter(struct perf_counter *parent_counter,
 	/*
 	 * Link this into the parent counter's child list
 	 */
+	WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
 	mutex_lock(&parent_counter->child_mutex);
 	list_add_tail(&child_counter->child_list, &parent_counter->child_list);
 	mutex_unlock(&parent_counter->child_mutex);
@@ -3485,6 +3495,7 @@ static void sync_child_counter(struct perf_counter *child_counter,
 	/*
 	 * Remove this counter from the parent's list
 	 */
+	WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
 	mutex_lock(&parent_counter->child_mutex);
 	list_del_init(&child_counter->child_list);
 	mutex_unlock(&parent_counter->child_mutex);
@@ -3527,12 +3538,17 @@ void perf_counter_exit_task(struct task_struct *child)
 	struct perf_counter_context *child_ctx;
 	unsigned long flags;
 
-	child_ctx = child->perf_counter_ctxp;
-
-	if (likely(!child_ctx))
+	if (likely(!child->perf_counter_ctxp))
 		return;
 
 	local_irq_save(flags);
+	/*
+	 * We can't reschedule here because interrupts are disabled,
+	 * and either child is current or it is a task that can't be
+	 * scheduled, so we are now safe from rescheduling changing
+	 * our context.
+	 */
+	child_ctx = child->perf_counter_ctxp;
 	__perf_counter_task_sched_out(child_ctx);
 
 	/*
@@ -3542,6 +3558,15 @@ void perf_counter_exit_task(struct task_struct *child)
 	 */
 	spin_lock(&child_ctx->lock);
 	child->perf_counter_ctxp = NULL;
+	if (child_ctx->parent_ctx) {
+		/*
+		 * This context is a clone; unclone it so it can't get
+		 * swapped to another process while we're removing all
+		 * the counters from it.
+		 */
+		put_ctx(child_ctx->parent_ctx);
+		child_ctx->parent_ctx = NULL;
+	}
 	spin_unlock(&child_ctx->lock);
 	local_irq_restore(flags);
 
@@ -3571,9 +3596,11 @@ again:
 int perf_counter_init_task(struct task_struct *child)
 {
 	struct perf_counter_context *child_ctx, *parent_ctx;
+	struct perf_counter_context *cloned_ctx;
 	struct perf_counter *counter;
 	struct task_struct *parent = current;
 	int inherited_all = 1;
+	u64 cloned_gen;
 	int ret = 0;
 
 	child->perf_counter_ctxp = NULL;
@@ -3581,8 +3608,7 @@ int perf_counter_init_task(struct task_struct *child)
 	mutex_init(&child->perf_counter_mutex);
 	INIT_LIST_HEAD(&child->perf_counter_list);
 
-	parent_ctx = parent->perf_counter_ctxp;
-	if (likely(!parent_ctx || !parent_ctx->nr_counters))
+	if (likely(!parent->perf_counter_ctxp))
 		return 0;
 
 	/*
@@ -3600,6 +3626,34 @@ int perf_counter_init_task(struct task_struct *child)
 	get_task_struct(child);
 
 	/*
+	 * If the parent's context is a clone, temporarily set its
+	 * parent_gen to an impossible value (all 1s) so it won't get
+	 * swapped under us.  The rcu_read_lock makes sure that
+	 * parent_ctx continues to exist even if it gets swapped to
+	 * another process and then freed while we are trying to get
+	 * its lock.
+	 */
+	rcu_read_lock();
+ retry:
+	parent_ctx = rcu_dereference(parent->perf_counter_ctxp);
+	/*
+	 * No need to check if parent_ctx != NULL here; since we saw
+	 * it non-NULL earlier, the only reason for it to become NULL
+	 * is if we exit, and since we're currently in the middle of
+	 * a fork we can't be exiting at the same time.
+	 */
+	spin_lock_irq(&parent_ctx->lock);
+	if (parent_ctx != rcu_dereference(parent->perf_counter_ctxp)) {
+		spin_unlock_irq(&parent_ctx->lock);
+		goto retry;
+	}
+	cloned_gen = parent_ctx->parent_gen;
+	if (parent_ctx->parent_ctx)
+		parent_ctx->parent_gen = ~0ull;
+	spin_unlock_irq(&parent_ctx->lock);
+	rcu_read_unlock();
+
+	/*
 	 * Lock the parent list. No need to lock the child - not PID
 	 * hashed yet and not running, so nobody can access it.
 	 */
@@ -3630,10 +3684,15 @@ int perf_counter_init_task(struct task_struct *child)
 		/*
 		 * Mark the child context as a clone of the parent
 		 * context, or of whatever the parent is a clone of.
+		 * Note that if the parent is a clone, it could get
+		 * uncloned at any point, but that doesn't matter
+		 * because the list of counters and the generation
+		 * count can't have changed since we took the mutex.
 		 */
-		if (parent_ctx->parent_ctx) {
-			child_ctx->parent_ctx = parent_ctx->parent_ctx;
-			child_ctx->parent_gen = parent_ctx->parent_gen;
+		cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
+		if (cloned_ctx) {
+			child_ctx->parent_ctx = cloned_ctx;
+			child_ctx->parent_gen = cloned_gen;
 		} else {
 			child_ctx->parent_ctx = parent_ctx;
 			child_ctx->parent_gen = parent_ctx->generation;
@@ -3643,6 +3702,16 @@ int perf_counter_init_task(struct task_struct *child)
 
 	mutex_unlock(&parent_ctx->mutex);
 
+	/*
+	 * Restore the clone status of the parent.
+	 */
+	if (parent_ctx->parent_ctx) {
+		spin_lock_irq(&parent_ctx->lock);
+		if (parent_ctx->parent_ctx)
+			parent_ctx->parent_gen = cloned_gen;
+		spin_unlock_irq(&parent_ctx->lock);
+	}
+
 	return ret;
 }
 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/