lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Fri, 28 Apr 2017 16:24:56 +0200
From:   Sebastian Siewior <bigeasy@...utronix.de>
To:     Thomas Gleixner <tglx@...utronix.de>,
        Peter Zijlstra <peterz@...radead.org>
Cc:     LKML <linux-kernel@...r.kernel.org>,
        Ingo Molnar <mingo@...nel.org>,
        Steven Rostedt <rostedt@...dmis.org>
Subject: [RFC PATCH] trace/perf: cure locking issue in perf_event_open()
 error path

As trinity figured out, there is a recursive get_online_cpus() in
perf_event_open()'s error path:
| Call Trace:
|  dump_stack+0x86/0xce
|  __lock_acquire+0x2520/0x2cd0
|  lock_acquire+0x27c/0x2f0
|  get_online_cpus+0x3d/0x80
|  static_key_slow_dec+0x5a/0x70
|  sw_perf_event_destroy+0x8e/0x100
|  _free_event+0x61b/0x800
|  free_event+0x68/0x70
|  SyS_perf_event_open+0x19db/0x1d80

In order to cure, I am moving free_event() after the put_online_cpus()
block.
Besides that one, there also the error path in perf_event_alloc() which
also invokes event->destory. Here I delayed the destory work to
schedule_work().

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@...utronix.de>
---

I am not quite happy with the schedule_work() part.

 include/linux/perf_event.h |  1 +
 kernel/events/core.c       | 52 +++++++++++++++++++++++++++++++++-------------
 2 files changed, 38 insertions(+), 15 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 24a635887f28..d6a874dbbd21 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -718,6 +718,7 @@ struct perf_event {
 #endif
 
 	struct list_head		sb_list;
+	struct work_struct		destroy_work;
 #endif /* CONFIG_PERF_EVENTS */
 };
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7aed78b516fc..3358889609f8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9320,6 +9320,16 @@ static void account_event(struct perf_event *event)
 	account_pmu_sb_event(event);
 }
 
+static void perf_alloc_destroy_ev(struct work_struct *work)
+{
+	struct perf_event *event;
+
+	event = container_of(work, struct perf_event, destroy_work);
+	event->destroy(event);
+	module_put(event->pmu->module);
+	kfree(event);
+}
+
 /*
  * Allocate and initialize a event structure
  */
@@ -9334,6 +9344,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	struct pmu *pmu;
 	struct perf_event *event;
 	struct hw_perf_event *hwc;
+	bool delay_destroy = false;
 	long err = -EINVAL;
 
 	if ((unsigned)cpu >= nr_cpu_ids) {
@@ -9497,15 +9508,22 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	exclusive_event_destroy(event);
 
 err_pmu:
-	if (event->destroy)
-		event->destroy(event);
-	module_put(pmu->module);
+	if (event->destroy) {
+		/* delay ->destroy due to nested get_online_cpus() */
+		INIT_WORK(&event->destroy_work, perf_alloc_destroy_ev);
+		delay_destroy = true;
+	} else {
+		module_put(pmu->module);
+	}
 err_ns:
 	if (is_cgroup_event(event))
 		perf_detach_cgroup(event);
 	if (event->ns)
 		put_pid_ns(event->ns);
-	kfree(event);
+	if (delay_destroy)
+		schedule_work(&event->destroy_work);
+	else
+		kfree(event);
 
 	return ERR_PTR(err);
 }
@@ -9798,7 +9816,7 @@ SYSCALL_DEFINE5(perf_event_open,
 		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
 {
 	struct perf_event *group_leader = NULL, *output_event = NULL;
-	struct perf_event *event, *sibling;
+	struct perf_event *event = NULL, *sibling;
 	struct perf_event_attr attr;
 	struct perf_event_context *ctx, *uninitialized_var(gctx);
 	struct file *event_file = NULL;
@@ -9908,13 +9926,14 @@ SYSCALL_DEFINE5(perf_event_open,
 				 NULL, NULL, cgroup_fd);
 	if (IS_ERR(event)) {
 		err = PTR_ERR(event);
+		event = NULL;
 		goto err_cred;
 	}
 
 	if (is_sampling_event(event)) {
 		if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
 			err = -EOPNOTSUPP;
-			goto err_alloc;
+			goto err_cred;
 		}
 	}
 
@@ -9927,7 +9946,7 @@ SYSCALL_DEFINE5(perf_event_open,
 	if (attr.use_clockid) {
 		err = perf_event_set_clock(event, attr.clockid);
 		if (err)
-			goto err_alloc;
+			goto err_cred;
 	}
 
 	if (pmu->task_ctx_nr == perf_sw_context)
@@ -9962,7 +9981,7 @@ SYSCALL_DEFINE5(perf_event_open,
 	ctx = find_get_context(pmu, task, event);
 	if (IS_ERR(ctx)) {
 		err = PTR_ERR(ctx);
-		goto err_alloc;
+		goto err_cred;
 	}
 
 	if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
@@ -10186,18 +10205,21 @@ SYSCALL_DEFINE5(perf_event_open,
 err_context:
 	perf_unpin_context(ctx);
 	put_ctx(ctx);
-err_alloc:
-	/*
-	 * If event_file is set, the fput() above will have called ->release()
-	 * and that will take care of freeing the event.
-	 */
-	if (!event_file)
-		free_event(event);
 err_cred:
 	if (task)
 		mutex_unlock(&task->signal->cred_guard_mutex);
 err_cpus:
 	put_online_cpus();
+	/*
+	 * The event cleanup should happen earlier (as per cleanup in reverse
+	 * allocation order). It is delayed after the put_online_cpus() section
+	 * so we don't invoke event->destroy in it and risk recursive invocation
+	 * of it via static_key_slow_dec().
+	 * If event_file is set, the fput() above will have called ->release()
+	 * and that will take care of freeing the event.
+	 */
+	if (event && !event_file)
+		free_event(event);
 err_task:
 	if (task)
 		put_task_struct(task);
-- 
2.11.0

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ