linux-kernel - [RFC PATCH V2 2/7] perf: Init/fini PMU specific data

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1578080364-5928-2-git-send-email-kan.liang@linux.intel.com>
Date:   Fri,  3 Jan 2020 11:39:19 -0800
From:   kan.liang@...ux.intel.com
To:     peterz@...radead.org, mingo@...hat.com, acme@...nel.org,
        tglx@...utronix.de, bp@...en8.de, linux-kernel@...r.kernel.org
Cc:     eranian@...gle.com, alexey.budankov@...ux.intel.com,
        vitaly.slobodskoy@...el.com, ak@...ux.intel.com,
        Kan Liang <kan.liang@...ux.intel.com>
Subject: [RFC PATCH V2 2/7] perf: Init/fini PMU specific data

From: Kan Liang <kan.liang@...ux.intel.com>

The PMU specific data for the monitored tasks only be allocated during
LBR call stack monitoring.

When a LBR call stack event is accounted, the perf_ctx_data for related
tasks will be allocated/initialized by init_perf_ctx_data().
When a LBR call stack event is unaccounted, the perf_ctx_data for
related tasks will be detached/freed by fini_task_ctx_data().

Perf supports both per-process event and system-wide event.
- For per-process event, perf only allocates the perf_ctx_data for
  current task.
  If the allocation fails, perf error out for per-process event.
- For system-wide event, perf has to allocate the perf_ctx_data for
  both existing tasks and upcoming tasks.
  The allocation for the existing tasks is done in perf_event_alloc().
  The allocation for new tasks will be done in perf_event_fork().
  If any allocation fails, perf doesn't error out for system-wide event.
  A debug message will be dumped to system log instead. LBR callstack
  may be cutoff for the task which doesn't have the space allocated.

Signed-off-by: Kan Liang <kan.liang@...ux.intel.com>
---

Changes since V1:
- Merge patch 2 and 3 of V1. Remove several helpers
  e.g. alloc/init/fini_task_ctx_data_rcu().
- Move the allocations out of spin lock
- Remove tasklist_lock. Use RCU iteration of the tasklist
- Retrieve total number of threads from global nr_threads

 kernel/events/core.c | 325 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 325 insertions(+)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index f0e0f85a..64d3ee4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -47,6 +47,7 @@
 #include <linux/parser.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/stat.h>
 #include <linux/proc_ns.h>
 #include <linux/mount.h>
 
@@ -387,6 +388,10 @@ static atomic_t nr_switch_events __read_mostly;
 static atomic_t nr_ksymbol_events __read_mostly;
 static atomic_t nr_bpf_events __read_mostly;
 
+/* Track the number of system-wide event which requires pmu specific data */
+static atomic_t nr_task_data_sys_wide_events;
+static DEFINE_RAW_SPINLOCK(task_data_sys_wide_events_lock);
+
 static LIST_HEAD(pmus);
 static DEFINE_MUTEX(pmus_lock);
 static struct srcu_struct pmus_srcu;
@@ -4440,6 +4445,256 @@ static void unaccount_freq_event(void)
 		atomic_dec(&nr_freq_events);
 }
 
+static int
+alloc_perf_ctx_data(size_t ctx_size, gfp_t flags,
+		    struct perf_ctx_data **task_ctx_data)
+{
+	struct perf_ctx_data *ctx_data;
+
+	ctx_data = kzalloc(sizeof(struct perf_ctx_data), flags);
+	if (!ctx_data)
+		return -ENOMEM;
+
+	ctx_data->data = kzalloc(ctx_size, flags);
+	if (!ctx_data->data) {
+		kfree(ctx_data);
+		return -ENOMEM;
+	}
+
+	ctx_data->data_size = ctx_size;
+	*task_ctx_data = ctx_data;
+
+	return 0;
+}
+
+static void
+free_perf_ctx_data(struct perf_ctx_data *ctx_data)
+{
+	kfree(ctx_data->data);
+	kfree(ctx_data);
+}
+
+static void
+free_perf_ctx_data_rcu(struct rcu_head *rcu_head)
+{
+	struct perf_ctx_data *ctx_data;
+
+	ctx_data = container_of(rcu_head, struct perf_ctx_data, rcu_head);
+	free_perf_ctx_data(ctx_data);
+}
+
+static int
+init_task_ctx_data(struct task_struct *task, size_t ctx_size)
+{
+	struct perf_ctx_data *ctx_data;
+
+	if (alloc_perf_ctx_data(ctx_size, GFP_KERNEL, &ctx_data))
+		return -ENOMEM;
+
+	mutex_lock(&task->perf_event_mutex);
+
+	if (task->perf_ctx_data) {
+		refcount_inc(&task->perf_ctx_data->refcount);
+		free_perf_ctx_data(ctx_data);
+	} else {
+		refcount_set(&ctx_data->refcount, 1);
+		/* System-wide event is active as well */
+		if (atomic_read(&nr_task_data_sys_wide_events))
+			refcount_inc(&ctx_data->refcount);
+		rcu_assign_pointer(task->perf_ctx_data, ctx_data);
+	}
+
+	mutex_unlock(&task->perf_event_mutex);
+	return 0;
+}
+
+static int
+init_system_wide_ctx_data(size_t ctx_size)
+{
+	struct perf_ctx_data **data;
+	struct task_struct *g, *p;
+	unsigned long flags;
+	int i, num_thread, pos = 0, nr_new_tasks = 0;
+
+	/* Retrieve total number of threads */
+	num_thread = nr_threads;
+
+	data = kcalloc(num_thread, sizeof(*data), GFP_KERNEL);
+	if (!data) {
+		printk_once(KERN_DEBUG
+			    "Failed to allocate space for LBR callstack. "
+			    "The LBR callstack for all tasks may be cutoff.\n");
+		return -ENOMEM;
+	}
+
+	/* Do a quick allocation in first round with GFP_ATOMIC. */
+	for (i = 0; i < num_thread; i++) {
+		if (alloc_perf_ctx_data(ctx_size, GFP_ATOMIC, &data[i]))
+			break;
+	}
+	num_thread = i;
+
+	raw_spin_lock_irqsave(&task_data_sys_wide_events_lock, flags);
+
+	/*
+	 * Allocate perf_ctx_data for all existing threads with the first
+	 * system-wide event.
+	 * The perf_ctx_data for new threads will be allocated in
+	 * perf_event_fork().
+	 */
+	if (atomic_inc_return(&nr_task_data_sys_wide_events) > 1)
+		goto unlock;
+
+	rcu_read_lock();
+	for_each_process_thread(g, p) {
+		mutex_lock(&p->perf_event_mutex);
+		if (p->perf_ctx_data) {
+			/*
+			 * The perf_ctx_data for this thread may has been
+			 * allocated by per-task event.
+			 * Only update refcount for the case.
+			 */
+			refcount_inc(&p->perf_ctx_data->refcount);
+			mutex_unlock(&p->perf_event_mutex);
+			continue;
+		}
+
+		if (pos < num_thread) {
+			refcount_set(&data[pos]->refcount, 1);
+			rcu_assign_pointer(p->perf_ctx_data, data[pos++]);
+		} else {
+			/*
+			 * There may be some new threads created,
+			 * when we allocate space.
+			 * Track the number in nr_new_tasks.
+			 */
+			nr_new_tasks++;
+		}
+		mutex_unlock(&p->perf_event_mutex);
+	}
+	rcu_read_unlock();
+
+	raw_spin_unlock_irqrestore(&task_data_sys_wide_events_lock, flags);
+
+	if (!nr_new_tasks)
+		goto end;
+
+	/*
+	 * Try again to allocate the space for the left tasks.
+	 */
+	for (i = 0; i < nr_new_tasks; i++) {
+		if (alloc_perf_ctx_data(ctx_size, GFP_KERNEL, &data[i]))
+			break;
+	}
+	num_thread = i;
+	pos = 0;
+
+	raw_spin_lock_irqsave(&task_data_sys_wide_events_lock, flags);
+
+	rcu_read_lock();
+	for_each_process_thread(g, p) {
+
+		mutex_lock(&p->perf_event_mutex);
+		if (p->perf_ctx_data) {
+			mutex_unlock(&p->perf_event_mutex);
+			continue;
+		}
+
+		if (pos < num_thread) {
+			refcount_set(&data[pos]->refcount, 1);
+			rcu_assign_pointer(p->perf_ctx_data, data[pos]);
+		}
+		pos++;
+		mutex_unlock(&p->perf_event_mutex);
+	}
+	rcu_read_unlock();
+
+	if (pos >= num_thread) {
+		printk_once(KERN_DEBUG
+			    "Failed to allocate space for LBR callstack. "
+			    "The LBR callstack for some tasks may be cutoff.\n");
+	}
+unlock:
+	raw_spin_unlock_irqrestore(&task_data_sys_wide_events_lock, flags);
+
+end:
+	for (; pos < num_thread; pos++)
+		free_perf_ctx_data(data[pos]);
+
+	kfree(data);
+	return 0;
+}
+
+static int
+init_perf_ctx_data(struct perf_event *event)
+{
+	struct task_struct *task = event->hw.target;
+	size_t ctx_size = event->pmu->task_ctx_size;
+
+	if (task)
+		return init_task_ctx_data(task, ctx_size);
+	else
+		return init_system_wide_ctx_data(ctx_size);
+}
+
+/**
+ * Detach perf_ctx_data RCU pointer for a task
+ * @task:        Target Task
+ * @force:       Unconditionally free perf_ctx_data
+ *
+ * If force is set, free perf_ctx_data unconditionally.
+ * Otherwise, free perf_ctx_data when there are no users.
+ * Lock is required to sync the writers of perf_ctx_data RCU pointer
+ */
+static void
+fini_task_ctx_data(struct task_struct *task, bool force)
+{
+	struct perf_ctx_data *ctx_data;
+
+	mutex_lock(&task->perf_event_mutex);
+
+	ctx_data = task->perf_ctx_data;
+	if (!ctx_data)
+		goto unlock;
+
+	if (!force && !refcount_dec_and_test(&ctx_data->refcount))
+		goto unlock;
+
+	RCU_INIT_POINTER(task->perf_ctx_data, NULL);
+	call_rcu(&ctx_data->rcu_head, free_perf_ctx_data_rcu);
+
+unlock:
+	mutex_unlock(&task->perf_event_mutex);
+}
+
+static void fini_system_wide_ctx_data(void)
+{
+	struct task_struct *g, *p;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&task_data_sys_wide_events_lock, flags);
+	if (!atomic_dec_and_test(&nr_task_data_sys_wide_events))
+		goto unlock;
+
+	rcu_read_lock();
+	for_each_process_thread(g, p)
+		fini_task_ctx_data(p, false);
+	rcu_read_unlock();
+
+unlock:
+	raw_spin_unlock_irqrestore(&task_data_sys_wide_events_lock, flags);
+}
+
+static void fini_perf_ctx_data(struct perf_event *event)
+{
+	struct task_struct *task = event->hw.target;
+
+	if (task)
+		fini_task_ctx_data(task, false);
+	else
+		fini_system_wide_ctx_data();
+}
+
 static void unaccount_event(struct perf_event *event)
 {
 	bool dec = false;
@@ -4471,6 +4726,8 @@ static void unaccount_event(struct perf_event *event)
 		atomic_dec(&nr_ksymbol_events);
 	if (event->attr.bpf_event)
 		atomic_dec(&nr_bpf_events);
+	if (event->attach_state & PERF_ATTACH_TASK_DATA)
+		fini_perf_ctx_data(event);
 
 	if (dec) {
 		if (!atomic_add_unless(&perf_sched_count, -1, 1))
@@ -7297,10 +7554,69 @@ static void perf_event_task(struct task_struct *task,
 		       task_ctx);
 }
 
+/*
+ * Allocate data for a new task when profiling system-wide
+ * events which require PMU specific data
+ */
+static void perf_event_alloc_task_data(struct task_struct *child,
+				       struct task_struct *parent)
+{
+	struct perf_ctx_data *ctx_data;
+	size_t ctx_size = 0;
+	unsigned long flags;
+
+	if (!atomic_read(&nr_task_data_sys_wide_events))
+		return;
+
+	rcu_read_lock();
+	ctx_data = rcu_dereference(parent->perf_ctx_data);
+	if (ctx_data)
+		ctx_size = ctx_data->data_size;
+	rcu_read_unlock();
+
+	if (!ctx_size)
+		return;
+
+	if (alloc_perf_ctx_data(ctx_size, GFP_KERNEL, &ctx_data))
+		return;
+
+	raw_spin_lock_irqsave(&task_data_sys_wide_events_lock, flags);
+
+	/*
+	 * System-wide event may be unaccount when attaching the perf_ctx_data.
+	 * For example,
+	 *                CPU A                              CPU B
+	 *        perf_event_alloc_task_data():
+	 *          read(nr_task_data_events)
+	 *                                         fini_system_wide_ctx_data()
+	 *          alloc_perf_ctx_data()
+	 *          rcu_assign_pointer(perf_ctx_data);
+	 *
+	 * The perf_ctx_data may never be freed until the task is terminated.
+	 */
+	if (unlikely(!atomic_read(&nr_task_data_sys_wide_events))) {
+		free_perf_ctx_data(ctx_data);
+		goto unlock;
+	}
+
+	mutex_lock(&child->perf_event_mutex);
+	if (child->perf_ctx_data) {
+		free_perf_ctx_data(ctx_data);
+	} else {
+		refcount_set(&ctx_data->refcount, 1);
+		rcu_assign_pointer(child->perf_ctx_data, ctx_data);
+	}
+	mutex_unlock(&child->perf_event_mutex);
+
+unlock:
+	raw_spin_unlock_irqrestore(&task_data_sys_wide_events_lock, flags);
+}
+
 void perf_event_fork(struct task_struct *task)
 {
 	perf_event_task(task, NULL, 1);
 	perf_event_namespaces(task);
+	perf_event_alloc_task_data(task, current);
 }
 
 /*
@@ -10826,11 +11142,18 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	if (err)
 		goto err_callchain_buffer;
 
+	if ((event->attach_state & PERF_ATTACH_TASK_DATA) &&
+	    init_perf_ctx_data(event))
+		goto err_task_ctx_data;
+
 	/* symmetric to unaccount_event() in _free_event() */
 	account_event(event);
 
 	return event;
 
+err_task_ctx_data:
+	if (!event->parent && (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN))
+		put_callchain_buffers();
 err_callchain_buffer:
 	if (!event->parent) {
 		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
@@ -11892,6 +12215,8 @@ void perf_event_exit_task(struct task_struct *child)
 	 * At this point we need to send EXIT events to cpu contexts.
 	 */
 	perf_event_task(child, NULL, 0);
+
+	fini_task_ctx_data(child, true);
 }
 
 static void perf_free_event(struct perf_event *event,
-- 
2.7.4