linux-kernel - [RFC][PATCH] perf: Tracepoint collection support

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <1290445737.2072.338.camel@laptop>
Date:	Mon, 22 Nov 2010 18:08:57 +0100
From:	Peter Zijlstra <peterz@...radead.org>
To:	Ingo Molnar <mingo@...e.hu>, tglx <tglx@...utronix.de>,
	Steven Rostedt <rostedt@...dmis.org>
Cc:	fweisbec <fweisbec@...il.com>,
	Stephane Eranian <eranian@...gle.com>,
	paulus <paulus@...ba.org>, LKML <linux-kernel@...r.kernel.org>
Subject: [RFC][PATCH] perf: Tracepoint collection support

Due to popular request this implements a tracepoint collection event
{ .type = PERF_TYPE_TRACEPOINT, .config = ~0ULL }. by default it
contains no tracepoints, but tracepoints can be added using:
  ioctl(fd, PERF_EVENT_IOC_ADD_TP, tp_id);

In order to provide a dense ID space for tracepoints replace the
tracepoint ID generation with an IDR tree.

Furthermore, replace the whole trace-event <-> perf infrastructure with
multiple IDR trees.

We keep an IDR tree per perf_event, this tree collects all the
tracepoints the perf_event is interested in and stores the corresponding
'node'. This tree manages the node life-time.

Then we keep an IDR tree per task and per CPU. Both of these trees are
accumulation trees, they're the union of all events of that particular
task/CPU. It stores a list of 'node's.

Then, when a trace-event happens, we look for it in both the CPU tree as
well as the current task tree. For both, if present we iterate the node
list and deliver the event to the corresponding perf_events.

We manage the IDR trees on perf_event creation/destruction and
ioctl(ADD_TP) time. This mean that pmu::{add,remove} are empty ops, the
per-task stat is taken care of in the per-task tree after all.

The patch is lightly tested and wants some serious testing/review before
merging.

Also, it has a number of open points, like the breakpoint much it
suffers from the pmu::event_init() vs context-attach inversion, and the
whole fancy inherited context avoid switch logic needs a fix.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@...llo.nl>
---
 include/linux/ftrace_event.h    |    9 
 include/linux/perf_event.h      |    9 
 include/linux/sched.h           |    9 
 include/trace/ftrace.h          |    4 
 kernel/perf_event.c             |  389 ++++++++++++++++++++++++++++++++++++++--
 kernel/trace/trace_event_perf.c |   95 ++++-----
 kernel/trace/trace_kprobe.c     |   10 -
 kernel/trace/trace_output.c     |   77 +------
 kernel/trace/trace_syscalls.c   |    8 
 9 files changed, 459 insertions(+), 151 deletions(-)

Index: linux-2.6/include/linux/ftrace_event.h
===================================================================
--- linux-2.6.orig/include/linux/ftrace_event.h
+++ linux-2.6/include/linux/ftrace_event.h
@@ -87,7 +87,6 @@ struct trace_event_functions {
 };
 
 struct trace_event {
-	struct hlist_node		node;
 	struct list_head		list;
 	int				type;
 	struct trace_event_functions	*funcs;
@@ -194,7 +193,6 @@ struct ftrace_event_call {
 
 #ifdef CONFIG_PERF_EVENTS
 	int				perf_refcount;
-	struct hlist_head __percpu	*perf_events;
 #endif
 };
 
@@ -260,8 +258,9 @@ struct perf_event;
 
 DECLARE_PER_CPU(struct pt_regs, perf_trace_regs);
 
-extern int  perf_trace_init(struct perf_event *event);
+extern int  perf_trace_init(struct perf_event *event, int event_id);
 extern void perf_trace_destroy(struct perf_event *event);
+extern void perf_trace_destroy_id(int id);
 extern int  perf_trace_add(struct perf_event *event, int flags);
 extern void perf_trace_del(struct perf_event *event, int flags);
 extern int  ftrace_profile_set_filter(struct perf_event *event, int event_id,
@@ -272,9 +271,9 @@ extern void *perf_trace_buf_prepare(int 
 
 static inline void
 perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr,
-		       u64 count, struct pt_regs *regs, void *head)
+		       u64 count, struct pt_regs *regs, int id)
 {
-	perf_tp_event(addr, count, raw_data, size, regs, head, rctx);
+	perf_tp_event(addr, count, raw_data, size, regs, rctx, id);
 }
 #endif
 
Index: linux-2.6/include/linux/perf_event.h
===================================================================
--- linux-2.6.orig/include/linux/perf_event.h
+++ linux-2.6/include/linux/perf_event.h
@@ -238,6 +238,7 @@ struct perf_event_attr {
 #define PERF_EVENT_IOC_PERIOD		_IOW('$', 4, __u64)
 #define PERF_EVENT_IOC_SET_OUTPUT	_IO ('$', 5)
 #define PERF_EVENT_IOC_SET_FILTER	_IOW('$', 6, char *)
+#define PERF_EVENT_IOC_ADD_TP		_IO ('$', 7)
 
 enum perf_event_ioc_flags {
 	PERF_IOC_FLAG_GROUP		= 1U << 0,
@@ -545,6 +546,11 @@ struct hw_perf_event {
 			struct task_struct		*bp_target;
 		};
 #endif
+		/*
+		 * Same fudge as for breakpoints, trace-events needs
+		 * it too,.. convert the bp crap over..
+		 */
+		struct task_struct *event_target;
 	};
 	int				state;
 	local64_t			prev_count;
@@ -813,6 +819,7 @@ struct perf_event {
 #ifdef CONFIG_EVENT_TRACING
 	struct ftrace_event_call	*tp_event;
 	struct event_filter		*filter;
+	struct perf_tp_idr		tp_idr;
 #endif
 
 #endif /* CONFIG_PERF_EVENTS */
@@ -1076,7 +1083,7 @@ static inline bool perf_paranoid_kernel(
 extern void perf_event_init(void);
 extern void perf_tp_event(u64 addr, u64 count, void *record,
 			  int entry_size, struct pt_regs *regs,
-			  struct hlist_head *head, int rctx);
+			  int rctx, int id);
 extern void perf_bp_event(struct perf_event *event, void *data);
 
 #ifndef perf_misc_flags
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -82,6 +82,7 @@ struct sched_param {
 #include <linux/rculist.h>
 #include <linux/rtmutex.h>
 
+#include <linux/idr.h>
 #include <linux/time.h>
 #include <linux/param.h>
 #include <linux/resource.h>
@@ -1179,6 +1180,11 @@ enum perf_event_task_context {
 	perf_nr_task_contexts,
 };
 
+struct perf_tp_idr {
+	struct mutex	lock;
+	struct idr	idr;
+};
+
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	void *stack;
@@ -1452,6 +1458,9 @@ struct task_struct {
 	struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
 	struct mutex perf_event_mutex;
 	struct list_head perf_event_list;
+#ifdef CONFIG_EVENT_TRACING
+	struct perf_tp_idr *perf_tp_idr;
+#endif
 #endif
 #ifdef CONFIG_NUMA
 	struct mempolicy *mempolicy;	/* Protected by alloc_lock */
Index: linux-2.6/include/trace/ftrace.h
===================================================================
--- linux-2.6.orig/include/trace/ftrace.h
+++ linux-2.6/include/trace/ftrace.h
@@ -700,7 +700,6 @@ perf_trace_##call(void *__data, proto)		
 	struct ftrace_raw_##call *entry;				\
 	struct pt_regs __regs;						\
 	u64 __addr = 0, __count = 1;					\
-	struct hlist_head *head;					\
 	int __entry_size;						\
 	int __data_size;						\
 	int rctx;							\
@@ -725,9 +724,8 @@ perf_trace_##call(void *__data, proto)		
 									\
 	{ assign; }							\
 									\
-	head = this_cpu_ptr(event_call->perf_events);			\
 	perf_trace_buf_submit(entry, __entry_size, rctx, __addr,	\
-		__count, &__regs, head);				\
+		__count, &__regs, event_call->event.type);		\
 }
 
 /*
Index: linux-2.6/kernel/perf_event.c
===================================================================
--- linux-2.6.orig/kernel/perf_event.c
+++ linux-2.6/kernel/perf_event.c
@@ -13,6 +13,7 @@
 #include <linux/mm.h>
 #include <linux/cpu.h>
 #include <linux/smp.h>
+#include <linux/idr.h>
 #include <linux/file.h>
 #include <linux/poll.h>
 #include <linux/slab.h>
@@ -310,6 +311,7 @@ list_add_event(struct perf_event *event,
 	ctx->nr_events++;
 	if (event->attr.inherit_stat)
 		ctx->nr_stat++;
+	++ctx->generation;
 }
 
 static void perf_group_attach(struct perf_event *event)
@@ -370,6 +372,7 @@ list_del_event(struct perf_event *event,
 	 */
 	if (event->state > PERF_EVENT_STATE_OFF)
 		event->state = PERF_EVENT_STATE_OFF;
+	++ctx->generation;
 }
 
 static void perf_group_detach(struct perf_event *event)
@@ -1228,6 +1231,12 @@ void perf_event_context_sched_out(struct
 	if (!cpuctx->task_ctx)
 		return;
 
+#if 0
+	/*
+	 * Need to sort out how to make task_struct::perf_tp_idr
+	 * work with this fancy switching stuff.. tracepoints could be
+	 * in multiple contexts due to the software event muck.
+	 */
 	rcu_read_lock();
 	parent = rcu_dereference(ctx->parent_ctx);
 	next_ctx = next->perf_event_ctxp[ctxn];
@@ -1261,6 +1270,7 @@ void perf_event_context_sched_out(struct
 		raw_spin_unlock(&ctx->lock);
 	}
 	rcu_read_unlock();
+#endif
 
 	if (do_switch) {
 		ctx_sched_out(ctx, cpuctx, EVENT_ALL);
@@ -2561,6 +2571,7 @@ static struct perf_event *perf_fget_ligh
 static int perf_event_set_output(struct perf_event *event,
 				 struct perf_event *output_event);
 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
+static int perf_event_add_tp(struct perf_event *event, int tp_id);
 
 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
@@ -2607,6 +2618,9 @@ static long perf_ioctl(struct file *file
 	case PERF_EVENT_IOC_SET_FILTER:
 		return perf_event_set_filter(event, (void __user *)arg);
 
+	case PERF_EVENT_IOC_ADD_TP:
+		return perf_event_add_tp(event, arg);
+
 	default:
 		return -ENOTTY;
 	}
@@ -4741,6 +4755,9 @@ static struct pmu perf_swevent = {
 
 #ifdef CONFIG_EVENT_TRACING
 
+#include <linux/ftrace_event.h>
+#include "trace/trace_output.h"
+
 static int perf_tp_filter_match(struct perf_event *event,
 				struct perf_sample_data *data)
 {
@@ -4767,8 +4784,60 @@ static int perf_tp_event_match(struct pe
 	return 1;
 }
 
+static void perf_tp_idr_init(struct perf_tp_idr *idr)
+{
+	idr_init(&idr->idr);
+	mutex_init(&idr->lock);
+}
+
+static DEFINE_PER_CPU(struct perf_tp_idr, perf_tp_idr);
+
+struct perf_tp_node {
+	struct list_head	list;
+	struct perf_event	*event;
+	struct rcu_head		rcu;
+};
+
+static void do_perf_tp_event(struct perf_event *event, u64 count,
+			     struct perf_sample_data *data,
+			     struct pt_regs *regs)
+{
+	if (perf_tp_event_match(event, data, regs))
+		perf_swevent_event(event, count, 1, data, regs);
+}
+
+static void perf_tp_idr_event(struct perf_tp_idr *tp_idr,
+			      int id, u64 count,
+			      struct perf_sample_data *data,
+			      struct pt_regs *regs)
+{
+	struct perf_tp_node *tp_node, *node;
+	struct perf_event *event;
+
+	if (!tp_idr)
+		return;
+
+	/*
+	 * Most of this is done under rcu_read_lock_sched(), which doesn't
+	 * exclude regular RCU grace periods, but the IDR code uses call_rcu()
+	 * so we have to use rcu_read_lock() here as well.
+	 */
+	rcu_read_lock();
+	tp_node = idr_find(&tp_idr->idr, id);
+	rcu_read_unlock();
+
+	if (!tp_node)
+		return;
+
+	event = tp_node->event;
+
+	do_perf_tp_event(event, count, data, regs);
+	list_for_each_entry_rcu(node, &tp_node->list, list)
+		do_perf_tp_event(node->event, count, data, regs);
+}
+
 void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
-		   struct pt_regs *regs, struct hlist_head *head, int rctx)
+		   struct pt_regs *regs, int rctx, int id)
 {
 	struct perf_sample_data data;
 	struct perf_event *event;
@@ -4782,18 +4851,196 @@ void perf_tp_event(u64 addr, u64 count, 
 	perf_sample_data_init(&data, addr);
 	data.raw = &raw;
 
-	hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
-		if (perf_tp_event_match(event, &data, regs))
-			perf_swevent_event(event, count, 1, &data, regs);
-	}
+	perf_tp_idr_event(&__get_cpu_var(perf_tp_idr), id, count, &data, regs);
+	perf_tp_idr_event(current->perf_tp_idr, id, count, &data, regs);
 
 	perf_swevent_put_recursion_context(rctx);
 }
 EXPORT_SYMBOL_GPL(perf_tp_event);
 
+static struct perf_tp_idr *
+perf_tp_init_task(struct perf_event *event, struct task_struct *task)
+{
+	struct perf_tp_idr *idr;
+
+	mutex_lock(&task->perf_event_mutex);
+	idr = task->perf_tp_idr;
+	if (idr)
+		goto unlock;
+
+	idr = kzalloc(sizeof(struct perf_tp_idr), GFP_KERNEL);
+	if (!idr)
+		goto unlock;
+
+	perf_tp_idr_init(idr);
+
+	task->perf_tp_idr = idr;
+unlock:
+	mutex_unlock(&task->perf_event_mutex);
+
+	return idr;
+}
+
+static struct perf_tp_idr *perf_event_idr(struct perf_event *event, bool create)
+{
+	struct perf_tp_idr *tp_idr;
+	struct task_struct *task;
+
+	if (event->attach_state & PERF_ATTACH_TASK) {
+		task = event->hw.event_target;
+		tp_idr = task->perf_tp_idr;
+		if (!tp_idr && create)
+			tp_idr = perf_tp_init_task(event, task);
+	} else
+		tp_idr = &per_cpu(perf_tp_idr, event->cpu);
+
+	return tp_idr;
+}
+
+static void perf_tp_free_node(struct rcu_head *rcu)
+{
+	struct perf_tp_node *node = container_of(rcu, struct perf_tp_node, rcu);
+
+	kfree(node);
+}
+
+static int perf_tp_remove_idr(int id, void *p, void *data)
+{
+	struct perf_tp_node *node = p;
+	struct perf_tp_node *first, *next;
+	struct perf_tp_idr *tp_idr = data;
+
+	if (!tp_idr)
+		goto no_idr;
+
+	mutex_lock(&tp_idr->lock);
+	first = idr_find(&tp_idr->idr, id);
+	if (first == node) {
+		next = list_first_entry(&first->list, struct perf_tp_node, list);
+		if (next != first)
+			idr_replace(&tp_idr->idr, next, id);
+		else
+			idr_remove(&tp_idr->idr, id);
+	}
+	list_del_rcu(&node->list);
+	mutex_unlock(&tp_idr->lock);
+
+no_idr:
+	perf_trace_destroy_id(id);
+	call_rcu_sched(&node->rcu, perf_tp_free_node);
+	return 0;
+}
+
 static void tp_perf_event_destroy(struct perf_event *event)
 {
-	perf_trace_destroy(event);
+	/*
+	 * Since this is the free path, the fd is gone an there
+	 * can be no concurrency on event->tp_idr.
+	 */
+
+	idr_for_each(&event->tp_idr.idr, perf_tp_remove_idr,
+			perf_event_idr(event, false));
+
+	idr_remove_all(&event->tp_idr.idr);
+	idr_destroy(&event->tp_idr.idr);
+}
+
+static int __perf_event_add_tp(struct perf_event *event, int tp_id)
+{
+	struct perf_tp_node *node, *first;
+	struct perf_tp_idr *idr;
+	int tmp_id, err, ret = -ENOMEM;
+
+	node = kmalloc(sizeof(*node), GFP_KERNEL);
+	if (!node)
+		goto out;
+
+	node->event = event;
+	INIT_LIST_HEAD(&node->list);
+
+	/*
+	 * Insert the node into the event->idr, this idr tracks the
+	 * tracepoints we're interested in, it has a 1:1 relation
+	 * with the node.
+	 */
+	idr = &event->tp_idr;
+	mutex_lock(&idr->lock);
+	err = idr_pre_get(&idr->idr, GFP_KERNEL);
+	if (!err) {
+		ret = -ENOMEM;
+		goto free_node;
+	}
+
+	ret = idr_get_new_above(&idr->idr, node, tp_id, &tmp_id);
+	if (ret)
+		goto free_node;
+
+	if (WARN_ON(tp_id != tmp_id)) {
+		ret = -EBUSY;
+		goto free_idr1;
+	}
+	mutex_unlock(&idr->lock);
+
+	/*
+	 * Insert the node into the task/cpu idr, this idr tracks
+	 * all active tracepoints for the task/cpu, it has a 1:n relation
+	 * with the node.
+	 */
+	idr = perf_event_idr(event, true);
+	if (!idr) {
+		if (event->attach_state & PERF_ATTACH_CONTEXT)
+			ret = -ENOMEM;
+		else
+			ret = -ESRCH;
+		goto free_idr1_set;
+	}
+	mutex_lock(&idr->lock);
+	first = idr_find(&idr->idr, tp_id);
+	if (first) {
+		list_add_rcu(&node->list, &first->list);
+		goto unlock;
+	}
+
+	err = idr_pre_get(&idr->idr, GFP_KERNEL);
+	if (!err) {
+		ret = -ENOMEM;
+		goto free_idr1_set_unlock;
+	}
+
+	ret = idr_get_new_above(&idr->idr, node, tp_id, &tmp_id);
+	if (ret)
+		goto free_idr1_set;
+
+	if (WARN_ON(tp_id != tmp_id)) {
+		ret = -EBUSY;
+		goto free_idr2;
+	}
+unlock:
+	mutex_unlock(&idr->lock);
+
+	ret = perf_trace_init(event, tp_id);
+	if (ret)
+		goto free_all;
+
+out:
+	return ret;
+
+free_all:
+	mutex_lock(&idr->lock);
+free_idr2:
+	idr_remove(&idr->idr, tmp_id);
+free_idr1_set_unlock:
+	mutex_unlock(&idr->lock);
+free_idr1_set:
+	idr = &event->tp_idr;
+	tmp_id = tp_id;
+	mutex_lock(&idr->lock);
+free_idr1:
+	idr_remove(&idr->idr, tmp_id);
+free_node:
+	mutex_unlock(&idr->lock);
+	kfree(node);
+	goto out;
 }
 
 static int perf_tp_event_init(struct perf_event *event)
@@ -4803,21 +5050,38 @@ static int perf_tp_event_init(struct per
 	if (event->attr.type != PERF_TYPE_TRACEPOINT)
 		return -ENOENT;
 
-	err = perf_trace_init(event);
-	if (err)
-		return err;
+	perf_tp_idr_init(&event->tp_idr);
 
 	event->destroy = tp_perf_event_destroy;
 
+	if (event->attr.config != ~0ULL) {
+		err = __perf_event_add_tp(event, event->attr.config);
+		if (err)
+			return err;
+		err = perf_trace_init(event, event->attr.config);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int perf_tp_event_add(struct perf_event *event, int flags)
+{
+	event->hw.state = flags & PERF_EF_START ? 0 : PERF_HES_STOPPED;
 	return 0;
 }
 
+static void perf_tp_event_del(struct perf_event *event, int flags)
+{
+}
+
 static struct pmu perf_tracepoint = {
 	.task_ctx_nr	= perf_sw_context,
 
 	.event_init	= perf_tp_event_init,
-	.add		= perf_trace_add,
-	.del		= perf_trace_del,
+	.add		= perf_tp_event_add,
+	.del		= perf_tp_event_del,
 	.start		= perf_swevent_start,
 	.stop		= perf_swevent_stop,
 	.read		= perf_swevent_read,
@@ -4825,6 +5089,11 @@ static struct pmu perf_tracepoint = {
 
 static inline void perf_tp_register(void)
 {
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		perf_tp_idr_init(&per_cpu(perf_tp_idr, cpu));
+
 	perf_pmu_register(&perf_tracepoint);
 }
 
@@ -4833,7 +5102,8 @@ static int perf_event_set_filter(struct 
 	char *filter_str;
 	int ret;
 
-	if (event->attr.type != PERF_TYPE_TRACEPOINT)
+	if (event->attr.type != PERF_TYPE_TRACEPOINT ||
+	    event->attr.config == ~0ULL)
 		return -EINVAL;
 
 	filter_str = strndup_user(arg, PAGE_SIZE);
@@ -4851,6 +5121,59 @@ static void perf_event_free_filter(struc
 	ftrace_profile_free_filter(event);
 }
 
+static int perf_event_add_tp(struct perf_event *event, int tp_id)
+{
+	if (event->attr.type != PERF_TYPE_TRACEPOINT &&
+	    event->attr.config != ~0ULL)
+		return -EINVAL;
+
+	return __perf_event_add_tp(event, tp_id);
+}
+
+/*
+ * Called from the exit path, _after_ all events have been detached from it.
+ */
+static void perf_tp_event_exit(struct task_struct *tsk)
+{
+	struct perf_tp_idr *idr = tsk->perf_tp_idr;
+
+	if (!idr)
+		return;
+
+	idr_remove_all(&idr->idr);
+	idr_destroy(&idr->idr);
+}
+
+static void perf_tp_event_delayed_put(struct task_struct *tsk)
+{
+	kfree(tsk->perf_tp_idr);
+}
+
+static int perf_tp_inherit_idr(int id, void *p, void *data)
+{
+	struct perf_event *child = data;
+
+	return __perf_event_add_tp(child, id);
+}
+
+static int perf_tp_event_inherit(struct perf_event *parent_event,
+				 struct perf_event *child_event)
+{
+	int ret;
+
+	/*
+	 * The child is not yet exposed, hence no need to serialize things
+	 * on that side.
+	 */
+	mutex_lock(&parent_event->tp_idr.lock);
+	ret = idr_for_each(&parent_event->tp_idr.idr,
+			perf_tp_inherit_idr,
+			child_event);
+	mutex_unlock(&parent_event->tp_idr.lock);
+
+	return ret;
+}
+
 #else
 
 static inline void perf_tp_register(void)
@@ -4866,6 +5189,25 @@ static void perf_event_free_filter(struc
 {
 }
 
+static int perf_event_add_tp(struct perf_event *event, int tp_id)
+{
+	return -ENOENT;
+}
+
+static void perf_tp_event_exit(struct task_struct *tsk)
+{
+}
+
+static void perf_tp_event_delayed_put(struct task_struct *tsk)
+{
+}
+
+static int perf_tp_event_inherit(struct perf_event *parent_event,
+				 struct perf_event *child_event)
+{
+	return 0;
+}
+
 #endif /* CONFIG_EVENT_TRACING */
 
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
@@ -5290,6 +5632,9 @@ perf_event_alloc(struct perf_event_attr 
 	INIT_LIST_HEAD(&event->sibling_list);
 	init_waitqueue_head(&event->waitq);
 	init_irq_work(&event->pending, perf_pending_event);
+#ifdef CONFIG_EVENT_TRACING
+	perf_tp_idr_init(&event->tp_idr);
+#endif
 
 	mutex_init(&event->mmap_mutex);
 
@@ -5308,6 +5653,7 @@ perf_event_alloc(struct perf_event_attr 
 
 	if (task) {
 		event->attach_state = PERF_ATTACH_TASK;
+		event->hw.event_target = task;
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 		/*
 		 * hw_breakpoint is a bit difficult here..
@@ -5353,7 +5699,7 @@ perf_event_alloc(struct perf_event_attr 
 	if (err) {
 		if (event->ns)
 			put_pid_ns(event->ns);
-		kfree(event);
+		free_event(event);
 		return ERR_PTR(err);
 	}
 
@@ -5694,7 +6040,6 @@ SYSCALL_DEFINE5(perf_event_open,
 	}
 
 	perf_install_in_context(ctx, event, cpu);
-	++ctx->generation;
 	mutex_unlock(&ctx->mutex);
 
 	event->owner = current;
@@ -5763,7 +6108,6 @@ perf_event_create_kernel_counter(struct 
 	WARN_ON_ONCE(ctx->parent_ctx);
 	mutex_lock(&ctx->mutex);
 	perf_install_in_context(ctx, event, cpu);
-	++ctx->generation;
 	mutex_unlock(&ctx->mutex);
 
 	return event;
@@ -5936,6 +6280,8 @@ void perf_event_exit_task(struct task_st
 
 	for_each_task_context_nr(ctxn)
 		perf_event_exit_task_context(child, ctxn);
+
+	perf_tp_event_exit(child);
 }
 
 static void perf_free_event(struct perf_event *event,
@@ -5998,6 +6344,8 @@ void perf_event_delayed_put(struct task_
 
 	for_each_task_context_nr(ctxn)
 		WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
+
+	perf_tp_event_delayed_put(task);
 }
 
 /*
@@ -6013,6 +6361,7 @@ inherit_event(struct perf_event *parent_
 {
 	struct perf_event *child_event;
 	unsigned long flags;
+	int ret;
 
 	/*
 	 * Instead of creating recursive hierarchies of events,
@@ -6030,6 +6379,15 @@ inherit_event(struct perf_event *parent_
 					   NULL);
 	if (IS_ERR(child_event))
 		return child_event;
+
+	child_event->ctx = child_ctx;
+
+	ret = perf_tp_event_inherit(parent_event, child_event);
+	if (ret) {
+		free_event(child_event);
+		return ERR_PTR(ret);
+	}
+
 	get_ctx(child_ctx);
 
 	/*
@@ -6052,7 +6410,6 @@ inherit_event(struct perf_event *parent_
 		local64_set(&hwc->period_left, sample_period);
 	}
 
-	child_event->ctx = child_ctx;
 	child_event->overflow_handler = parent_event->overflow_handler;
 
 	/*
Index: linux-2.6/kernel/trace/trace_event_perf.c
===================================================================
--- linux-2.6.orig/kernel/trace/trace_event_perf.c
+++ linux-2.6/kernel/trace/trace_event_perf.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/kprobes.h>
 #include "trace.h"
+#include "trace_output.h"
 
 static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
 
@@ -47,9 +48,7 @@ static int perf_trace_event_perm(struct 
 static int perf_trace_event_init(struct ftrace_event_call *tp_event,
 				 struct perf_event *p_event)
 {
-	struct hlist_head __percpu *list;
 	int ret;
-	int cpu;
 
 	ret = perf_trace_event_perm(tp_event, p_event);
 	if (ret)
@@ -61,15 +60,6 @@ static int perf_trace_event_init(struct 
 
 	ret = -ENOMEM;
 
-	list = alloc_percpu(struct hlist_head);
-	if (!list)
-		goto fail;
-
-	for_each_possible_cpu(cpu)
-		INIT_HLIST_HEAD(per_cpu_ptr(list, cpu));
-
-	tp_event->perf_events = list;
-
 	if (!total_ref_count) {
 		char __percpu *buf;
 		int i;
@@ -100,63 +90,40 @@ static int perf_trace_event_init(struct 
 		}
 	}
 
-	if (!--tp_event->perf_refcount) {
-		free_percpu(tp_event->perf_events);
-		tp_event->perf_events = NULL;
-	}
+	--tp_event->perf_refcount;
 
 	return ret;
 }
 
-int perf_trace_init(struct perf_event *p_event)
+int perf_trace_init(struct perf_event *p_event, int event_id)
 {
 	struct ftrace_event_call *tp_event;
-	int event_id = p_event->attr.config;
+	struct trace_event *t_event;
 	int ret = -EINVAL;
 
+	trace_event_read_lock();
+	t_event = ftrace_find_event(event_id);
+	if (!t_event)
+		goto out;
+
+	tp_event = container_of(t_event, struct ftrace_event_call, event);
+
 	mutex_lock(&event_mutex);
-	list_for_each_entry(tp_event, &ftrace_events, list) {
-		if (tp_event->event.type == event_id &&
-		    tp_event->class && tp_event->class->reg &&
-		    try_module_get(tp_event->mod)) {
-			ret = perf_trace_event_init(tp_event, p_event);
-			if (ret)
-				module_put(tp_event->mod);
-			break;
-		}
+	if (tp_event->class && tp_event->class->reg &&
+			try_module_get(tp_event->mod)) {
+		ret = perf_trace_event_init(tp_event, p_event);
+		if (ret)
+			module_put(tp_event->mod);
 	}
 	mutex_unlock(&event_mutex);
+out:
+	trace_event_read_unlock();
 
 	return ret;
 }
 
-int perf_trace_add(struct perf_event *p_event, int flags)
-{
-	struct ftrace_event_call *tp_event = p_event->tp_event;
-	struct hlist_head __percpu *pcpu_list;
-	struct hlist_head *list;
-
-	pcpu_list = tp_event->perf_events;
-	if (WARN_ON_ONCE(!pcpu_list))
-		return -EINVAL;
-
-	if (!(flags & PERF_EF_START))
-		p_event->hw.state = PERF_HES_STOPPED;
-
-	list = this_cpu_ptr(pcpu_list);
-	hlist_add_head_rcu(&p_event->hlist_entry, list);
-
-	return 0;
-}
-
-void perf_trace_del(struct perf_event *p_event, int flags)
-{
-	hlist_del_rcu(&p_event->hlist_entry);
-}
-
-void perf_trace_destroy(struct perf_event *p_event)
+static void __perf_trace_destroy(struct ftrace_event_call *tp_event)
 {
-	struct ftrace_event_call *tp_event = p_event->tp_event;
 	int i;
 
 	mutex_lock(&event_mutex);
@@ -171,9 +138,6 @@ void perf_trace_destroy(struct perf_even
 	 */
 	tracepoint_synchronize_unregister();
 
-	free_percpu(tp_event->perf_events);
-	tp_event->perf_events = NULL;
-
 	if (!--total_ref_count) {
 		for (i = 0; i < PERF_NR_CONTEXTS; i++) {
 			free_percpu(perf_trace_buf[i]);
@@ -185,6 +149,27 @@ void perf_trace_destroy(struct perf_even
 	mutex_unlock(&event_mutex);
 }
 
+void perf_trace_destroy(struct perf_event *p_event)
+{
+	__perf_trace_destroy(p_event->tp_event);
+}
+
+void perf_trace_destroy_id(int event_id)
+{
+	struct ftrace_event_call *tp_event;
+	struct trace_event *t_event;
+
+	trace_event_read_lock();
+	t_event = ftrace_find_event(event_id);
+	if (!t_event)
+		goto unlock;
+
+	tp_event = container_of(t_event, struct ftrace_event_call, event);
+	__perf_trace_destroy(tp_event);
+unlock:
+	trace_event_read_unlock();
+}
+
 __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
 				       struct pt_regs *regs, int *rctxp)
 {
Index: linux-2.6/kernel/trace/trace_kprobe.c
===================================================================
--- linux-2.6.orig/kernel/trace/trace_kprobe.c
+++ linux-2.6/kernel/trace/trace_kprobe.c
@@ -1559,7 +1559,6 @@ static __kprobes void kprobe_perf_func(s
 	struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
 	struct ftrace_event_call *call = &tp->call;
 	struct kprobe_trace_entry_head *entry;
-	struct hlist_head *head;
 	int size, __size, dsize;
 	int rctx;
 
@@ -1579,8 +1578,8 @@ static __kprobes void kprobe_perf_func(s
 	memset(&entry[1], 0, dsize);
 	store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
 
-	head = this_cpu_ptr(call->perf_events);
-	perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
+	perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs,
+			      call->event.type);
 }
 
 /* Kretprobe profile handler */
@@ -1590,7 +1589,6 @@ static __kprobes void kretprobe_perf_fun
 	struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
 	struct ftrace_event_call *call = &tp->call;
 	struct kretprobe_trace_entry_head *entry;
-	struct hlist_head *head;
 	int size, __size, dsize;
 	int rctx;
 
@@ -1610,8 +1608,8 @@ static __kprobes void kretprobe_perf_fun
 	entry->ret_ip = (unsigned long)ri->ret_addr;
 	store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
 
-	head = this_cpu_ptr(call->perf_events);
-	perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
+	perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1,
+			      regs, call->event.type);
 }
 
 static int probe_perf_enable(struct ftrace_event_call *call)
Index: linux-2.6/kernel/trace/trace_output.c
===================================================================
--- linux-2.6.orig/kernel/trace/trace_output.c
+++ linux-2.6/kernel/trace/trace_output.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/ftrace.h>
+#include <linux/idr.h>
 
 #include "trace_output.h"
 
@@ -16,9 +17,9 @@
 
 DECLARE_RWSEM(trace_event_mutex);
 
-static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
+static const int first_event_type = __TRACE_LAST_TYPE + 1;
 
-static int next_event_type = __TRACE_LAST_TYPE + 1;
+static DEFINE_IDR(trace_type_idr);
 
 int trace_print_seq(struct seq_file *m, struct trace_seq *s)
 {
@@ -660,50 +661,11 @@ static int task_state_char(unsigned long
  */
 struct trace_event *ftrace_find_event(int type)
 {
-	struct trace_event *event;
-	struct hlist_node *n;
-	unsigned key;
-
-	key = type & (EVENT_HASHSIZE - 1);
-
-	hlist_for_each_entry(event, n, &event_hash[key], node) {
-		if (event->type == type)
-			return event;
-	}
-
-	return NULL;
+	return idr_find(&trace_type_idr, type);
 }
 
 static LIST_HEAD(ftrace_event_list);
 
-static int trace_search_list(struct list_head **list)
-{
-	struct trace_event *e;
-	int last = __TRACE_LAST_TYPE;
-
-	if (list_empty(&ftrace_event_list)) {
-		*list = &ftrace_event_list;
-		return last + 1;
-	}
-
-	/*
-	 * We used up all possible max events,
-	 * lets see if somebody freed one.
-	 */
-	list_for_each_entry(e, &ftrace_event_list, list) {
-		if (e->type != last + 1)
-			break;
-		last++;
-	}
-
-	/* Did we used up all 65 thousand events??? */
-	if ((last + 1) > FTRACE_MAX_EVENT)
-		return 0;
-
-	*list = &e->list;
-	return last + 1;
-}
-
 void trace_event_read_lock(void)
 {
 	down_read(&trace_event_mutex);
@@ -731,7 +693,6 @@ void trace_event_read_unlock(void)
  */
 int register_ftrace_event(struct trace_event *event)
 {
-	unsigned key;
 	int ret = 0;
 
 	down_write(&trace_event_mutex);
@@ -746,21 +707,23 @@ int register_ftrace_event(struct trace_e
 
 	if (!event->type) {
 		struct list_head *list = NULL;
+		int type, err;
 
-		if (next_event_type > FTRACE_MAX_EVENT) {
+		err = idr_pre_get(&trace_type_idr, GFP_KERNEL);
+		if (!err)
+			goto out;
 
-			event->type = trace_search_list(&list);
-			if (!event->type)
-				goto out;
-
-		} else {
-			
-			event->type = next_event_type++;
-			list = &ftrace_event_list;
-		}
+		ret = idr_get_new_above(&trace_type_idr, event, first_event_type, &type);
+		if (ret)
+			goto out;
 
-		if (WARN_ON(ftrace_find_event(event->type)))
+		if (type > FTRACE_MAX_EVENT) {
+			idr_remove(&trace_type_idr, type);
 			goto out;
+		}
+
+		event->type = type;
+		list = &ftrace_event_list;
 
 		list_add_tail(&event->list, list);
 
@@ -783,10 +746,6 @@ int register_ftrace_event(struct trace_e
 	if (event->funcs->binary == NULL)
 		event->funcs->binary = trace_nop_print;
 
-	key = event->type & (EVENT_HASHSIZE - 1);
-
-	hlist_add_head(&event->node, &event_hash[key]);
-
 	ret = event->type;
  out:
 	up_write(&trace_event_mutex);
@@ -800,7 +759,7 @@ EXPORT_SYMBOL_GPL(register_ftrace_event)
  */
 int __unregister_ftrace_event(struct trace_event *event)
 {
-	hlist_del(&event->node);
+	idr_remove(&trace_type_idr, event->type);
 	list_del(&event->list);
 	return 0;
 }
Index: linux-2.6/kernel/trace/trace_syscalls.c
===================================================================
--- linux-2.6.orig/kernel/trace/trace_syscalls.c
+++ linux-2.6/kernel/trace/trace_syscalls.c
@@ -489,7 +489,6 @@ static void perf_syscall_enter(void *ign
 {
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_enter *rec;
-	struct hlist_head *head;
 	int syscall_nr;
 	int rctx;
 	int size;
@@ -520,8 +519,7 @@ static void perf_syscall_enter(void *ign
 	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
 			       (unsigned long *)&rec->args);
 
-	head = this_cpu_ptr(sys_data->enter_event->perf_events);
-	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
+	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, rec->ent.type);
 }
 
 int perf_sysenter_enable(struct ftrace_event_call *call)
@@ -563,7 +561,6 @@ static void perf_syscall_exit(void *igno
 {
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_exit *rec;
-	struct hlist_head *head;
 	int syscall_nr;
 	int rctx;
 	int size;
@@ -596,8 +593,7 @@ static void perf_syscall_exit(void *igno
 	rec->nr = syscall_nr;
 	rec->ret = syscall_get_return_value(current, regs);
 
-	head = this_cpu_ptr(sys_data->exit_event->perf_events);
-	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
+	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, rec->ent.type);
 }
 
 int perf_sysexit_enable(struct ftrace_event_call *call)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/