lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20130626122525.GA5189@gmail.com>
Date:	Wed, 26 Jun 2013 14:25:25 +0200
From:	Ingo Molnar <mingo@...nel.org>
To:	Robert Richter <rric@...nel.org>
Cc:	Borislav Petkov <bp@...en8.de>,
	Peter Zijlstra <peterz@...radead.org>,
	Arnaldo Carvalho de Melo <acme@...radead.org>,
	Jiri Olsa <jolsa@...hat.com>, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v2 00/14] perf, persistent: Kernel updates for perf tool
 integration


* Ingo Molnar <mingo@...nel.org> wrote:

> Note, for tracing the PERF_FLAG_FD_OUTPUT method of multiplexing 
> multiple events onto a single mmap buffers is probably useful (also 
> usable via the PERF_EVENT_IOC_SET_OUTPUT ioctl()), so please make sure 
> the scheme works naturally with that model as well, not just with 1:1 
> event+buffer mappings.
> 
> See the uses of PERF_EVENT_IOC_SET_OUTPUT in tools/perf/.

Note that another facility that would be very useful for tracing is 
PeterZ's and tglx's patch that enables multiple tracepoints to be attached 
to a single event.

See the 2+ years old (bitrotten and unfinished) WIP patch below.

It adds a PERF_EVENT_IOC_ADD_TP ioctl() that adds a new tracepoint to an 
existing event. This makes perf based tracing scale up to an arbitrary 
number of tracepoints in essence.

Thanks,

	Ingo

------------------>
Subject: perf-tracepoint-idr.patch
From: Thomas Gleixner <tglx@...utronix.de>
Date: Wed, 24 Nov 2010 12:09:26 +0100

Signed-off-by: Thomas Gleixner <tglx@...utronix.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@...llo.nl>
Signed-off-by: Ingo Molnar <mingo@...e.hu>
---
 include/linux/ftrace_event.h    |   10 
 include/linux/perf_event.h      |    9 
 include/linux/sched.h           |    9 
 include/trace/ftrace.h          |    4 
 kernel/events/core.c            |  407 ++++++++++++++++++++++++++++++++++++++--
 kernel/trace/trace_event_perf.c |   95 +++------
 kernel/trace/trace_kprobe.c     |   10 
 kernel/trace/trace_output.c     |  116 +++--------
 kernel/trace/trace_syscalls.c   |    8 
 9 files changed, 498 insertions(+), 170 deletions(-)

Index: linux/include/linux/ftrace_event.h
===================================================================
--- linux.orig/include/linux/ftrace_event.h
+++ linux/include/linux/ftrace_event.h
@@ -87,8 +87,6 @@ struct trace_event_functions {
 };
 
 struct trace_event {
-	struct hlist_node		node;
-	struct list_head		list;
 	int				type;
 	struct trace_event_functions	*funcs;
 };
@@ -194,7 +192,6 @@ struct ftrace_event_call {
 
 #ifdef CONFIG_PERF_EVENTS
 	int				perf_refcount;
-	struct hlist_head __percpu	*perf_events;
 #endif
 };
 
@@ -263,8 +260,9 @@ struct perf_event;
 
 DECLARE_PER_CPU(struct pt_regs, perf_trace_regs);
 
-extern int  perf_trace_init(struct perf_event *event);
+extern int  perf_trace_init(struct perf_event *event, int event_id);
 extern void perf_trace_destroy(struct perf_event *event);
+extern void perf_trace_destroy_id(int id);
 extern int  perf_trace_add(struct perf_event *event, int flags);
 extern void perf_trace_del(struct perf_event *event, int flags);
 extern int  ftrace_profile_set_filter(struct perf_event *event, int event_id,
@@ -275,9 +273,9 @@ extern void *perf_trace_buf_prepare(int 
 
 static inline void
 perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr,
-		       u64 count, struct pt_regs *regs, void *head)
+		       u64 count, struct pt_regs *regs, int id)
 {
-	perf_tp_event(addr, count, raw_data, size, regs, head, rctx);
+	perf_tp_event(addr, count, raw_data, size, regs, rctx, id);
 }
 #endif
 
Index: linux/include/linux/perf_event.h
===================================================================
--- linux.orig/include/linux/perf_event.h
+++ linux/include/linux/perf_event.h
@@ -247,6 +247,7 @@ struct perf_event_attr {
 #define PERF_EVENT_IOC_PERIOD		_IOW('$', 4, __u64)
 #define PERF_EVENT_IOC_SET_OUTPUT	_IO ('$', 5)
 #define PERF_EVENT_IOC_SET_FILTER	_IOW('$', 6, char *)
+#define PERF_EVENT_IOC_ADD_TP		_IO ('$', 7)
 
 enum perf_event_ioc_flags {
 	PERF_IOC_FLAG_GROUP		= 1U << 0,
@@ -568,6 +569,11 @@ struct hw_perf_event {
 			struct task_struct		*bp_target;
 		};
 #endif
+		/*
+		 * Same fudge as for breakpoints, trace-events needs
+		 * it too,.. convert the bp crap over..
+		 */
+		struct task_struct *event_target;
 	};
 	int				state;
 	local64_t			prev_count;
@@ -859,6 +865,7 @@ struct perf_event {
 #ifdef CONFIG_EVENT_TRACING
 	struct ftrace_event_call	*tp_event;
 	struct event_filter		*filter;
+	struct perf_tp_idr		tp_idr;
 #endif
 
 #ifdef CONFIG_CGROUP_PERF
@@ -1133,7 +1140,7 @@ static inline bool perf_paranoid_kernel(
 extern void perf_event_init(void);
 extern void perf_tp_event(u64 addr, u64 count, void *record,
 			  int entry_size, struct pt_regs *regs,
-			  struct hlist_head *head, int rctx);
+			  int rctx, int id);
 extern void perf_bp_event(struct perf_event *event, void *data);
 
 #ifndef perf_misc_flags
Index: linux/include/linux/sched.h
===================================================================
--- linux.orig/include/linux/sched.h
+++ linux/include/linux/sched.h
@@ -82,6 +82,7 @@ struct sched_param {
 #include <linux/rculist.h>
 #include <linux/rtmutex.h>
 
+#include <linux/idr.h>
 #include <linux/time.h>
 #include <linux/param.h>
 #include <linux/resource.h>
@@ -1199,6 +1200,11 @@ enum perf_event_task_context {
 	perf_nr_task_contexts,
 };
 
+struct perf_tp_idr {
+	struct mutex	lock;
+	struct idr	idr;
+};
+
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	void *stack;
@@ -1485,6 +1491,9 @@ struct task_struct {
 	struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
 	struct mutex perf_event_mutex;
 	struct list_head perf_event_list;
+#ifdef CONFIG_EVENT_TRACING
+	struct perf_tp_idr *perf_tp_idr;
+#endif
 #endif
 #ifdef CONFIG_NUMA
 	struct mempolicy *mempolicy;	/* Protected by alloc_lock */
Index: linux/include/trace/ftrace.h
===================================================================
--- linux.orig/include/trace/ftrace.h
+++ linux/include/trace/ftrace.h
@@ -708,7 +708,6 @@ perf_trace_##call(void *__data, proto)		
 	struct ftrace_raw_##call *entry;				\
 	struct pt_regs __regs;						\
 	u64 __addr = 0, __count = 1;					\
-	struct hlist_head *head;					\
 	int __entry_size;						\
 	int __data_size;						\
 	int rctx;							\
@@ -733,9 +732,8 @@ perf_trace_##call(void *__data, proto)		
 									\
 	{ assign; }							\
 									\
-	head = this_cpu_ptr(event_call->perf_events);			\
 	perf_trace_buf_submit(entry, __entry_size, rctx, __addr,	\
-		__count, &__regs, head);				\
+		__count, &__regs, event_call->event.type);		\
 }
 
 /*
Index: linux/kernel/events/core.c
===================================================================
--- linux.orig/kernel/events/core.c
+++ linux/kernel/events/core.c
@@ -823,6 +823,7 @@ list_add_event(struct perf_event *event,
 	ctx->nr_events++;
 	if (event->attr.inherit_stat)
 		ctx->nr_stat++;
+	++ctx->generation;
 }
 
 /*
@@ -976,6 +977,7 @@ list_del_event(struct perf_event *event,
 	 */
 	if (event->state > PERF_EVENT_STATE_OFF)
 		event->state = PERF_EVENT_STATE_OFF;
+	++ctx->generation;
 }
 
 static void perf_group_detach(struct perf_event *event)
@@ -1894,6 +1896,12 @@ static void perf_event_context_sched_out
 	if (!cpuctx->task_ctx)
 		return;
 
+#if 0
+	/*
+	 * Need to sort out how to make task_struct::perf_tp_idr
+	 * work with this fancy switching stuff.. tracepoints could be
+	 * in multiple contexts due to the software event muck.
+	 */
 	rcu_read_lock();
 	parent = rcu_dereference(ctx->parent_ctx);
 	next_ctx = next->perf_event_ctxp[ctxn];
@@ -1927,6 +1935,7 @@ static void perf_event_context_sched_out
 		raw_spin_unlock(&ctx->lock);
 	}
 	rcu_read_unlock();
+#endif
 
 	if (do_switch) {
 		ctx_sched_out(ctx, cpuctx, EVENT_ALL);
@@ -3261,6 +3270,7 @@ static struct perf_event *perf_fget_ligh
 static int perf_event_set_output(struct perf_event *event,
 				 struct perf_event *output_event);
 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
+static int perf_event_add_tp(struct perf_event *event, int tp_id);
 
 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
@@ -3307,6 +3317,9 @@ static long perf_ioctl(struct file *file
 	case PERF_EVENT_IOC_SET_FILTER:
 		return perf_event_set_filter(event, (void __user *)arg);
 
+	case PERF_EVENT_IOC_ADD_TP:
+		return perf_event_add_tp(event, arg);
+
 	default:
 		return -ENOTTY;
 	}
@@ -5471,6 +5484,9 @@ static struct pmu perf_swevent = {
 
 #ifdef CONFIG_EVENT_TRACING
 
+#include <linux/ftrace_event.h>
+#include "../trace/trace_output.h"
+
 static int perf_tp_filter_match(struct perf_event *event,
 				struct perf_sample_data *data)
 {
@@ -5485,8 +5501,9 @@ static int perf_tp_event_match(struct pe
 				struct perf_sample_data *data,
 				struct pt_regs *regs)
 {
-	if (event->hw.state & PERF_HES_STOPPED)
+	if (event->state != PERF_EVENT_STATE_ACTIVE)
 		return 0;
+
 	/*
 	 * All tracepoints are from kernel-space.
 	 */
@@ -5499,8 +5516,60 @@ static int perf_tp_event_match(struct pe
 	return 1;
 }
 
+static void perf_tp_idr_init(struct perf_tp_idr *idr)
+{
+	idr_init(&idr->idr);
+	mutex_init(&idr->lock);
+}
+
+static DEFINE_PER_CPU(struct perf_tp_idr, perf_tp_idr);
+
+struct perf_tp_node {
+	struct list_head	list;
+	struct perf_event	*event;
+	struct rcu_head		rcu;
+};
+
+static void do_perf_tp_event(struct perf_event *event, u64 count,
+			     struct perf_sample_data *data,
+			     struct pt_regs *regs)
+{
+	if (perf_tp_event_match(event, data, regs))
+		perf_swevent_event(event, count, 1, data, regs);
+}
+
+static void perf_tp_idr_event(struct perf_tp_idr *tp_idr,
+			      int id, u64 count,
+			      struct perf_sample_data *data,
+			      struct pt_regs *regs)
+{
+	struct perf_tp_node *tp_node, *node;
+	struct perf_event *event;
+
+	if (!tp_idr)
+		return;
+
+	/*
+	 * Most of this is done under rcu_read_lock_sched(), which doesn't
+	 * exclude regular RCU grace periods, but the IDR code uses call_rcu()
+	 * so we have to use rcu_read_lock() here as well.
+	 */
+	rcu_read_lock();
+	tp_node = idr_find(&tp_idr->idr, id);
+	rcu_read_unlock();
+
+	if (!tp_node)
+		return;
+
+	event = tp_node->event;
+
+	do_perf_tp_event(event, count, data, regs);
+	list_for_each_entry_rcu(node, &tp_node->list, list)
+		do_perf_tp_event(node->event, count, data, regs);
+}
+
 void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
-		   struct pt_regs *regs, struct hlist_head *head, int rctx)
+		   struct pt_regs *regs, int rctx, int id)
 {
 	struct perf_sample_data data;
 	struct perf_event *event;
@@ -5514,18 +5583,197 @@ void perf_tp_event(u64 addr, u64 count, 
 	perf_sample_data_init(&data, addr);
 	data.raw = &raw;
 
-	hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
-		if (perf_tp_event_match(event, &data, regs))
-			perf_swevent_event(event, count, 1, &data, regs);
-	}
+	perf_tp_idr_event(&__get_cpu_var(perf_tp_idr), id, count, &data, regs);
+	perf_tp_idr_event(current->perf_tp_idr, id, count, &data, regs);
 
 	perf_swevent_put_recursion_context(rctx);
 }
 EXPORT_SYMBOL_GPL(perf_tp_event);
 
+static struct perf_tp_idr *
+perf_tp_init_task(struct perf_event *event, struct task_struct *task)
+{
+	struct perf_tp_idr *idr;
+
+	mutex_lock(&task->perf_event_mutex);
+	idr = task->perf_tp_idr;
+	if (idr)
+		goto unlock;
+
+	idr = kzalloc(sizeof(struct perf_tp_idr), GFP_KERNEL);
+	if (!idr)
+		goto unlock;
+
+	perf_tp_idr_init(idr);
+
+	task->perf_tp_idr = idr;
+unlock:
+	mutex_unlock(&task->perf_event_mutex);
+
+	return idr;
+}
+
+static struct perf_tp_idr *perf_event_idr(struct perf_event *event, bool create)
+{
+	struct perf_tp_idr *tp_idr;
+	struct task_struct *task;
+
+	if (event->attach_state & PERF_ATTACH_TASK) {
+		task = event->hw.event_target;
+		tp_idr = task->perf_tp_idr;
+		if (!tp_idr && create)
+			tp_idr = perf_tp_init_task(event, task);
+	} else
+		tp_idr = &per_cpu(perf_tp_idr, event->cpu);
+
+	return tp_idr;
+}
+
+static void perf_tp_free_node(struct rcu_head *rcu)
+{
+	struct perf_tp_node *node = container_of(rcu, struct perf_tp_node, rcu);
+
+	kfree(node);
+}
+
+static int perf_tp_remove_idr(int id, void *p, void *data)
+{
+	struct perf_tp_node *node = p;
+	struct perf_tp_node *first, *next;
+	struct perf_tp_idr *tp_idr = data;
+
+	if (!tp_idr)
+		goto no_idr;
+
+	mutex_lock(&tp_idr->lock);
+	first = idr_find(&tp_idr->idr, id);
+	if (first == node) {
+		next = list_first_entry(&first->list, struct perf_tp_node, list);
+		if (next != first)
+			idr_replace(&tp_idr->idr, next, id);
+		else
+			idr_remove(&tp_idr->idr, id);
+	}
+	list_del_rcu(&node->list);
+	mutex_unlock(&tp_idr->lock);
+
+no_idr:
+	perf_trace_destroy_id(id);
+	call_rcu_sched(&node->rcu, perf_tp_free_node);
+	return 0;
+}
+
 static void tp_perf_event_destroy(struct perf_event *event)
 {
-	perf_trace_destroy(event);
+	/*
+	 * Since this is the free path, the fd is gone an there
+	 * can be no concurrency on event->tp_idr.
+	 */
+
+	idr_for_each(&event->tp_idr.idr, perf_tp_remove_idr,
+			perf_event_idr(event, false));
+
+	idr_remove_all(&event->tp_idr.idr);
+	idr_destroy(&event->tp_idr.idr);
+}
+
+static int __perf_event_add_tp(struct perf_event *event, int tp_id)
+{
+	struct perf_tp_node *node, *first;
+	struct perf_tp_idr *idr;
+	int tmp_id, err, ret = -ENOMEM;
+
+	node = kmalloc(sizeof(*node), GFP_KERNEL);
+	if (!node)
+		goto out;
+
+	node->event = event;
+	INIT_LIST_HEAD(&node->list);
+
+	/*
+	 * Insert the node into the event->idr, this idr tracks the
+	 * tracepoints we're interested in, it has a 1:1 relation
+	 * with the node.
+	 */
+	idr = &event->tp_idr;
+	mutex_lock(&idr->lock);
+	err = idr_pre_get(&idr->idr, GFP_KERNEL);
+	if (!err) {
+		ret = -ENOMEM;
+		goto free_node;
+	}
+
+	ret = idr_get_new_above(&idr->idr, node, tp_id, &tmp_id);
+	if (ret)
+		goto free_node;
+
+	if (WARN_ON(tp_id != tmp_id)) {
+		printk(KERN_ERR "fail: %d %d\n" , tp_id, tmp_id);
+		ret = -EBUSY;
+		goto free_idr1;
+	}
+	mutex_unlock(&idr->lock);
+
+	/*
+	 * Insert the node into the task/cpu idr, this idr tracks
+	 * all active tracepoints for the task/cpu, it has a 1:n relation
+	 * with the node.
+	 */
+	idr = perf_event_idr(event, true);
+	if (!idr) {
+		if (event->attach_state & PERF_ATTACH_CONTEXT)
+			ret = -ENOMEM;
+		else
+			ret = -ESRCH;
+		goto free_idr1_set;
+	}
+	mutex_lock(&idr->lock);
+	first = idr_find(&idr->idr, tp_id);
+	if (first) {
+		list_add_rcu(&node->list, &first->list);
+		goto unlock;
+	}
+
+	err = idr_pre_get(&idr->idr, GFP_KERNEL);
+	if (!err) {
+		ret = -ENOMEM;
+		goto free_idr1_set_unlock;
+	}
+
+	ret = idr_get_new_above(&idr->idr, node, tp_id, &tmp_id);
+	if (ret)
+		goto free_idr1_set;
+
+	if (WARN_ON(tp_id != tmp_id)) {
+		ret = -EBUSY;
+		goto free_idr2;
+	}
+unlock:
+	mutex_unlock(&idr->lock);
+
+	ret = perf_trace_init(event, tp_id);
+	if (ret)
+		goto free_all;
+
+out:
+	return ret;
+
+free_all:
+	mutex_lock(&idr->lock);
+free_idr2:
+	idr_remove(&idr->idr, tmp_id);
+free_idr1_set_unlock:
+	mutex_unlock(&idr->lock);
+free_idr1_set:
+	idr = &event->tp_idr;
+	tmp_id = tp_id;
+	mutex_lock(&idr->lock);
+free_idr1:
+	idr_remove(&idr->idr, tmp_id);
+free_node:
+	mutex_unlock(&idr->lock);
+	kfree(node);
+	goto out;
 }
 
 static int perf_tp_event_init(struct perf_event *event)
@@ -5535,21 +5783,35 @@ static int perf_tp_event_init(struct per
 	if (event->attr.type != PERF_TYPE_TRACEPOINT)
 		return -ENOENT;
 
-	err = perf_trace_init(event);
-	if (err)
-		return err;
+	perf_tp_idr_init(&event->tp_idr);
 
 	event->destroy = tp_perf_event_destroy;
 
+	if (event->attr.config != ~0ULL) {
+		err = __perf_event_add_tp(event, event->attr.config);
+		if (err)
+			return err;
+	}
+
 	return 0;
 }
 
+static int perf_tp_event_add(struct perf_event *event, int flags)
+{
+	event->hw.state = flags & PERF_EF_START ? 0 : PERF_HES_STOPPED;
+	return 0;
+}
+
+static void perf_tp_event_del(struct perf_event *event, int flags)
+{
+}
+
 static struct pmu perf_tracepoint = {
 	.task_ctx_nr	= perf_sw_context,
 
 	.event_init	= perf_tp_event_init,
-	.add		= perf_trace_add,
-	.del		= perf_trace_del,
+	.add		= perf_tp_event_add,
+	.del		= perf_tp_event_del,
 	.start		= perf_swevent_start,
 	.stop		= perf_swevent_stop,
 	.read		= perf_swevent_read,
@@ -5557,6 +5819,11 @@ static struct pmu perf_tracepoint = {
 
 static inline void perf_tp_register(void)
 {
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		perf_tp_idr_init(&per_cpu(perf_tp_idr, cpu));
+
 	perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
 }
 
@@ -5565,7 +5832,8 @@ static int perf_event_set_filter(struct 
 	char *filter_str;
 	int ret;
 
-	if (event->attr.type != PERF_TYPE_TRACEPOINT)
+	if (event->attr.type != PERF_TYPE_TRACEPOINT ||
+	    event->attr.config == ~0ULL)
 		return -EINVAL;
 
 	filter_str = strndup_user(arg, PAGE_SIZE);
@@ -5583,6 +5851,74 @@ static void perf_event_free_filter(struc
 	ftrace_profile_free_filter(event);
 }
 
+static int perf_event_add_tp(struct perf_event *event, int tp_id)
+{
+	if (event->attr.type != PERF_TYPE_TRACEPOINT &&
+	    event->attr.config != ~0ULL)
+		return -EINVAL;
+
+	return __perf_event_add_tp(event, tp_id);
+}
+
+/*
+ * Called from the exit path, _after_ all events have been detached from it.
+ */
+static void perf_tp_event_exit(struct task_struct *tsk)
+{
+	struct perf_tp_idr *idr = tsk->perf_tp_idr;
+
+	if (!idr)
+		return;
+
+	idr_remove_all(&idr->idr);
+	idr_destroy(&idr->idr);
+}
+
+static void perf_tp_event_delayed_put(struct task_struct *tsk)
+{
+	struct perf_tp_idr *idr = tsk->perf_tp_idr;
+
+	tsk->perf_tp_idr = NULL;
+	kfree(idr);
+}
+
+static int perf_tp_inherit_idr(int id, void *p, void *data)
+{
+	struct perf_event *child = data;
+
+	return __perf_event_add_tp(child, id);
+}
+
+static int perf_tp_event_inherit(struct perf_event *parent_event,
+				 struct perf_event *child_event)
+{
+	int ret;
+
+	if (parent_event->attr.type != PERF_TYPE_TRACEPOINT ||
+	    parent_event->attr.config != ~0ULL)
+		return 0;
+
+	/*
+	 * The child is not yet exposed, hence no need to serialize things
+	 * on that side.
+	 */
+	mutex_lock(&parent_event->tp_idr.lock);
+	ret = idr_for_each(&parent_event->tp_idr.idr,
+			perf_tp_inherit_idr,
+			child_event);
+	mutex_unlock(&parent_event->tp_idr.lock);
+
+	return ret;
+}
+
+static void perf_tp_event_init_task(struct task_struct *child)
+{
+	/*
+	 * Clear the idr pointer copied from the parent.
+	 */
+	child->perf_tp_idr = NULL;
+}
+
 #else
 
 static inline void perf_tp_register(void)
@@ -5598,6 +5934,29 @@ static void perf_event_free_filter(struc
 {
 }
 
+static int perf_event_add_tp(struct perf_event *event, int tp_id)
+{
+	return -ENOENT;
+}
+
+static void perf_tp_event_exit(struct task_struct *tsk)
+{
+}
+
+static void perf_tp_event_delayed_put(struct task_struct *tsk)
+{
+}
+
+static int perf_tp_event_inherit(struct perf_event *parent_event,
+				 struct perf_event *child_event)
+{
+	return 0;
+}
+
+static void perf_tp_event_init_task()(struct task_struct *child)
+{
+}
+
 #endif /* CONFIG_EVENT_TRACING */
 
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
@@ -6173,6 +6532,9 @@ perf_event_alloc(struct perf_event_attr 
 	INIT_LIST_HEAD(&event->sibling_list);
 	init_waitqueue_head(&event->waitq);
 	init_irq_work(&event->pending, perf_pending_event);
+#ifdef CONFIG_EVENT_TRACING
+	perf_tp_idr_init(&event->tp_idr);
+#endif
 
 	mutex_init(&event->mmap_mutex);
 
@@ -6191,6 +6553,7 @@ perf_event_alloc(struct perf_event_attr 
 
 	if (task) {
 		event->attach_state = PERF_ATTACH_TASK;
+		event->hw.event_target = task;
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 		/*
 		 * hw_breakpoint is a bit difficult here..
@@ -6236,7 +6599,7 @@ done:
 	if (err) {
 		if (event->ns)
 			put_pid_ns(event->ns);
-		kfree(event);
+		free_event(event);
 		return ERR_PTR(err);
 	}
 
@@ -6604,7 +6967,6 @@ SYSCALL_DEFINE5(perf_event_open,
 	}
 
 	perf_install_in_context(ctx, event, cpu);
-	++ctx->generation;
 	perf_unpin_context(ctx);
 	mutex_unlock(&ctx->mutex);
 
@@ -6681,7 +7043,6 @@ perf_event_create_kernel_counter(struct 
 	WARN_ON_ONCE(ctx->parent_ctx);
 	mutex_lock(&ctx->mutex);
 	perf_install_in_context(ctx, event, cpu);
-	++ctx->generation;
 	perf_unpin_context(ctx);
 	mutex_unlock(&ctx->mutex);
 
@@ -6858,6 +7219,8 @@ void perf_event_exit_task(struct task_st
 
 	for_each_task_context_nr(ctxn)
 		perf_event_exit_task_context(child, ctxn);
+
+	perf_tp_event_exit(child);
 }
 
 static void perf_free_event(struct perf_event *event,
@@ -6920,6 +7283,8 @@ void perf_event_delayed_put(struct task_
 
 	for_each_task_context_nr(ctxn)
 		WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
+
+	perf_tp_event_delayed_put(task);
 }
 
 /*
@@ -6935,6 +7300,7 @@ inherit_event(struct perf_event *parent_
 {
 	struct perf_event *child_event;
 	unsigned long flags;
+	int ret;
 
 	/*
 	 * Instead of creating recursive hierarchies of events,
@@ -6952,6 +7318,13 @@ inherit_event(struct perf_event *parent_
 					   NULL);
 	if (IS_ERR(child_event))
 		return child_event;
+
+	ret = perf_tp_event_inherit(parent_event, child_event);
+	if (ret) {
+		free_event(child_event);
+		return ERR_PTR(ret);
+	}
+
 	get_ctx(child_ctx);
 
 	/*
@@ -7177,6 +7550,8 @@ int perf_event_init_task(struct task_str
 	mutex_init(&child->perf_event_mutex);
 	INIT_LIST_HEAD(&child->perf_event_list);
 
+	perf_tp_event_init_task(child);
+
 	for_each_task_context_nr(ctxn) {
 		ret = perf_event_init_context(child, ctxn);
 		if (ret)
Index: linux/kernel/trace/trace_event_perf.c
===================================================================
--- linux.orig/kernel/trace/trace_event_perf.c
+++ linux/kernel/trace/trace_event_perf.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/kprobes.h>
 #include "trace.h"
+#include "trace_output.h"
 
 static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
 
@@ -47,9 +48,7 @@ static int perf_trace_event_perm(struct 
 static int perf_trace_event_init(struct ftrace_event_call *tp_event,
 				 struct perf_event *p_event)
 {
-	struct hlist_head __percpu *list;
 	int ret;
-	int cpu;
 
 	ret = perf_trace_event_perm(tp_event, p_event);
 	if (ret)
@@ -61,15 +60,6 @@ static int perf_trace_event_init(struct 
 
 	ret = -ENOMEM;
 
-	list = alloc_percpu(struct hlist_head);
-	if (!list)
-		goto fail;
-
-	for_each_possible_cpu(cpu)
-		INIT_HLIST_HEAD(per_cpu_ptr(list, cpu));
-
-	tp_event->perf_events = list;
-
 	if (!total_ref_count) {
 		char __percpu *buf;
 		int i;
@@ -100,63 +90,40 @@ fail:
 		}
 	}
 
-	if (!--tp_event->perf_refcount) {
-		free_percpu(tp_event->perf_events);
-		tp_event->perf_events = NULL;
-	}
+	--tp_event->perf_refcount;
 
 	return ret;
 }
 
-int perf_trace_init(struct perf_event *p_event)
+int perf_trace_init(struct perf_event *p_event, int event_id)
 {
 	struct ftrace_event_call *tp_event;
-	int event_id = p_event->attr.config;
+	struct trace_event *t_event;
 	int ret = -EINVAL;
 
+	trace_event_read_lock();
+	t_event = ftrace_find_event(event_id);
+	if (!t_event)
+		goto out;
+
+	tp_event = container_of(t_event, struct ftrace_event_call, event);
+
 	mutex_lock(&event_mutex);
-	list_for_each_entry(tp_event, &ftrace_events, list) {
-		if (tp_event->event.type == event_id &&
-		    tp_event->class && tp_event->class->reg &&
-		    try_module_get(tp_event->mod)) {
-			ret = perf_trace_event_init(tp_event, p_event);
-			if (ret)
-				module_put(tp_event->mod);
-			break;
-		}
+	if (tp_event->class && tp_event->class->reg &&
+			try_module_get(tp_event->mod)) {
+		ret = perf_trace_event_init(tp_event, p_event);
+		if (ret)
+			module_put(tp_event->mod);
 	}
 	mutex_unlock(&event_mutex);
+out:
+	trace_event_read_unlock();
 
 	return ret;
 }
 
-int perf_trace_add(struct perf_event *p_event, int flags)
-{
-	struct ftrace_event_call *tp_event = p_event->tp_event;
-	struct hlist_head __percpu *pcpu_list;
-	struct hlist_head *list;
-
-	pcpu_list = tp_event->perf_events;
-	if (WARN_ON_ONCE(!pcpu_list))
-		return -EINVAL;
-
-	if (!(flags & PERF_EF_START))
-		p_event->hw.state = PERF_HES_STOPPED;
-
-	list = this_cpu_ptr(pcpu_list);
-	hlist_add_head_rcu(&p_event->hlist_entry, list);
-
-	return 0;
-}
-
-void perf_trace_del(struct perf_event *p_event, int flags)
-{
-	hlist_del_rcu(&p_event->hlist_entry);
-}
-
-void perf_trace_destroy(struct perf_event *p_event)
+static void __perf_trace_destroy(struct ftrace_event_call *tp_event)
 {
-	struct ftrace_event_call *tp_event = p_event->tp_event;
 	int i;
 
 	mutex_lock(&event_mutex);
@@ -171,9 +138,6 @@ void perf_trace_destroy(struct perf_even
 	 */
 	tracepoint_synchronize_unregister();
 
-	free_percpu(tp_event->perf_events);
-	tp_event->perf_events = NULL;
-
 	if (!--total_ref_count) {
 		for (i = 0; i < PERF_NR_CONTEXTS; i++) {
 			free_percpu(perf_trace_buf[i]);
@@ -185,6 +149,27 @@ out:
 	mutex_unlock(&event_mutex);
 }
 
+void perf_trace_destroy(struct perf_event *p_event)
+{
+	__perf_trace_destroy(p_event->tp_event);
+}
+
+void perf_trace_destroy_id(int event_id)
+{
+	struct ftrace_event_call *tp_event;
+	struct trace_event *t_event;
+
+	trace_event_read_lock();
+	t_event = ftrace_find_event(event_id);
+	if (!t_event)
+		goto unlock;
+
+	tp_event = container_of(t_event, struct ftrace_event_call, event);
+	__perf_trace_destroy(tp_event);
+unlock:
+	trace_event_read_unlock();
+}
+
 __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
 				       struct pt_regs *regs, int *rctxp)
 {
Index: linux/kernel/trace/trace_kprobe.c
===================================================================
--- linux.orig/kernel/trace/trace_kprobe.c
+++ linux/kernel/trace/trace_kprobe.c
@@ -1659,7 +1659,6 @@ static __kprobes void kprobe_perf_func(s
 	struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
 	struct ftrace_event_call *call = &tp->call;
 	struct kprobe_trace_entry_head *entry;
-	struct hlist_head *head;
 	int size, __size, dsize;
 	int rctx;
 
@@ -1679,8 +1678,8 @@ static __kprobes void kprobe_perf_func(s
 	memset(&entry[1], 0, dsize);
 	store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
 
-	head = this_cpu_ptr(call->perf_events);
-	perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
+	perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs,
+			      call->event.type);
 }
 
 /* Kretprobe profile handler */
@@ -1690,7 +1689,6 @@ static __kprobes void kretprobe_perf_fun
 	struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
 	struct ftrace_event_call *call = &tp->call;
 	struct kretprobe_trace_entry_head *entry;
-	struct hlist_head *head;
 	int size, __size, dsize;
 	int rctx;
 
@@ -1710,8 +1708,8 @@ static __kprobes void kretprobe_perf_fun
 	entry->ret_ip = (unsigned long)ri->ret_addr;
 	store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
 
-	head = this_cpu_ptr(call->perf_events);
-	perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
+	perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1,
+			      regs, call->event.type);
 }
 
 static int probe_perf_enable(struct ftrace_event_call *call)
Index: linux/kernel/trace/trace_output.c
===================================================================
--- linux.orig/kernel/trace/trace_output.c
+++ linux/kernel/trace/trace_output.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/ftrace.h>
+#include <linux/idr.h>
 
 #include "trace_output.h"
 
@@ -16,9 +17,9 @@
 
 DECLARE_RWSEM(trace_event_mutex);
 
-static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
+static const int first_event_type = __TRACE_LAST_TYPE + 1;
 
-static int next_event_type = __TRACE_LAST_TYPE + 1;
+static DEFINE_IDR(trace_type_idr);
 
 int trace_print_seq(struct seq_file *m, struct trace_seq *s)
 {
@@ -664,58 +665,43 @@ static int task_state_char(unsigned long
  */
 struct trace_event *ftrace_find_event(int type)
 {
-	struct trace_event *event;
-	struct hlist_node *n;
-	unsigned key;
-
-	key = type & (EVENT_HASHSIZE - 1);
-
-	hlist_for_each_entry(event, n, &event_hash[key], node) {
-		if (event->type == type)
-			return event;
-	}
-
-	return NULL;
+	return idr_find(&trace_type_idr, type);
 }
 
-static LIST_HEAD(ftrace_event_list);
+void trace_event_read_lock(void)
+{
+	down_read(&trace_event_mutex);
+}
 
-static int trace_search_list(struct list_head **list)
+void trace_event_read_unlock(void)
 {
-	struct trace_event *e;
-	int last = __TRACE_LAST_TYPE;
+	up_read(&trace_event_mutex);
+}
 
-	if (list_empty(&ftrace_event_list)) {
-		*list = &ftrace_event_list;
-		return last + 1;
-	}
+static int register_event(struct trace_event *event, int id, bool strict)
+{
+	int ret, type;
 
-	/*
-	 * We used up all possible max events,
-	 * lets see if somebody freed one.
-	 */
-	list_for_each_entry(e, &ftrace_event_list, list) {
-		if (e->type != last + 1)
-			break;
-		last++;
-	}
+	ret = idr_pre_get(&trace_type_idr, GFP_KERNEL);
+	if (!ret)
+		return 0;
 
-	/* Did we used up all 65 thousand events??? */
-	if ((last + 1) > FTRACE_MAX_EVENT)
+	ret = idr_get_new_above(&trace_type_idr, event, id, &type);
+	if (ret)
 		return 0;
 
-	*list = &e->list;
-	return last + 1;
-}
+	if (strict && id != type) {
+		idr_remove(&trace_type_idr, type);
+		return 0;
+	}
 
-void trace_event_read_lock(void)
-{
-	down_read(&trace_event_mutex);
-}
+	if (type > FTRACE_MAX_EVENT) {
+		idr_remove(&trace_type_idr, type);
+		return 0;
+	}
 
-void trace_event_read_unlock(void)
-{
-	up_read(&trace_event_mutex);
+	event->type = type;
+	return type;
 }
 
 /**
@@ -735,7 +721,6 @@ void trace_event_read_unlock(void)
  */
 int register_ftrace_event(struct trace_event *event)
 {
-	unsigned key;
 	int ret = 0;
 
 	down_write(&trace_event_mutex);
@@ -746,35 +731,18 @@ int register_ftrace_event(struct trace_e
 	if (WARN_ON(!event->funcs))
 		goto out;
 
-	INIT_LIST_HEAD(&event->list);
-
 	if (!event->type) {
-		struct list_head *list = NULL;
-
-		if (next_event_type > FTRACE_MAX_EVENT) {
-
-			event->type = trace_search_list(&list);
-			if (!event->type)
-				goto out;
-
-		} else {
-			
-			event->type = next_event_type++;
-			list = &ftrace_event_list;
-		}
-
-		if (WARN_ON(ftrace_find_event(event->type)))
+		ret = register_event(event, first_event_type, false);
+		if (!ret)
 			goto out;
-
-		list_add_tail(&event->list, list);
-
-	} else if (event->type > __TRACE_LAST_TYPE) {
-		printk(KERN_WARNING "Need to add type to trace.h\n");
-		WARN_ON(1);
-		goto out;
 	} else {
-		/* Is this event already used */
-		if (ftrace_find_event(event->type))
+		if (event->type > __TRACE_LAST_TYPE) {
+			printk(KERN_WARNING "Need to add type to trace.h\n");
+			WARN_ON(1);
+			goto out;
+		}
+		ret = register_event(event, event->type, true);
+		if (!ret)
 			goto out;
 	}
 
@@ -787,11 +755,6 @@ int register_ftrace_event(struct trace_e
 	if (event->funcs->binary == NULL)
 		event->funcs->binary = trace_nop_print;
 
-	key = event->type & (EVENT_HASHSIZE - 1);
-
-	hlist_add_head(&event->node, &event_hash[key]);
-
-	ret = event->type;
  out:
 	up_write(&trace_event_mutex);
 
@@ -804,8 +767,7 @@ EXPORT_SYMBOL_GPL(register_ftrace_event)
  */
 int __unregister_ftrace_event(struct trace_event *event)
 {
-	hlist_del(&event->node);
-	list_del(&event->list);
+	idr_remove(&trace_type_idr, event->type);
 	return 0;
 }
 
Index: linux/kernel/trace/trace_syscalls.c
===================================================================
--- linux.orig/kernel/trace/trace_syscalls.c
+++ linux/kernel/trace/trace_syscalls.c
@@ -499,7 +499,6 @@ static void perf_syscall_enter(void *ign
 {
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_enter *rec;
-	struct hlist_head *head;
 	int syscall_nr;
 	int rctx;
 	int size;
@@ -530,8 +529,7 @@ static void perf_syscall_enter(void *ign
 	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
 			       (unsigned long *)&rec->args);
 
-	head = this_cpu_ptr(sys_data->enter_event->perf_events);
-	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
+	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, rec->ent.type);
 }
 
 int perf_sysenter_enable(struct ftrace_event_call *call)
@@ -573,7 +571,6 @@ static void perf_syscall_exit(void *igno
 {
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_exit *rec;
-	struct hlist_head *head;
 	int syscall_nr;
 	int rctx;
 	int size;
@@ -606,8 +603,7 @@ static void perf_syscall_exit(void *igno
 	rec->nr = syscall_nr;
 	rec->ret = syscall_get_return_value(current, regs);
 
-	head = this_cpu_ptr(sys_data->exit_event->perf_events);
-	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
+	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, rec->ent.type);
 }
 
 int perf_sysexit_enable(struct ftrace_event_call *call)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ