lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Fri, 23 Sep 2016 14:27:24 +0300
From:   Alexander Shishkin <alexander.shishkin@...ux.intel.com>
To:     Peter Zijlstra <a.p.zijlstra@...llo.nl>
Cc:     Ingo Molnar <mingo@...hat.com>, linux-kernel@...r.kernel.org,
        vince@...ter.net, eranian@...gle.com,
        Arnaldo Carvalho de Melo <acme@...radead.org>,
        tglx@...utronix.de, ak@...ux.intel.com,
        Alexander Shishkin <alexander.shishkin@...ux.intel.com>
Subject: [RFC PATCH 4/6] perf: Add infrastructure for using AUX data in perf samples

AUX data can be used to annotate perf events such as performance counters
or tracepoints/breakpoints by including it in sample records when
PERF_SAMPLE_AUX flag is set. Such samples would be instrumental in debugging
and profiling by providing, for example, a history of instruction flow
leading up to the event's overflow.

To facilitate this, this patch adds code to create a kernel counter with a
ring buffer to track and collect AUX data that is then copied out into the
sampled events' perf data stream as samples.

The user interface is extended to allow for this, new attribute fields are
added:

  * aux_sample_type: specify PMU on which the AUX data generating event
                     is created;
  * aux_sample_config: event config (maps to attribute's config field),
  * aux_sample_size: size of the sample to be written.

This kernel counter is configured similarly to the event that is being
annotated with regards to filtering (exclude_{hv,idle,user,kernel}) and
enabled state (disabled, enable_on_exec) to make sure that the sampler
is not tracking any out of context activity. One sampler can be used
for multiple events.

Signed-off-by: Alexander Shishkin <alexander.shishkin@...ux.intel.com>
---
 include/linux/perf_event.h      |  12 ++
 include/uapi/linux/perf_event.h |  16 +-
 kernel/events/core.c            | 315 +++++++++++++++++++++++++++++++++++++++-
 3 files changed, 341 insertions(+), 2 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 5c5362584a..7121cf7b5c 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -101,6 +101,12 @@ struct perf_branch_stack {
 	struct perf_branch_entry	entries[0];
 };
 
+struct perf_aux_record {
+	u64		size;
+	unsigned long	from;
+	unsigned long	to;
+};
+
 struct task_struct;
 
 /*
@@ -532,6 +538,7 @@ struct swevent_hlist {
 #define PERF_ATTACH_GROUP	0x02
 #define PERF_ATTACH_TASK	0x04
 #define PERF_ATTACH_TASK_DATA	0x08
+#define PERF_ATTACH_SAMPLING	0x10
 
 struct perf_cgroup;
 struct ring_buffer;
@@ -691,6 +698,9 @@ struct perf_event {
 	perf_overflow_handler_t		overflow_handler;
 	void				*overflow_handler_context;
 
+	struct perf_event		*aux_sampler;
+	atomic_long_t			aux_samplees_count;
+
 #ifdef CONFIG_EVENT_TRACING
 	struct trace_event_call		*tp_event;
 	struct event_filter		*filter;
@@ -888,6 +898,7 @@ struct perf_sample_data {
 	 */
 	u64				addr;
 	struct perf_raw_record		*raw;
+	struct perf_aux_record		aux;
 	struct perf_branch_stack	*br_stack;
 	u64				period;
 	u64				weight;
@@ -937,6 +948,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data,
 	/* remaining struct members initialized in perf_prepare_sample() */
 	data->addr = addr;
 	data->raw  = NULL;
+	data->aux.from = data->aux.to = data->aux.size = 0;
 	data->br_stack = NULL;
 	data->period = period;
 	data->weight = 0;
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index c66a485a24..1bf3f2c358 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -139,8 +139,9 @@ enum perf_event_sample_format {
 	PERF_SAMPLE_IDENTIFIER			= 1U << 16,
 	PERF_SAMPLE_TRANSACTION			= 1U << 17,
 	PERF_SAMPLE_REGS_INTR			= 1U << 18,
+	PERF_SAMPLE_AUX				= 1U << 19,
 
-	PERF_SAMPLE_MAX = 1U << 19,		/* non-ABI */
+	PERF_SAMPLE_MAX = 1U << 20,		/* non-ABI */
 };
 
 /*
@@ -273,6 +274,9 @@ enum perf_event_read_format {
 					/* add: sample_stack_user */
 #define PERF_ATTR_SIZE_VER4	104	/* add: sample_regs_intr */
 #define PERF_ATTR_SIZE_VER5	112	/* add: aux_watermark */
+#define PERF_ATTR_SIZE_VER6	136	/* add: aux_sample_type */
+					/* add: aux_sample_config */
+					/* add: aux_sample_size */
 
 /*
  * Hardware event_id to monitor via a performance monitoring event:
@@ -390,6 +394,14 @@ struct perf_event_attr {
 	__u32	aux_watermark;
 	__u16	sample_max_stack;
 	__u16	__reserved_2;	/* align to __u64 */
+
+	/*
+	 * AUX area sampling configuration
+	 */
+	__u64	aux_sample_config;	/* event config for AUX sampling */
+	__u64	aux_sample_size;	/* desired sample size */
+	__u32	aux_sample_type;	/* pmu::type of an AUX PMU */
+	__u32	__reserved_3;		/* align to __u64 */
 };
 
 #define perf_flags(attr)	(*(&(attr)->read_format + 1))
@@ -773,6 +785,8 @@ enum perf_event_type {
 	 *	{ u64			transaction; } && PERF_SAMPLE_TRANSACTION
 	 *	{ u64			abi; # enum perf_sample_regs_abi
 	 *	  u64			regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
+	 *	{ u64			size;
+	 *	  char			data[size]; } && PERF_SAMPLE_AUX
 	 * };
 	 */
 	PERF_RECORD_SAMPLE			= 9,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b64a5c611f..fdb20fdeb1 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2422,6 +2422,25 @@ static void _perf_event_enable(struct perf_event *event)
 {
 	struct perf_event_context *ctx = event->ctx;
 
+	if (event->aux_sampler) {
+		struct perf_event_context *sctx = event->aux_sampler->ctx;
+
+		lockdep_assert_held(&ctx->mutex);
+
+		if (sctx != ctx) {
+			sctx = perf_event_ctx_lock_nested(event->aux_sampler,
+							  SINGLE_DEPTH_NESTING);
+			if (WARN_ON_ONCE(!sctx))
+				goto done;
+		}
+
+		_perf_event_enable(event->aux_sampler);
+
+		if (sctx != ctx)
+			perf_event_ctx_unlock(event->aux_sampler, sctx);
+	}
+
+done:
 	raw_spin_lock_irq(&ctx->lock);
 	if (event->state >= PERF_EVENT_STATE_INACTIVE ||
 	    event->state <  PERF_EVENT_STATE_ERROR) {
@@ -3855,6 +3874,8 @@ static void unaccount_freq_event(void)
 		atomic_dec(&nr_freq_events);
 }
 
+static void perf_aux_sampler_fini(struct perf_event *event);
+
 static void unaccount_event(struct perf_event *event)
 {
 	bool dec = false;
@@ -3886,6 +3907,9 @@ static void unaccount_event(struct perf_event *event)
 			schedule_delayed_work(&perf_sched_work, HZ);
 	}
 
+	if ((event->attr.sample_type & PERF_SAMPLE_AUX))
+		perf_aux_sampler_fini(event);
+
 	unaccount_event_cpu(event, event->cpu);
 
 	unaccount_pmu_sb_event(event);
@@ -3993,6 +4017,23 @@ static void _free_event(struct perf_event *event)
 
 	unaccount_event(event);
 
+	if (kernel_rb_event(event)) {
+		struct perf_event_context *ctx = event->ctx;
+		unsigned long flags;
+
+		/*
+		 * This event may not be explicitly freed by
+		 * perf_event_release_kernel(), we still need to remove it
+		 * from its context.
+		 */
+		raw_spin_lock_irqsave(&ctx->lock, flags);
+		list_del_event(event, ctx);
+		raw_spin_unlock_irqrestore(&ctx->lock, flags);
+
+		ring_buffer_unaccount(event->rb, false);
+		rb_free_kernel(event->rb, event);
+	}
+
 	if (event->rb) {
 		/*
 		 * Can happen when we close an event with re-directed output.
@@ -5455,6 +5496,232 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
 	}
 }
 
+struct perf_event *__find_sampling_counter(struct perf_event_context *ctx,
+					   struct perf_event *event,
+					   struct task_struct *task)
+{
+	struct perf_event *sampler = NULL;
+
+	list_for_each_entry(sampler, &ctx->event_list, event_entry) {
+		if (kernel_rb_event(sampler) &&
+		    sampler->cpu                  == event->cpu &&
+		    sampler->attr.type            == event->attr.aux_sample_type &&
+		    sampler->attr.config          == event->attr.aux_sample_config &&
+		    sampler->attr.exclude_hv      == event->attr.exclude_hv &&
+		    sampler->attr.exclude_idle    == event->attr.exclude_idle &&
+		    sampler->attr.exclude_user    == event->attr.exclude_user &&
+		    sampler->attr.exclude_kernel  == event->attr.exclude_kernel &&
+		    sampler->attr.aux_sample_size >= event->attr.aux_sample_size &&
+		    atomic_long_inc_not_zero(&sampler->refcount))
+			return sampler;
+	}
+
+	return NULL;
+}
+
+struct perf_event *find_sampling_counter(struct pmu *pmu,
+					 struct perf_event *event,
+					 struct task_struct *task)
+{
+	struct perf_event *sampler = NULL;
+	struct perf_cpu_context *cpuctx;
+	struct perf_event_context *ctx;
+	unsigned long flags;
+
+	if (!task) {
+		if (!cpu_online(event->cpu))
+			return NULL;
+
+		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, event->cpu);
+		ctx = &cpuctx->ctx;
+		raw_spin_lock_irqsave(&ctx->lock, flags);
+	} else {
+		ctx = perf_lock_task_context(task, pmu->task_ctx_nr, &flags);
+
+		if (!ctx)
+			return NULL;
+	}
+
+	sampler = __find_sampling_counter(ctx, event, task);
+	raw_spin_unlock_irqrestore(&ctx->lock, flags);
+
+	return sampler;
+}
+
+/*
+ * Sampling AUX data in perf events is done by means of a kernel event that
+ * collects data to its own ring_buffer. This data gets copied out into sampled
+ * event's SAMPLE_AUX records every time the sampled event overflows. One such
+ * kernel event (sampler) can be used to provide samples for multiple events
+ * (samplees) on the same context if their attributes match. Each samplee
+ * holds a reference to the sampler event; the last one out frees the sampler;
+ * perf_event_exit_task() is instructed not to free samplers directly.
+ */
+static int perf_aux_sampler_init(struct perf_event *event,
+				 struct task_struct *task,
+				 struct pmu *pmu)
+{
+	struct perf_event_attr attr;
+	struct perf_event *sampler;
+	unsigned long nr_pages;
+	int ret;
+
+	if (!pmu || !pmu->setup_aux)
+		return -ENOTSUPP;
+
+	sampler = find_sampling_counter(pmu, event, task);
+	if (!sampler) {
+		memset(&attr, 0, sizeof(attr));
+		attr.type            = pmu->type;
+		attr.config          = event->attr.aux_sample_config;
+		attr.disabled        = 1; /* see below */
+		attr.enable_on_exec  = event->attr.enable_on_exec;
+		attr.exclude_hv      = event->attr.exclude_hv;
+		attr.exclude_idle    = event->attr.exclude_idle;
+		attr.exclude_user    = event->attr.exclude_user;
+		attr.exclude_kernel  = event->attr.exclude_kernel;
+		attr.aux_sample_size = event->attr.aux_sample_size;
+
+		sampler = perf_event_create_kernel_counter(&attr, event->cpu,
+							   task, NULL, NULL);
+		if (IS_ERR(sampler))
+			return PTR_ERR(sampler);
+
+		nr_pages = 1ul << __get_order(event->attr.aux_sample_size);
+
+		ret = rb_alloc_kernel(sampler, 0, nr_pages);
+		if (ret) {
+			perf_event_release_kernel(sampler);
+			return ret;
+		}
+
+		/*
+		 * This event will be freed by the last exiting samplee;
+		 * perf_event_exit_task() should skip it over.
+		 */
+		sampler->attach_state |= PERF_ATTACH_SAMPLING;
+	}
+
+	event->aux_sampler = sampler;
+
+	if (!atomic_long_inc_return(&sampler->aux_samplees_count)) {
+		/*
+		 * enable the sampler here unless the original event wants
+		 * to stay disabled
+		 */
+		if (!event->attr.disabled)
+			perf_event_enable(sampler);
+	}
+
+	return 0;
+}
+
+static void perf_aux_sampler_fini(struct perf_event *event)
+{
+	struct perf_event *sampler = event->aux_sampler;
+
+	if (!sampler)
+		return;
+
+	/*
+	 * We're holding a reference to the sampler, so it's always
+	 * valid here.
+	 */
+	if (atomic_long_dec_and_test(&sampler->aux_samplees_count))
+		perf_event_disable(sampler);
+
+	/* can be last */
+	put_event(sampler);
+
+	event->aux_sampler = NULL;
+}
+
+static unsigned long perf_aux_sampler_trace(struct perf_event *event,
+					    struct perf_sample_data *data)
+{
+	struct perf_event *sampler = event->aux_sampler;
+	struct ring_buffer *rb;
+	int *disable_count;
+
+	data->aux.size = 0;
+
+	if (!sampler || READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE)
+		goto out;
+
+	if (READ_ONCE(sampler->oncpu) != smp_processor_id())
+		goto out;
+
+	/*
+	 * Non-zero disable count here means that we, being the NMI
+	 * context, are racing with pmu::add or pmu::del, both of which
+	 * may lead to a dangling hardware event and all manner of mayhem.
+	 */
+	disable_count = this_cpu_ptr(sampler->pmu->pmu_disable_count);
+	if (*disable_count)
+		goto out;
+
+	perf_pmu_disable(sampler->pmu);
+
+	rb = ring_buffer_get(sampler);
+	if (!rb) {
+		perf_pmu_enable(sampler->pmu);
+		goto out;
+	}
+
+	sampler->pmu->stop(sampler, PERF_EF_UPDATE);
+
+	data->aux.to = local_read(&rb->aux_head);
+
+	if (data->aux.to < sampler->attr.aux_sample_size)
+		data->aux.from = rb->aux_nr_pages * PAGE_SIZE +
+			data->aux.to - sampler->attr.aux_sample_size;
+	else
+		data->aux.from = data->aux.to -
+			sampler->attr.aux_sample_size;
+	data->aux.size = ALIGN(sampler->attr.aux_sample_size, sizeof(u64));
+	ring_buffer_put(rb);
+
+out:
+	return data->aux.size;
+}
+
+static void perf_aux_sampler_output(struct perf_event *event,
+				    struct perf_output_handle *handle,
+				    struct perf_sample_data *data)
+{
+	struct perf_event *sampler = event->aux_sampler;
+	struct ring_buffer *rb;
+	unsigned long pad;
+	int ret;
+
+	if (WARN_ON_ONCE(!sampler || !data->aux.size))
+		goto out_enable;
+
+	rb = ring_buffer_get(sampler);
+	if (WARN_ON_ONCE(!rb))
+		goto out_enable;
+
+	ret = rb_output_aux(rb, data->aux.from, data->aux.to,
+			    (aux_copyfn)perf_output_copy, handle);
+	if (ret < 0) {
+		pr_warn_ratelimited("failed to copy trace data\n");
+		goto out;
+	}
+
+	pad = data->aux.size - ret;
+	if (pad) {
+		u64 p = 0;
+
+		perf_output_copy(handle, &p, pad);
+	}
+out:
+	ring_buffer_put(rb);
+	sampler->pmu->start(sampler, 0);
+
+out_enable:
+	perf_pmu_enable(sampler->pmu);
+}
+
 static void __perf_event_header__init_id(struct perf_event_header *header,
 					 struct perf_sample_data *data,
 					 struct perf_event *event)
@@ -5774,6 +6041,13 @@ void perf_output_sample(struct perf_output_handle *handle,
 		}
 	}
 
+	if (sample_type & PERF_SAMPLE_AUX) {
+		perf_output_put(handle, data->aux.size);
+
+		if (data->aux.size)
+			perf_aux_sampler_output(event, handle, data);
+	}
+
 	if (!event->attr.watermark) {
 		int wakeup_events = event->attr.wakeup_events;
 
@@ -5907,6 +6181,14 @@ void perf_prepare_sample(struct perf_event_header *header,
 
 		header->size += size;
 	}
+
+	if (sample_type & PERF_SAMPLE_AUX) {
+		u64 size = sizeof(u64);
+
+		size += perf_aux_sampler_trace(event, data);
+
+		header->size += size;
+	}
 }
 
 static void __always_inline
@@ -6109,6 +6391,8 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
 		event->addr_filters_gen++;
 	raw_spin_unlock_irqrestore(&ifh->lock, flags);
 
+	perf_pmu_enable(event->pmu);
+
 	if (restart)
 		perf_event_stop(event, 1);
 }
@@ -6673,6 +6957,8 @@ static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
 		event->addr_filters_gen++;
 	raw_spin_unlock_irqrestore(&ifh->lock, flags);
 
+	perf_pmu_enable(event->pmu);
+
 	if (restart)
 		perf_event_stop(event, 1);
 }
@@ -9076,10 +9362,27 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	}
 
 	if (!event->parent) {
+		if (event->attr.sample_type & PERF_SAMPLE_AUX) {
+			struct pmu *aux_pmu;
+			int idx;
+
+			err = -EINVAL;
+
+			idx = srcu_read_lock(&pmus_srcu);
+			aux_pmu = __perf_find_pmu(event->attr.aux_sample_type);
+			if (aux_pmu)
+				err = perf_aux_sampler_init(event, task,
+							    aux_pmu);
+			srcu_read_unlock(&pmus_srcu, idx);
+
+			if (err)
+				goto err_addr_filters;
+		}
+
 		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
 			err = get_callchain_buffers(attr->sample_max_stack);
 			if (err)
-				goto err_addr_filters;
+				goto err_aux_sampler;
 		}
 	}
 
@@ -9088,6 +9391,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 
 	return event;
 
+err_aux_sampler:
+	perf_aux_sampler_fini(event);
+
 err_addr_filters:
 	kfree(event->addr_filters_offs);
 
@@ -9917,6 +10223,13 @@ perf_event_exit_event(struct perf_event *child_event,
 	struct perf_event *parent_event = child_event->parent;
 
 	/*
+	 * Skip over samplers, they are released by the last holder
+	 * of their reference.
+	 */
+	if (child_event->attach_state & PERF_ATTACH_SAMPLING)
+		return;
+
+	/*
 	 * Do not destroy the 'original' grouping; because of the context
 	 * switch optimization the original events could've ended up in a
 	 * random child task.
-- 
2.9.3

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ