lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <87lfw234ew.fsf@ashishki-desk.ger.corp.intel.com>
Date:   Fri, 09 Aug 2019 15:32:39 +0300
From:   Alexander Shishkin <alexander.shishkin@...ux.intel.com>
To:     Peter Zijlstra <peterz@...radead.org>
Cc:     Ingo Molnar <mingo@...hat.com>,
        Arnaldo Carvalho de Melo <acme@...hat.com>,
        linux-kernel@...r.kernel.org, jolsa@...hat.com,
        alexander.shishkin@...ux.intel.com
Subject: Re: [PATCH v1 4/6] perf: Allow using AUX data in perf samples

Peter Zijlstra <peterz@...radead.org> writes:

> On Tue, Jun 19, 2018 at 01:47:25PM +0300, Alexander Shishkin wrote:
>> Right, the SW stuff may then race with event_function_call() stuff. Hmm.
>> For the HW stuff, I'm hoping that some kind of a sleight of hand may
>> suffice. Let me think some more.
>
> I currently don't see how the SW driven snapshot can ever work, see my
> comment on the last patch.

Yes, this should be solved by grouping, similarly PEBS->PT.

>> > Why can't you just snapshot the current location and let the thing
>> > 'run' ?
>> 
>> Because the buffer will overwrite itself and the location will be useless.
>
> Not if it's large enough ;-)
>
>> We don't write the AUX data out in this 'mode' at all, only the samples,
>> which allows for much less data in the resulting perf.data, less work for
>> the consumer, less IO bandwidth etc, and as a bonus, no AUX-related
>> interrupts.
>> 
>> But actually, even to snapshot the location we need to stop the event.
>
> Maybe new methods that should only be called from NMI context? 

Right, I went with that and added a ->snapshot_aux() method that can't
change any states or do anything that would affect the operation of
potentially preempted in-IRQ callbacks. So, if NMI hits in the middle of
->stop() or whatnot, we should be fine. Some context: in AUX overwrite
mode (aka "snapshot mode") we don't get PT related NMIs, the only NMIs
are from the HW events for which we're writing samples. We use the cpu
local ->handle_nmi (a kind of misnomer) to tell us if the NMI hit
between ->start() and ->stop() and we can safely take the sample.

The other problem is sampling SW events, that would require a ctx->lock
to prevent racing with event_function_call()s from other cpus, resulting
in somewhat cringy "if (!in_nmi()) raw_spin_lock(...)", but I don't have
better idea as to how to handle that.

The whole thing is squashed into one patch below for convenience.

>From df0b5efd9c24a9e4477a3501331888c4b4682588 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@...ux.intel.com>
Date: Tue, 12 Jun 2018 10:51:15 +0300
Subject: [RFC v0] perf, pt: Allow using AUX data in perf samples

---
 arch/x86/events/intel/pt.c      |  40 +++++++++
 include/linux/perf_event.h      |  15 ++++
 include/uapi/linux/perf_event.h |   7 +-
 kernel/events/core.c            | 139 +++++++++++++++++++++++++++++++-
 kernel/events/internal.h        |   1 +
 5 files changed, 200 insertions(+), 2 deletions(-)

diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index d58124d93e5f..a7318c29242e 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -1413,6 +1413,45 @@ static void pt_event_stop(struct perf_event *event, int mode)
 	}
 }
 
+static long pt_event_snapshot_aux(struct perf_event *event,
+				  struct perf_output_handle *handle,
+				  unsigned long size)
+{
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
+	struct pt_buffer *buf = perf_get_aux(&pt->handle);
+	unsigned long from = 0, to;
+	long ret;
+
+	if (!buf->snapshot)
+		return 0;
+
+	/*
+	 * Here, handle_nmi tells us if the tracing is on
+	 */
+	if (!READ_ONCE(pt->handle_nmi))
+		return 0;
+
+	pt_config_stop(event);
+
+	pt_read_offset(buf);
+	pt_update_head(pt);
+
+	to = local_read(&buf->data_size);
+	if (to < size)
+		from = buf->nr_pages << PAGE_SHIFT;
+	from += to - size;
+
+	ret = perf_output_copy_aux(&pt->handle, handle, from, to);
+
+	/*
+	 * If the tracing was on, restart it.
+	 */
+	if (READ_ONCE(pt->handle_nmi))
+		pt_config(event);
+
+	return ret;
+}
+
 static void pt_event_del(struct perf_event *event, int mode)
 {
 	pt_event_stop(event, PERF_EF_UPDATE);
@@ -1532,6 +1571,7 @@ static __init int pt_init(void)
 	pt_pmu.pmu.del			 = pt_event_del;
 	pt_pmu.pmu.start		 = pt_event_start;
 	pt_pmu.pmu.stop			 = pt_event_stop;
+	pt_pmu.pmu.snapshot_aux		 = pt_event_snapshot_aux;
 	pt_pmu.pmu.read			 = pt_event_read;
 	pt_pmu.pmu.setup_aux		 = pt_buffer_setup_aux;
 	pt_pmu.pmu.free_aux		 = pt_buffer_free_aux;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 9c8b70e8dc99..48ad35da73cd 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -103,6 +103,10 @@ struct perf_branch_stack {
 	struct perf_branch_entry	entries[0];
 };
 
+struct perf_aux_record {
+	u64		size;
+};
+
 struct task_struct;
 
 /*
@@ -248,6 +252,8 @@ struct perf_event;
 #define PERF_PMU_CAP_NO_EXCLUDE			0x80
 #define PERF_PMU_CAP_AUX_OUTPUT			0x100
 
+struct perf_output_handle;
+
 /**
  * struct pmu - generic performance monitoring unit
  */
@@ -422,6 +428,13 @@ struct pmu {
 	 */
 	void (*free_aux)		(void *aux); /* optional */
 
+	/*
+	 * Fun stuff
+	 */
+	long (*snapshot_aux)		(struct perf_event *event,
+					 struct perf_output_handle *handle,
+					 unsigned long size);
+
 	/*
 	 * Validate address range filters: make sure the HW supports the
 	 * requested configuration and number of filters; return 0 if the
@@ -960,6 +973,7 @@ struct perf_sample_data {
 		u32	reserved;
 	}				cpu_entry;
 	struct perf_callchain_entry	*callchain;
+	struct perf_aux_record		aux;
 
 	/*
 	 * regs_user may point to task_pt_regs or to regs_user_copy, depending
@@ -992,6 +1006,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data,
 	data->weight = 0;
 	data->data_src.val = PERF_MEM_NA;
 	data->txn = 0;
+	data->aux.size = 0;
 }
 
 extern void perf_output_sample(struct perf_output_handle *handle,
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index bb7b271397a6..947f49d46f0d 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -141,8 +141,9 @@ enum perf_event_sample_format {
 	PERF_SAMPLE_TRANSACTION			= 1U << 17,
 	PERF_SAMPLE_REGS_INTR			= 1U << 18,
 	PERF_SAMPLE_PHYS_ADDR			= 1U << 19,
+	PERF_SAMPLE_AUX				= 1U << 20,
 
-	PERF_SAMPLE_MAX = 1U << 20,		/* non-ABI */
+	PERF_SAMPLE_MAX = 1U << 21,		/* non-ABI */
 
 	__PERF_SAMPLE_CALLCHAIN_EARLY		= 1ULL << 63, /* non-ABI; internal use */
 };
@@ -300,6 +301,7 @@ enum perf_event_read_format {
 					/* add: sample_stack_user */
 #define PERF_ATTR_SIZE_VER4	104	/* add: sample_regs_intr */
 #define PERF_ATTR_SIZE_VER5	112	/* add: aux_watermark */
+#define PERF_ATTR_SIZE_VER6	120	/* add: aux_sample_size */
 
 /*
  * Hardware event_id to monitor via a performance monitoring event:
@@ -425,6 +427,7 @@ struct perf_event_attr {
 	__u32	aux_watermark;
 	__u16	sample_max_stack;
 	__u16	__reserved_2;	/* align to __u64 */
+	__u64	aux_sample_size;
 };
 
 /*
@@ -864,6 +867,8 @@ enum perf_event_type {
 	 *	{ u64			abi; # enum perf_sample_regs_abi
 	 *	  u64			regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
 	 *	{ u64			phys_addr;} && PERF_SAMPLE_PHYS_ADDR
+	 *	{ u64			size;
+	 *	  char			data[size]; } && PERF_SAMPLE_AUX
 	 * };
 	 */
 	PERF_RECORD_SAMPLE			= 9,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d72b756b4ccb..9754c1af51a4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1953,7 +1953,10 @@ static int perf_get_aux_event(struct perf_event *event,
 	if (!group_leader)
 		return 0;
 
-	if (!perf_aux_output_match(event, group_leader))
+	if (event->attr.aux_output && !perf_aux_output_match(event, group_leader))
+		return 0;
+
+	if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
 		return 0;
 
 	if (!atomic_long_inc_not_zero(&group_leader->refcount))
@@ -6150,6 +6153,103 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
 	}
 }
 
+static unsigned long perf_aux_sample_size(struct perf_event *event,
+					  struct perf_sample_data *data,
+					  size_t size)
+{
+	struct perf_event *sampler = event->aux_event;
+	struct ring_buffer *rb;
+
+	data->aux.size = 0;
+
+	if (!sampler)
+		goto out;
+
+	if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE))
+		goto out;
+
+	if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
+		goto out;
+
+	rb = ring_buffer_get(sampler);
+	if (!rb)
+		goto out;
+
+	size = min(size, perf_aux_size(rb));
+	data->aux.size = ALIGN(size, sizeof(u64));
+	ring_buffer_put(rb);
+
+out:
+	return data->aux.size;
+}
+
+int perf_pmu_aux_sample_output(struct perf_event *event,
+			       struct perf_output_handle *handle,
+			       unsigned long size)
+{
+	unsigned long flags;
+	int ret;
+
+	/*
+	 * NMI vs IRQ
+	 *
+	 * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler
+	 * paths. If we start calling them in NMI context, they may race with
+	 * the IRQ ones, that is, for example, re-starting an event that's just
+	 * been stopped.
+	 */
+	if (!in_nmi())
+		raw_spin_lock_irqsave(&event->ctx->lock, flags);
+
+	ret = event->pmu->snapshot_aux(event, handle, size);
+
+	if (!in_nmi())
+		raw_spin_unlock_irqrestore(&event->ctx->lock, flags);
+
+	return ret;
+}
+
+static void perf_aux_sample_output(struct perf_event *event,
+				   struct perf_output_handle *handle,
+				   struct perf_sample_data *data)
+{
+	struct perf_event *sampler = event->aux_event;
+	unsigned long pad;
+	struct ring_buffer *rb;
+	int ret;
+
+	if (WARN_ON_ONCE(!sampler || !data->aux.size))
+		return;
+
+	rb = ring_buffer_get(sampler);
+	if (!rb)
+		return;
+
+	if (READ_ONCE(rb->aux_in_sampling))
+		goto out_put;
+
+	WRITE_ONCE(rb->aux_in_sampling, 1);
+
+	ret = perf_pmu_aux_sample_output(sampler, handle, data->aux.size);
+	if (ret < 0) {
+		pr_warn_ratelimited("failed to copy trace data\n");
+		goto out_clear;
+	}
+
+	pad = data->aux.size - ret;
+	if (pad) {
+		u64 p = 0;
+
+		perf_output_copy(handle, &p, pad);
+	}
+
+out_clear:
+	WRITE_ONCE(rb->aux_in_sampling, 0);
+
+out_put:
+	ring_buffer_put(rb);
+}
+
 static void __perf_event_header__init_id(struct perf_event_header *header,
 					 struct perf_sample_data *data,
 					 struct perf_event *event)
@@ -6469,6 +6569,13 @@ void perf_output_sample(struct perf_output_handle *handle,
 	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
 		perf_output_put(handle, data->phys_addr);
 
+	if (sample_type & PERF_SAMPLE_AUX) {
+		perf_output_put(handle, data->aux.size);
+
+		if (data->aux.size)
+			perf_aux_sample_output(event, handle, data);
+	}
+
 	if (!event->attr.watermark) {
 		int wakeup_events = event->attr.wakeup_events;
 
@@ -6657,6 +6764,32 @@ void perf_prepare_sample(struct perf_event_header *header,
 
 	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
 		data->phys_addr = perf_virt_to_phys(data->addr);
+
+	if (sample_type & PERF_SAMPLE_AUX) {
+		u64 size;
+
+		header->size += sizeof(u64); /* size */
+
+		/*
+		 * Given the 16bit nature of header::size, an AUX sample can
+		 * easily overflow it, what with all the preceding sample bits.
+		 * Make sure this doesn't happen by using up to U16_MAX bytes
+		 * per sample in total (rounded down to 8 byte boundary).
+		 */
+		size = min_t(size_t, U16_MAX - header->size,
+			     event->attr.aux_sample_size);
+		size = rounddown(size, 8);
+		size = perf_aux_sample_size(event, data, size);
+
+		WARN_ON_ONCE(size + header->size > U16_MAX);
+		header->size += size;
+	}
+	/*
+	 * If you're adding more sample types here, you likely need to do
+	 * something about the overflowing header::size, like repurpose the
+	 * lowest 3 bits of size, which should be always zero at the moment.
+	 */
+	WARN_ON_ONCE(header->size & 7);
 }
 
 static __always_inline int
@@ -11171,9 +11304,12 @@ SYSCALL_DEFINE5(perf_event_open,
 		}
 	}
 
 	if (event->attr.aux_output && !perf_get_aux_event(event, group_leader))
 		goto err_locked;
 
+	if (event->attr.aux_sample_size && !perf_get_aux_event(event, group_leader))
+		goto err_locked;
+
 	/*
 	 * Must be under the same ctx::mutex as perf_install_in_context(),
 	 * because we need to serialize with concurrent event creation.
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 3aef4191798c..747d67f130cb 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -50,6 +50,7 @@ struct ring_buffer {
 	unsigned long			aux_mmap_locked;
 	void				(*free_aux)(void *);
 	refcount_t			aux_refcount;
+	int				aux_in_sampling;
 	void				**aux_pages;
 	void				*aux_priv;
 
-- 
2.20.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ