lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Tue, 24 Nov 2015 22:00:32 +0800
From:	Yunlong Song <yunlong.song@...wei.com>
To:	<a.p.zijlstra@...llo.nl>, <paulus@...ba.org>, <mingo@...hat.com>,
	<acme@...nel.org>
CC:	<linux-kernel@...r.kernel.org>, <wangnan0@...wei.com>,
	<namhyung@...nel.org>, <ast@...nel.org>,
	<masami.hiramatsu.pt@...achi.com>, <kan.liang@...el.com>,
	<adrian.hunter@...el.com>, <jolsa@...nel.org>, <dsahern@...il.com>,
	<bp@...en8.de>, <jean.pihet@...aro.org>, <rric@...nel.org>,
	<xiakaixu@...wei.com>, <hekuang@...wei.com>
Subject: [PATCH] perf record: Add snapshot mode support for perf's regular events

For aux area tracing, there is already a snapshot mode support for
intel-pt and intel-bts events. Similarly, this patch adds a snapshot
mode for perf's regular events. A user space ring buffer is allocated to
handle the tracing data from the kernel space ring buffer, and the
tracing data will only dump to perf.data when perf receives a SIGUSR2
signal.

Similarly like '-S' in aux trace snapshot mode, '-M' enables perf's
regular event's snapshot mode by defining the size (bytes) of the user
space ring buffer.

Example 1:

 $ perf record -a -M 10000000
 /*
  * Let perf record runs for some time before finally ends, and do not
  * send any SIGUSR2 signal to perf during perf's running.
  */

 $ perf report

 Error:
 The perf.data file has no samples!
 # To display the perf.data header info, please use --header/--header-only options.

As shown above, without any SIGUSR2 signal, perf record will dump no samples
to perf.data in the snapshot mode.

Example 2:

 $ perf record -a -M 10000000
 /*
  * Let perf record runs for some time before finally ends, and send
  * several times of SIGUSR2 signal to perf during perf's running.
  */

 # kill -SIGUSR2 `pidof perf`
 ...
 # kill -SIGUSR2 `pidof perf`

 $ perf report
<SNIP>
 # Total Lost Samples: 0
 #
 # Samples: 942  of event 'cycles:pp'
 # Event count (approx.): 175168972
 #
 # Overhead  Command          Shared Object            Symbol
 # ........  ...............  .......................  .........................................
 #
      8.20%  kworker/2:0      [kernel.kallsyms]        [k] default_send_IPI_mask_allbutself_phys
      6.33%  swapper          [kernel.kallsyms]        [k] intel_idle
      2.64%  pidof            [kernel.kallsyms]        [k] arch_get_unmapped_area_topdown
      2.56%  pidof            [kernel.kallsyms]        [k] unmap_region
      2.26%  pidof            [kernel.kallsyms]        [k] memcpy
      2.26%  pidof            libc-2.19.so             [.] _IO_vfscanf
      2.03%  pidof            [kernel.kallsyms]        [k] lookup_fast
      1.72%  pidof            [kernel.kallsyms]        [k] filp_close
      1.62%  pidof            [kernel.kallsyms]        [k] apparmor_file_open
      1.56%  pidof            [kernel.kallsyms]        [k] process_measurement
      1.50%  pidof            [kernel.kallsyms]        [k] find_vma
<SNIP>

As shown above, perf record will dump samples to perf.data every time
it receives a SIGUSR2 signal in the snapshot mode.

Signed-off-by: Yunlong Song <yunlong.song@...wei.com>
---
 tools/perf/builtin-record.c | 181 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 170 insertions(+), 11 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 199fc31..75606a6 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -37,6 +37,16 @@
 #include <sched.h>
 #include <sys/mman.h>
 
+static volatile int memory_enabled;
+static volatile int memory_signalled;
+/* The maximum size of one perf_event is 65536*/
+#define MEMORY_SIZE_MIN 65537
+
+struct memory {
+	void *start;
+	u64 head, tail;
+	u64 size;
+};
 
 struct record {
 	struct perf_tool	tool;
@@ -51,16 +61,134 @@ struct record {
 	bool			no_buildid;
 	bool			no_buildid_cache;
 	unsigned long long	samples;
+	struct memory memory;
 };
 
-static int record__write(struct record *rec, void *bf, size_t size)
+static int buf_to_file(struct record *rec, void *buf,
+				size_t size, u64 head, u64 tail)
 {
-	if (perf_data_file__write(rec->session->file, bf, size) < 0) {
-		pr_err("failed to write perf data, error: %m\n");
+	size_t written = 0;
+
+	if (head < tail) {
+		if (perf_data_file__write(rec->session->file,
+					buf + head, tail - head) < 0)
+			goto out;
+		written += tail - head;
+	} else if (head > tail) {
+		if (perf_data_file__write(rec->session->file,
+					buf + head, size - head) < 0)
+			goto out;
+		written += size - head;
+
+		if (perf_data_file__write(rec->session->file, buf, tail) < 0)
+			goto out;
+		written += tail;
+	}
+
+	rec->bytes_written += written;
+	return 0;
+out:
+	pr_err("failed to write perf data, error: %m\n");
+	return -1;
+}
+
+static int memory_to_file(struct record *rec)
+{
+	if (buf_to_file(rec, rec->memory.start, rec->memory.size,
+			rec->memory.head, rec->memory.tail) < 0)
 		return -1;
+	rec->memory.head = rec->memory.tail;
+
+	return 0;
+}
+
+static ssize_t perf_memory__write(struct memory *memory, void *buf, size_t size)
+{
+	void *buf_start = buf;
+	size_t left = size, written, delta, skip;
+	union perf_event *event;
+	struct perf_event_header hdr;
+	struct record *rec = container_of(memory, struct record, memory);
+
+	while (left) {
+		skip = 0;
+		written = min(left, memory->size - memory->tail);
+		if (memory->head > memory->tail)
+			delta = memory->head - memory->tail;
+		else
+			delta = memory->size - memory->tail + memory->head;
+		if (delta <= written) {
+			do {
+				if ((memory->head + skip) <= (memory->size -
+						sizeof(struct perf_event_header)))
+					event = (union perf_event *)(memory->start +
+							memory->head + skip);
+				else {
+					size_t hdr_left;
+
+					hdr_left = sizeof(struct perf_event_header) -
+								memory->size + memory->head + skip;
+					memcpy(&hdr, memory->start + memory->head + skip,
+						sizeof(struct perf_event_header) - hdr_left);
+
+					if (hdr_left <= memory->tail)
+						memcpy((void *)&hdr + sizeof(struct perf_event_header) -
+								hdr_left, memory->start, hdr_left);
+					else if (!memory->tail)
+						memcpy((void *)&hdr + sizeof(struct perf_event_header) -
+								hdr_left, buf, hdr_left);
+					else {
+						memcpy((void *)&hdr + sizeof(struct perf_event_header) -
+								hdr_left, memory->start, memory->tail);
+						hdr_left -= memory->tail;
+						memcpy((void *)&hdr + sizeof(struct perf_event_header) -
+								hdr_left, buf, hdr_left);
+					}
+
+					event = (union perf_event *)&hdr;
+					if (rec->session->header.needs_swap)
+						perf_event_header__bswap(&event->header);
+				}
+
+				if (event->header.type != PERF_RECORD_SAMPLE) {
+					if (buf_to_file(rec, memory->start, memory->size,
+						memory->head + skip, (memory->head + skip +
+						event->header.size) % memory->size) < 0)
+						return -1;
+				}
+
+				skip += event->header.size;
+			} while (skip <= written - delta);
+		}
+
+		memcpy(memory->start + memory->tail, buf, written);
+
+		memory->head = (memory->head + skip) % memory->size;
+		memory->tail = (memory->tail + written) % memory->size;
+
+		left -= written;
+		buf  += written;
+	}
+
+	BUG_ON((size_t)(buf - buf_start) != size);
+	return size;
+}
+
+static int record__write(struct record *rec, void *bf, size_t size)
+{
+	if (rec->memory.size && memory_enabled) {
+		if (perf_memory__write(&rec->memory, bf, size) < 0) {
+			pr_err("failed to write memory data, error: %m\n");
+			return -1;
+		}
+	} else {
+		if (perf_data_file__write(rec->session->file, bf, size) < 0) {
+			pr_err("failed to write perf data, error: %m\n");
+			return -1;
+		}
+		rec->bytes_written += size;
 	}
 
-	rec->bytes_written += size;
 	return 0;
 }
 
@@ -86,6 +214,8 @@ static int record__mmap_read(struct record *rec, int idx)
 	if (old == head)
 		return 0;
 
+	memory_enabled = 1;
+
 	rec->samples++;
 
 	size = head - old;
@@ -113,6 +243,7 @@ static int record__mmap_read(struct record *rec, int idx)
 	md->prev = old;
 	perf_evlist__mmap_consume(rec->evlist, idx);
 out:
+	memory_enabled = 0;
 	return rc;
 }
 
@@ -426,8 +557,11 @@ static int record__mmap_read_all(struct record *rec)
 	 * Mark the round finished in case we wrote
 	 * at least one event.
 	 */
-	if (bytes_written != rec->bytes_written)
+	if (bytes_written != rec->bytes_written) {
+		memory_enabled = 1;
 		rc = record__write(rec, &finished_round_event, sizeof(finished_round_event));
+		memory_enabled = 0;
+	}
 
 out:
 	return rc;
@@ -492,7 +626,7 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
 	signal(SIGCHLD, sig_handler);
 	signal(SIGINT, sig_handler);
 	signal(SIGTERM, sig_handler);
-	if (rec->opts.auxtrace_snapshot_mode)
+	if (rec->opts.auxtrace_snapshot_mode || rec->memory.size)
 		signal(SIGUSR2, snapshot_sig_handler);
 	else
 		signal(SIGUSR2, SIG_IGN);
@@ -687,6 +821,14 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
 			}
 		}
 
+		if (memory_signalled) {
+			memory_signalled = 0;
+			if (memory_to_file(rec) < 0) {
+				err = -1;
+				goto out_child;
+			}
+		}
+
 		if (hits == rec->samples) {
 			if (done || draining)
 				break;
@@ -1009,6 +1151,12 @@ static struct record record = {
 		.mmap2		= perf_event__process_mmap2,
 		.ordered_events	= true,
 	},
+	.memory = {
+		.start = NULL,
+		.head = 0,
+		.tail = 0,
+		.size = 0,
+	},
 };
 
 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
@@ -1119,6 +1267,7 @@ struct option __record_options[] = {
 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
 		   "options passed to clang when compiling BPF scriptlets"),
 #endif
+	OPT_U64('M', "memory", &record.memory.size, "user space ring buffer memory size (bytes)"),
 	OPT_END()
 };
 
@@ -1220,19 +1369,29 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
 		goto out_symbol_exit;
 	}
 
+	if (rec->memory.size) {
+		if (rec->memory.size < MEMORY_SIZE_MIN)
+			rec->memory.size = MEMORY_SIZE_MIN;
+		rec->memory.start = malloc(rec->memory.size);
+	}
+
 	err = __cmd_record(&record, argc, argv);
 out_symbol_exit:
 	perf_evlist__delete(rec->evlist);
 	symbol__exit();
 	auxtrace_record__free(rec->itr);
+	if (rec->memory.size)
+		free(rec->memory.start);
 	return err;
 }
 
 static void snapshot_sig_handler(int sig __maybe_unused)
 {
-	if (!auxtrace_snapshot_enabled)
-		return;
-	auxtrace_snapshot_enabled = 0;
-	auxtrace_snapshot_err = auxtrace_record__snapshot_start(record.itr);
-	auxtrace_record__snapshot_started = 1;
+	if (record.opts.auxtrace_snapshot_mode && auxtrace_snapshot_enabled) {
+		auxtrace_snapshot_enabled = 0;
+		auxtrace_snapshot_err = auxtrace_record__snapshot_start(record.itr);
+		auxtrace_record__snapshot_started = 1;
+	}
+	if (record.memory.size && !memory_signalled)
+		memory_signalled = 1;
 }
-- 
1.8.5.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ