lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Wed,  6 Nov 2013 11:41:37 -0700
From:	David Ahern <dsahern@...il.com>
To:	acme@...stprotocols.net, linux-kernel@...r.kernel.org
Cc:	mingo@...nel.org, jolsa@...hat.com,
	David Ahern <dsahern@...il.com>,
	Frederic Weisbecker <fweisbec@...il.com>,
	Peter Zijlstra <peterz@...radead.org>,
	Namhyung Kim <namhyung@...nel.org>,
	Mike Galbraith <efault@....de>,
	Stephane Eranian <eranian@...gle.com>
Subject: [PATCH 4/4] perf record: mmap output file - v3

When recording raw_syscalls for the entire system, e.g.,
    perf record -e raw_syscalls:*,sched:sched_switch -a -- sleep 1

you end up with a negative feedback loop as perf itself calls write() fairly
often. This patch handles the problem by mmap'ing the file in chunks of 64M at
a time and copies events from the event buffers to the file avoiding write
system calls.

Before (with write syscall):

    perf record -o /tmp/perf.data -e raw_syscalls:*,sched:sched_switch -a -- sleep 1
    [ perf record: Woken up 0 times to write data ]
    [ perf record: Captured and wrote 81.843 MB /tmp/perf.data (~3575786 samples) ]

After (using mmap):

    perf record -o /tmp/perf.data -e raw_syscalls:*,sched:sched_switch -a -- sleep 1
    [ perf record: Woken up 31 times to write data ]
    [ perf record: Captured and wrote 8.203 MB /tmp/perf.data (~358388 samples) ]

In addition to perf-trace benefits using mmap lowers the overhead of
perf-record. For example,

  perf stat -i -- perf record -g -o /tmp/perf.data openssl speed aes

shows a drop in time, CPU cycles, and instructions all drop by more than a
factor of 3. Jiri also ran a test that showed a big improvement.

v3: Removed use of bytes_at_mmap_start at the stat() that set it
    Added user option to control the size of the mmap for writing file.

v2: Removed msync call before munmap per Jiri's suggestion

Signed-off-by: David Ahern <dsahern@...il.com>
Cc: Ingo Molnar <mingo@...nel.org>
Cc: Frederic Weisbecker <fweisbec@...il.com>
Cc: Peter Zijlstra <peterz@...radead.org>
Cc: Jiri Olsa <jolsa@...hat.com>
Cc: Namhyung Kim <namhyung@...nel.org>
Cc: Mike Galbraith <efault@....de>
Cc: Stephane Eranian <eranian@...gle.com>

Signed-off-by: David Ahern <dsahern@...il.com>
---
 tools/perf/Documentation/perf-record.txt |  5 ++
 tools/perf/builtin-record.c              | 97 ++++++++++++++++++++++++++++++++
 2 files changed, 102 insertions(+)

diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index 052f7c4dc00c..5cd305eb1698 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -201,6 +201,11 @@ abort events and some memory events in precise mode on modern Intel CPUs.
 --transaction::
 Record transaction flags for transaction related events.
 
+--out-pages=::
+	Number of pages to mmap while writing data to file (must be a power of two).
+	Specification can be appended with unit character - B/K/M/G. The
+	size is rounded up to have nearest pages power of two value.
+
 SEE ALSO
 --------
 linkperf:perf-stat[1], linkperf:perf-list[1]
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 15280b5e5574..3cf563eb7896 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -30,6 +30,9 @@
 #include <sched.h>
 #include <sys/mman.h>
 
+/* output file mmap'ed N chunks at a time */
+#define MMAP_OUTPUT_SIZE   (64*1024*1024)
+
 #ifndef HAVE_ON_EXIT_SUPPORT
 #ifndef ATEXIT_MAX
 #define ATEXIT_MAX 32
@@ -65,6 +68,14 @@ static void __handle_on_exit_funcs(void)
 struct perf_record {
 	struct perf_tool	tool;
 	struct perf_record_opts	opts;
+
+	/* for MMAP based file writes */
+	void			*mmap_addr;
+	u64			mmap_offset;     /* current location within mmap */
+	unsigned int		mmap_out_pages;  /* user configurable option */
+	size_t			mmap_out_size;   /* size of mmap segments */
+	bool			use_mmap;
+
 	u64			bytes_written;
 	struct perf_data_file	file;
 	struct perf_evlist	*evlist;
@@ -76,10 +87,68 @@ struct perf_record {
 	long			samples;
 };
 
+static int do_mmap_output(struct perf_record *rec, void *buf, size_t size)
+{
+	struct perf_data_file *file = &rec->file;
+	u64 remaining;
+	off_t offset;
+
+	if (rec->mmap_addr == NULL) {
+do_mmap:
+		offset = rec->session->header.data_offset + rec->bytes_written;
+		if (offset < (ssize_t) rec->mmap_out_size) {
+			rec->mmap_offset = offset;
+			offset = 0;
+		} else
+			rec->mmap_offset = 0;
+
+		/* extend file to include a new mmap segment */
+		if (ftruncate(file->fd, offset + rec->mmap_out_size) != 0) {
+			pr_err("ftruncate failed\n");
+			return -1;
+		}
+
+		rec->mmap_addr = mmap(NULL, rec->mmap_out_size,
+				      PROT_WRITE | PROT_READ, MAP_SHARED,
+				      file->fd, offset);
+
+		if (rec->mmap_addr == MAP_FAILED) {
+			pr_err("mmap failed: %d: %s\n", errno, strerror(errno));
+			/* reset file size */
+			ftruncate(file->fd, offset);
+			return -1;
+		}
+	}
+
+	remaining = rec->mmap_out_size - rec->mmap_offset;
+
+	if (size > remaining) {
+		memcpy(rec->mmap_addr + rec->mmap_offset, buf, remaining);
+		rec->bytes_written += remaining;
+
+		size -= remaining;
+		buf  += remaining;
+
+		munmap(rec->mmap_addr, rec->mmap_out_size);
+		goto do_mmap;
+	}
+
+	if (size) {
+		memcpy(rec->mmap_addr + rec->mmap_offset, buf, size);
+		rec->bytes_written += size;
+		rec->mmap_offset += size;
+	}
+
+	return 0;
+}
+
 static int write_output(struct perf_record *rec, void *buf, size_t size)
 {
 	struct perf_data_file *file = &rec->file;
 
+	if (rec->use_mmap)
+		return do_mmap_output(rec, buf, size);
+
 	while (size) {
 		int ret = write(file->fd, buf, size);
 
@@ -429,6 +498,12 @@ static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
 		goto out_delete_session;
 	}
 
+	if (!file->is_pipe && rec->mmap_out_size) {
+		if (rec->mmap_out_pages)
+			rec->mmap_out_size = rec->mmap_out_pages * page_size;
+		rec->use_mmap = true;
+	}
+
 	machine = &session->machines.host;
 
 	if (file->is_pipe) {
@@ -544,6 +619,24 @@ static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
 		}
 	}
 
+	if (rec->use_mmap) {
+		off_t len = rec->session->header.data_offset + rec->bytes_written;
+		int fd = rec->file.fd;
+
+		rec->use_mmap = false;
+		munmap(rec->mmap_addr, rec->mmap_out_size);
+		rec->mmap_addr = NULL;
+
+		if (ftruncate(fd, len) != 0)
+			pr_err("ftruncate failed\n");
+
+		/*
+		 * Set output pointer to end of file
+		 * eg., needed for buildid processing
+		 */
+		lseek(fd, len, SEEK_SET);
+	}
+
 	if (quiet || signr == SIGUSR1)
 		return 0;
 
@@ -805,6 +898,7 @@ static struct perf_record record = {
 			.uses_mmap   = true,
 		},
 	},
+	.mmap_out_size = MMAP_OUTPUT_SIZE,
 };
 
 #define CALLCHAIN_HELP "setup and enables call-graph (stack chain/backtrace) recording: "
@@ -891,6 +985,9 @@ const struct option record_options[] = {
 		    "sample by weight (on special events only)"),
 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
 		    "sample transaction flags (special events only)"),
+	OPT_CALLBACK(0, "out-pages", &record.mmap_out_pages, "pages",
+		     "number of pages to use for output chunks.",
+		     perf_evlist__parse_mmap_pages),
 	OPT_END()
 };
 
-- 
1.8.3.4 (Apple Git-47)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists