linux-kernel - Re: [PATCH v9 2/3]: perf record: enable asynchronous trace writing

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20181005071615.GC3768@sejong>
Date:   Fri, 5 Oct 2018 16:16:15 +0900
From:   Namhyung Kim <namhyung@...nel.org>
To:     Alexey Budankov <alexey.budankov@...ux.intel.com>
Cc:     Peter Zijlstra <peterz@...radead.org>,
        Ingo Molnar <mingo@...nel.org>,
        Arnaldo Carvalho de Melo <acme@...nel.org>,
        Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
        Jiri Olsa <jolsa@...hat.com>, Andi Kleen <ak@...ux.intel.com>,
        linux-kernel <linux-kernel@...r.kernel.org>, kernel-team@....com
Subject: Re: [PATCH v9 2/3]: perf record: enable asynchronous trace writing

On Wed, Oct 03, 2018 at 07:12:10PM +0300, Alexey Budankov wrote:
> 
> Trace file offset is calculated and updated linearly prior
> enqueuing aio write at record__pushfn().
> 
> record__aio_sync() blocks till completion of started AIO operation 
> and then proceeds.
> 
> record__mmap_read_sync() implements a barrier for all incomplete
> aio write requests.
> 
> Signed-off-by: Alexey Budankov <alexey.budankov@...ux.intel.com>
> ---
>  Changes in v9:
>  - enable AIO streaming only when --aio-cblocks option is specified explicitly
>  Changes in v8:
>  - split AIO completion check into separate record__aio_complete()
>  Changes in v6:
>  - handled errno == EAGAIN case from aio_write();
>  Changes in v5:
>  - data loss metrics decreased from 25% to 2x in trialed configuration;
>  - avoided nanosleep() prior calling aio_suspend();
>  - switched to per cpu multi record__aio_sync() aio
>  - record_mmap_read_sync() now does global barrier just before 
>    switching trace file or collection stop;
>  - resolved livelock on perf record -e intel_pt// -- dd if=/dev/zero of=/dev/null count=100000
>  Changes in v4:
>  - converted void *bf to struct perf_mmap *md in signatures
>  - written comment in perf_mmap__push() just before perf_mmap__get();
>  - written comment in record__mmap_read_sync() on possible restarting 
>    of aio_write() operation and releasing perf_mmap object after all;
>  - added perf_mmap__put() for the cases of failed aio_write();
>  Changes in v3:
>  - written comments about nanosleep(0.5ms) call prior aio_suspend()
>    to cope with intrusiveness of its implementation in glibc;
>  - written comments about rationale behind coping profiling data 
>    into mmap->data buffer;
> ---
>  tools/perf/builtin-record.c | 154 +++++++++++++++++++++++++++++++++++++++++++-
>  tools/perf/perf.h           |   3 +
>  tools/perf/util/mmap.c      |  73 +++++++++++++++++++++
>  tools/perf/util/mmap.h      |   4 ++
>  4 files changed, 233 insertions(+), 1 deletion(-)
> 
> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> index 0980dfe3396b..c63fe8c021a2 100644
> --- a/tools/perf/builtin-record.c
> +++ b/tools/perf/builtin-record.c
> @@ -124,6 +124,118 @@ static int record__write(struct record *rec, struct perf_mmap *map __maybe_unuse
>  	return 0;
>  }
>  
> +#ifdef HAVE_AIO_SUPPORT
> +static int record__aio_write(struct aiocb *cblock, int trace_fd,
> +		void *buf, size_t size, off_t off)
> +{
> +	int rc;
> +
> +	cblock->aio_fildes = trace_fd;
> +	cblock->aio_buf    = buf;
> +	cblock->aio_nbytes = size;
> +	cblock->aio_offset = off;
> +	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
> +
> +	do {
> +		rc = aio_write(cblock);
> +		if (rc == 0) {
> +			break;
> +		} else if (errno != EAGAIN) {
> +			cblock->aio_fildes = -1;
> +			pr_err("failed to queue perf data, error: %m\n");
> +			break;
> +		}
> +	} while (1);
> +
> +	return rc;
> +}
> +
> +static int record__aio_complete(struct perf_mmap *md, struct aiocb *cblock)
> +{
> +	void *rem_buf;
> +	off_t rem_off;
> +	size_t rem_size;
> +	int rc, aio_errno;
> +	ssize_t aio_ret, written;
> +
> +	aio_errno = aio_error(cblock);
> +	if (aio_errno == EINPROGRESS)
> +		return 0;
> +
> +	written = aio_ret = aio_return(cblock);
> +	if (aio_ret < 0) {
> +		if (aio_errno != EINTR)
> +			pr_err("failed to write perf data, error: %m\n");
> +		written = 0;
> +	}
> +
> +	rem_size = cblock->aio_nbytes - written;
> +
> +	if (rem_size == 0) {
> +		cblock->aio_fildes = -1;
> +		/*
> +		 * md->refcount is incremented in perf_mmap__push() for
> +		 * every enqueued aio write request so decrement it because
> +		 * the request is now complete.
> +		 */
> +		perf_mmap__put(md);
> +		rc = 1;
> +	} else {
> +		/*
> +		 * aio write request may require restart with the
> +		 * reminder if the kernel didn't write whole
> +		 * chunk at once.
> +		 */
> +		rem_off = cblock->aio_offset + written;
> +		rem_buf = (void *)(cblock->aio_buf + written);
> +		record__aio_write(cblock, cblock->aio_fildes,
> +				rem_buf, rem_size, rem_off);
> +		rc = 0;
> +	}
> +
> +	return rc;
> +}
> +
> +static void record__aio_sync(struct perf_mmap *md)
> +{
> +	struct aiocb *cblock = &md->cblock;
> +	struct timespec timeout = { 0, 1000 * 1000  * 1 }; // 1ms
> +
> +	do {
> +		if (cblock->aio_fildes == -1 || record__aio_complete(md, cblock))
> +			return;
> +
> +		while (aio_suspend((const struct aiocb**)&cblock, 1, &timeout)) {
> +			if (!(errno == EAGAIN || errno == EINTR))
> +				pr_err("failed to sync perf data, error: %m\n");

Is there somthing we can do in this error case?  Any chance it gets
stuck in the loop?


> +		}
> +	} while (1);
> +}
> +
> +static int record__aio_pushfn(void *to, struct aiocb *cblock, void *bf, size_t size)
> +{
> +	off_t off;
> +	struct record *rec = to;
> +	int ret, trace_fd = rec->session->data->file.fd;
> +
> +	rec->samples++;
> +
> +	off = lseek(trace_fd, 0, SEEK_CUR);
> +	lseek(trace_fd, off + size, SEEK_SET);

It'd be nice if these lseek() could be removed and use
rec->bytes_written instead.


> +	ret = record__aio_write(cblock, trace_fd, bf, size, off);
> +	if (!ret) {
> +		rec->bytes_written += size;
> +
> +		if (switch_output_size(rec))
> +			trigger_hit(&switch_output_trigger);

Doesn't it need the _sync() before the trigger?  Maybe it should be
moved to record__mmap_read_evlist() or so..


> +	} else {
> +		lseek(trace_fd, off, SEEK_SET);
> +	}
> +
> +	return ret;
> +}
> +#endif
> +
>  static int process_synthesized_event(struct perf_tool *tool,
>  				     union perf_event *event,
>  				     struct perf_sample *sample __maybe_unused,
> @@ -136,7 +248,6 @@ static int process_synthesized_event(struct perf_tool *tool,
>  static int record__pushfn(struct perf_mmap *map, void *to, void *bf, size_t size)
>  {
>  	struct record *rec = to;
> -
>  	rec->samples++;
>  	return record__write(rec, map, bf, size);
>  }
> @@ -513,6 +624,24 @@ static struct perf_event_header finished_round_event = {
>  	.type = PERF_RECORD_FINISHED_ROUND,
>  };
>  
> +static void record__mmap_read_sync(struct record *rec __maybe_unused)
> +{
> +#ifdef HAVE_AIO_SUPPORT
> +	int i;
> +	struct perf_evlist *evlist = rec->evlist;
> +	struct perf_mmap *maps = evlist->mmap;
> +
> +	if (!rec->opts.nr_cblocks)
> +		return;
> +
> +	for (i = 0; i < evlist->nr_mmaps; i++) {
> +		struct perf_mmap *map = &maps[i];
> +		if (map->base)
> +			record__aio_sync(map);
> +	}
> +#endif
> +}
> +
>  static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evlist,
>  				    bool overwrite)
>  {
> @@ -535,10 +664,29 @@ static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evli
>  		struct perf_mmap *map = &maps[i];
>  
>  		if (map->base) {
> +#ifndef HAVE_AIO_SUPPORT
>  			if (perf_mmap__push(map, rec, record__pushfn) != 0) {
>  				rc = -1;
>  				goto out;
>  			}
> +#else
> +			if (!rec->opts.nr_cblocks) {
> +				if (perf_mmap__push(map, rec, record__pushfn) != 0) {
> +					rc = -1;
> +					goto out;
> +				}
> +			} else {
> +				/*
> +				 * Call record__aio_sync() to wait till map->data buffer
> +				 * becomes available after previous aio write request.
> +				 */
> +				record__aio_sync(map);
> +				if (perf_mmap__aio_push(map, rec, record__aio_pushfn) != 0) {
> +					rc = -1;
> +					goto out;
> +				}
> +			}
> +#endif

If dummy aio functions are provided, the #ifdef can be removed and
just use the #else part assuming opts.nr_cblocks == 0.

Thanks,
Namhyung


>  		}
>  
>  		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
> @@ -1057,6 +1205,7 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
>  			perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
>  
>  		if (record__mmap_read_all(rec) < 0) {
> +			record__mmap_read_sync(rec);
>  			trigger_error(&auxtrace_snapshot_trigger);
>  			trigger_error(&switch_output_trigger);
>  			err = -1;
> @@ -1068,6 +1217,7 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
>  			if (!trigger_is_error(&auxtrace_snapshot_trigger))
>  				record__read_auxtrace_snapshot(rec);
>  			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
> +				record__mmap_read_sync(rec);
>  				pr_err("AUX area tracing snapshot failed\n");
>  				err = -1;
>  				goto out_child;
> @@ -1086,6 +1236,7 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
>  			 */
>  			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
>  				continue;
> +			record__mmap_read_sync(rec);
>  			trigger_ready(&switch_output_trigger);
>  
>  			/*
> @@ -1139,6 +1290,7 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
>  			disabled = true;
>  		}
>  	}
> +	record__mmap_read_sync(rec);
>  	trigger_off(&auxtrace_snapshot_trigger);
>  	trigger_off(&switch_output_trigger);