linux-kernel - RE: [PATCH 3/4] perf record: event synthesization multithreading support

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <37D7C6CF3E00A74B8858931C1DB2F077537D4264@SHSMSX103.ccr.corp.intel.com>
Date:   Fri, 13 Oct 2017 14:58:40 +0000
From:   "Liang, Kan" <kan.liang@...el.com>
To:     Arnaldo Carvalho de Melo <acme@...nel.org>
CC:     "peterz@...radead.org" <peterz@...radead.org>,
        "mingo@...hat.com" <mingo@...hat.com>,
        "linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
        "jolsa@...nel.org" <jolsa@...nel.org>,
        "wangnan0@...wei.com" <wangnan0@...wei.com>,
        "hekuang@...wei.com" <hekuang@...wei.com>,
        "namhyung@...nel.org" <namhyung@...nel.org>,
        "alexander.shishkin@...ux.intel.com" 
        <alexander.shishkin@...ux.intel.com>,
        "Hunter, Adrian" <adrian.hunter@...el.com>,
        "ak@...ux.intel.com" <ak@...ux.intel.com>
Subject: RE: [PATCH 3/4] perf record: event synthesization multithreading
 support

> Em Fri, Oct 13, 2017 at 07:09:26AM -0700, kan.liang@...el.com escreveu:
> > From: Kan Liang <Kan.liang@...el.com>
> >
> > The process function process_synthesized_event writes the process
> > result to perf.data, which is not multithreading friendly.
> >
> > Realloc buffer for each thread to temporarily keep the processing
> > result. Write them to the perf.data at the end of event synthesization.
> > The new method doesn't impact the final result, because
> >  - The order of the synthesized event is not important.
> >  - The number of synthesized event is limited. Usually, it only needs
> >    hundreds of Kilobyte to store all the synthesized event.
> >    It's unlikly failed because of lack of memory.
> 
> Why not write to a per cpu file and then at the end merge them? 

I just thought merging from memory should be faster than merging from files.

> Leave
> the memory management to the kernel, i.e. in most cases you may even not
> end up touching the disk, when memory is plentiful, just rewind the per
> event files and go on dumping to the main perf.data file.
>

Agree. I will do it.

> At some point we may just don't do this merging, and keep per cpu files
> all the way to perf report, etc. This would be a first foray into
> that...
>

Yes, but it's a little bit complex.
I think I will do it in a separate improvement patch series later.
For now, I will do the file merge.

Thanks,
Kan
 
> - Arnaldo
> 
> > The threads number hard code to online CPU number. The following patch
> > will introduce an option to set it.
> >
> > The multithreading synthesize is only available for per cpu monitoring.
> >
> > Signed-off-by: Kan Liang <Kan.liang@...el.com>
> > ---
> >  tools/perf/builtin-record.c | 86
> ++++++++++++++++++++++++++++++++++++++++++---
> >  1 file changed, 81 insertions(+), 5 deletions(-)
> >
> > diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> > index 4ede9bf..1978021 100644
> > --- a/tools/perf/builtin-record.c
> > +++ b/tools/perf/builtin-record.c
> > @@ -62,6 +62,11 @@ struct switch_output {
> >  	bool		 set;
> >  };
> >
> > +struct synthesize_buf {
> > +	void		*entry;
> > +	size_t		size;
> > +};
> > +
> >  struct record {
> >  	struct perf_tool	tool;
> >  	struct record_opts	opts;
> > @@ -80,6 +85,7 @@ struct record {
> >  	bool			timestamp_filename;
> >  	struct switch_output	switch_output;
> >  	unsigned long long	samples;
> > +	struct synthesize_buf	*buf;
> >  };
> >
> >  static volatile int auxtrace_record__snapshot_started;
> > @@ -120,14 +126,35 @@ static int record__write(struct record *rec, void
> *bf, size_t size)
> >  	return 0;
> >  }
> >
> > +static int buf__write(struct record *rec, int idx, void *bf, size_t size)
> > +{
> > +	size_t target_size = rec->buf[idx].size;
> > +	void *target;
> > +
> > +	target = realloc(rec->buf[idx].entry, target_size + size);
> > +	if (target == NULL)
> > +		return -ENOMEM;
> > +
> > +	memcpy(target + target_size, bf, size);
> > +
> > +	rec->buf[idx].entry = target;
> > +	rec->buf[idx].size += size;
> > +
> > +	return 0;
> > +}
> > +
> >  static int process_synthesized_event(struct perf_tool *tool,
> >  				     union perf_event *event,
> >  				     struct perf_sample *sample
> __maybe_unused,
> >  				     struct machine *machine
> __maybe_unused,
> > -				     struct thread_info *thread
> __maybe_unused)
> > +				     struct thread_info *thread)
> >  {
> >  	struct record *rec = container_of(tool, struct record, tool);
> > -	return record__write(rec, event, event->header.size);
> > +
> > +	if (!perf_singlethreaded && thread)
> > +		return buf__write(rec, thread->idx, event, event-
> >header.size);
> > +	else
> > +		return record__write(rec, event, event->header.size);
> >  }
> >
> >  static int record__pushfn(void *to, void *bf, size_t size)
> > @@ -690,6 +717,48 @@ static const struct perf_event_mmap_page
> *record__pick_pc(struct record *rec)
> >  	return NULL;
> >  }
> >
> > +static int record__multithread_synthesize(struct record *rec,
> > +					  struct machine *machine,
> > +					  struct perf_tool *tool,
> > +					  struct record_opts *opts)
> > +{
> > +	int i, err, nr_thread = sysconf(_SC_NPROCESSORS_ONLN);
> > +
> > +	if (nr_thread > 1) {
> > +		perf_set_multithreaded();
> > +
> > +		rec->buf = calloc(nr_thread, sizeof(struct synthesize_buf));
> > +		if (rec->buf == NULL) {
> > +			pr_err("Could not do multithread synthesize\n");
> > +			nr_thread = 1;
> > +			perf_set_singlethreaded();
> > +		}
> > +	}
> > +
> > +	err = __machine__synthesize_threads(machine, tool, &opts->target,
> > +					    rec->evlist->threads,
> > +					    process_synthesized_event,
> > +					    opts->sample_address,
> > +					    opts->proc_map_timeout,
> nr_thread);
> > +	if (err < 0)
> > +		goto free;
> > +
> > +	if (nr_thread > 1) {
> > +		for (i = 0; i < nr_thread; i++)
> > +			record__write(rec, rec->buf[i].entry, rec->buf[i].size);
> > +	}
> > +
> > +free:
> > +	if (nr_thread > 1) {
> > +		for (i = 0; i < nr_thread; i++)
> > +			free(rec->buf[i].entry);
> > +		free(rec->buf);
> > +		perf_set_singlethreaded();
> > +	}
> > +
> > +	return err;
> > +}
> > +
> >  static int record__synthesize(struct record *rec, bool tail)
> >  {
> >  	struct perf_session *session = rec->session;
> > @@ -766,9 +835,16 @@ static int record__synthesize(struct record *rec,
> bool tail)
> >  					 perf_event__synthesize_guest_os,
> tool);
> >  	}
> >
> > -	err = __machine__synthesize_threads(machine, tool, &opts->target,
> rec->evlist->threads,
> > -					    process_synthesized_event, opts-
> >sample_address,
> > -					    opts->proc_map_timeout, 1);
> > +	/* multithreading synthesize is only available for cpu monitoring */
> > +	if (target__has_cpu(&opts->target))
> > +		err = record__multithread_synthesize(rec, machine, tool,
> opts);
> > +	else
> > +		err = __machine__synthesize_threads(machine, tool,
> > +						    &opts->target,
> > +						    rec->evlist->threads,
> > +						    process_synthesized_event,
> > +						    opts->sample_address,
> > +						    opts->proc_map_timeout,
> 1);
> >  out:
> >  	return err;
> >  }
> > --
> > 2.7.4