[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20160427214722.GU11033@kernel.org>
Date: Wed, 27 Apr 2016 18:47:22 -0300
From: Arnaldo Carvalho de Melo <acme@...nel.org>
To: Ravi Bangoria <ravi.bangoria@...ux.vnet.ibm.com>
Cc: linux-kernel@...r.kernel.org, hemant@...ux.vnet.ibm.com,
naveen.n.rao@...ux.vnet.ibm.com
Subject: Re: [RFC 1/4] perf kvm: Enable 'record' on powerpc
Em Wed, Apr 27, 2016 at 06:02:21PM +0530, Ravi Bangoria escreveu:
> Hi Arnaldo,
>
> I've worked on your patch. I'm sending this patch(diff) to check if this
> is the same idea you want to progress with. I cleanup your patch,
> removed arch specific compile time directives and changed code to
> enable cross arch reporting. I tested record on powerpc and report
> on x86 and it's working.
>
> Please give suggestion about your approach. Let me know if you have
> some other idea to progress with.
>
> Here is the diff w.r.t perf/cpumode branch:
>
> diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
> index bff6664..83ef6c6 100644
> --- a/tools/perf/builtin-kvm.c
> +++ b/tools/perf/builtin-kvm.c
> @@ -1480,6 +1480,60 @@ perf_stat:
> }
> #endif /* HAVE_KVM_STAT_SUPPORT */
>
> +#define PPC_HV_DECREMENTER 2432
> +#define PPC_HV_BIT 3
> +#define PPC_PR_BIT 49
> +#define PPC_MAX 63
> +
> +static bool perf_sample__is_hv_dec_trap(struct perf_sample *sample, struct
> perf_evsel *evsel)
> +{
> + int trap = perf_evsel__intval(evsel, sample, "trap");
> + return trap == PPC_HV_DECREMENTER;
> +}
> +
> +static void perf_kvm__munge_ppc_guest_sample(struct perf_evsel *evsel,
> struct perf_sample *sample)
> +{
> + unsigned long msr, hv, pr;
> +
> + if (!perf_sample__is_hv_dec_trap(sample, evsel))
> + return;
> +
> + sample->ip = perf_evsel__intval(evsel, sample, "pc");
> + sample->cpumode = PERF_RECORD_MISC_GUEST_KERNEL;
> +
> + msr = perf_evsel__intval(evsel, sample, "msr");
> + hv = msr & ((unsigned long)1 << (PPC_MAX - PPC_HV_BIT));
> + pr = msr & ((unsigned long)1 << (PPC_MAX - PPC_PR_BIT));
> + if (!hv && pr)
> + sample->cpumode = PERF_RECORD_MISC_GUEST_USER;
> +}
> +
> +static bool perf_evlist__recorded_on_ppc(const struct perf_evlist *evlist)
> +{
> + if (evlist->env && evlist->env->arch) {
> + return !strcmp(evlist->env->arch, "ppc64") ||
> + !strcmp(evlist->env->arch, "ppc64le");
> + }
> + return false;
> +}
> +
> +int perf_kvm__setup_munge_ppc_guest_event(struct perf_evlist *evlist)
> +{
> + struct perf_evsel *evsel;
> + const char name[] = "kvm_hv:kvm_guest_exit";
> +
> + if (!perf_evlist__recorded_on_ppc(evlist))
> + return 0;
> +
> + evsel = perf_evlist__find_tracepoint_by_name(evlist, name);
> + if (evsel == NULL)
> + return -1;
> +
> + evsel->munge_sample = perf_kvm__munge_ppc_guest_sample;
> +
> + return 0;
> +}
> +
> static int __cmd_record(const char *file_name, int argc, const char **argv)
> {
> int rec_argc, i = 0, j;
> diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
> index ab47273..7cb41f7 100644
> --- a/tools/perf/builtin-report.c
> +++ b/tools/perf/builtin-report.c
> @@ -879,6 +879,12 @@ repeat:
> if (session == NULL)
> return -1;
>
> + if (perf_guest &&
> + perf_kvm__setup_munge_ppc_guest_event(session->evlist)) {
> + pr_err("PPC event not present in %s file\n", input_name);
> + goto error;
> + }
This looks out of place, i.e. this reads: "For all cases where there is
a guest and we can't setup the ppc KVM guest related stuff, its an
error"
I think this gets clearer as:
if (perf_guest && perf_evlist__recorded_on_ppc(evlist) &&
perf_kvm__setup_munge_ppc_guest_event(session->evlist)) {
pr_err("PPC event not present in %s file\n", input_name);
goto error;
}
Then we read this as "Hey, if this was recorded on ppc, try to set
things up for ppc", but then again, what is this KVM stuff doing in the
generic 'perf report' code?
What if this is a perf.data file generated on PPC but being read on PPC?
This will not make sense to munge it, right?
This is with what I remember from this case, please bear with me.
- Arnaldo
> +
> if (report.queue_size) {
> ordered_events__set_alloc_size(&session->ordered_events,
> report.queue_size);
> diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
> index 738ce22..1665171 100644
> --- a/tools/perf/util/evsel.c
> +++ b/tools/perf/util/evsel.c
> @@ -216,6 +216,7 @@ void perf_evsel__init(struct perf_evsel *evsel,
> evsel->sample_size = __perf_evsel__sample_size(attr->sample_type);
> perf_evsel__calc_id_pos(evsel);
> evsel->cmdline_group_boundary = false;
> + evsel->munge_sample = NULL;
> }
>
> struct perf_evsel *perf_evsel__new_idx(struct perf_event_attr *attr, int
> idx)
> @@ -1887,6 +1888,9 @@ int perf_evsel__parse_sample(struct perf_evsel *evsel,
> union perf_event *event,
> }
> }
>
> + if (evsel->munge_sample != NULL)
> + evsel->munge_sample(evsel, data);
> +
> return 0;
> }
>
> diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
> index 501ea6e..4637945 100644
> --- a/tools/perf/util/evsel.h
> +++ b/tools/perf/util/evsel.h
> @@ -12,6 +12,7 @@
> #include "counts.h"
>
> struct perf_evsel;
> +struct perf_sample;
>
> /*
> * Per fd, to map back from PERF_SAMPLE_ID to evsel, only used when there
> are
> @@ -92,6 +93,8 @@ struct perf_evsel {
> double scale;
> const char *unit;
> struct event_format *tp_format;
> + void (*munge_sample)(struct perf_evsel *evsel,
> + struct perf_sample *sample);
> off_t id_offset;
> void *priv;
> u64 db_id;
> @@ -236,8 +239,6 @@ int perf_evsel__open(struct perf_evsel *evsel, struct
> cpu_map *cpus,
> struct thread_map *threads);
> void perf_evsel__close(struct perf_evsel *evsel, int ncpus, int nthreads);
>
> -struct perf_sample;
> -
> void *perf_evsel__rawptr(struct perf_evsel *evsel, struct perf_sample
> *sample,
> const char *name);
> u64 perf_evsel__intval(struct perf_evsel *evsel, struct perf_sample
> *sample,
> diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
> index d0d50ce..ffe5bbb 100644
> --- a/tools/perf/util/util.h
> +++ b/tools/perf/util/util.h
> @@ -367,4 +367,9 @@ typedef void (*print_binary_t)(enum binary_printer_ops,
> void print_binary(unsigned char *data, size_t len,
> size_t bytes_per_line, print_binary_t printer,
> void *extra);
> +
> +struct perf_evlist;
> +
> +int perf_kvm__setup_munge_ppc_guest_event(struct perf_evlist *evlist);
> +
> #endif /* GIT_COMPAT_UTIL_H */
>
>
>
>
> On Friday 25 March 2016 02:45 AM, Arnaldo Carvalho de Melo wrote:
> >Em Tue, Mar 22, 2016 at 11:19:21PM -0300, Arnaldo Carvalho de Melo escreveu:
> >>Em Tue, Mar 22, 2016 at 04:12:11PM -0300, Arnaldo Carvalho de Melo escreveu:
> >>>Em Wed, Feb 24, 2016 at 02:37:42PM +0530, Ravi Bangoria escreveu:
> >>>>'perf kvm record' is not available on powerpc because 'perf' relies on
> >>>>the 'cycles' event (a PMU event) to profile the guest. However, for
> >>>>powerpc, this can't be used from the host because the PMUs are controlled
> >>>>by the guest rather than the host.
> >>>>
> >>>>There exists a tracepoint 'kvm_hv:kvm_guest_exit' in powerpc which is
> >>>>hit whenever any of the threads exit the guest context. The guest
> >>>>instruction pointer dumped along with this tracepoint data in the field
> >>>>'pc', can be used as guest instruction pointer.
> >>>>
> >>>>This patch changes default event as kvm_hv:kvm_guest_exit for recording
> >>>>guest data in host on powerpc. As we are using host event to record guest
> >>>>data, this approach will enable only --guest option of 'perf kvm'. Still
> >>>>--host --guest together won't work.
> >>>It should, i.e. --host --guest should translate to:
> >>>
> >>> -e cycles:H,kvm_hv:kvm_guest_exit
> >>>
> >>>I.e. both collect cycles only in the host, and also the tracepoint that
> >>>will allow us to get the guest approximation for the unavailable cycles
> >>>event, no?
> >>>
> >>>I'm putting the infrastructure work needed for this the perf/cpumode
> >>>branch. More work will be put there soon.
> >>So I took a different path and made perf_evsel__parse_sample set a new
> >>perf_sample.cpumode field, this way we'll end up having just to set a
> >>per-evsel ->post_parse_sample() callback for the event that replaces
> >>"cycles" for PPC guests where we'll just set data->ip and data->cpumode,
> >>the rest of the code remains unchanged.
> >>
> >>The changes I made looks useful in itself, as, IIRC more code was
> >>removed than added.
> >>
> >>I'll continue tomorrow and will test with the kvm:kvm_exit on x86_64 for
> >>testing, that has:
> >Ok, so the infrastructure got merged already and from there the next
> >steps are in running with:
> >
> > perf kvm --guest record -a -e cycles:H,kvm:kvm_exit
> >
> >And then, with the patch below applied, try:
> >
> >perf kvm --guestkallsyms kallsyms.guest --guestmodules modules.guest report -i perf.data.guest --munge-ppc-guest-sample kvm:kvm_exit
> >
> >I'm almost there, it is still not resolving to the kernel DSO, etc, so I
> >get:
> >
> >Samples: 1K of event 'kvm:kvm_exit', Event count (approx.): 1924
> >Overhead Command Shared Object Symbol
> > 34.77% :5343 [unknown] [g] 0xffffffff81043158
> > 16.84% :5345 [unknown] [g] 0xffffffff813f3d5a
> > 16.84% :5345 [unknown] [g] 0xffffffff813f43ec
> > 13.83% :5345 [unknown] [g] 0xffffffff81043158
> > 9.62% :5343 [unknown] [g] 0xffffffff8104301a
> > 3.79% :5345 [unknown] [g] 0xffffffff8104301a
> > 1.77% :5345 [unknown] [u] 0x0000003ae6c75dc9
> > 0.52% :5343 [unknown] [g] 0xffffffff812a29b1
> > 0.16% :5343 [unknown] [g] 0xffffffff8100bc00
> > 0.10% :5343 [unknown] [g] 0xffffffff8104315a
> > 0.10% :5343 [unknown] [g] 0xffffffff8106306f
> > 0.10% :5343 [unknown] [g] 0xffffffff8153b7fc
> > 0.10% :5345 [unknown] [g] 0xffffffff8106306f
> > 0.05% :5343 [unknown] [g] 0xffffffff8100b720
> >
> >[root@...et ~]# cat /proc/*/task/5343/comm
> >qemu-system-x86
> >[root@...et ~]#
> >
> >The patch does several of the things you did per sample, but only right after
> >opening the perf.data file, and I'll break it down in multiple patches, this is
> >just a heads up, please review it if you have the time, in the end we should
> >have a mechanism useful not just for PPC and that affects just 'perf kvm' in
> >this specific case.
> >
> >- Arnaldo
> >
> >diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
> >index bff666458b28..b7b6527446f8 100644
> >--- a/tools/perf/builtin-kvm.c
> >+++ b/tools/perf/builtin-kvm.c
> >@@ -1480,6 +1480,86 @@ perf_stat:
> > }
> > #endif /* HAVE_KVM_STAT_SUPPORT */
> >
> >+#ifdef __powerpc__
> >+#define PPC_HV_DECREMENTER 2432
> >+#define PPC_HV_BIT 3
> >+#define PPC_PR_BIT 49
> >+#define PPC_MAX 63
> >+
> >+static bool perf_sample__is_hv_dec_trap(struct perf_sample *sample, struct perf_evsel *evsel)
> >+{
> >+ int trap = perf_evsel__intval(evsel, sample, "trap");
> >+ return trap == PPC_HV_DECREMENTER;
> >+}
> >+
> >+static void perf_kvm__munge_ppc_guest_sample(struct perf_evsel *evsel, struct perf_sample *sample)
> >+{
> >+ unsigned long msr, hv, pr;
> >+
> >+ if (!perf_sample__is_hv_dec_trap(sample, evsel))
> >+ return;
> >+
> >+ sample->ip = perf_evsel__intval(evsel, sample, "pc");
> >+ sample->cpumode = PERF_RECORD_MISC_GUEST_KERNEL;
> >+
> >+ msr = perf_evsel__intval(evsel, sample, "msr");
> >+ hv = msr & ((unsigned long)1 << (PPC_MAX - PPC_HV_BIT));
> >+ pr = msr & ((unsigned long)1 << (PPC_MAX - PPC_PR_BIT));
> >+ if (!hv && pr)
> >+ sample->cpumode = PERF_RECORD_MISC_GUEST_USER;
> >+}
> >+
> >+static bool perf_evlist__recorded_on_ppc(const struct perf_evlist *evlist)
> >+{
> >+ if (evlist->env && evlist->env->arch) {
> >+ return !strcmp(evlist->env->arch, "ppc64") ||
> >+ !strcmp(evlist->env->arch, "ppc64le");
> >+ }
> >+ return false;
> >+}
> >+#elif defined(__i386__) || defined(__x86_64__)
> >+/*
> >+ * For testing with kvm:kvm_exit
> >+ */
> >+static void perf_kvm__munge_ppc_guest_sample(struct perf_evsel *evsel, struct perf_sample *sample)
> >+{
> >+ sample->ip = perf_evsel__intval(evsel, sample, "guest_rip");
> >+ sample->cpumode = PERF_RECORD_MISC_GUEST_KERNEL;
> >+
> >+ if (sample->ip < 0xffffffff00000000)
> >+ sample->cpumode = PERF_RECORD_MISC_GUEST_USER;
> >+
> >+ if (0) {
> >+ fprintf(stderr, "%s: %s, cpumode=%d ip=%" PRIx64 "\n",
> >+ __func__, perf_evsel__name(evsel), sample->cpumode, sample->ip);
> >+ }
> >+}
> >+
> >+static bool perf_evlist__recorded_on_ppc(const struct perf_evlist *evlist __maybe_unused)
> >+{
> >+ return true;
> >+}
> >+#endif
> >+
> >+int perf_kvm__setup_munge_ppc_guest_event(struct perf_evlist *evlist, const char *name)
> >+{
> >+ struct perf_evsel *evsel;
> >+
> >+ if (!perf_evlist__recorded_on_ppc(evlist))
> >+ return 0;
> >+
> >+ evsel = perf_evlist__find_tracepoint_by_name(evlist, name);
> >+ if (evsel == NULL)
> >+ return -1;
> >+
> >+ if (0)
> >+ fprintf(stderr, "%s: %s\n", __func__, perf_evsel__name(evsel));
> >+
> >+ evsel->munge_sample = perf_kvm__munge_ppc_guest_sample;
> >+
> >+ return 0;
> >+}
> >+
> > static int __cmd_record(const char *file_name, int argc, const char **argv)
> > {
> > int rec_argc, i = 0, j;
> >diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
> >index 160ea23b45aa..ba8786d12423 100644
> >--- a/tools/perf/builtin-report.c
> >+++ b/tools/perf/builtin-report.c
> >@@ -667,6 +667,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
> > int branch_mode = -1;
> > bool branch_call_mode = false;
> > char callchain_default_opt[] = CALLCHAIN_DEFAULT_OPT;
> >+ const char *munge_ppc_guest_event = NULL;
> > const char * const report_usage[] = {
> > "perf report [<options>]",
> > NULL
> >@@ -814,6 +815,8 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
> > "Show raw trace event output (do not use print fmt or plugins)"),
> > OPT_BOOLEAN(0, "hierarchy", &symbol_conf.report_hierarchy,
> > "Show entries in a hierarchy"),
> >+ OPT_STRING(0, "munge-ppc-guest-sample", &munge_ppc_guest_event, "event name",
> >+ "Use a different event to simulate guest cycles, not accessible on PPC hosts"),
> > OPT_END()
> > };
> > struct perf_data_file file = {
> >@@ -880,6 +883,12 @@ repeat:
> > if (session == NULL)
> > return -1;
> >
> >+ if (munge_ppc_guest_event &&
> >+ perf_kvm__setup_munge_ppc_guest_event(session->evlist, munge_ppc_guest_event)) {
> >+ pr_err("PPC event not present in %s file\n", input_name);
> >+ goto error;
> >+ }
> >+
> > if (report.queue_size) {
> > ordered_events__set_alloc_size(&session->ordered_events,
> > report.queue_size);
> >diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
> >index 738ce226002b..1665171a6e9c 100644
> >--- a/tools/perf/util/evsel.c
> >+++ b/tools/perf/util/evsel.c
> >@@ -216,6 +216,7 @@ void perf_evsel__init(struct perf_evsel *evsel,
> > evsel->sample_size = __perf_evsel__sample_size(attr->sample_type);
> > perf_evsel__calc_id_pos(evsel);
> > evsel->cmdline_group_boundary = false;
> >+ evsel->munge_sample = NULL;
> > }
> >
> > struct perf_evsel *perf_evsel__new_idx(struct perf_event_attr *attr, int idx)
> >@@ -1887,6 +1888,9 @@ int perf_evsel__parse_sample(struct perf_evsel *evsel, union perf_event *event,
> > }
> > }
> >
> >+ if (evsel->munge_sample != NULL)
> >+ evsel->munge_sample(evsel, data);
> >+
> > return 0;
> > }
> >
> >diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
> >index 501ea6e565f1..46379451f641 100644
> >--- a/tools/perf/util/evsel.h
> >+++ b/tools/perf/util/evsel.h
> >@@ -12,6 +12,7 @@
> > #include "counts.h"
> >
> > struct perf_evsel;
> >+struct perf_sample;
> >
> > /*
> > * Per fd, to map back from PERF_SAMPLE_ID to evsel, only used when there are
> >@@ -92,6 +93,8 @@ struct perf_evsel {
> > double scale;
> > const char *unit;
> > struct event_format *tp_format;
> >+ void (*munge_sample)(struct perf_evsel *evsel,
> >+ struct perf_sample *sample);
> > off_t id_offset;
> > void *priv;
> > u64 db_id;
> >@@ -236,8 +239,6 @@ int perf_evsel__open(struct perf_evsel *evsel, struct cpu_map *cpus,
> > struct thread_map *threads);
> > void perf_evsel__close(struct perf_evsel *evsel, int ncpus, int nthreads);
> >
> >-struct perf_sample;
> >-
> > void *perf_evsel__rawptr(struct perf_evsel *evsel, struct perf_sample *sample,
> > const char *name);
> > u64 perf_evsel__intval(struct perf_evsel *evsel, struct perf_sample *sample,
> >diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
> >index 8298d607c738..12bc51450d56 100644
> >--- a/tools/perf/util/util.h
> >+++ b/tools/perf/util/util.h
> >@@ -356,4 +356,9 @@ typedef void (*print_binary_t)(enum binary_printer_ops,
> > void print_binary(unsigned char *data, size_t len,
> > size_t bytes_per_line, print_binary_t printer,
> > void *extra);
> >+
> >+struct perf_evlist;
> >+
> >+int perf_kvm__setup_munge_ppc_guest_event(struct perf_evlist *evlist, const char *name);
> >+
> > #endif /* GIT_COMPAT_UTIL_H */
> >
Powered by blists - more mailing lists