lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <768e94b1-feb6-4e69-2034-381f97de5c27@arm.com>
Date:   Fri, 22 Jul 2022 10:30:51 +0100
From:   James Clark <james.clark@....com>
To:     Mike Leach <mike.leach@...aro.org>
Cc:     mathieu.poirier@...aro.org, peterz@...radead.org, mingo@...hat.com,
        acme@...nel.org, linux-perf-users@...r.kernel.org,
        quic_jinlmao@...cinc.com, suzuki.poulose@....com,
        coresight@...ts.linaro.org, linux-arm-kernel@...ts.infradead.org,
        linux-kernel@...r.kernel.org
Subject: Re: [PATCH v2 11/13] perf: cs-etm: Handle
 PERF_RECORD_AUX_OUTPUT_HW_ID packet



On 21/07/2022 13:38, Mike Leach wrote:
> Hi James
> 
> On Wed, 20 Jul 2022 at 17:07, James Clark <james.clark@....com> wrote:
>>
>>
>>
>> On 04/07/2022 09:11, Mike Leach wrote:
>>> When using dynamically assigned CoreSight trace IDs the drivers can output
>>> the ID / CPU association as a PERF_RECORD_AUX_OUTPUT_HW_ID packet.
>>>
>>> Update cs-etm decoder to handle this packet by setting the CPU/Trace ID
>>> mapping.
>>>
>>> Signed-off-by: Mike Leach <mike.leach@...aro.org>
>>> ---
>>>  tools/include/linux/coresight-pmu.h           |  14 ++
>>>  .../perf/util/cs-etm-decoder/cs-etm-decoder.c |   9 +
>>>  tools/perf/util/cs-etm.c                      | 167 +++++++++++++++++-
>>>  3 files changed, 185 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/tools/include/linux/coresight-pmu.h b/tools/include/linux/coresight-pmu.h
>>> index 31d007fab3a6..4e8b3148f939 100644
>>> --- a/tools/include/linux/coresight-pmu.h
>>> +++ b/tools/include/linux/coresight-pmu.h
>>> @@ -7,6 +7,8 @@
>>>  #ifndef _LINUX_CORESIGHT_PMU_H
>>>  #define _LINUX_CORESIGHT_PMU_H
>>>
>>> +#include <linux/bits.h>
>>> +
>>>  #define CORESIGHT_ETM_PMU_NAME "cs_etm"
>>>
>>>  /*
>>> @@ -40,4 +42,16 @@
>>>  #define ETM4_CFG_BIT_RETSTK  12
>>>  #define ETM4_CFG_BIT_VMID_OPT        15
>>>
>>> +/*
>>> + * Interpretation of the PERF_RECORD_AUX_OUTPUT_HW_ID payload.
>>> + * Used to associate a CPU with the CoreSight Trace ID.
>>> + * [63:16] - unused SBZ
>>> + * [15:08] - Trace ID
>>> + * [07:00] - Version
>>> + */
>>> +#define CS_AUX_HW_ID_VERSION_MASK    GENMASK_ULL(7, 0)
>>> +#define CS_AUX_HW_ID_TRACE_ID_MASK   GENMASK_ULL(15, 8)
>>> +
>>> +#define CS_AUX_HW_ID_CURR_VERSION 0
>>> +
>>>  #endif
>>> diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
>>> index 31fa3b45134a..d1dd73310707 100644
>>> --- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
>>> +++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
>>> @@ -611,6 +611,8 @@ static ocsd_datapath_resp_t cs_etm_decoder__gen_trace_elem_printer(
>>>       return resp;
>>>  }
>>>
>>> +#define CS_TRACE_ID_MASK GENMASK(6, 0)
>>> +
>>>  static int
>>>  cs_etm_decoder__create_etm_decoder(struct cs_etm_decoder_params *d_params,
>>>                                  struct cs_etm_trace_params *t_params,
>>> @@ -625,6 +627,7 @@ cs_etm_decoder__create_etm_decoder(struct cs_etm_decoder_params *d_params,
>>>       switch (t_params->protocol) {
>>>       case CS_ETM_PROTO_ETMV3:
>>>       case CS_ETM_PROTO_PTM:
>>> +             csid = (t_params->etmv3.reg_idr & CS_TRACE_ID_MASK);
>>>               cs_etm_decoder__gen_etmv3_config(t_params, &config_etmv3);
>>>               decoder->decoder_name = (t_params->protocol == CS_ETM_PROTO_ETMV3) ?
>>>                                                       OCSD_BUILTIN_DCD_ETMV3 :
>>> @@ -632,11 +635,13 @@ cs_etm_decoder__create_etm_decoder(struct cs_etm_decoder_params *d_params,
>>>               trace_config = &config_etmv3;
>>>               break;
>>>       case CS_ETM_PROTO_ETMV4i:
>>> +             csid = (t_params->etmv4.reg_traceidr & CS_TRACE_ID_MASK);
>>>               cs_etm_decoder__gen_etmv4_config(t_params, &trace_config_etmv4);
>>>               decoder->decoder_name = OCSD_BUILTIN_DCD_ETMV4I;
>>>               trace_config = &trace_config_etmv4;
>>>               break;
>>>       case CS_ETM_PROTO_ETE:
>>> +             csid = (t_params->ete.reg_traceidr & CS_TRACE_ID_MASK);
>>>               cs_etm_decoder__gen_ete_config(t_params, &trace_config_ete);
>>>               decoder->decoder_name = OCSD_BUILTIN_DCD_ETE;
>>>               trace_config = &trace_config_ete;
>>> @@ -645,6 +650,10 @@ cs_etm_decoder__create_etm_decoder(struct cs_etm_decoder_params *d_params,
>>>               return -1;
>>>       }
>>>
>>> +     /* if the CPU has no trace ID associated, no decoder needed */
>>> +     if (csid == CS_UNUSED_TRACE_ID)
>>> +             return 0;
>>> +
>>>       if (d_params->operation == CS_ETM_OPERATION_DECODE) {
>>>               if (ocsd_dt_create_decoder(decoder->dcd_tree,
>>>                                          decoder->decoder_name,
>>> diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
>>> index df9d67901f8d..ffce858f21fd 100644
>>> --- a/tools/perf/util/cs-etm.c
>>> +++ b/tools/perf/util/cs-etm.c
>>> @@ -217,6 +217,139 @@ static int cs_etm__map_trace_id(u8 trace_chan_id, u64 *cpu_metadata)
>>>       return 0;
>>>  }
>>>
>>> +static int cs_etm__metadata_get_trace_id(u8 *trace_chan_id, u64 *cpu_metadata)
>>> +{
>>> +     u64 cs_etm_magic = cpu_metadata[CS_ETM_MAGIC];
>>> +
>>> +     switch (cs_etm_magic) {
>>> +     case __perf_cs_etmv3_magic:
>>> +             *trace_chan_id = cpu_metadata[CS_ETM_ETMTRACEIDR];
>>> +             break;
>>> +     case __perf_cs_etmv4_magic:
>>> +     case __perf_cs_ete_magic:
>>> +             *trace_chan_id = cpu_metadata[CS_ETMV4_TRCTRACEIDR];
>>> +             break;
>>> +
>>> +     default:
>>> +             return -EINVAL;
>>> +     }
>>> +     return 0;
>>> +}
>>> +
>>> +static int cs_etm__metadata_set_trace_id(u8 trace_chan_id, u64 *cpu_metadata)
>>> +{
>>> +     u64 cs_etm_magic = cpu_metadata[CS_ETM_MAGIC];
>>> +
>>> +     switch (cs_etm_magic) {
>>> +     case __perf_cs_etmv3_magic:
>>> +              cpu_metadata[CS_ETM_ETMTRACEIDR] = trace_chan_id;
>>> +             break;
>>> +     case __perf_cs_etmv4_magic:
>>> +     case __perf_cs_ete_magic:
>>> +             cpu_metadata[CS_ETMV4_TRCTRACEIDR] = trace_chan_id;
>>> +             break;
>>> +
>>> +     default:
>>> +             return -EINVAL;
>>> +     }
>>> +     return 0;
>>> +}
>>> +
>>> +/*
>>> + * FIELD_GET (linux/bitfield.h) not available outside kernel code,
>>> + * and the header contains too many dependencies to just copy over,
>>> + * so roll our own based on the original
>>> + */
>>> +#define __bf_shf(x) (__builtin_ffsll(x) - 1)
>>> +#define FIELD_GET(_mask, _reg)                                               \
>>> +     ({                                                              \
>>> +             (typeof(_mask))(((_reg) & (_mask)) >> __bf_shf(_mask)); \
>>> +     })
>>> +> +/*
>>> + * Handle the PERF_RECORD_AUX_OUTPUT_HW_ID event.
>>> + *
>>> + * The payload associates the Trace ID and the CPU.
>>> + * The routine is tolerant of seeing multiple packets with the same association,
>>> + * but a CPU / Trace ID association changing during a session is an error.
>>> + */
>>> +static int cs_etm__process_aux_output_hw_id(struct perf_session *session,
>>> +                                         union perf_event *event)
>>> +{
>>> +     struct cs_etm_auxtrace *etm;
>>> +     struct perf_sample sample;
>>> +     struct int_node *inode;
>>> +     struct evsel *evsel;
>>> +     u64 *cpu_data;
>>> +     u64 hw_id;
>>> +     int cpu, version, err;
>>> +     u8 trace_chan_id, curr_chan_id;
>>> +
>>> +     /* extract and parse the HW ID */
>>> +     hw_id = event->aux_output_hw_id.hw_id;
>>> +     version = FIELD_GET(CS_AUX_HW_ID_VERSION_MASK, hw_id);
>>> +     trace_chan_id = FIELD_GET(CS_AUX_HW_ID_TRACE_ID_MASK, hw_id);
>>> +
>>> +     /* check that we can handle this version */
>>> +     if (version > CS_AUX_HW_ID_CURR_VERSION)
>>> +             return -EINVAL;
>>> +
>>> +     /* get access to the etm metadata */
>>> +     etm = container_of(session->auxtrace, struct cs_etm_auxtrace, auxtrace);
>>> +     if (!etm || !etm->metadata)
>>> +             return -EINVAL;
>>> +
>>> +     /* parse the sample to get the CPU */
>>> +     evsel = evlist__event2evsel(session->evlist, event);
>>> +     if (!evsel)
>>> +             return -EINVAL;
>>> +     err = evsel__parse_sample(evsel, event, &sample);
>>> +     if (err)
>>> +             return err;
>>> +     cpu = sample.cpu;
>>> +     if (cpu == -1) {
>>> +             /* no CPU in the sample - possibly recorded with an old version of perf */
>>> +             pr_err("CS_ETM: no CPU AUX_OUTPUT_HW_ID sample. Use compatible perf to record.");
>>> +             return -EINVAL;
>>> +     }
>>> +
>>> +     /*
>>> +      * look to see if the metadata contains a valid trace ID.
>>> +      * if so we mapped it before and it must be the same as the ID in the packet.
>>> +      */
>>> +     cpu_data = etm->metadata[cpu];
>>> +     err = cs_etm__metadata_get_trace_id(&curr_chan_id, cpu_data);
>>> +     if (err)
>>> +             return err;
>>> +     if (CS_IS_VALID_TRACE_ID(curr_chan_id) && (curr_chan_id != trace_chan_id)) {
>>> +             pr_err("CS_ETM: mismatch between CPU trace ID and HW_ID packet ID\n");
>>> +             return -EINVAL;
>>> +     }
>>> +
>>> +     /* next see if the ID is mapped to a CPU, and it matches the current CPU */
>>> +     inode = intlist__find(traceid_list, trace_chan_id);
>>> +     if (inode) {
>>> +             cpu_data = inode->priv;
>>> +             if ((int)cpu_data[CS_ETM_CPU] != cpu) {
>>> +                     pr_err("CS_ETM: map mismatch between HW_ID packet CPU and Trace ID\n");
>>> +                     return -EINVAL;
>>> +             }
>>> +             return 0;
>>> +     }
>>> +
>>> +     /* not one we've seen before - lets map it */
>>> +     err = cs_etm__map_trace_id(trace_chan_id, cpu_data);
>>> +     if (err)
>>> +             return err;
>>> +
>>> +     /*
>>> +      * if we are picking up the association from the packet, need to plug
>>> +      * the correct trace ID into the metadata for setting up decoders later.
>>> +      */
>>> +     err = cs_etm__metadata_set_trace_id(trace_chan_id, cpu_data);
>>> +     return err;
>>> +}
>>> +
>>>  void cs_etm__etmq_set_traceid_queue_timestamp(struct cs_etm_queue *etmq,
>>>                                             u8 trace_chan_id)
>>>  {
>>> @@ -2433,6 +2566,8 @@ static int cs_etm__process_event(struct perf_session *session,
>>>               return cs_etm__process_itrace_start(etm, event);
>>>       else if (event->header.type == PERF_RECORD_SWITCH_CPU_WIDE)
>>>               return cs_etm__process_switch_cpu_wide(etm, event);
>>> +     else if (event->header.type ==  PERF_RECORD_AUX_OUTPUT_HW_ID)
>>> +             return cs_etm__process_aux_output_hw_id(session, event);
>>
>> This shouldn't need to be handled here because of the peek at the beginning. Although
>> it's probably harmless to do it twice, it can make deciphering the flow quite difficult.
>>
> Agreed - this was really belt and braces coding while I was testing -
> and where PT decoded it.
> Given the peek events this can be dropped next time.
> 
>>>
>>>       if (!etm->timeless_decoding && event->header.type == PERF_RECORD_AUX) {
>>>               /*
>>> @@ -2662,7 +2797,7 @@ static void cs_etm__print_auxtrace_info(__u64 *val, int num)
>>>       for (i = CS_HEADER_VERSION_MAX; cpu < num; cpu++) {
>>>               if (version == 0)
>>>                       err = cs_etm__print_cpu_metadata_v0(val, &i);
>>> -             else if (version == 1)
>>> +             else if (version == 1 || version == 2)
>>>                       err = cs_etm__print_cpu_metadata_v1(val, &i);
>>>               if (err)
>>>                       return;
>>> @@ -2774,11 +2909,16 @@ static int cs_etm__queue_aux_fragment(struct perf_session *session, off_t file_o
>>>       }
>>>
>>>       /*
>>> -      * In per-thread mode, CPU is set to -1, but TID will be set instead. See
>>> -      * auxtrace_mmap_params__set_idx(). Return 'not found' if neither CPU nor TID match.
>>> +      * In per-thread mode, auxtrace CPU is set to -1, but TID will be set instead. See
>>> +      * auxtrace_mmap_params__set_idx(). However, the sample AUX event will contain a
>>> +      * CPU as we set this always for the AUX_OUTPUT_HW_ID event.
>>> +      * So now compare only TIDs if auxtrace CPU is -1, and CPUs if auxtrace CPU is not -1.
>>> +      * Return 'not found' if mismatch.
>>>        */
>>> -     if ((auxtrace_event->cpu == (__u32) -1 && auxtrace_event->tid != sample->tid) ||
>>> -                     auxtrace_event->cpu != sample->cpu)
>>> +     if (auxtrace_event->cpu == (__u32) -1) {
>>> +             if (auxtrace_event->tid != sample->tid)
>>> +                     return 1;
>>> +     } else if (auxtrace_event->cpu != sample->cpu)
>>>               return 1;
>>>
>>>       if (aux_event->flags & PERF_AUX_FLAG_OVERWRITE) {
>>> @@ -2827,6 +2967,15 @@ static int cs_etm__queue_aux_fragment(struct perf_session *session, off_t file_o
>>>       return 1;
>>>  }
>>>
>>> +static int cs_etm__process_aux_hw_id_cb(struct perf_session *session, union perf_event *event,
>>> +                                     u64 offset __maybe_unused, void *data __maybe_unused)
>>> +{
>>> +     /* look to handle PERF_RECORD_AUX_OUTPUT_HW_ID early to ensure decoders can be set up */
>>> +     if (event->header.type == PERF_RECORD_AUX_OUTPUT_HW_ID)
>>> +             return cs_etm__process_aux_output_hw_id(session, event);
>>> +     return 0;
>>> +}
>>
>> I couldn't see the relationship between the two peeks and why they couldn't be done together
>> in one pass. I changed it so cs_etm__process_aux_hw_id_cb() is also called on the peek
>> to queue the aux records and it seemed to work. At least just opening the file and glancing.
>>
>> If there is some dependency though, I don't think two passes is excessive.
>>
> 
> I initially tried this and there are issues.
> 
>  During testing I had a --per-thread run with two buffers. One buffer
> had only a single ID, which appeared as a packet before the buffer was
> processed.
> The second had the same ID plus a new ID, which appeared after the
> first buffer and before the second.
> 
> Problem is, under the current system, once data is queued, the
> decoders are set and it meant a decoder for the second ID was never
> created, resulting in a bunch of undecoded data.
> It may well be possible to re-examine how and when decoders are
> created, but there is currently a built in assumption that all IDs are
> available before the first buffer is queued, and changing this is well
> beyond the remit of this patch set.
> 
> 
> 
>>> +
>>>  static int cs_etm__queue_aux_records_cb(struct perf_session *session, union perf_event *event,
>>>                                       u64 offset __maybe_unused, void *data __maybe_unused)
>>>  {
>>> @@ -3109,6 +3258,14 @@ int cs_etm__process_auxtrace_info(union perf_event *event,
>>>       if (err)
>>>               goto err_delete_thread;
>>>
>>> +     /* scan for AUX_OUTPUT_HW_ID records */
>>> +     if (hdr_version >=  CS_AUX_HW_ID_VERSION_MIN) {
>>> +             err = perf_session__peek_events(session, session->header.data_offset,
>>> +                                             session->header.data_size,
>>> +                                             cs_etm__process_aux_hw_id_cb, NULL);
>>
>> This no longer works at all with piping because of this line in peek_events:
>>
>>         if (perf_data__is_pipe(session->data))
>>                 return -1;
>>
> 
> Does this not also apply to the:
> cs_etm__queue_aux_records(session);
> call immediately after this, which also uses perf_session__peek_events()?

It uses it, but it has a fallback if it isn't available where the buffers aren't
split by aux records and are processed whole as they were before the aux split change.

The if statement surrounding it checks if the index was populated first, which only
happens in non-piping mode:

  if (index && index->nr > 0)
       return perf_session__peek_events(session, session->header.data_offset,

It seems like it isn't possible to have a fallback for this trace ID change so we
can probably drop piping support entirely.

> 
>> So we should change the warning message to an error and exit earlier:
>>
>>         if (!etm->data_queued)
>>                 pr_warning("CS ETM warning: Coresight decode and TRBE support requires random file access.\n"
>>                            "Continuing with best effort decoding in piped mode.\n\n");
>>
>> And then we can also remove all the now dead code and variables related to piping like:
>>
>>    etm->data_queued = etm->queues.populated;
>>    ...
>>
>>    if (!etm->data_queued) {
>>       ...
>>    }
>>
> 
> which means that these were already dead code?

See above

> 
>>
>>> +             if (err)
>>> +                     goto err_delete_thread;
>>> +     }
>>>       err = cs_etm__queue_aux_records(session);
>>>       if (err)
>>>               goto err_delete_thread;
> 
> 
> Regards
> 
> Mike
> 
> --
> Mike Leach
> Principal Engineer, ARM Ltd.
> Manchester Design Centre. UK

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ