linux-kernel - Re: [PATCH v0 5/5] perf/x86/intel/pt: Add support for instruction trace filtering in PT

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CANLsYkykMHDk71nOGXbnDbF08zXBsUa_7YDH2t3U=Yk=dgwnzQ@mail.gmail.com>
Date:	Fri, 11 Dec 2015 11:06:53 -0700
From:	Mathieu Poirier <mathieu.poirier@...aro.org>
To:	Alexander Shishkin <alexander.shishkin@...ux.intel.com>
Cc:	Peter Zijlstra <a.p.zijlstra@...llo.nl>,
	Ingo Molnar <mingo@...hat.com>,
	"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
	vince@...ter.net, Stephane Eranian <eranian@...gle.com>,
	Arnaldo Carvalho de Melo <acme@...radead.org>
Subject: Re: [PATCH v0 5/5] perf/x86/intel/pt: Add support for instruction
 trace filtering in PT

On 11 December 2015 at 06:36, Alexander Shishkin
<alexander.shishkin@...ux.intel.com> wrote:
> Newer versions of Intel PT support address ranges, which can be used to
> define IP address range-based filters or TraceSTOP regions. Number of
> ranges in enumerated via cpuid.
>
> This patch implements pmu callbacks and related low-level code to allow
> filter validation, configuration and programming into the hardware.
>
> Signed-off-by: Alexander Shishkin <alexander.shishkin@...ux.intel.com>
> ---
>  arch/x86/kernel/cpu/intel_pt.h            |  30 ++++++-
>  arch/x86/kernel/cpu/perf_event_intel_pt.c | 132 +++++++++++++++++++++++++++++-
>  2 files changed, 159 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/intel_pt.h b/arch/x86/kernel/cpu/intel_pt.h
> index 6ce8cd20b9..1b36c8529d 100644
> --- a/arch/x86/kernel/cpu/intel_pt.h
> +++ b/arch/x86/kernel/cpu/intel_pt.h
> @@ -105,13 +105,39 @@ struct pt_buffer {
>         struct topa_entry       *topa_index[0];
>  };
>
> +#define PT_FILTERS_NUM 4
> +
> +/**
> + * struct pt_filter - IP range filter configuration
> + * @msr_a:     range start, goes to RTIT_ADDRn_A
> + * @msr_b:     range end, goes to RTIT_ADDRn_B
> + * @config:    4-bit field in RTIT_CTL
> + */
> +struct pt_filter {
> +       unsigned long   msr_a;
> +       unsigned long   msr_b;
> +       unsigned long   config;
> +};
> +
> +/**
> + * struct pt_filters - IP range filtering context
> + * @filter:    filters defined for this context
> + * @nr_filters:        number of defined filters in the @filter array
> + */
> +struct pt_filters {
> +       struct pt_filter        filter[PT_FILTERS_NUM];
> +       unsigned int            nr_filters;
> +};
> +
>  /**
>   * struct pt - per-cpu pt context
> - * @handle:    perf output handle
> - * @handle_nmi:        do handle PT PMI on this cpu, there's an active event
> + * @handle:            perf output handle
> + * @filters:           last configured filters
> + * @handle_nmi:                do handle PT PMI on this cpu, there's an active event
>   */
>  struct pt {
>         struct perf_output_handle handle;
> +       struct pt_filters       filters;
>         int                     handle_nmi;
>  };
>
> diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c
> index 2ec25581de..f531a2f0de 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel_pt.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c
> @@ -253,6 +253,73 @@ static bool pt_event_valid(struct perf_event *event)
>   * These all are cpu affine and operate on a local PT
>   */
>
> +/* Address ranges and their corresponding msr configuration registers */
> +static const struct pt_address_range {
> +       unsigned long   msr_a;
> +       unsigned long   msr_b;
> +       unsigned int    reg_off;
> +} pt_address_ranges[] = {
> +       {
> +               .msr_a   = MSR_IA32_RTIT_ADDR0_A,
> +               .msr_b   = MSR_IA32_RTIT_ADDR0_B,
> +               .reg_off = RTIT_CTL_ADDR0_OFFSET,
> +       },
> +       {
> +               .msr_a   = MSR_IA32_RTIT_ADDR1_A,
> +               .msr_b   = MSR_IA32_RTIT_ADDR1_B,
> +               .reg_off = RTIT_CTL_ADDR1_OFFSET,
> +       },
> +       {
> +               .msr_a   = MSR_IA32_RTIT_ADDR2_A,
> +               .msr_b   = MSR_IA32_RTIT_ADDR2_B,
> +               .reg_off = RTIT_CTL_ADDR2_OFFSET,
> +       },
> +       {
> +               .msr_a   = MSR_IA32_RTIT_ADDR3_A,
> +               .msr_b   = MSR_IA32_RTIT_ADDR3_B,
> +               .reg_off = RTIT_CTL_ADDR3_OFFSET,
> +       }
> +};
> +
> +static u64 pt_config_filters(struct perf_event *event)
> +{
> +       struct pt_filters *filters = event->hw.itrace_filters;
> +       struct pt *pt = this_cpu_ptr(&pt_ctx);
> +       unsigned int range = 0;
> +       u64 rtit_ctl = 0;
> +
> +       if (!filters)
> +               return 0;
> +
> +       for (range = 0; range < filters->nr_filters; range++) {
> +               struct pt_filter *filter = &filters->filter[range];
> +
> +               /*
> +                * Note, if the range has zero start/end addresses due
> +                * to its dynamic object not being loaded yet, we just
> +                * go ahead and program zeroed range, which will simply
> +                * produce no data. Note^2: if executable code at 0x0
> +                * is a concern, we can set up an "invalid" configuration
> +                * such as msr_b < msr_a.
> +                */
> +
> +               /* avoid redundant msr writes */
> +               if (pt->filters.filter[range].msr_a != filter->msr_a) {
> +                       wrmsrl(pt_address_ranges[range].msr_a, filter->msr_a);
> +                       pt->filters.filter[range].msr_a = filter->msr_a;
> +               }
> +
> +               if (pt->filters.filter[range].msr_b != filter->msr_b) {
> +                       wrmsrl(pt_address_ranges[range].msr_b, filter->msr_b);
> +                       pt->filters.filter[range].msr_b = filter->msr_b;
> +               }

You don't need checks to make sure the address range is correct?  On
the CS side a < b must be respected or the tracer will produced
invalid results.

> +
> +               rtit_ctl |= filter->config << pt_address_ranges[range].reg_off;

I understand what you're doing here but it is probably a good idea to
make it clear with a comment.

> +       }
> +
> +       return rtit_ctl;
> +}
> +
>  static void pt_config(struct perf_event *event)
>  {
>         u64 reg;
> @@ -262,7 +329,8 @@ static void pt_config(struct perf_event *event)
>                 wrmsrl(MSR_IA32_RTIT_STATUS, 0);
>         }
>
> -       reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
> +       reg = pt_config_filters(event);
> +       reg |= RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
>
>         if (!event->attr.exclude_kernel)
>                 reg |= RTIT_CTL_OS;
> @@ -907,6 +975,60 @@ static void pt_buffer_free_aux(void *data)
>         kfree(buf);
>  }
>
> +static int pt_itrace_filters_init(struct perf_event *event)
> +{
> +       struct pt_filters *filters;
> +       int node = event->cpu == -1 ? -1 : cpu_to_node(event->cpu);
> +
> +       if (!pt_cap_get(PT_CAP_num_address_ranges))
> +               return 0;
> +
> +       filters = kzalloc_node(sizeof(struct pt_filters), GFP_KERNEL, node);
> +       if (!filters)
> +               return -ENOMEM;
> +
> +       event->hw.itrace_filters = filters;
> +
> +       return 0;
> +}
> +
> +static void pt_itrace_filters_fini(struct perf_event *event)
> +{
> +       kfree(event->hw.itrace_filters);
> +       event->hw.itrace_filters = NULL;
> +}
> +
> +static int pt_event_itrace_filter_setup(struct perf_event *event)
> +{
> +       struct pt_filters *filters = event->hw.itrace_filters;
> +       struct perf_itrace_filter *filter;
> +       int range = 0;
> +
> +       if (!filters)
> +               return -EOPNOTSUPP;
> +
> +       list_for_each_entry_rcu(filter, &event->itrace_filters, entry) {
> +               /* PT doesn't support single address triggers */
> +               if (!filter->range)
> +                       return -EOPNOTSUPP;
> +
> +               if (filter->kernel && !kernel_ip(filter->offset))
> +                       return -EINVAL;
> +
> +               filters->filter[range].msr_a  = filter->start;
> +               filters->filter[range].msr_b  = filter->end;
> +               filters->filter[range].config = filter->filter ? 1 : 2;
> +
> +               if (++range > pt_cap_get(PT_CAP_num_address_ranges))
> +                       return -EOPNOTSUPP;
> +       }
> +
> +       if (range)
> +               filters->nr_filters = range - 1;
> +
> +       return 0;
> +}
> +
>  /**
>   * intel_pt_interrupt() - PT PMI handler
>   */
> @@ -1075,6 +1197,7 @@ static void pt_event_read(struct perf_event *event)
>
>  static void pt_event_destroy(struct perf_event *event)
>  {
> +       pt_itrace_filters_fini(event);
>         x86_del_exclusive(x86_lbr_exclusive_pt);
>  }
>
> @@ -1089,6 +1212,11 @@ static int pt_event_init(struct perf_event *event)
>         if (x86_add_exclusive(x86_lbr_exclusive_pt))
>                 return -EBUSY;
>
> +       if (pt_itrace_filters_init(event)) {
> +               x86_del_exclusive(x86_lbr_exclusive_pt);
> +               return -ENOMEM;
> +       }
> +
>         event->destroy = pt_event_destroy;
>
>         return 0;
> @@ -1152,6 +1280,8 @@ static __init int pt_init(void)
>         pt_pmu.pmu.read         = pt_event_read;
>         pt_pmu.pmu.setup_aux    = pt_buffer_setup_aux;
>         pt_pmu.pmu.free_aux     = pt_buffer_free_aux;
> +       pt_pmu.pmu.itrace_filter_setup =
> +               pt_event_itrace_filter_setup;
>         ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
>
>         return ret;
> --
> 2.6.2
>

I've been scratching my head for a while now on how we could convey
address ranges to the tracers.  My initial thought was to extend the
-e cs_etm/.../ to take strings that could be parsed.  I was going to
send an RFC email to the list in January but you beat me to the punch
- the method you are putting forward is better.

I had comments about some possible race conditions in the core but
Peter got to those first.

Other than the above comment and the suggestion in 4/5, for the
portion of the work concerned with IntelPT:

Reviewed-by: Mathieu Poirier <mathieu.poirier@...aro.org>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/