lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Thu, 8 Apr 2010 23:18:21 +0200
From:	Stephane Eranian <eranian@...gle.com>
To:	Peter Zijlstra <peterz@...radead.org>
Cc:	linux-kernel@...r.kernel.org, mingo@...e.hu, paulus@...ba.org,
	davem@...emloft.net, fweisbec@...il.com, robert.richter@....com,
	perfmon2-devel@...ts.sf.net, eranian@...il.com
Subject: Re: [PATCH] perf_events: fix bogus warn_on(_once) in 
	perf_prepare_sample()

On Thu, Apr 8, 2010 at 11:03 PM, Peter Zijlstra <peterz@...radead.org> wrote:
> On Thu, 2010-04-08 at 22:55 +0200, Peter Zijlstra wrote:
>> On Thu, 2010-04-08 at 22:45 +0200, Stephane Eranian wrote:
>> >     There is a warn_on_once() check for PERF_SAMPLE_RAW which trips
>> >     when using PEBS on both Core and Nehalem. Core PEBS sample size is 144
>> >     bytes and 176 bytes for Nehalem. Both are multiples of 8, but the size
>> >     field is encoded as int, thus the total is never a multiple of 8 which
>> >     trips the check. I think the size should have been u64, but now it is
>> >     too late to change given it is ABI.
>>
>> PEBS hasn't seen -linus yet, so we can fix that.
>>
>> There's various things that do indeed rely on the perf buffer to always
>> be u64 aligned, so this warning isn't bogus at all.
>
> On the subject of PEBS, we need to change the ABI before it does hit
> -linus, I've got something like the below,. but I'm not quite sure of
> it.
>
> We could keep the PERF_RECORD_MISC_EXACT bit and allow non-fixed up IPs
> in PERF_SAMPLE_EVENT_IP fields, that way we can avoid having to add both
> IP and EVENT_IP to samples.
>
You always have the original IP in the PEBS sample, so why add yet another way
of getting to the same data?

> ---
> Subject: perf: Change the PEBS interface
> From: Peter Zijlstra <a.p.zijlstra@...llo.nl>
> Date: Tue Mar 30 13:35:58 CEST 2010
>
> This patch changes the PEBS interface from perf_event_attr::precise
> and PERF_RECORD_MISC_EXACT to perf_event_attr::precise and
> PERF_SAMPLE_EVENT_IP.
>
> The rationale is that .precise=1 !PERF_SAMPLE_EVENT_IP could be used
> to implement buffered PEBS.
>
I am not sure I understand what you mean by buffered PEBS. Are you talking
about using PEBS buffer bigger than one entry?
If so, how can you do:
  - the LBR based fixups for multiple samples (on PMU interrupt)
  - attribute PID/TID in system-wide mode


> Signed-off-by: Peter Zijlstra <a.p.zijlstra@...llo.nl>
> LKML-Reference: <new-submission>
> ---
>  arch/x86/include/asm/perf_event.h         |    9 --
>  arch/x86/kernel/cpu/perf_event_intel_ds.c |  133 +++++++++++++-----------------
>  include/linux/perf_event.h                |    7 +
>  kernel/perf_event.c                       |    6 +
>  tools/perf/builtin-top.c                  |    9 +-
>  5 files changed, 79 insertions(+), 85 deletions(-)
>
> Index: linux-2.6/arch/x86/include/asm/perf_event.h
> ===================================================================
> --- linux-2.6.orig/arch/x86/include/asm/perf_event.h
> +++ linux-2.6/arch/x86/include/asm/perf_event.h
> @@ -128,21 +128,12 @@ extern void perf_events_lapic_init(void)
>
>  #define PERF_EVENT_INDEX_OFFSET                        0
>
> -/*
> - * Abuse bit 3 of the cpu eflags register to indicate proper PEBS IP fixups.
> - * This flag is otherwise unused and ABI specified to be 0, so nobody should
> - * care what we do with it.
> - */
> -#define PERF_EFLAGS_EXACT      (1UL << 3)
> -
>  #define perf_misc_flags(regs)                          \
>  ({     int misc = 0;                                   \
>        if (user_mode(regs))                            \
>                misc |= PERF_RECORD_MISC_USER;          \
>        else                                            \
>                misc |= PERF_RECORD_MISC_KERNEL;        \
> -       if (regs->flags & PERF_EFLAGS_EXACT)            \
> -               misc |= PERF_RECORD_MISC_EXACT;         \
>        misc; })
>
>  #define perf_instruction_pointer(regs) ((regs)->ip)
> Index: linux-2.6/arch/x86/kernel/cpu/perf_event_intel_ds.c
> ===================================================================
> --- linux-2.6.orig/arch/x86/kernel/cpu/perf_event_intel_ds.c
> +++ linux-2.6/arch/x86/kernel/cpu/perf_event_intel_ds.c
> @@ -330,7 +330,8 @@ static void intel_pmu_pebs_enable(struct
>        cpuc->pebs_enabled |= 1ULL << hwc->idx;
>        WARN_ON_ONCE(cpuc->enabled);
>
> -       if (x86_pmu.intel_cap.pebs_trap)
> +       if (x86_pmu.intel_cap.pebs_trap &&
> +                       (event->attr.sample_type & PERF_SAMPLE_EVENT_IP))
>                intel_pmu_lbr_enable(event);
>  }
>
> @@ -345,7 +346,8 @@ static void intel_pmu_pebs_disable(struc
>
>        hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
>
> -       if (x86_pmu.intel_cap.pebs_trap)
> +       if (x86_pmu.intel_cap.pebs_trap &&
> +                       (event->attr.sample_type & PERF_SAMPLE_EVENT_IP))
>                intel_pmu_lbr_disable(event);
>  }
>
> @@ -376,12 +378,12 @@ static inline bool kernel_ip(unsigned lo
>  #endif
>  }
>
> -static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
> +static int intel_pmu_pebs_fixup_ip(unsigned long *ipp)
>  {
>        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
>        unsigned long from = cpuc->lbr_entries[0].from;
>        unsigned long old_to, to = cpuc->lbr_entries[0].to;
> -       unsigned long ip = regs->ip;
> +       unsigned long ip = *ipp;
>
>        /*
>         * We don't need to fixup if the PEBS assist is fault like
> @@ -412,7 +414,7 @@ static int intel_pmu_pebs_fixup_ip(struc
>         * We sampled a branch insn, rewind using the LBR stack
>         */
>        if (ip == to) {
> -               regs->ip = from;
> +               *ipp = from;
>                return 1;
>        }
>
> @@ -439,7 +441,7 @@ static int intel_pmu_pebs_fixup_ip(struc
>        } while (to < ip);
>
>        if (to == ip) {
> -               regs->ip = old_to;
> +               *ipp = old_to;
>                return 1;
>        }
>
> @@ -452,15 +454,62 @@ static int intel_pmu_pebs_fixup_ip(struc
>
>  static int intel_pmu_save_and_restart(struct perf_event *event);
>
> +static void __intel_pmu_pebs_event(struct perf_event *event,
> +                                  struct pt_regs *iregs, void *__pebs)
> +{
> +       /*
> +        * We cast to pebs_record_core since that is a subset of
> +        * both formats and we don't use the other fields in this
> +        * routine.
> +        */
> +       struct pebs_record_core *pebs = __pebs;
> +       struct perf_sample_data data;
> +       struct perf_raw_record raw;
> +       struct pt_regs regs;
> +
> +       if (!intel_pmu_save_and_restart(event))
> +               return;
> +
> +       perf_sample_data_init(&data, 0);
> +       data.period = event->hw.last_period;
> +
> +       if (event->attr.sample_type & PERF_SAMPLE_RAW) {
> +               raw.size = x86_pmu.pebs_record_size;
> +               raw.data = pebs;
> +               data.raw = &raw;
> +       }
> +
> +       /*
> +        * We use the interrupt regs as a base because the PEBS record
> +        * does not contain a full regs set, specifically it seems to
> +        * lack segment descriptors, which get used by things like
> +        * user_mode().
> +        *
> +        * In the simple case fix up only the IP and BP,SP regs, for
> +        * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
> +        * A possible PERF_SAMPLE_REGS will have to transfer all regs.
> +        */
> +       regs = *iregs;
> +       regs.ip = pebs->ip;
> +       regs.bp = pebs->bp;
> +       regs.sp = pebs->sp;
> +
> +       if (event->attr.sample_type & PERF_SAMPLE_EVENT_IP) {
> +               unsigned long event_ip = pebs->ip;
> +               if (intel_pmu_pebs_fixup_ip(&event_ip))
> +                       data.event_ip = event_ip;
> +       }
> +
> +       if (perf_event_overflow(event, 1, &data, &regs))
> +               x86_pmu_stop(event);
> +}
> +
>  static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
>  {
>        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
>        struct debug_store *ds = cpuc->ds;
>        struct perf_event *event = cpuc->events[0]; /* PMC0 only */
>        struct pebs_record_core *at, *top;
> -       struct perf_sample_data data;
> -       struct perf_raw_record raw;
> -       struct pt_regs regs;
>        int n;
>
>        if (!ds || !x86_pmu.pebs)
> @@ -486,9 +535,6 @@ static void intel_pmu_drain_pebs_core(st
>        if (n <= 0)
>                return;
>
> -       if (!intel_pmu_save_and_restart(event))
> -               return;
> -
>        /*
>         * Should not happen, we program the threshold at 1 and do not
>         * set a reset value.
> @@ -496,37 +542,7 @@ static void intel_pmu_drain_pebs_core(st
>        WARN_ON_ONCE(n > 1);
>        at += n - 1;
>
> -       perf_sample_data_init(&data, 0);
> -       data.period = event->hw.last_period;
> -
> -       if (event->attr.sample_type & PERF_SAMPLE_RAW) {
> -               raw.size = x86_pmu.pebs_record_size;
> -               raw.data = at;
> -               data.raw = &raw;
> -       }
> -
> -       /*
> -        * We use the interrupt regs as a base because the PEBS record
> -        * does not contain a full regs set, specifically it seems to
> -        * lack segment descriptors, which get used by things like
> -        * user_mode().
> -        *
> -        * In the simple case fix up only the IP and BP,SP regs, for
> -        * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
> -        * A possible PERF_SAMPLE_REGS will have to transfer all regs.
> -        */
> -       regs = *iregs;
> -       regs.ip = at->ip;
> -       regs.bp = at->bp;
> -       regs.sp = at->sp;
> -
> -       if (intel_pmu_pebs_fixup_ip(&regs))
> -               regs.flags |= PERF_EFLAGS_EXACT;
> -       else
> -               regs.flags &= ~PERF_EFLAGS_EXACT;
> -
> -       if (perf_event_overflow(event, 1, &data, &regs))
> -               x86_pmu_stop(event);
> +       __intel_pmu_pebs_event(event, iregs, at);
>  }
>
>  static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
> @@ -534,10 +550,7 @@ static void intel_pmu_drain_pebs_nhm(str
>        struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
>        struct debug_store *ds = cpuc->ds;
>        struct pebs_record_nhm *at, *top;
> -       struct perf_sample_data data;
>        struct perf_event *event = NULL;
> -       struct perf_raw_record raw;
> -       struct pt_regs regs;
>        u64 status = 0;
>        int bit, n;
>
> @@ -579,33 +592,7 @@ static void intel_pmu_drain_pebs_nhm(str
>                if (!event || bit >= MAX_PEBS_EVENTS)
>                        continue;
>
> -               if (!intel_pmu_save_and_restart(event))
> -                       continue;
> -
> -               perf_sample_data_init(&data, 0);
> -               data.period = event->hw.last_period;
> -
> -               if (event->attr.sample_type & PERF_SAMPLE_RAW) {
> -                       raw.size = x86_pmu.pebs_record_size;
> -                       raw.data = at;
> -                       data.raw = &raw;
> -               }
> -
> -               /*
> -                * See the comment in intel_pmu_drain_pebs_core()
> -                */
> -               regs = *iregs;
> -               regs.ip = at->ip;
> -               regs.bp = at->bp;
> -               regs.sp = at->sp;
> -
> -               if (intel_pmu_pebs_fixup_ip(&regs))
> -                       regs.flags |= PERF_EFLAGS_EXACT;
> -               else
> -                       regs.flags &= ~PERF_EFLAGS_EXACT;
> -
> -               if (perf_event_overflow(event, 1, &data, &regs))
> -                       x86_pmu_stop(event);
> +               __intel_pmu_pebs_event(event, iregs, at);
>        }
>  }
>
> Index: linux-2.6/include/linux/perf_event.h
> ===================================================================
> --- linux-2.6.orig/include/linux/perf_event.h
> +++ linux-2.6/include/linux/perf_event.h
> @@ -125,8 +125,9 @@ enum perf_event_sample_format {
>        PERF_SAMPLE_PERIOD                      = 1U << 8,
>        PERF_SAMPLE_STREAM_ID                   = 1U << 9,
>        PERF_SAMPLE_RAW                         = 1U << 10,
> +       PERF_SAMPLE_EVENT_IP                    = 1U << 11,
>
> -       PERF_SAMPLE_MAX = 1U << 11,             /* non-ABI */
> +       PERF_SAMPLE_MAX = 1U << 12,             /* non-ABI */
>  };
>
>  /*
> @@ -294,7 +295,6 @@ struct perf_event_mmap_page {
>  #define PERF_RECORD_MISC_USER                  (2 << 0)
>  #define PERF_RECORD_MISC_HYPERVISOR            (3 << 0)
>
> -#define PERF_RECORD_MISC_EXACT                 (1 << 14)
>  /*
>  * Reserve the last bit to indicate some extended misc field
>  */
> @@ -389,6 +389,7 @@ enum perf_event_type {
>         *      struct perf_event_header        header;
>         *
>         *      { u64                   ip;       } && PERF_SAMPLE_IP
> +        *      { u64                   event_ip; } && PERF_SAMPLE_EVENT_IP
>         *      { u32                   pid, tid; } && PERF_SAMPLE_TID
>         *      { u64                   time;     } && PERF_SAMPLE_TIME
>         *      { u64                   addr;     } && PERF_SAMPLE_ADDR
> @@ -804,6 +805,7 @@ struct perf_sample_data {
>        u64                             type;
>
>        u64                             ip;
> +       u64                             event_ip;
>        struct {
>                u32     pid;
>                u32     tid;
> @@ -824,6 +826,7 @@ struct perf_sample_data {
>  static inline
>  void perf_sample_data_init(struct perf_sample_data *data, u64 addr)
>  {
> +       data->event_ip = 0;
>        data->addr = addr;
>        data->raw  = NULL;
>  }
> Index: linux-2.6/kernel/perf_event.c
> ===================================================================
> --- linux-2.6.orig/kernel/perf_event.c
> +++ linux-2.6/kernel/perf_event.c
> @@ -3158,6 +3158,9 @@ void perf_output_sample(struct perf_outp
>        if (sample_type & PERF_SAMPLE_IP)
>                perf_output_put(handle, data->ip);
>
> +       if (sample_type & PERF_SAMPLE_EVENT_IP)
> +               perf_output_put(handle, data->event_ip);
> +
>        if (sample_type & PERF_SAMPLE_TID)
>                perf_output_put(handle, data->tid_entry);
>
> @@ -3237,6 +3240,9 @@ void perf_prepare_sample(struct perf_eve
>                header->size += sizeof(data->ip);
>        }
>
> +       if (sample_type & PERF_SAMPLE_EVENT_IP)
> +               header->size += sizeof(data->event_ip);
> +
>        if (sample_type & PERF_SAMPLE_TID) {
>                /* namespace issues */
>                data->tid_entry.pid = perf_event_pid(event, current);
> Index: linux-2.6/tools/perf/builtin-top.c
> ===================================================================
> --- linux-2.6.orig/tools/perf/builtin-top.c
> +++ linux-2.6/tools/perf/builtin-top.c
> @@ -975,8 +975,10 @@ static void event__process_sample(const
>                return;
>        }
>
> -       if (self->header.misc & PERF_RECORD_MISC_EXACT)
> +       if (ip)
>                exact_samples++;
> +       else
> +               return;
>
>        if (event__preprocess_sample(self, session, &al, symbol_filter) < 0 ||
>            al.filtered)
> @@ -1169,6 +1171,11 @@ static void start_counter(int i, int cou
>
>        attr->sample_type       = PERF_SAMPLE_IP | PERF_SAMPLE_TID;
>
> +       if (attr->precise) {
> +               attr->sample_type &= ~PERF_SAMPLE_IP;
> +               attr->sample_type |= PERF_SAMPLE_EVENT_IP;
> +       }
> +
>        if (freq) {
>                attr->sample_type       |= PERF_SAMPLE_PERIOD;
>                attr->freq              = 1;
>
>
>

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ