linux-kernel - Re: [PATCH 1/4 -v2] perf: rework the whole read vs group stuff

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <7c86c4470908210747u125dc6d6y76a9ca7f1af41160@mail.gmail.com>
Date:	Fri, 21 Aug 2009 16:47:41 +0200
From:	stephane eranian <eranian@...glemail.com>
To:	Peter Zijlstra <a.p.zijlstra@...llo.nl>
Cc:	Ingo Molnar <mingo@...e.hu>, Paul Mackerras <paulus@...ba.org>,
	Corey J Ashford <cjashfor@...ibm.com>,
	LKML <linux-kernel@...r.kernel.org>,
	perfmon2-devel <perfmon2-devel@...ts.sourceforge.net>
Subject: Re: [PATCH 1/4 -v2] perf: rework the whole read vs group stuff

Hi,

Well, I tried that and it brings more complexity than needed
especially with regards to
extracting the ID for events when you're doing grouping.

To extract the ID, one has to read out a struct as follows:
 *      { u64           nr;
 *        { u64         time_enabled; } && PERF_FORMAT_ENABLED
 *        { u64         time_running; } && PERF_FORMAT_RUNNING
 *        { u64         value;
 *          { u64       id;           } && PERF_FORMAT_ID
 *        }             cntr[nr];
 *      } && PERF_FORMAT_GROUP

Supposedly, you should have to do this only once per group. Reading
this stuff using the group leader should yield the values of all the other
events. This is not what I have observed. All events report the same
ID as the leader. Not clear why.

As I suggested in a previous message, I don't think all of this is necessary.
If a tool was able to pass the ID to associate with an event, then there
would be no need for a read(). Furthermore, it would make it easier to pick
an ID which suites the tool's data structure. For instance, if you create
4 events in a group, the ID could be 0,1,2,3 and would most likely map to
an index in the array used by the tool to manage the perf_counter structures.
That would also make it easier in the critical path in the signal handler. No
need to have a lookup table to map "random" ID to ID more relevant
for the tool. The ID does not need to be very wide. IDs are relevant only if
one uses group sampling. Therefore the ID needs to identify an event within
a group. Could use a reserved field in perf_counter_attr or add an ioctl() to
assign an ID.

On Thu, Aug 13, 2009 at 11:47 AM, Peter Zijlstra<a.p.zijlstra@...llo.nl> wrote:
> Replace PERF_SAMPLE_GROUP with PERF_SAMPLE_READ and introduce
> PERF_FORMAT_GROUP to deal with group reads in a more generic way.
>
> This allows you to get group reads out of read() as well.
>
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@...llo.nl>
> ---
>  include/linux/perf_counter.h |   47 +++++--
>  kernel/perf_counter.c        |  274 +++++++++++++++++++++++++++++++------------
>  2 files changed, 238 insertions(+), 83 deletions(-)
>
> Index: linux-2.6/include/linux/perf_counter.h
> ===================================================================
> --- linux-2.6.orig/include/linux/perf_counter.h
> +++ linux-2.6/include/linux/perf_counter.h
> @@ -115,7 +115,7 @@ enum perf_counter_sample_format {
>        PERF_SAMPLE_TID                         = 1U << 1,
>        PERF_SAMPLE_TIME                        = 1U << 2,
>        PERF_SAMPLE_ADDR                        = 1U << 3,
> -       PERF_SAMPLE_GROUP                       = 1U << 4,
> +       PERF_SAMPLE_READ                        = 1U << 4,
>        PERF_SAMPLE_CALLCHAIN                   = 1U << 5,
>        PERF_SAMPLE_ID                          = 1U << 6,
>        PERF_SAMPLE_CPU                         = 1U << 7,
> @@ -127,16 +127,32 @@ enum perf_counter_sample_format {
>  };
>
>  /*
> - * Bits that can be set in attr.read_format to request that
> - * reads on the counter should return the indicated quantities,
> - * in increasing order of bit value, after the counter value.
> + * The format of the data returned by read() on a perf counter fd,
> + * as specified by attr.read_format:
> + *
> + * struct read_format {
> + *     { u64           value;
> + *       { u64         time_enabled; } && PERF_FORMAT_ENABLED
> + *       { u64         time_running; } && PERF_FORMAT_RUNNING
> + *       { u64         id;           } && PERF_FORMAT_ID
> + *     } && !PERF_FORMAT_GROUP
> + *
> + *     { u64           nr;
> + *       { u64         time_enabled; } && PERF_FORMAT_ENABLED
> + *       { u64         time_running; } && PERF_FORMAT_RUNNING
> + *       { u64         value;
> + *         { u64       id;           } && PERF_FORMAT_ID
> + *       }             cntr[nr];
> + *     } && PERF_FORMAT_GROUP
> + * };
>  */
>  enum perf_counter_read_format {
>        PERF_FORMAT_TOTAL_TIME_ENABLED          = 1U << 0,
>        PERF_FORMAT_TOTAL_TIME_RUNNING          = 1U << 1,
>        PERF_FORMAT_ID                          = 1U << 2,
> +       PERF_FORMAT_GROUP                       = 1U << 3,
>
> -       PERF_FORMAT_MAX = 1U << 3,              /* non-ABI */
> +       PERF_FORMAT_MAX = 1U << 4,              /* non-ABI */
>  };
>
>  #define PERF_ATTR_SIZE_VER0    64      /* sizeof first published struct */
> @@ -343,10 +359,8 @@ enum perf_event_type {
>         * struct {
>         *      struct perf_event_header        header;
>         *      u32                             pid, tid;
> -        *      u64                             value;
> -        *      { u64           time_enabled;   } && PERF_FORMAT_ENABLED
> -        *      { u64           time_running;   } && PERF_FORMAT_RUNNING
> -        *      { u64           parent_id;      } && PERF_FORMAT_ID
> +        *
> +        *      struct read_format              values;
>         * };
>         */
>        PERF_EVENT_READ                 = 8,
> @@ -364,11 +378,22 @@ enum perf_event_type {
>         *      { u32                   cpu, res; } && PERF_SAMPLE_CPU
>         *      { u64                   period;   } && PERF_SAMPLE_PERIOD
>         *
> -        *      { u64                   nr;
> -        *        { u64 id, val; }      cnt[nr];  } && PERF_SAMPLE_GROUP
> +        *      { struct read_format    values;   } && PERF_SAMPLE_READ
>         *
>         *      { u64                   nr,
>         *        u64                   ips[nr];  } && PERF_SAMPLE_CALLCHAIN
> +        *
> +        *      #
> +        *      # The RAW record below is opaque data wrt the ABI
> +        *      #
> +        *      # That is, the ABI doesn't make any promises wrt to
> +        *      # the stability of its content, it may vary depending
> +        *      # on event, hardware, kernel version and phase of
> +        *      # the moon.
> +        *      #
> +        *      # In other words, PERF_SAMPLE_RAW contents are not an ABI.
> +        *      #
> +        *
>         *      { u32                   size;
>         *        char                  data[size];}&& PERF_SAMPLE_RAW
>         * };
> Index: linux-2.6/kernel/perf_counter.c
> ===================================================================
> --- linux-2.6.orig/kernel/perf_counter.c
> +++ linux-2.6/kernel/perf_counter.c
> @@ -1692,7 +1692,32 @@ static int perf_release(struct inode *in
>        return 0;
>  }
>
> -static u64 perf_counter_read_tree(struct perf_counter *counter)
> +static int perf_counter_read_size(struct perf_counter *counter)
> +{
> +       int entry = sizeof(u64); /* value */
> +       int size = 0;
> +       int nr = 1;
> +
> +       if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
> +               size += sizeof(u64);
> +
> +       if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
> +               size += sizeof(u64);
> +
> +       if (counter->attr.read_format & PERF_FORMAT_ID)
> +               entry += sizeof(u64);
> +
> +       if (counter->attr.read_format & PERF_FORMAT_GROUP) {
> +               nr += counter->group_leader->nr_siblings;
> +               size += sizeof(u64);
> +       }
> +
> +       size += entry * nr;
> +
> +       return size;
> +}
> +
> +static u64 perf_counter_read_value(struct perf_counter *counter)
>  {
>        struct perf_counter *child;
>        u64 total = 0;
> @@ -1704,14 +1729,96 @@ static u64 perf_counter_read_tree(struct
>        return total;
>  }
>
> +static int perf_counter_read_entry(struct perf_counter *counter,
> +                                  u64 read_format, char __user *buf)
> +{
> +       int n = 0, count = 0;
> +       u64 values[2];
> +
> +       values[n++] = perf_counter_read_value(counter);
> +       if (read_format & PERF_FORMAT_ID)
> +               values[n++] = primary_counter_id(counter);
> +
> +       count = n * sizeof(u64);
> +
> +       if (copy_to_user(buf, values, count))
> +               return -EFAULT;
> +
> +       return count;
> +}
> +
> +static int perf_counter_read_group(struct perf_counter *counter,
> +                                  u64 read_format, char __user *buf)
> +{
> +       struct perf_counter *leader = counter->group_leader, *sub;
> +       int n = 0, size = 0, err = -EFAULT;
> +       u64 values[3];
> +
> +       values[n++] = 1 + leader->nr_siblings;
> +       if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
> +               values[n++] = leader->total_time_enabled +
> +                       atomic64_read(&leader->child_total_time_enabled);
> +       }
> +       if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
> +               values[n++] = leader->total_time_running +
> +                       atomic64_read(&leader->child_total_time_running);
> +       }
> +
> +       size = n * sizeof(u64);
> +
> +       if (copy_to_user(buf, values, size))
> +               return -EFAULT;
> +
> +       err = perf_counter_read_entry(leader, read_format, buf + size);
> +       if (err < 0)
> +               return err;
> +
> +       size += err;
> +
> +       list_for_each_entry(sub, &leader->sibling_list, list_entry) {
> +               err = perf_counter_read_entry(counter, read_format,
> +                               buf + size);
> +               if (err < 0)
> +                       return err;
> +
> +               size += err;
> +       }
> +
> +       return size;
> +}
> +
> +static int perf_counter_read_one(struct perf_counter *counter,
> +                                u64 read_format, char __user *buf)
> +{
> +       u64 values[4];
> +       int n = 0;
> +
> +       values[n++] = perf_counter_read_value(counter);
> +       if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
> +               values[n++] = counter->total_time_enabled +
> +                       atomic64_read(&counter->child_total_time_enabled);
> +       }
> +       if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
> +               values[n++] = counter->total_time_running +
> +                       atomic64_read(&counter->child_total_time_running);
> +       }
> +       if (read_format & PERF_FORMAT_ID)
> +               values[n++] = primary_counter_id(counter);
> +
> +       if (copy_to_user(buf, values, n * sizeof(u64)))
> +               return -EFAULT;
> +
> +       return n * sizeof(u64);
> +}
> +
>  /*
>  * Read the performance counter - simple non blocking version for now
>  */
>  static ssize_t
>  perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
>  {
> -       u64 values[4];
> -       int n;
> +       u64 read_format = counter->attr.read_format;
> +       int ret;
>
>        /*
>         * Return end-of-file for a read on a counter that is in
> @@ -1721,28 +1828,18 @@ perf_read_hw(struct perf_counter *counte
>        if (counter->state == PERF_COUNTER_STATE_ERROR)
>                return 0;
>
> +       if (count < perf_counter_read_size(counter))
> +               return -ENOSPC;
> +
>        WARN_ON_ONCE(counter->ctx->parent_ctx);
>        mutex_lock(&counter->child_mutex);
> -       values[0] = perf_counter_read_tree(counter);
> -       n = 1;
> -       if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
> -               values[n++] = counter->total_time_enabled +
> -                       atomic64_read(&counter->child_total_time_enabled);
> -       if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
> -               values[n++] = counter->total_time_running +
> -                       atomic64_read(&counter->child_total_time_running);
> -       if (counter->attr.read_format & PERF_FORMAT_ID)
> -               values[n++] = primary_counter_id(counter);
> +       if (read_format & PERF_FORMAT_GROUP)
> +               ret = perf_counter_read_group(counter, read_format, buf);
> +       else
> +               ret = perf_counter_read_one(counter, read_format, buf);
>        mutex_unlock(&counter->child_mutex);
>
> -       if (count < n * sizeof(u64))
> -               return -EINVAL;
> -       count = n * sizeof(u64);
> -
> -       if (copy_to_user(buf, values, count))
> -               return -EFAULT;
> -
> -       return count;
> +       return ret;
>  }
>
>  static ssize_t
> @@ -2631,6 +2728,79 @@ static u32 perf_counter_tid(struct perf_
>        return task_pid_nr_ns(p, counter->ns);
>  }
>
> +static void perf_output_read_one(struct perf_output_handle *handle,
> +                                struct perf_counter *counter)
> +{
> +       u64 read_format = counter->attr.read_format;
> +       u64 values[4];
> +       int n = 0;
> +
> +       values[n++] = atomic64_read(&counter->count);
> +       if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
> +               values[n++] = counter->total_time_enabled +
> +                       atomic64_read(&counter->child_total_time_enabled);
> +       }
> +       if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
> +               values[n++] = counter->total_time_running +
> +                       atomic64_read(&counter->child_total_time_running);
> +       }
> +       if (read_format & PERF_FORMAT_ID)
> +               values[n++] = primary_counter_id(counter);
> +
> +       perf_output_copy(handle, values, n * sizeof(u64));
> +}
> +
> +/*
> + * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult.
> + */
> +static void perf_output_read_group(struct perf_output_handle *handle,
> +                           struct perf_counter *counter)
> +{
> +       struct perf_counter *leader = counter->group_leader, *sub;
> +       u64 read_format = counter->attr.read_format;
> +       u64 values[5];
> +       int n = 0;
> +
> +       values[n++] = 1 + leader->nr_siblings;
> +
> +       if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
> +               values[n++] = leader->total_time_enabled;
> +
> +       if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
> +               values[n++] = leader->total_time_running;
> +
> +       if (leader != counter)
> +               leader->pmu->read(leader);
> +
> +       values[n++] = atomic64_read(&leader->count);
> +       if (read_format & PERF_FORMAT_ID)
> +               values[n++] = primary_counter_id(leader);
> +
> +       perf_output_copy(handle, values, n * sizeof(u64));
> +
> +       list_for_each_entry(sub, &leader->sibling_list, list_entry) {
> +               n = 0;
> +
> +               if (sub != counter)
> +                       sub->pmu->read(sub);
> +
> +               values[n++] = atomic64_read(&sub->count);
> +               if (read_format & PERF_FORMAT_ID)
> +                       values[n++] = primary_counter_id(sub);
> +
> +               perf_output_copy(handle, values, n * sizeof(u64));
> +       }
> +}
> +
> +static void perf_output_read(struct perf_output_handle *handle,
> +                            struct perf_counter *counter)
> +{
> +       if (counter->attr.read_format & PERF_FORMAT_GROUP)
> +               perf_output_read_group(handle, counter);
> +       else
> +               perf_output_read_one(handle, counter);
> +}
> +
>  void perf_counter_output(struct perf_counter *counter, int nmi,
>                                struct perf_sample_data *data)
>  {
> @@ -2642,10 +2812,6 @@ void perf_counter_output(struct perf_cou
>        struct {
>                u32 pid, tid;
>        } tid_entry;
> -       struct {
> -               u64 id;
> -               u64 counter;
> -       } group_entry;
>        struct perf_callchain_entry *callchain = NULL;
>        int callchain_size = 0;
>        u64 time;
> @@ -2700,10 +2866,8 @@ void perf_counter_output(struct perf_cou
>        if (sample_type & PERF_SAMPLE_PERIOD)
>                header.size += sizeof(u64);
>
> -       if (sample_type & PERF_SAMPLE_GROUP) {
> -               header.size += sizeof(u64) +
> -                       counter->nr_siblings * sizeof(group_entry);
> -       }
> +       if (sample_type & PERF_SAMPLE_READ)
> +               header.size += perf_counter_read_size(counter);
>
>        if (sample_type & PERF_SAMPLE_CALLCHAIN) {
>                callchain = perf_callchain(data->regs);
> @@ -2760,26 +2924,8 @@ void perf_counter_output(struct perf_cou
>        if (sample_type & PERF_SAMPLE_PERIOD)
>                perf_output_put(&handle, data->period);
>
> -       /*
> -        * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult.
> -        */
> -       if (sample_type & PERF_SAMPLE_GROUP) {
> -               struct perf_counter *leader, *sub;
> -               u64 nr = counter->nr_siblings;
> -
> -               perf_output_put(&handle, nr);
> -
> -               leader = counter->group_leader;
> -               list_for_each_entry(sub, &leader->sibling_list, list_entry) {
> -                       if (sub != counter)
> -                               sub->pmu->read(sub);
> -
> -                       group_entry.id = primary_counter_id(sub);
> -                       group_entry.counter = atomic64_read(&sub->count);
> -
> -                       perf_output_put(&handle, group_entry);
> -               }
> -       }
> +       if (sample_type & PERF_SAMPLE_READ)
> +               perf_output_read(&handle, counter);
>
>        if (sample_type & PERF_SAMPLE_CALLCHAIN) {
>                if (callchain)
> @@ -2818,8 +2964,6 @@ struct perf_read_event {
>
>        u32                             pid;
>        u32                             tid;
> -       u64                             value;
> -       u64                             format[3];
>  };
>
>  static void
> @@ -2831,34 +2975,20 @@ perf_counter_read_event(struct perf_coun
>                .header = {
>                        .type = PERF_EVENT_READ,
>                        .misc = 0,
> -                       .size = sizeof(event) - sizeof(event.format),
> +                       .size = sizeof(event) + perf_counter_read_size(counter),
>                },
>                .pid = perf_counter_pid(counter, task),
>                .tid = perf_counter_tid(counter, task),
> -               .value = atomic64_read(&counter->count),
>        };
> -       int ret, i = 0;
> -
> -       if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
> -               event.header.size += sizeof(u64);
> -               event.format[i++] = counter->total_time_enabled;
> -       }
> -
> -       if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
> -               event.header.size += sizeof(u64);
> -               event.format[i++] = counter->total_time_running;
> -       }
> -
> -       if (counter->attr.read_format & PERF_FORMAT_ID) {
> -               event.header.size += sizeof(u64);
> -               event.format[i++] = primary_counter_id(counter);
> -       }
> +       int ret;
>
>        ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);
>        if (ret)
>                return;
>
> -       perf_output_copy(&handle, &event, event.header.size);
> +       perf_output_put(&handle, event);
> +       perf_output_read(&handle, counter);
> +
>        perf_output_end(&handle);
>  }
>
> @@ -3929,9 +4059,9 @@ perf_counter_alloc(struct perf_counter_a
>        atomic64_set(&hwc->period_left, hwc->sample_period);
>
>        /*
> -        * we currently do not support PERF_SAMPLE_GROUP on inherited counters
> +        * we currently do not support PERF_FORMAT_GROUP on inherited counters
>         */
> -       if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP))
> +       if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
>                goto done;
>
>        switch (attr->type) {
>
> --
>
>