[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <7c86c4470908210747u125dc6d6y76a9ca7f1af41160@mail.gmail.com>
Date: Fri, 21 Aug 2009 16:47:41 +0200
From: stephane eranian <eranian@...glemail.com>
To: Peter Zijlstra <a.p.zijlstra@...llo.nl>
Cc: Ingo Molnar <mingo@...e.hu>, Paul Mackerras <paulus@...ba.org>,
Corey J Ashford <cjashfor@...ibm.com>,
LKML <linux-kernel@...r.kernel.org>,
perfmon2-devel <perfmon2-devel@...ts.sourceforge.net>
Subject: Re: [PATCH 1/4 -v2] perf: rework the whole read vs group stuff
Hi,
Well, I tried that and it brings more complexity than needed
especially with regards to
extracting the ID for events when you're doing grouping.
To extract the ID, one has to read out a struct as follows:
* { u64 nr;
* { u64 time_enabled; } && PERF_FORMAT_ENABLED
* { u64 time_running; } && PERF_FORMAT_RUNNING
* { u64 value;
* { u64 id; } && PERF_FORMAT_ID
* } cntr[nr];
* } && PERF_FORMAT_GROUP
Supposedly, you should have to do this only once per group. Reading
this stuff using the group leader should yield the values of all the other
events. This is not what I have observed. All events report the same
ID as the leader. Not clear why.
As I suggested in a previous message, I don't think all of this is necessary.
If a tool was able to pass the ID to associate with an event, then there
would be no need for a read(). Furthermore, it would make it easier to pick
an ID which suites the tool's data structure. For instance, if you create
4 events in a group, the ID could be 0,1,2,3 and would most likely map to
an index in the array used by the tool to manage the perf_counter structures.
That would also make it easier in the critical path in the signal handler. No
need to have a lookup table to map "random" ID to ID more relevant
for the tool. The ID does not need to be very wide. IDs are relevant only if
one uses group sampling. Therefore the ID needs to identify an event within
a group. Could use a reserved field in perf_counter_attr or add an ioctl() to
assign an ID.
On Thu, Aug 13, 2009 at 11:47 AM, Peter Zijlstra<a.p.zijlstra@...llo.nl> wrote:
> Replace PERF_SAMPLE_GROUP with PERF_SAMPLE_READ and introduce
> PERF_FORMAT_GROUP to deal with group reads in a more generic way.
>
> This allows you to get group reads out of read() as well.
>
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@...llo.nl>
> ---
> include/linux/perf_counter.h | 47 +++++--
> kernel/perf_counter.c | 274 +++++++++++++++++++++++++++++++------------
> 2 files changed, 238 insertions(+), 83 deletions(-)
>
> Index: linux-2.6/include/linux/perf_counter.h
> ===================================================================
> --- linux-2.6.orig/include/linux/perf_counter.h
> +++ linux-2.6/include/linux/perf_counter.h
> @@ -115,7 +115,7 @@ enum perf_counter_sample_format {
> PERF_SAMPLE_TID = 1U << 1,
> PERF_SAMPLE_TIME = 1U << 2,
> PERF_SAMPLE_ADDR = 1U << 3,
> - PERF_SAMPLE_GROUP = 1U << 4,
> + PERF_SAMPLE_READ = 1U << 4,
> PERF_SAMPLE_CALLCHAIN = 1U << 5,
> PERF_SAMPLE_ID = 1U << 6,
> PERF_SAMPLE_CPU = 1U << 7,
> @@ -127,16 +127,32 @@ enum perf_counter_sample_format {
> };
>
> /*
> - * Bits that can be set in attr.read_format to request that
> - * reads on the counter should return the indicated quantities,
> - * in increasing order of bit value, after the counter value.
> + * The format of the data returned by read() on a perf counter fd,
> + * as specified by attr.read_format:
> + *
> + * struct read_format {
> + * { u64 value;
> + * { u64 time_enabled; } && PERF_FORMAT_ENABLED
> + * { u64 time_running; } && PERF_FORMAT_RUNNING
> + * { u64 id; } && PERF_FORMAT_ID
> + * } && !PERF_FORMAT_GROUP
> + *
> + * { u64 nr;
> + * { u64 time_enabled; } && PERF_FORMAT_ENABLED
> + * { u64 time_running; } && PERF_FORMAT_RUNNING
> + * { u64 value;
> + * { u64 id; } && PERF_FORMAT_ID
> + * } cntr[nr];
> + * } && PERF_FORMAT_GROUP
> + * };
> */
> enum perf_counter_read_format {
> PERF_FORMAT_TOTAL_TIME_ENABLED = 1U << 0,
> PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1,
> PERF_FORMAT_ID = 1U << 2,
> + PERF_FORMAT_GROUP = 1U << 3,
>
> - PERF_FORMAT_MAX = 1U << 3, /* non-ABI */
> + PERF_FORMAT_MAX = 1U << 4, /* non-ABI */
> };
>
> #define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */
> @@ -343,10 +359,8 @@ enum perf_event_type {
> * struct {
> * struct perf_event_header header;
> * u32 pid, tid;
> - * u64 value;
> - * { u64 time_enabled; } && PERF_FORMAT_ENABLED
> - * { u64 time_running; } && PERF_FORMAT_RUNNING
> - * { u64 parent_id; } && PERF_FORMAT_ID
> + *
> + * struct read_format values;
> * };
> */
> PERF_EVENT_READ = 8,
> @@ -364,11 +378,22 @@ enum perf_event_type {
> * { u32 cpu, res; } && PERF_SAMPLE_CPU
> * { u64 period; } && PERF_SAMPLE_PERIOD
> *
> - * { u64 nr;
> - * { u64 id, val; } cnt[nr]; } && PERF_SAMPLE_GROUP
> + * { struct read_format values; } && PERF_SAMPLE_READ
> *
> * { u64 nr,
> * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN
> + *
> + * #
> + * # The RAW record below is opaque data wrt the ABI
> + * #
> + * # That is, the ABI doesn't make any promises wrt to
> + * # the stability of its content, it may vary depending
> + * # on event, hardware, kernel version and phase of
> + * # the moon.
> + * #
> + * # In other words, PERF_SAMPLE_RAW contents are not an ABI.
> + * #
> + *
> * { u32 size;
> * char data[size];}&& PERF_SAMPLE_RAW
> * };
> Index: linux-2.6/kernel/perf_counter.c
> ===================================================================
> --- linux-2.6.orig/kernel/perf_counter.c
> +++ linux-2.6/kernel/perf_counter.c
> @@ -1692,7 +1692,32 @@ static int perf_release(struct inode *in
> return 0;
> }
>
> -static u64 perf_counter_read_tree(struct perf_counter *counter)
> +static int perf_counter_read_size(struct perf_counter *counter)
> +{
> + int entry = sizeof(u64); /* value */
> + int size = 0;
> + int nr = 1;
> +
> + if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
> + size += sizeof(u64);
> +
> + if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
> + size += sizeof(u64);
> +
> + if (counter->attr.read_format & PERF_FORMAT_ID)
> + entry += sizeof(u64);
> +
> + if (counter->attr.read_format & PERF_FORMAT_GROUP) {
> + nr += counter->group_leader->nr_siblings;
> + size += sizeof(u64);
> + }
> +
> + size += entry * nr;
> +
> + return size;
> +}
> +
> +static u64 perf_counter_read_value(struct perf_counter *counter)
> {
> struct perf_counter *child;
> u64 total = 0;
> @@ -1704,14 +1729,96 @@ static u64 perf_counter_read_tree(struct
> return total;
> }
>
> +static int perf_counter_read_entry(struct perf_counter *counter,
> + u64 read_format, char __user *buf)
> +{
> + int n = 0, count = 0;
> + u64 values[2];
> +
> + values[n++] = perf_counter_read_value(counter);
> + if (read_format & PERF_FORMAT_ID)
> + values[n++] = primary_counter_id(counter);
> +
> + count = n * sizeof(u64);
> +
> + if (copy_to_user(buf, values, count))
> + return -EFAULT;
> +
> + return count;
> +}
> +
> +static int perf_counter_read_group(struct perf_counter *counter,
> + u64 read_format, char __user *buf)
> +{
> + struct perf_counter *leader = counter->group_leader, *sub;
> + int n = 0, size = 0, err = -EFAULT;
> + u64 values[3];
> +
> + values[n++] = 1 + leader->nr_siblings;
> + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
> + values[n++] = leader->total_time_enabled +
> + atomic64_read(&leader->child_total_time_enabled);
> + }
> + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
> + values[n++] = leader->total_time_running +
> + atomic64_read(&leader->child_total_time_running);
> + }
> +
> + size = n * sizeof(u64);
> +
> + if (copy_to_user(buf, values, size))
> + return -EFAULT;
> +
> + err = perf_counter_read_entry(leader, read_format, buf + size);
> + if (err < 0)
> + return err;
> +
> + size += err;
> +
> + list_for_each_entry(sub, &leader->sibling_list, list_entry) {
> + err = perf_counter_read_entry(counter, read_format,
> + buf + size);
> + if (err < 0)
> + return err;
> +
> + size += err;
> + }
> +
> + return size;
> +}
> +
> +static int perf_counter_read_one(struct perf_counter *counter,
> + u64 read_format, char __user *buf)
> +{
> + u64 values[4];
> + int n = 0;
> +
> + values[n++] = perf_counter_read_value(counter);
> + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
> + values[n++] = counter->total_time_enabled +
> + atomic64_read(&counter->child_total_time_enabled);
> + }
> + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
> + values[n++] = counter->total_time_running +
> + atomic64_read(&counter->child_total_time_running);
> + }
> + if (read_format & PERF_FORMAT_ID)
> + values[n++] = primary_counter_id(counter);
> +
> + if (copy_to_user(buf, values, n * sizeof(u64)))
> + return -EFAULT;
> +
> + return n * sizeof(u64);
> +}
> +
> /*
> * Read the performance counter - simple non blocking version for now
> */
> static ssize_t
> perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
> {
> - u64 values[4];
> - int n;
> + u64 read_format = counter->attr.read_format;
> + int ret;
>
> /*
> * Return end-of-file for a read on a counter that is in
> @@ -1721,28 +1828,18 @@ perf_read_hw(struct perf_counter *counte
> if (counter->state == PERF_COUNTER_STATE_ERROR)
> return 0;
>
> + if (count < perf_counter_read_size(counter))
> + return -ENOSPC;
> +
> WARN_ON_ONCE(counter->ctx->parent_ctx);
> mutex_lock(&counter->child_mutex);
> - values[0] = perf_counter_read_tree(counter);
> - n = 1;
> - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
> - values[n++] = counter->total_time_enabled +
> - atomic64_read(&counter->child_total_time_enabled);
> - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
> - values[n++] = counter->total_time_running +
> - atomic64_read(&counter->child_total_time_running);
> - if (counter->attr.read_format & PERF_FORMAT_ID)
> - values[n++] = primary_counter_id(counter);
> + if (read_format & PERF_FORMAT_GROUP)
> + ret = perf_counter_read_group(counter, read_format, buf);
> + else
> + ret = perf_counter_read_one(counter, read_format, buf);
> mutex_unlock(&counter->child_mutex);
>
> - if (count < n * sizeof(u64))
> - return -EINVAL;
> - count = n * sizeof(u64);
> -
> - if (copy_to_user(buf, values, count))
> - return -EFAULT;
> -
> - return count;
> + return ret;
> }
>
> static ssize_t
> @@ -2631,6 +2728,79 @@ static u32 perf_counter_tid(struct perf_
> return task_pid_nr_ns(p, counter->ns);
> }
>
> +static void perf_output_read_one(struct perf_output_handle *handle,
> + struct perf_counter *counter)
> +{
> + u64 read_format = counter->attr.read_format;
> + u64 values[4];
> + int n = 0;
> +
> + values[n++] = atomic64_read(&counter->count);
> + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
> + values[n++] = counter->total_time_enabled +
> + atomic64_read(&counter->child_total_time_enabled);
> + }
> + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
> + values[n++] = counter->total_time_running +
> + atomic64_read(&counter->child_total_time_running);
> + }
> + if (read_format & PERF_FORMAT_ID)
> + values[n++] = primary_counter_id(counter);
> +
> + perf_output_copy(handle, values, n * sizeof(u64));
> +}
> +
> +/*
> + * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult.
> + */
> +static void perf_output_read_group(struct perf_output_handle *handle,
> + struct perf_counter *counter)
> +{
> + struct perf_counter *leader = counter->group_leader, *sub;
> + u64 read_format = counter->attr.read_format;
> + u64 values[5];
> + int n = 0;
> +
> + values[n++] = 1 + leader->nr_siblings;
> +
> + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
> + values[n++] = leader->total_time_enabled;
> +
> + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
> + values[n++] = leader->total_time_running;
> +
> + if (leader != counter)
> + leader->pmu->read(leader);
> +
> + values[n++] = atomic64_read(&leader->count);
> + if (read_format & PERF_FORMAT_ID)
> + values[n++] = primary_counter_id(leader);
> +
> + perf_output_copy(handle, values, n * sizeof(u64));
> +
> + list_for_each_entry(sub, &leader->sibling_list, list_entry) {
> + n = 0;
> +
> + if (sub != counter)
> + sub->pmu->read(sub);
> +
> + values[n++] = atomic64_read(&sub->count);
> + if (read_format & PERF_FORMAT_ID)
> + values[n++] = primary_counter_id(sub);
> +
> + perf_output_copy(handle, values, n * sizeof(u64));
> + }
> +}
> +
> +static void perf_output_read(struct perf_output_handle *handle,
> + struct perf_counter *counter)
> +{
> + if (counter->attr.read_format & PERF_FORMAT_GROUP)
> + perf_output_read_group(handle, counter);
> + else
> + perf_output_read_one(handle, counter);
> +}
> +
> void perf_counter_output(struct perf_counter *counter, int nmi,
> struct perf_sample_data *data)
> {
> @@ -2642,10 +2812,6 @@ void perf_counter_output(struct perf_cou
> struct {
> u32 pid, tid;
> } tid_entry;
> - struct {
> - u64 id;
> - u64 counter;
> - } group_entry;
> struct perf_callchain_entry *callchain = NULL;
> int callchain_size = 0;
> u64 time;
> @@ -2700,10 +2866,8 @@ void perf_counter_output(struct perf_cou
> if (sample_type & PERF_SAMPLE_PERIOD)
> header.size += sizeof(u64);
>
> - if (sample_type & PERF_SAMPLE_GROUP) {
> - header.size += sizeof(u64) +
> - counter->nr_siblings * sizeof(group_entry);
> - }
> + if (sample_type & PERF_SAMPLE_READ)
> + header.size += perf_counter_read_size(counter);
>
> if (sample_type & PERF_SAMPLE_CALLCHAIN) {
> callchain = perf_callchain(data->regs);
> @@ -2760,26 +2924,8 @@ void perf_counter_output(struct perf_cou
> if (sample_type & PERF_SAMPLE_PERIOD)
> perf_output_put(&handle, data->period);
>
> - /*
> - * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult.
> - */
> - if (sample_type & PERF_SAMPLE_GROUP) {
> - struct perf_counter *leader, *sub;
> - u64 nr = counter->nr_siblings;
> -
> - perf_output_put(&handle, nr);
> -
> - leader = counter->group_leader;
> - list_for_each_entry(sub, &leader->sibling_list, list_entry) {
> - if (sub != counter)
> - sub->pmu->read(sub);
> -
> - group_entry.id = primary_counter_id(sub);
> - group_entry.counter = atomic64_read(&sub->count);
> -
> - perf_output_put(&handle, group_entry);
> - }
> - }
> + if (sample_type & PERF_SAMPLE_READ)
> + perf_output_read(&handle, counter);
>
> if (sample_type & PERF_SAMPLE_CALLCHAIN) {
> if (callchain)
> @@ -2818,8 +2964,6 @@ struct perf_read_event {
>
> u32 pid;
> u32 tid;
> - u64 value;
> - u64 format[3];
> };
>
> static void
> @@ -2831,34 +2975,20 @@ perf_counter_read_event(struct perf_coun
> .header = {
> .type = PERF_EVENT_READ,
> .misc = 0,
> - .size = sizeof(event) - sizeof(event.format),
> + .size = sizeof(event) + perf_counter_read_size(counter),
> },
> .pid = perf_counter_pid(counter, task),
> .tid = perf_counter_tid(counter, task),
> - .value = atomic64_read(&counter->count),
> };
> - int ret, i = 0;
> -
> - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
> - event.header.size += sizeof(u64);
> - event.format[i++] = counter->total_time_enabled;
> - }
> -
> - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
> - event.header.size += sizeof(u64);
> - event.format[i++] = counter->total_time_running;
> - }
> -
> - if (counter->attr.read_format & PERF_FORMAT_ID) {
> - event.header.size += sizeof(u64);
> - event.format[i++] = primary_counter_id(counter);
> - }
> + int ret;
>
> ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);
> if (ret)
> return;
>
> - perf_output_copy(&handle, &event, event.header.size);
> + perf_output_put(&handle, event);
> + perf_output_read(&handle, counter);
> +
> perf_output_end(&handle);
> }
>
> @@ -3929,9 +4059,9 @@ perf_counter_alloc(struct perf_counter_a
> atomic64_set(&hwc->period_left, hwc->sample_period);
>
> /*
> - * we currently do not support PERF_SAMPLE_GROUP on inherited counters
> + * we currently do not support PERF_FORMAT_GROUP on inherited counters
> */
> - if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP))
> + if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
> goto done;
>
> switch (attr->type) {
>
> --
>
>
Powered by blists - more mailing lists