[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <AANLkTintfpyvx=Pj1+uu8kCkFn3zT6b36h27Hb7pm=ZF@mail.gmail.com>
Date: Wed, 22 Dec 2010 11:08:02 +0100
From: Stephane Eranian <eranian@...gle.com>
To: Peter Zijlstra <a.p.zijlstra@...llo.nl>
Cc: Lin Ming <ming.m.lin@...el.com>, Ingo Molnar <mingo@...e.hu>,
Andi Kleen <andi@...stfloor.org>,
Frederic Weisbecker <fweisbec@...il.com>,
Arjan van de Ven <arjan@...radead.org>,
lkml <linux-kernel@...r.kernel.org>, paulus <paulus@...ba.org>
Subject: Re: [RFC PATCH] perf: Add load latency monitoring on Intel Nehalem/Westmere
Hi,
On Wed, Dec 22, 2010 at 10:00 AM, Peter Zijlstra <a.p.zijlstra@...llo.nl> wrote:
> On Wed, 2010-12-22 at 16:12 +0800, Lin Ming wrote:
>>
>> diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
>> index ed6ff11..2a02529 100644
>> --- a/arch/x86/kernel/cpu/perf_event.c
>> +++ b/arch/x86/kernel/cpu/perf_event.c
>> @@ -197,18 +197,25 @@ struct extra_reg {
>> unsigned int extra_shift;
>> u64 config_mask;
>> u64 valid_mask;
>> + u64 flags;
>> };
>>
>> -#define EVENT_EXTRA_REG(e, ms, m, vm, es) { \
>> +#define EVENT_EXTRA_REG(e, ms, m, vm, es, f) { \
>> .event = (e), \
>> .msr = (ms), \
>> .config_mask = (m), \
>> .valid_mask = (vm), \
>> .extra_shift = (es), \
>> + .flags = (f), \
>> }
>> #define INTEL_EVENT_EXTRA_REG(event, msr, vm, es) \
>> - EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, es)
>> -#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, 0)
>> + EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, es, 0)
>> +#define INTEL_EVENT_EXTRA_REG2(event, msr, vm, es, f) \
>> + EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT | \
>> + ARCH_PERFMON_EVENTSEL_UMASK, vm, es, f)
>> +#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, 0, 0)
>
> You'll need to increment MAX_EXTRA_REGS to 3 I think.
>
>> +#define EXTRA_REG_LD_LAT 0x1
>
> I'm not quite sure we actually need the whole flags business.
>
>> union perf_capabilities {
>> struct {
>> @@ -384,6 +391,11 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
>> if (extra & ~er->valid_mask)
>> return -EINVAL;
>> event->hw.extra_config = extra;
>> + event->hw.extra_flags = er->flags;
>> +
>> + /* The minimum value that may be programmed into MSR_PEBS_LD_LAT is 3 */
>> + if ((er->flags & EXTRA_REG_LD_LAT) && extra < 3)
>> + event->hw.extra_config = 3;
>
> if (er->msr == MSR_PEBS_LD_LAT_THRESHOLD && extra < 3)
> event->hw.extra_config = 3;
>
>> break;
>> }
>> return 0;
>> diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
>> index bc4afb1..7e2b873 100644
>> --- a/arch/x86/kernel/cpu/perf_event_intel.c
>> +++ b/arch/x86/kernel/cpu/perf_event_intel.c
>> @@ -89,6 +89,8 @@ static struct event_constraint intel_nehalem_event_constraints[] =
>> static struct extra_reg intel_nehalem_extra_regs[] =
>> {
>> INTEL_EVENT_EXTRA_REG(0xb7, 0x1a6, 0xffff, 32), /* OFFCORE_RESPONSE */
>> + /* MEM_INST_RETIRED.LATENCY_ABOVE_THRESHOLD */
>> + INTEL_EVENT_EXTRA_REG2(0x100b, 0x3f6, 0xffff, 32, EXTRA_REG_LD_LAT),
>> EVENT_EXTRA_END
>> };
>
> Maybe use the MSR names instead of the numbers.
>
>
>> diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
>> index b7dcd9f..d008c40 100644
>> --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
>> +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
>> @@ -376,6 +376,7 @@ static struct event_constraint intel_core_pebs_events[] = {
>> };
>>
>> static struct event_constraint intel_nehalem_pebs_events[] = {
>> + PEBS_EVENT_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.LATENCY_ABOVE_THRESHOLD */
>> PEBS_EVENT_CONSTRAINT(0x00c0, 0xf), /* INSTR_RETIRED.ANY */
>> PEBS_EVENT_CONSTRAINT(0xfec1, 0xf), /* X87_OPS_RETIRED.ANY */
>> PEBS_EVENT_CONSTRAINT(0x00c5, 0xf), /* BR_INST_RETIRED.MISPRED */
>> @@ -414,6 +415,8 @@ static void intel_pmu_pebs_enable(struct perf_event *event)
>> hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
>>
>> cpuc->pebs_enabled |= 1ULL << hwc->idx;
>> + if (hwc->extra_flags & EXTRA_REG_LD_LAT)
>> + cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32);
>
> if (hwc->extra_reg == MSR_PEBS_LD_LAT_THRESHOLD)
> cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32);
>
>> WARN_ON_ONCE(cpuc->enabled);
>>
>> if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
>> @@ -426,6 +429,8 @@ static void intel_pmu_pebs_disable(struct perf_event *event)
>> struct hw_perf_event *hwc = &event->hw;
>>
>> cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
>> + if (hwc->extra_flags & EXTRA_REG_LD_LAT)
>> + cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32));
>
> if (hwx->extra_reg == MSR_PEBS_LD_LAT_THRESHOLD)
> cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32));
>
>> if (cpuc->enabled)
>> wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
>>
>> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
>> index d24d9ab..38bffa4 100644
>> --- a/include/linux/perf_event.h
>> +++ b/include/linux/perf_event.h
>> @@ -541,6 +541,7 @@ struct hw_perf_event {
>> int last_cpu;
>> unsigned int extra_reg;
>> u64 extra_config;
>> + u64 extra_flags;
>> };
>> struct { /* software */
>> struct hrtimer hrtimer;
>>
>
> Which then also obviates the need for this extra field.
>
> You also need some extra goo in intel_pmu_drain_pebs_nhm(), we can
> already use the PERF_SAMPLE_ADDR for the linear data address provided by
> the pebs-ll thing, and we might need to add:
>
> PERF_SAMPLE_LATENCY -- Stephane said other archs can also use this
>
Extracting the instruction address is not so useful. You need the
instruction and data addresses, the latency and data source. As Peter
pointed out, you can use PERF_SAMPLE_ADDR for the data address.
True. And also we would need a PERF_SAMPLE_DATA_SRC to extract
the data source information. Other archs also have that.
Note that PEBS-Load latency needs the IP+1 correction. It points to the
instruction address after the load/lfetch. But I suspect your patch already
takes care of that.
> Not quite sure what to do for the source bits, POWER also has some extra
> bits, but I'm not sure they qualify as purely source bits. And
> interpreting them is going to be inherently arch specific, which
> sucks :/
>
>
Yes, I think there is more to it than just data source, unfortunately.
If you want to avoid returning an opaque u64 (PERF_SAMPLE_EXTRA), then
you need to break it down: PERF_SAMPLE_DATA_SRC, PERF_SAMPLE_XX
and so on.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists