[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <32f2d870-9b9c-0b10-880f-cc99cf969bf6@arm.com>
Date: Fri, 18 May 2018 14:49:55 +0100
From: Robin Murphy <robin.murphy@....com>
To: Suzuki K Poulose <suzuki.poulose@....com>,
linux-arm-kernel@...ts.infradead.org
Cc: linux-kernel@...r.kernel.org, mark.rutland@....com,
will.deacon@....com
Subject: Re: [PATCH 6/6] arm64: perf: Add support for chaining counters
On 18/05/18 11:22, Suzuki K Poulose wrote:
> Add support for chained event counters. PMUv3 allows chaining
> a pair of adjacent PMU counters (with the lower counter number
> being always "even"). The low counter is programmed to count
> the event of interest and the high counter(odd numbered) is
> programmed with a special event code (0x1e - Chain). Thus
> we need special allocation schemes to make the full use of
> available counters. So, we allocate the counters from either
> ends. i.e, chained counters are allocated from the lower
> end in pairs of two and the normal counters are allocated
> from the higher number. Also makes necessary changes to
> handle the chained events as a single event with 2 counters.
>
> Cc: Mark Rutland <mark.rutland@....com>
> Cc: Will Deacon <will.deacon@....com>
> Signed-off-by: Suzuki K Poulose <suzuki.poulose@....com>
> ---
> arch/arm64/kernel/perf_event.c | 226 ++++++++++++++++++++++++++++++++++++-----
> 1 file changed, 202 insertions(+), 24 deletions(-)
>
> diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
> index ea8e060..5f81cd0 100644
> --- a/arch/arm64/kernel/perf_event.c
> +++ b/arch/arm64/kernel/perf_event.c
> @@ -446,9 +446,11 @@ static struct attribute_group armv8_pmuv3_events_attr_group = {
> };
>
> PMU_FORMAT_ATTR(event, "config:0-15");
> +PMU_FORMAT_ATTR(chain, "config1:0");
>
> static struct attribute *armv8_pmuv3_format_attrs[] = {
> &format_attr_event.attr,
> + &format_attr_chain.attr,
> NULL,
> };
>
> @@ -457,6 +459,12 @@ static struct attribute_group armv8_pmuv3_format_attr_group = {
> .attrs = armv8_pmuv3_format_attrs,
> };
>
> +static bool armv8pmu_event_is_chained(struct perf_event *event)
> +{
> + return event->attr.config1 & 0x1;
> +}
> +
> +
> /*
> * Perf Events' indices
> */
> @@ -512,6 +520,36 @@ static inline int armv8pmu_select_counter(int idx)
> return idx;
> }
>
> +static inline u32 armv8pmu_read_evcntr(int idx)
> +{
> + return (armv8pmu_select_counter(idx) == idx) ?
> + read_sysreg(pmxevcntr_el0) : 0;
> +}
> +
> +static inline u64 armv8pmu_read_chain_counter(int idx)
> +{
> + u64 prev_hi, hi, lo;
> +
> + do {
> + prev_hi = armv8pmu_read_evcntr(idx);
> + isb();
> + lo = armv8pmu_read_evcntr(idx - 1);
> + isb();
> + hi = armv8pmu_read_evcntr(idx);
> + isb();
> + } while (prev_hi != hi);
Is it worth trying to elide that last isb() in the highly likely case
that we don't need it?
> +
> + return (hi << 32) | lo;
> +}
> +
> +static inline u64 armv8pmu_read_hw_counter(struct perf_event *event)
> +{
> + int idx = event->hw.idx;
> +
> + return armv8pmu_event_is_chained(event) ?
> + armv8pmu_read_chain_counter(idx) : armv8pmu_read_evcntr(idx);
> +}
> +
> static inline u64 armv8pmu_read_counter(struct perf_event *event)
> {
> struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
> @@ -524,12 +562,37 @@ static inline u64 armv8pmu_read_counter(struct perf_event *event)
> smp_processor_id(), idx);
> else if (idx == ARMV8_IDX_CYCLE_COUNTER)
> value = read_sysreg(pmccntr_el0);
> - else if (armv8pmu_select_counter(idx) == idx)
> - value = read_sysreg(pmxevcntr_el0);
> + else
> + value = armv8pmu_read_hw_counter(event);
>
> return value;
> }
>
> +static inline void armv8pmu_write_evcntr(int idx, u32 value)
> +{
> + if (armv8pmu_select_counter(idx) == idx)
> + write_sysreg(value, pmxevcntr_el0);
> +}
> +
> +static inline void armv8pmu_write_chain_counter(int idx, u64 value)
> +{
> + armv8pmu_write_evcntr(idx, value >> 32);
> + isb();
> + armv8pmu_write_evcntr(idx - 1, value);
> + isb();
Either that isb() is unnecessary, or we are (and have been) missing one
after a non-chained write.
> +}
> +
> +static inline void armv8pmu_write_hw_counter(struct perf_event *event,
> + u64 value)
> +{
> + int idx = event->hw.idx;
> +
> + if (armv8pmu_event_is_chained(event))
> + armv8pmu_write_chain_counter(idx, value);
> + else
> + armv8pmu_write_evcntr(idx, value);
> +}
> +
> static inline void armv8pmu_write_counter(struct perf_event *event, u64 value)
> {
> struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
> @@ -541,8 +604,8 @@ static inline void armv8pmu_write_counter(struct perf_event *event, u64 value)
> smp_processor_id(), idx);
> else if (idx == ARMV8_IDX_CYCLE_COUNTER)
> write_sysreg(value, pmccntr_el0);
> - else if (armv8pmu_select_counter(idx) == idx)
> - write_sysreg(value, pmxevcntr_el0);
> + else
> + armv8pmu_write_hw_counter(event, value);
> }
>
> static inline void armv8pmu_write_evtype(int idx, u32 val)
> @@ -553,6 +616,28 @@ static inline void armv8pmu_write_evtype(int idx, u32 val)
> }
> }
>
> +static inline void armv8pmu_write_event_type(struct perf_event *event)
> +{
> + struct hw_perf_event *hwc = &event->hw;
> + int idx = hwc->idx;
> +
> + /*
> + * For chained events, write the high counter event type
> + * followed by the low counter.
> + */
> + if (armv8pmu_event_is_chained(event)) {
> + u32 chain_evt = ARMV8_PMUV3_PERFCTR_CHAIN;
> +
> + /* Set the filters as that of the main event for chain */
> + chain_evt |= hwc->config_base & ~ARMV8_PMU_EVTYPE_EVENT;
> + armv8pmu_write_evtype(idx, chain_evt);
> + isb();
> + idx--;
> + }
> +
> + armv8pmu_write_evtype(idx, hwc->config_base);
> +}
> +
> static inline int armv8pmu_enable_counter(int idx)
> {
> u32 counter = ARMV8_IDX_TO_COUNTER(idx);
> @@ -560,6 +645,23 @@ static inline int armv8pmu_enable_counter(int idx)
> return idx;
> }
>
> +static inline void armv8pmu_enable_event_counter(struct perf_event *event)
> +{
> + int idx = event->hw.idx;
> +
> + /*
> + * For chained events, we enable the high counter followed by
> + * the low counter.
> + */
> + armv8pmu_enable_counter(idx);
> +
> + if (armv8pmu_event_is_chained(event)) {
> + isb();
> + armv8pmu_enable_counter(idx - 1);
> + }
> +
> +}
> +
> static inline int armv8pmu_disable_counter(int idx)
> {
> u32 counter = ARMV8_IDX_TO_COUNTER(idx);
> @@ -567,6 +669,24 @@ static inline int armv8pmu_disable_counter(int idx)
> return idx;
> }
>
> +static inline void armv8pmu_disable_event_counter(struct perf_event *event)
> +{
> + struct hw_perf_event *hwc = &event->hw;
Nit: might as well drop this and be consistent with the enable case.
> + int idx = hwc->idx;
> +
> + /*
> + * Disable the low counter followed by the high counter
> + * for chained events.
> + */
> + if (armv8pmu_event_is_chained(event)) {
> + armv8pmu_disable_counter(idx - 1);
> + isb();
> + }
> +
> + armv8pmu_disable_counter(idx);
> +}
> +
> +
> static inline int armv8pmu_enable_intens(int idx)
> {
> u32 counter = ARMV8_IDX_TO_COUNTER(idx);
> @@ -574,6 +694,12 @@ static inline int armv8pmu_enable_intens(int idx)
> return idx;
> }
>
> +static inline int armv8pmu_enable_event_irq(struct perf_event *event)
> +{
> + /* For chained events, enable the interrupt for only the high counter */
> + return armv8pmu_enable_intens(event->hw.idx);
> +}
> +
> static inline int armv8pmu_disable_intens(int idx)
> {
> u32 counter = ARMV8_IDX_TO_COUNTER(idx);
> @@ -586,6 +712,11 @@ static inline int armv8pmu_disable_intens(int idx)
> return idx;
> }
>
> +static inline int armv8pmu_disable_event_irq(struct perf_event *event)
> +{
> + return armv8pmu_disable_intens(event->hw.idx);
> +}
> +
> static inline u32 armv8pmu_getreset_flags(void)
> {
> u32 value;
> @@ -603,10 +734,8 @@ static inline u32 armv8pmu_getreset_flags(void)
> static void armv8pmu_enable_event(struct perf_event *event)
> {
> unsigned long flags;
> - struct hw_perf_event *hwc = &event->hw;
> struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
> struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events);
> - int idx = hwc->idx;
>
> /*
> * Enable counter and interrupt, and set the counter to count
> @@ -617,22 +746,22 @@ static void armv8pmu_enable_event(struct perf_event *event)
> /*
> * Disable counter
> */
> - armv8pmu_disable_counter(idx);
> + armv8pmu_disable_event_counter(event);
>
> /*
> * Set event (if destined for PMNx counters).
> */
> - armv8pmu_write_evtype(idx, hwc->config_base);
> + armv8pmu_write_event_type(event);
>
> /*
> * Enable interrupt for this counter
> */
> - armv8pmu_enable_intens(idx);
> + armv8pmu_enable_event_irq(event);
>
> /*
> * Enable counter
> */
> - armv8pmu_enable_counter(idx);
> + armv8pmu_enable_event_counter(event);
>
> raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
> }
> @@ -640,10 +769,8 @@ static void armv8pmu_enable_event(struct perf_event *event)
> static void armv8pmu_disable_event(struct perf_event *event)
> {
> unsigned long flags;
> - struct hw_perf_event *hwc = &event->hw;
> struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
> struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events);
> - int idx = hwc->idx;
>
> /*
> * Disable counter and interrupt
> @@ -651,14 +778,14 @@ static void armv8pmu_disable_event(struct perf_event *event)
> raw_spin_lock_irqsave(&events->pmu_lock, flags);
>
> /*
> - * Disable counter
> + * Disable counter for this event
> */
> - armv8pmu_disable_counter(idx);
> + armv8pmu_disable_event_counter(event);
>
> /*
> - * Disable interrupt for this counter
> + * Disable interrupt for this event counter
> */
> - armv8pmu_disable_intens(idx);
> + armv8pmu_disable_event_irq(event);
>
> raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
> }
> @@ -747,6 +874,39 @@ static void armv8pmu_stop(struct arm_pmu *cpu_pmu)
> raw_spin_unlock_irqrestore(&events->pmu_lock, flags);
> }
>
> +static int armv8pmu_get_single_idx(struct pmu_hw_events *cpuc,
> + struct arm_pmu *cpu_pmu)
> +{
> + int idx;
> +
> + for (idx = cpu_pmu->num_events - 1; idx >= ARMV8_IDX_COUNTER0; ++idx)
> + if (!test_and_set_bit(idx, cpuc->used_mask))
> + return idx;
> + return -EAGAIN;
> +}
> +
> +static int armv8pmu_get_chain_idx(struct pmu_hw_events *cpuc,
> + struct arm_pmu *cpu_pmu)
> +{
> + int idx;
> +
> + /*
> + * Chaining requires two consecutive event counters, where
> + * the lower idx must be even. We allocate chain events
> + * from the lower index (i.e, counter0) and the single events
> + * from the higher end to maximise the utilisation.
> + */
> + for (idx = ARMV8_IDX_COUNTER0 + 1; idx < cpu_pmu->num_events; idx += 2)
> + if (!test_and_set_bit(idx, cpuc->used_mask)) {
> + /* Check if the preceding even counter is available */
> + if (!test_and_set_bit(idx - 1, cpuc->used_mask))
> + return idx;
> + /* Release the Odd counter */
> + clear_bit(idx, cpuc->used_mask);
> + }
> + return -EAGAIN;
> +}
> +
> static int armv8pmu_get_event_idx(struct pmu_hw_events *cpuc,
> struct perf_event *event)
> {
> @@ -755,7 +915,10 @@ static int armv8pmu_get_event_idx(struct pmu_hw_events *cpuc,
> struct hw_perf_event *hwc = &event->hw;
> unsigned long evtype = hwc->config_base & ARMV8_PMU_EVTYPE_EVENT;
>
> - /* Always prefer to place a cycle counter into the cycle counter. */
> + /*
> + * Always prefer to place a cycle counter into the cycle counter
> + * irrespective of whether we are counting 32bit/64bit
I don't think that comment change adds much :/
> + */
> if (evtype == ARMV8_PMUV3_PERFCTR_CPU_CYCLES) {
> if (!test_and_set_bit(ARMV8_IDX_CYCLE_COUNTER, cpuc->used_mask))
> return ARMV8_IDX_CYCLE_COUNTER;
> @@ -764,13 +927,21 @@ static int armv8pmu_get_event_idx(struct pmu_hw_events *cpuc,
> /*
> * Otherwise use events counters
> */
> - for (idx = ARMV8_IDX_COUNTER0; idx < cpu_pmu->num_events; ++idx) {
> - if (!test_and_set_bit(idx, cpuc->used_mask))
> - return idx;
> - }
> + idx = armv8pmu_event_is_chained(event) ?
> + armv8pmu_get_chain_idx(cpuc, cpu_pmu) :
> + armv8pmu_get_single_idx(cpuc, cpu_pmu);
>
> - /* The counters are all in use. */
> - return -EAGAIN;
> + return idx;
> +}
> +
> +static void armv8pmu_clear_event_idx(struct pmu_hw_events *cpuc,
> + struct perf_event *event)
> +{
> + struct hw_perf_event *hwc = &event->hw;
> +
> + clear_bit(hwc->idx, cpuc->used_mask);
> + if (armv8pmu_event_is_chained(event))
> + clear_bit(hwc->idx - 1, cpuc->used_mask);
> }
>
> /*
> @@ -845,8 +1016,14 @@ static int __armv8_pmuv3_map_event(struct perf_event *event,
> &armv8_pmuv3_perf_cache_map,
> ARMV8_PMU_EVTYPE_EVENT);
>
> - if (hw_event_id == ARMV8_PMUV3_PERFCTR_CPU_CYCLES)
> + if (hw_event_id == ARMV8_PMUV3_PERFCTR_CPU_CYCLES) {
> + /* Prevent chaining for cycle counter */
Why? Sure, we want to avoid executing the chaining logic if we're
scheduling a cycles event in the dedicated counter (which is perhaps
what the comment above wanted to say), but if one ends up allocated into
a regular counter (e.g. if the user asks for multiple cycle counts with
different filters), then I don't see any reason to forbid that being
chained.
Robin.
> + if (armv8pmu_event_is_chained(event))
> + return -EINVAL;
> event->hw.flags |= ARMPMU_EVT_LONG;
> + } else if (armv8pmu_event_is_chained(event)) {
> + event->hw.flags |= ARMPMU_EVT_LONG;
> + }
>
> /* Onl expose micro/arch events supported by this PMU */
> if ((hw_event_id > 0) && (hw_event_id < ARMV8_PMUV3_MAX_COMMON_EVENTS)
> @@ -954,6 +1131,7 @@ static int armv8_pmu_init(struct arm_pmu *cpu_pmu)
> cpu_pmu->read_counter = armv8pmu_read_counter,
> cpu_pmu->write_counter = armv8pmu_write_counter,
> cpu_pmu->get_event_idx = armv8pmu_get_event_idx,
> + cpu_pmu->clear_event_idx = armv8pmu_clear_event_idx,
> cpu_pmu->start = armv8pmu_start,
> cpu_pmu->stop = armv8pmu_stop,
> cpu_pmu->reset = armv8pmu_reset,
>
Powered by blists - more mailing lists