[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20241114141918.GA39245@noisy.programming.kicks-ass.net>
Date: Thu, 14 Nov 2024 15:19:18 +0100
From: Peter Zijlstra <peterz@...radead.org>
To: kan.liang@...ux.intel.com
Cc: mingo@...hat.com, linux-kernel@...r.kernel.org, acme@...nel.org,
namhyung@...nel.org, irogers@...gle.com, eranian@...gle.com,
ak@...ux.intel.com, Dapeng Mi <dapeng1.mi@...ux.intel.com>
Subject: Re: [PATCH 2/2] perf/x86/intel/ds: Simplify the PEBS records
processing for adaptive PEBS
On Wed, Nov 13, 2024 at 07:14:27AM -0800, kan.liang@...ux.intel.com wrote:
> diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
> index 4d0f7c49295a..cbf2ab9ed4c8 100644
> --- a/arch/x86/events/intel/ds.c
> +++ b/arch/x86/events/intel/ds.c
> @@ -2400,12 +2400,38 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d
> }
> }
>
> +static inline void __intel_pmu_pebs_event_output(struct perf_event *event,
> + struct pt_regs *iregs,
> + void *record, bool last,
> + struct perf_sample_data *data)
> +{
> + struct x86_perf_regs perf_regs;
> + struct pt_regs *regs = &perf_regs.regs;
> + static struct pt_regs dummy_iregs;
> +
> + if (!iregs)
> + iregs = &dummy_iregs;
> +
> + setup_pebs_adaptive_sample_data(event, iregs, record, data, regs);
> + if (last) {
> + /*
> + * All but the last records are processed.
> + * The last one is left to be able to call the overflow handler.
> + */
> + if (perf_event_overflow(event, data, regs))
> + x86_pmu_stop(event, 0);
> + } else
> + perf_event_output(event, data, regs);
> +}
*sigh*... more unification please.
> static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_data *data)
> {
> short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
> + void *unprocessed[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS];
> struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
> struct debug_store *ds = cpuc->ds;
> struct perf_event *event;
> + struct pebs_basic *basic;
> void *base, *at, *top;
> int bit;
> u64 mask;
> @@ -2426,30 +2452,63 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d
> return;
> }
>
> - for (at = base; at < top; at += cpuc->pebs_record_size) {
> + for (at = base; at < top; at += basic->format_size) {
> u64 pebs_status;
>
> - pebs_status = get_pebs_status(at) & cpuc->pebs_enabled;
> - pebs_status &= mask;
> + basic = at;
> + if (WARN_ON_ONCE(basic->format_size != cpuc->pebs_record_size))
> + continue;
> +
> + pebs_status = basic->applicable_counters & cpuc->pebs_enabled & mask;
> + for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX) {
> + event = cpuc->events[bit];
> +
> + if (WARN_ON_ONCE(!event) ||
> + WARN_ON_ONCE(!event->attr.precise_ip))
> + continue;
> +
> + /*
> + * Need at least one record to call the overflow handler later.
> + * Initialize the unprocessed[] variable with the first record.
> + */
> + if (!counts[bit]++) {
> + unprocessed[bit] = at;
> + continue;
> + }
> +
> + __intel_pmu_pebs_event_output(event, iregs, unprocessed[bit], false, data);
>
> - for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX)
> - counts[bit]++;
> + unprocessed[bit] = at;
> + }
> }
>
> for_each_set_bit(bit, (unsigned long *)&mask, X86_PMC_IDX_MAX) {
> - if (counts[bit] == 0)
> + if (!counts[bit])
> continue;
>
> event = cpuc->events[bit];
> - if (WARN_ON_ONCE(!event))
> - continue;
>
> - if (WARN_ON_ONCE(!event->attr.precise_ip))
> - continue;
> + if (!iregs) {
> + /*
> + * The PEBS records may be drained in the non-overflow context,
> + * e.g., large PEBS + context switch. Perf should treat the
> + * last record the same as other PEBS records, and doesn't
> + * invoke the generic overflow handler.
> + */
> + __intel_pmu_pebs_event_output(event, iregs, unprocessed[bit], false, data);
> + } else
> + __intel_pmu_pebs_event_output(event, iregs, unprocessed[bit], true, data);
*sigh*, this is confusing as all hell. Both are last, but one says
last=false.
> - __intel_pmu_pebs_event(event, iregs, data, base,
> - top, bit, counts[bit],
> - setup_pebs_adaptive_sample_data);
> + if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) {
> + /*
> + * Now, auto-reload is only enabled in fixed period mode.
> + * The reload value is always hwc->sample_period.
> + * May need to change it, if auto-reload is enabled in
> + * freq mode later.
> + */
> + intel_pmu_save_and_restart_reload(event, counts[bit]);
> + } else
> + intel_pmu_save_and_restart(event);
And this is randomly ignoring the return value where previously we would
abort.
> }
> }
How's this completely untested delta?
---
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -2167,24 +2167,33 @@ intel_pmu_save_and_restart_reload(struct
return 0;
}
+void (*setup_fn)(struct perf_event *, struct pt_regs *, void *,
+ struct perf_sample_data *, struct pt_regs *);
+
+static struct pt_regs dummy_iregs;
+
static __always_inline void
__intel_pmu_pebs_event(struct perf_event *event,
struct pt_regs *iregs,
- struct perf_sample_data *data,
- void *base, void *top,
- int bit, int count,
- void (*setup_sample)(struct perf_event *,
- struct pt_regs *,
- void *,
- struct perf_sample_data *,
- struct pt_regs *))
+ struct pt_regs *regs,
+ struct perf_event_sample_data *data,
+ void *at,
+ setup_fn setup_sample)
+{
+ setup_sample(event, iregs, at, data, regs);
+ perf_event_output(event, data, regs);
+}
+
+static __always_inline void
+__intel_pmu_pebs_last_event(struct perf_event *event,
+ struct pt_regs *iregs,
+ struct pt_regs *regs,
+ struct perf_event_sample_data *data,
+ void *at,
+ int count,
+ setup_fn setup_sample)
{
- struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
- struct x86_perf_regs perf_regs;
- struct pt_regs *regs = &perf_regs.regs;
- void *at = get_next_pebs_record_by_bit(base, top, bit);
- static struct pt_regs dummy_iregs;
if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
/*
@@ -2194,21 +2203,12 @@ __intel_pmu_pebs_event(struct perf_event
* freq mode later.
*/
intel_pmu_save_and_restart_reload(event, count);
- } else if (!intel_pmu_save_and_restart(event))
- return;
-
- if (!iregs)
- iregs = &dummy_iregs;
-
- while (count > 1) {
- setup_sample(event, iregs, at, data, regs);
- perf_event_output(event, data, regs);
- at += cpuc->pebs_record_size;
- at = get_next_pebs_record_by_bit(at, top, bit);
- count--;
+ } else {
+ intel_pmu_save_and_restart(event);
}
setup_sample(event, iregs, at, data, regs);
+
if (iregs == &dummy_iregs) {
/*
* The PEBS records may be drained in the non-overflow context,
@@ -2227,6 +2227,34 @@ __intel_pmu_pebs_event(struct perf_event
}
}
+static __always_inline void
+__intel_pmu_pebs_events(struct perf_event *event,
+ struct pt_regs *iregs,
+ struct perf_sample_data *data,
+ void *base, void *top,
+ int bit, int count,
+ setup_fn setup_sample) {
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+ struct x86_perf_regs perf_regs;
+ struct pt_regs *regs = &perf_regs.regs;
+ void *at = get_next_pebs_record_by_bit(base, top, bit);
+ int cnt = count;
+
+ if (!iregs)
+ iregs = &dummy_iregs;
+
+ while (cnt > 1) {
+ __intel_pmu_pebs_event(event, iregs, regs, data, at, setup_sample);
+ at += cpuc->pebs_record_size;
+ at = get_next_pebs_record_by_bit(at, top, bit);
+ cnt--;
+ }
+
+ __intel_pmu_pebs_last_event(event, iregs, regs, data, at, count, setup_sample);
+}
+
static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_data *data)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -2261,7 +2289,7 @@ static void intel_pmu_drain_pebs_core(st
return;
}
- __intel_pmu_pebs_event(event, iregs, data, at, top, 0, n,
+ __intel_pmu_pebs_events(event, iregs, data, at, top, 0, n,
setup_pebs_fixed_sample_data);
}
@@ -2393,43 +2421,21 @@ static void intel_pmu_drain_pebs_nhm(str
}
if (counts[bit]) {
- __intel_pmu_pebs_event(event, iregs, data, base,
- top, bit, counts[bit],
- setup_pebs_fixed_sample_data);
+ __intel_pmu_pebs_events(event, iregs, data, base,
+ top, bit, counts[bit],
+ setup_pebs_fixed_sample_data);
}
}
}
-static inline void __intel_pmu_pebs_event_output(struct perf_event *event,
- struct pt_regs *iregs,
- void *record, bool last,
- struct perf_sample_data *data)
-{
- struct x86_perf_regs perf_regs;
- struct pt_regs *regs = &perf_regs.regs;
- static struct pt_regs dummy_iregs;
-
- if (!iregs)
- iregs = &dummy_iregs;
-
- setup_pebs_adaptive_sample_data(event, iregs, record, data, regs);
- if (last) {
- /*
- * All but the last records are processed.
- * The last one is left to be able to call the overflow handler.
- */
- if (perf_event_overflow(event, data, regs))
- x86_pmu_stop(event, 0);
- } else
- perf_event_output(event, data, regs);
-}
-
static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_data *data)
{
short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
- void *unprocessed[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS];
+ void *last[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS];
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct debug_store *ds = cpuc->ds;
+ struct x86_perf_regs perf_regs;
+ struct pt_regs *regs = &perf_regs.regs;
struct perf_event *event;
struct pebs_basic *basic;
void *base, *at, *top;
@@ -2452,6 +2458,12 @@ static void intel_pmu_drain_pebs_icl(str
return;
}
+ if (!iregs)
+ iregs = &dummy_iregs;
+
+ /*
+ * Process all but the last event for each counter.
+ */
for (at = base; at < top; at += basic->format_size) {
u64 pebs_status;
@@ -2467,18 +2479,12 @@ static void intel_pmu_drain_pebs_icl(str
WARN_ON_ONCE(!event->attr.precise_ip))
continue;
- /*
- * Need at least one record to call the overflow handler later.
- * Initialize the unprocessed[] variable with the first record.
- */
- if (!counts[bit]++) {
- unprocessed[bit] = at;
- continue;
+ if (counts[bit]++) {
+ __intel_pmu_pebs_event(event, iregs, regs, data, last[bit],
+ setup_pebs_adaptive_sample_data)
}
- __intel_pmu_pebs_event_output(event, iregs, unprocessed[bit], false, data);
-
- unprocessed[bit] = at;
+ last[bit] = at;
}
}
@@ -2487,28 +2493,8 @@ static void intel_pmu_drain_pebs_icl(str
continue;
event = cpuc->events[bit];
-
- if (!iregs) {
- /*
- * The PEBS records may be drained in the non-overflow context,
- * e.g., large PEBS + context switch. Perf should treat the
- * last record the same as other PEBS records, and doesn't
- * invoke the generic overflow handler.
- */
- __intel_pmu_pebs_event_output(event, iregs, unprocessed[bit], false, data);
- } else
- __intel_pmu_pebs_event_output(event, iregs, unprocessed[bit], true, data);
-
- if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) {
- /*
- * Now, auto-reload is only enabled in fixed period mode.
- * The reload value is always hwc->sample_period.
- * May need to change it, if auto-reload is enabled in
- * freq mode later.
- */
- intel_pmu_save_and_restart_reload(event, counts[bit]);
- } else
- intel_pmu_save_and_restart(event);
+ __intel_pmu_pebs_last_event(event, iregs, regs, data, last[bit],
+ counts[bit], setup_pebs_adaptive_sample_data);
}
}
Powered by blists - more mailing lists