lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20241114141918.GA39245@noisy.programming.kicks-ass.net>
Date: Thu, 14 Nov 2024 15:19:18 +0100
From: Peter Zijlstra <peterz@...radead.org>
To: kan.liang@...ux.intel.com
Cc: mingo@...hat.com, linux-kernel@...r.kernel.org, acme@...nel.org,
	namhyung@...nel.org, irogers@...gle.com, eranian@...gle.com,
	ak@...ux.intel.com, Dapeng Mi <dapeng1.mi@...ux.intel.com>
Subject: Re: [PATCH 2/2] perf/x86/intel/ds: Simplify the PEBS records
 processing for adaptive PEBS

On Wed, Nov 13, 2024 at 07:14:27AM -0800, kan.liang@...ux.intel.com wrote:

> diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
> index 4d0f7c49295a..cbf2ab9ed4c8 100644
> --- a/arch/x86/events/intel/ds.c
> +++ b/arch/x86/events/intel/ds.c
> @@ -2400,12 +2400,38 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d
>  	}
>  }
>  
> +static inline void __intel_pmu_pebs_event_output(struct perf_event *event,
> +						 struct pt_regs *iregs,
> +						 void *record, bool last,
> +						 struct perf_sample_data *data)
> +{
> +	struct x86_perf_regs perf_regs;
> +	struct pt_regs *regs = &perf_regs.regs;
> +	static struct pt_regs dummy_iregs;
> +
> +	if (!iregs)
> +		iregs = &dummy_iregs;
> +
> +	setup_pebs_adaptive_sample_data(event, iregs, record, data, regs);
> +	if (last) {
> +		/*
> +		 * All but the last records are processed.
> +		 * The last one is left to be able to call the overflow handler.
> +		 */
> +		if (perf_event_overflow(event, data, regs))
> +			x86_pmu_stop(event, 0);
> +	} else
> +		perf_event_output(event, data, regs);
> +}

*sigh*... more unification please.

>  static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_data *data)
>  {
>  	short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
> +	void *unprocessed[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS];
>  	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
>  	struct debug_store *ds = cpuc->ds;
>  	struct perf_event *event;
> +	struct pebs_basic *basic;
>  	void *base, *at, *top;
>  	int bit;
>  	u64 mask;
> @@ -2426,30 +2452,63 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d
>  		return;
>  	}
>  
> -	for (at = base; at < top; at += cpuc->pebs_record_size) {
> +	for (at = base; at < top; at += basic->format_size) {
>  		u64 pebs_status;
>  
> -		pebs_status = get_pebs_status(at) & cpuc->pebs_enabled;
> -		pebs_status &= mask;
> +		basic = at;
> +		if (WARN_ON_ONCE(basic->format_size != cpuc->pebs_record_size))
> +			continue;
> +
> +		pebs_status = basic->applicable_counters & cpuc->pebs_enabled & mask;
> +		for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX) {
> +			event = cpuc->events[bit];
> +
> +			if (WARN_ON_ONCE(!event) ||
> +			    WARN_ON_ONCE(!event->attr.precise_ip))
> +				continue;
> +
> +			/*
> +			 * Need at least one record to call the overflow handler later.
> +			 * Initialize the unprocessed[] variable with the first record.
> +			 */
> +			if (!counts[bit]++) {
> +				unprocessed[bit] = at;
> +				continue;
> +			}
> +
> +			__intel_pmu_pebs_event_output(event, iregs, unprocessed[bit], false, data);
>  
> -		for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX)
> -			counts[bit]++;
> +			unprocessed[bit] = at;
> +		}
>  	}
>  
>  	for_each_set_bit(bit, (unsigned long *)&mask, X86_PMC_IDX_MAX) {
> -		if (counts[bit] == 0)
> +		if (!counts[bit])
>  			continue;
>  
>  		event = cpuc->events[bit];
> -		if (WARN_ON_ONCE(!event))
> -			continue;
>  
> -		if (WARN_ON_ONCE(!event->attr.precise_ip))
> -			continue;
> +		if (!iregs) {
> +			/*
> +			 * The PEBS records may be drained in the non-overflow context,
> +			 * e.g., large PEBS + context switch. Perf should treat the
> +			 * last record the same as other PEBS records, and doesn't
> +			 * invoke the generic overflow handler.
> +			 */
> +			__intel_pmu_pebs_event_output(event, iregs, unprocessed[bit], false, data);
> +		} else
> +			__intel_pmu_pebs_event_output(event, iregs, unprocessed[bit], true, data);

*sigh*, this is confusing as all hell. Both are last, but one says
last=false.

> -		__intel_pmu_pebs_event(event, iregs, data, base,
> -				       top, bit, counts[bit],
> -				       setup_pebs_adaptive_sample_data);
> +		if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) {
> +			/*
> +			 * Now, auto-reload is only enabled in fixed period mode.
> +			 * The reload value is always hwc->sample_period.
> +			 * May need to change it, if auto-reload is enabled in
> +			 * freq mode later.
> +			 */
> +			intel_pmu_save_and_restart_reload(event, counts[bit]);
> +		} else
> +			intel_pmu_save_and_restart(event);

And this is randomly ignoring the return value where previously we would
abort.

>  	}
>  }

How's this completely untested delta?

---
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -2167,24 +2167,33 @@ intel_pmu_save_and_restart_reload(struct
 	return 0;
 }
 
+void (*setup_fn)(struct perf_event *, struct pt_regs *, void *,
+		 struct perf_sample_data *, struct pt_regs *);
+
+static struct pt_regs dummy_iregs;
+
 static __always_inline void
 __intel_pmu_pebs_event(struct perf_event *event,
 		       struct pt_regs *iregs,
-		       struct perf_sample_data *data,
-		       void *base, void *top,
-		       int bit, int count,
-		       void (*setup_sample)(struct perf_event *,
-					    struct pt_regs *,
-					    void *,
-					    struct perf_sample_data *,
-					    struct pt_regs *))
+		       struct pt_regs *regs,
+		       struct perf_event_sample_data *data,
+		       void *at,
+		       setup_fn setup_sample)
+{
+	setup_sample(event, iregs, at, data, regs);
+	perf_event_output(event, data, regs);
+}
+
+static __always_inline void
+__intel_pmu_pebs_last_event(struct perf_event *event,
+			    struct pt_regs *iregs,
+			    struct pt_regs *regs,
+			    struct perf_event_sample_data *data,
+			    void *at,
+			    int count,
+			    setup_fn setup_sample)
 {
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
-	struct x86_perf_regs perf_regs;
-	struct pt_regs *regs = &perf_regs.regs;
-	void *at = get_next_pebs_record_by_bit(base, top, bit);
-	static struct pt_regs dummy_iregs;
 
 	if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
 		/*
@@ -2194,21 +2203,12 @@ __intel_pmu_pebs_event(struct perf_event
 		 * freq mode later.
 		 */
 		intel_pmu_save_and_restart_reload(event, count);
-	} else if (!intel_pmu_save_and_restart(event))
-		return;
-
-	if (!iregs)
-		iregs = &dummy_iregs;
-
-	while (count > 1) {
-		setup_sample(event, iregs, at, data, regs);
-		perf_event_output(event, data, regs);
-		at += cpuc->pebs_record_size;
-		at = get_next_pebs_record_by_bit(at, top, bit);
-		count--;
+	} else {
+		intel_pmu_save_and_restart(event);
 	}
 
 	setup_sample(event, iregs, at, data, regs);
+
 	if (iregs == &dummy_iregs) {
 		/*
 		 * The PEBS records may be drained in the non-overflow context,
@@ -2227,6 +2227,34 @@ __intel_pmu_pebs_event(struct perf_event
 	}
 }
 
+static __always_inline void
+__intel_pmu_pebs_events(struct perf_event *event,
+			struct pt_regs *iregs,
+			struct perf_sample_data *data,
+			void *base, void *top,
+			int bit, int count,
+			setup_fn setup_sample) {
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	struct x86_perf_regs perf_regs;
+	struct pt_regs *regs = &perf_regs.regs;
+	void *at = get_next_pebs_record_by_bit(base, top, bit);
+	int cnt = count;
+
+	if (!iregs)
+		iregs = &dummy_iregs;
+
+	while (cnt > 1) {
+		__intel_pmu_pebs_event(event, iregs, regs, data, at, setup_sample);
+		at += cpuc->pebs_record_size;
+		at = get_next_pebs_record_by_bit(at, top, bit);
+		cnt--;
+	}
+
+	__intel_pmu_pebs_last_event(event, iregs, regs, data, at, count, setup_sample);
+}
+
 static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_data *data)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -2261,7 +2289,7 @@ static void intel_pmu_drain_pebs_core(st
 		return;
 	}
 
-	__intel_pmu_pebs_event(event, iregs, data, at, top, 0, n,
+	__intel_pmu_pebs_events(event, iregs, data, at, top, 0, n,
 			       setup_pebs_fixed_sample_data);
 }
 
@@ -2393,43 +2421,21 @@ static void intel_pmu_drain_pebs_nhm(str
 		}
 
 		if (counts[bit]) {
-			__intel_pmu_pebs_event(event, iregs, data, base,
-					       top, bit, counts[bit],
-					       setup_pebs_fixed_sample_data);
+			__intel_pmu_pebs_events(event, iregs, data, base,
+						top, bit, counts[bit],
+						setup_pebs_fixed_sample_data);
 		}
 	}
 }
 
-static inline void __intel_pmu_pebs_event_output(struct perf_event *event,
-						 struct pt_regs *iregs,
-						 void *record, bool last,
-						 struct perf_sample_data *data)
-{
-	struct x86_perf_regs perf_regs;
-	struct pt_regs *regs = &perf_regs.regs;
-	static struct pt_regs dummy_iregs;
-
-	if (!iregs)
-		iregs = &dummy_iregs;
-
-	setup_pebs_adaptive_sample_data(event, iregs, record, data, regs);
-	if (last) {
-		/*
-		 * All but the last records are processed.
-		 * The last one is left to be able to call the overflow handler.
-		 */
-		if (perf_event_overflow(event, data, regs))
-			x86_pmu_stop(event, 0);
-	} else
-		perf_event_output(event, data, regs);
-}
-
 static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_data *data)
 {
 	short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
-	void *unprocessed[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS];
+	void *last[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS];
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct debug_store *ds = cpuc->ds;
+	struct x86_perf_regs perf_regs;
+	struct pt_regs *regs = &perf_regs.regs;
 	struct perf_event *event;
 	struct pebs_basic *basic;
 	void *base, *at, *top;
@@ -2452,6 +2458,12 @@ static void intel_pmu_drain_pebs_icl(str
 		return;
 	}
 
+	if (!iregs)
+		iregs = &dummy_iregs;
+
+	/*
+	 * Process all but the last event for each counter.
+	 */
 	for (at = base; at < top; at += basic->format_size) {
 		u64 pebs_status;
 
@@ -2467,18 +2479,12 @@ static void intel_pmu_drain_pebs_icl(str
 			    WARN_ON_ONCE(!event->attr.precise_ip))
 				continue;
 
-			/*
-			 * Need at least one record to call the overflow handler later.
-			 * Initialize the unprocessed[] variable with the first record.
-			 */
-			if (!counts[bit]++) {
-				unprocessed[bit] = at;
-				continue;
+			if (counts[bit]++) {
+				__intel_pmu_pebs_event(event, iregs, regs, data, last[bit],
+						       setup_pebs_adaptive_sample_data)
 			}
 
-			__intel_pmu_pebs_event_output(event, iregs, unprocessed[bit], false, data);
-
-			unprocessed[bit] = at;
+			last[bit] = at;
 		}
 	}
 
@@ -2487,28 +2493,8 @@ static void intel_pmu_drain_pebs_icl(str
 			continue;
 
 		event = cpuc->events[bit];
-
-		if (!iregs) {
-			/*
-			 * The PEBS records may be drained in the non-overflow context,
-			 * e.g., large PEBS + context switch. Perf should treat the
-			 * last record the same as other PEBS records, and doesn't
-			 * invoke the generic overflow handler.
-			 */
-			__intel_pmu_pebs_event_output(event, iregs, unprocessed[bit], false, data);
-		} else
-			__intel_pmu_pebs_event_output(event, iregs, unprocessed[bit], true, data);
-
-		if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) {
-			/*
-			 * Now, auto-reload is only enabled in fixed period mode.
-			 * The reload value is always hwc->sample_period.
-			 * May need to change it, if auto-reload is enabled in
-			 * freq mode later.
-			 */
-			intel_pmu_save_and_restart_reload(event, counts[bit]);
-		} else
-			intel_pmu_save_and_restart(event);
+		__intel_pmu_pebs_last_event(event, iregs, regs, data, last[bit],
+					    counts[bit], setup_pebs_adaptive_sample_data);
 	}
 }
 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ