linux-kernel - [PATCH 2/2] perf/x86/intel/ds: Simplify the PEBS records processing for adaptive PEBS

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20241113151427.677169-2-kan.liang@linux.intel.com>
Date: Wed, 13 Nov 2024 07:14:27 -0800
From: kan.liang@...ux.intel.com
To: peterz@...radead.org,
	mingo@...hat.com,
	linux-kernel@...r.kernel.org
Cc: acme@...nel.org,
	namhyung@...nel.org,
	irogers@...gle.com,
	eranian@...gle.com,
	ak@...ux.intel.com,
	Kan Liang <kan.liang@...ux.intel.com>,
	Dapeng Mi <dapeng1.mi@...ux.intel.com>
Subject: [PATCH 2/2] perf/x86/intel/ds: Simplify the PEBS records processing for adaptive PEBS

From: Kan Liang <kan.liang@...ux.intel.com>

The current code may iterate all the PEBS records in the DS area several
times. The first loop is to find all active events and calculate the
available records for each event. Then iterate the whole buffer again
and again to process available records until all active events are
processed.

The algorithm is inherited from the old generations. The old PEBS
hardware does not deal well with the situation when events happen near
each other. SW has to drop the error records. Multiple iterations are
required.

The hardware limit has been addressed on newer platforms with adaptive
PEBS. A simple one-iteration algorithm is introduced.

The samples are output by record order with the patch, rather than the
event order. It doesn't impact the post-processing. The perf tool always
sorts the records by time before presenting them to the end user.

In an NMI, the last record has to be specially handled. Add a
unprocessed[] variable to track the last unprocessed record of each
event.

Save and restart the event after all records are processed.

Test:

11 PEBS events are used in the perf test. Only the basic information is
collected. 
perf record -e instructions:up,...,instructions:up -c 2000003 benchmark

The ftrace is used to record the duration of the
intel_pmu_drain_pebs_icl().

The average duration reduced from 62.04us to 57.94us.

A small improvement can be observed with the new algorithm.
Also, the implementation becomes simpler and more straightforward.

Suggested-by: Stephane Eranian <eranian@...gle.com>
Reviewed-by: Dapeng Mi <dapeng1.mi@...ux.intel.com>
Signed-off-by: Kan Liang <kan.liang@...ux.intel.com>
---
 arch/x86/events/intel/ds.c | 85 ++++++++++++++++++++++++++++++++------
 1 file changed, 72 insertions(+), 13 deletions(-)

diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 4d0f7c49295a..cbf2ab9ed4c8 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -2400,12 +2400,38 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d
 	}
 }
 
+static inline void __intel_pmu_pebs_event_output(struct perf_event *event,
+						 struct pt_regs *iregs,
+						 void *record, bool last,
+						 struct perf_sample_data *data)
+{
+	struct x86_perf_regs perf_regs;
+	struct pt_regs *regs = &perf_regs.regs;
+	static struct pt_regs dummy_iregs;
+
+	if (!iregs)
+		iregs = &dummy_iregs;
+
+	setup_pebs_adaptive_sample_data(event, iregs, record, data, regs);
+	if (last) {
+		/*
+		 * All but the last records are processed.
+		 * The last one is left to be able to call the overflow handler.
+		 */
+		if (perf_event_overflow(event, data, regs))
+			x86_pmu_stop(event, 0);
+	} else
+		perf_event_output(event, data, regs);
+}
+
 static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_data *data)
 {
 	short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
+	void *unprocessed[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS];
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct debug_store *ds = cpuc->ds;
 	struct perf_event *event;
+	struct pebs_basic *basic;
 	void *base, *at, *top;
 	int bit;
 	u64 mask;
@@ -2426,30 +2452,63 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d
 		return;
 	}
 
-	for (at = base; at < top; at += cpuc->pebs_record_size) {
+	for (at = base; at < top; at += basic->format_size) {
 		u64 pebs_status;
 
-		pebs_status = get_pebs_status(at) & cpuc->pebs_enabled;
-		pebs_status &= mask;
+		basic = at;
+		if (WARN_ON_ONCE(basic->format_size != cpuc->pebs_record_size))
+			continue;
+
+		pebs_status = basic->applicable_counters & cpuc->pebs_enabled & mask;
+		for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX) {
+			event = cpuc->events[bit];
+
+			if (WARN_ON_ONCE(!event) ||
+			    WARN_ON_ONCE(!event->attr.precise_ip))
+				continue;
+
+			/*
+			 * Need at least one record to call the overflow handler later.
+			 * Initialize the unprocessed[] variable with the first record.
+			 */
+			if (!counts[bit]++) {
+				unprocessed[bit] = at;
+				continue;
+			}
+
+			__intel_pmu_pebs_event_output(event, iregs, unprocessed[bit], false, data);
 
-		for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX)
-			counts[bit]++;
+			unprocessed[bit] = at;
+		}
 	}
 
 	for_each_set_bit(bit, (unsigned long *)&mask, X86_PMC_IDX_MAX) {
-		if (counts[bit] == 0)
+		if (!counts[bit])
 			continue;
 
 		event = cpuc->events[bit];
-		if (WARN_ON_ONCE(!event))
-			continue;
 
-		if (WARN_ON_ONCE(!event->attr.precise_ip))
-			continue;
+		if (!iregs) {
+			/*
+			 * The PEBS records may be drained in the non-overflow context,
+			 * e.g., large PEBS + context switch. Perf should treat the
+			 * last record the same as other PEBS records, and doesn't
+			 * invoke the generic overflow handler.
+			 */
+			__intel_pmu_pebs_event_output(event, iregs, unprocessed[bit], false, data);
+		} else
+			__intel_pmu_pebs_event_output(event, iregs, unprocessed[bit], true, data);
 
-		__intel_pmu_pebs_event(event, iregs, data, base,
-				       top, bit, counts[bit],
-				       setup_pebs_adaptive_sample_data);
+		if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) {
+			/*
+			 * Now, auto-reload is only enabled in fixed period mode.
+			 * The reload value is always hwc->sample_period.
+			 * May need to change it, if auto-reload is enabled in
+			 * freq mode later.
+			 */
+			intel_pmu_save_and_restart_reload(event, counts[bit]);
+		} else
+			intel_pmu_save_and_restart(event);
 	}
 }
 
-- 
2.38.1