lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20241218121437.1717-7-ravi.bangoria@amd.com>
Date: Wed, 18 Dec 2024 12:14:37 +0000
From: Ravi Bangoria <ravi.bangoria@....com>
To: <peterz@...radead.org>, <mingo@...hat.com>, <namhyung@...nel.org>
CC: <ravi.bangoria@....com>, <acme@...nel.org>, <eranian@...gle.com>,
	<jolsa@...nel.org>, <irogers@...gle.com>, <kan.liang@...ux.intel.com>,
	<bp@...en8.de>, <x86@...nel.org>, <linux-perf-users@...r.kernel.org>,
	<linux-kernel@...r.kernel.org>, <santosh.shukla@....com>,
	<ananth.narayan@....com>, <sandipan.das@....com>
Subject: [PATCH v2 6/6] perf mem/c2c amd: Add ldlat support

Perf mem and c2c uses IBS Op PMU on AMD platforms. IBS Op PMU on Zen5
uarch has added support for Load Latency filtering. Implement perf mem/
c2c --ldlat using IBS Op Load Latency filtering capability.

Some subtle differences between AMD and other arch:
o --ldlat is disabled by default on AMD
o Supported values are 128 to 2048.

Signed-off-by: Ravi Bangoria <ravi.bangoria@....com>
---
 tools/perf/Documentation/perf-c2c.txt      | 10 ++++++--
 tools/perf/Documentation/perf-mem.txt      | 12 +++++++--
 tools/perf/arch/x86/util/mem-events.c      |  6 +++++
 tools/perf/arch/x86/util/mem-events.h      |  1 +
 tools/perf/arch/x86/util/pmu.c             | 20 ++++++++++++---
 tools/perf/tests/shell/test_data_symbol.sh | 29 +++++++++++++++++++---
 tools/perf/util/pmu.c                      | 11 ++++++++
 tools/perf/util/pmu.h                      |  2 ++
 8 files changed, 81 insertions(+), 10 deletions(-)

diff --git a/tools/perf/Documentation/perf-c2c.txt b/tools/perf/Documentation/perf-c2c.txt
index 856f0dfb8e5a..3e2f690f127a 100644
--- a/tools/perf/Documentation/perf-c2c.txt
+++ b/tools/perf/Documentation/perf-c2c.txt
@@ -54,8 +54,14 @@ RECORD OPTIONS
 
 -l::
 --ldlat::
-	Configure mem-loads latency. Supported on Intel and Arm64 processors
-	only. Ignored on other archs.
+	Configure mem-loads latency. Supported on Intel, Arm64 and some variants
+	of AMD platforms. Ignored on other archs.
+
+	On AMD platforms:
+	- Supported latency values are 128 to 2048 (both inclusive).
+	- Latency value which is a multiple of 128 incurs a little less profiling
+	  overhead compared to other values.
+	- Load latency filtering is disabled by default.
 
 -k::
 --all-kernel::
diff --git a/tools/perf/Documentation/perf-mem.txt b/tools/perf/Documentation/perf-mem.txt
index 8a1bd9ff0f86..f28837d6b783 100644
--- a/tools/perf/Documentation/perf-mem.txt
+++ b/tools/perf/Documentation/perf-mem.txt
@@ -28,6 +28,8 @@ and kernel support is required. See linkperf:perf-arm-spe[1] for a setup guide.
 Due to the statistical nature of SPE sampling, not every memory operation will
 be sampled.
 
+On AMD this use IBS Op PMU to sample load-store operations.
+
 COMMON OPTIONS
 --------------
 -f::
@@ -67,8 +69,14 @@ RECORD OPTIONS
 	Configure all used events to run in user space.
 
 --ldlat <n>::
-	Specify desired latency for loads event. Supported on Intel and Arm64
-	processors only. Ignored on other archs.
+	Specify desired latency for loads event. Supported on Intel, Arm64 and
+	some variants of AMD platforms. Ignored on other archs.
+
+	On AMD platforms:
+	- Supported latency values are 128 to 2048 (both inclusive).
+	- Latency value which is a multiple of 128 incurs a little less profiling
+	  overhead compared to other values.
+	- Load latency filtering is disabled by default.
 
 REPORT OPTIONS
 --------------
diff --git a/tools/perf/arch/x86/util/mem-events.c b/tools/perf/arch/x86/util/mem-events.c
index 62df03e91c7e..b38f519020ff 100644
--- a/tools/perf/arch/x86/util/mem-events.c
+++ b/tools/perf/arch/x86/util/mem-events.c
@@ -26,3 +26,9 @@ struct perf_mem_event perf_mem_events_amd[PERF_MEM_EVENTS__MAX] = {
 	E(NULL,		NULL,		NULL,	false,	0),
 	E("mem-ldst",	"%s//",		NULL,	false,	0),
 };
+
+struct perf_mem_event perf_mem_events_amd_ldlat[PERF_MEM_EVENTS__MAX] = {
+	E(NULL,		NULL,		NULL,	false,	0),
+	E(NULL,		NULL,		NULL,	false,	0),
+	E("mem-ldst",	"%s/ldlat=%u/",	NULL,	true,	0),
+};
diff --git a/tools/perf/arch/x86/util/mem-events.h b/tools/perf/arch/x86/util/mem-events.h
index f55c8d3b7d59..11e09a256f5b 100644
--- a/tools/perf/arch/x86/util/mem-events.h
+++ b/tools/perf/arch/x86/util/mem-events.h
@@ -6,5 +6,6 @@ extern struct perf_mem_event perf_mem_events_intel[PERF_MEM_EVENTS__MAX];
 extern struct perf_mem_event perf_mem_events_intel_aux[PERF_MEM_EVENTS__MAX];
 
 extern struct perf_mem_event perf_mem_events_amd[PERF_MEM_EVENTS__MAX];
+extern struct perf_mem_event perf_mem_events_amd_ldlat[PERF_MEM_EVENTS__MAX];
 
 #endif /* _X86_MEM_EVENTS_H */
diff --git a/tools/perf/arch/x86/util/pmu.c b/tools/perf/arch/x86/util/pmu.c
index e0060dac2a9f..8712cbbbc712 100644
--- a/tools/perf/arch/x86/util/pmu.c
+++ b/tools/perf/arch/x86/util/pmu.c
@@ -18,8 +18,10 @@
 #include "mem-events.h"
 #include "util/env.h"
 
-void perf_pmu__arch_init(struct perf_pmu *pmu __maybe_unused)
+void perf_pmu__arch_init(struct perf_pmu *pmu)
 {
+	struct perf_pmu_caps *ldlat_cap;
+
 #ifdef HAVE_AUXTRACE_SUPPORT
 	if (!strcmp(pmu->name, INTEL_PT_PMU_NAME)) {
 		pmu->auxtrace = true;
@@ -33,8 +35,20 @@ void perf_pmu__arch_init(struct perf_pmu *pmu __maybe_unused)
 #endif
 
 	if (x86__is_amd_cpu()) {
-		if (!strcmp(pmu->name, "ibs_op"))
-			pmu->mem_events = perf_mem_events_amd;
+		if (strcmp(pmu->name, "ibs_op"))
+			return;
+
+		pmu->mem_events = perf_mem_events_amd;
+
+		if (!perf_pmu__caps_parse(pmu))
+			return;
+
+		ldlat_cap = perf_pmu__get_cap(pmu, "ldlat");
+		if (!ldlat_cap || strcmp(ldlat_cap->value, "1"))
+			return;
+
+		perf_mem_events__loads_ldlat = 0;
+		pmu->mem_events = perf_mem_events_amd_ldlat;
 	} else if (pmu->is_core) {
 		if (perf_pmu__have_event(pmu, "mem-loads-aux"))
 			pmu->mem_events = perf_mem_events_intel_aux;
diff --git a/tools/perf/tests/shell/test_data_symbol.sh b/tools/perf/tests/shell/test_data_symbol.sh
index c86da0235059..747f5dc9d679 100755
--- a/tools/perf/tests/shell/test_data_symbol.sh
+++ b/tools/perf/tests/shell/test_data_symbol.sh
@@ -55,11 +55,34 @@ trap cleanup_files exit term int
 
 echo "Recording workload..."
 
-# perf mem/c2c internally uses IBS PMU on AMD CPU which doesn't support
-# user/kernel filtering and per-process monitoring, spin program on
-# specific CPU and test in per-CPU mode.
 is_amd=$(grep -E -c 'vendor_id.*AuthenticAMD' /proc/cpuinfo)
 if (($is_amd >= 1)); then
+	mem_events="$(perf mem record -v -e list 2>&1)"
+	if ! [[ "$mem_events" =~ ^mem\-ldst.*ibs_op/(.*)/.*available ]]; then
+		echo "ERROR: mem-ldst event is not matching"
+		exit 1
+	fi
+
+	# --ldlat on AMD:
+	# o Zen4 and earlier uarch does not support ldlat
+	# o Even on supported platforms, it's disabled (--ldlat=0) by default.
+	ldlat=${BASH_REMATCH[1]}
+	if [[ -n $ldlat ]]; then
+		if ! [[ "$ldlat" =~ ldlat=0 ]]; then
+			echo "ERROR: ldlat not initialized to 0?"
+			exit 1
+		fi
+
+		mem_events="$(perf mem record -v --ldlat=150 -e list 2>&1)"
+		if ! [[ "$mem_events" =~ ^mem-ldst.*ibs_op/ldlat=150/.*available ]]; then
+			echo "ERROR: --ldlat not honored?"
+			exit 1
+		fi
+	fi
+
+	# perf mem/c2c internally uses IBS PMU on AMD CPU which doesn't
+	# support user/kernel filtering and per-process monitoring on older
+	# kernels, spin program on specific CPU and test in per-CPU mode.
 	perf mem record -vvv -o ${PERF_DATA} -C 0 -- taskset -c 0 $TEST_PROGRAM 2>"${ERR_FILE}" &
 else
 	perf mem record -vvv --all-user -o ${PERF_DATA} -- $TEST_PROGRAM 2>"${ERR_FILE}" &
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 08a9d0bd9301..e480fd4744dd 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -2103,6 +2103,17 @@ static void perf_pmu__del_caps(struct perf_pmu *pmu)
 	}
 }
 
+struct perf_pmu_caps *perf_pmu__get_cap(struct perf_pmu *pmu, const char *name)
+{
+	struct perf_pmu_caps *caps;
+
+	list_for_each_entry(caps, &pmu->caps, list) {
+		if (!strcmp(caps->name, name))
+			return caps;
+	}
+	return NULL;
+}
+
 /*
  * Reading/parsing the given pmu capabilities, which should be located at:
  * /sys/bus/event_source/devices/<dev>/caps as sysfs group attributes.
diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h
index dbed6c243a5e..b3d4d7c5b92d 100644
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h
@@ -266,6 +266,8 @@ bool pmu_uncore_identifier_match(const char *compat, const char *id);
 
 int perf_pmu__convert_scale(const char *scale, char **end, double *sval);
 
+struct perf_pmu_caps *perf_pmu__get_cap(struct perf_pmu *pmu, const char *name);
+
 int perf_pmu__caps_parse(struct perf_pmu *pmu);
 
 void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config,
-- 
2.43.0


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ