lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20230517172745.5833-5-kprateek.nayak@amd.com>
Date:   Wed, 17 May 2023 22:57:44 +0530
From:   K Prateek Nayak <kprateek.nayak@....com>
To:     <linux-perf-users@...r.kernel.org>, <linux-kernel@...r.kernel.org>,
        <acme@...nel.org>, <peterz@...radead.org>, <mingo@...hat.com>,
        <mark.rutland@....com>, <alexander.shishkin@...ux.intel.com>,
        <jolsa@...nel.org>, <namhyung@...nel.org>
CC:     <ravi.bangoria@....com>, <sandipan.das@....com>,
        <ananth.narayan@....com>, <gautham.shenoy@....com>,
        <eranian@...gle.com>, <irogers@...gle.com>, <puwen@...on.cn>
Subject: [PATCH v4 4/5] perf stat: Add "--per-cache" aggregation option and document the same

This patch adds support for "--per-cache" option for aggregation at a
particular cache level and documents the same. Following is the output
of perf stat with aggregation at L3 for the event
"ls_dmnd_fills_from_sys.ext_cache_remote" on a dual socket
3rd Generation EPYC Processor (2 x 64C/128T - 16 LLCs) when running
hackbench pinned to 4 LLCs:

  $ sudo perf stat --per-cache=L3 -a -e ls_dmnd_fills_from_sys.ext_cache_remote -- \
    taskset -c 0-15,64-79,128-143,192-207 \
    perf bench sched messaging -p -t -l 100000 -g 8

  ...

   Performance counter stats for 'system wide':
  
  S0-D0-L3-ID0             16          9,500,803      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID8             16          6,338,099      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID16            16            355,005      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID24            16             22,067      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID32            16             16,321      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID40            16             11,619      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID48            16              4,238      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID56            16             31,158      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID64            16         28,242,452      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID72            16         22,906,973      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID80            16             72,898      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID88            16             56,907      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID96            16             20,456      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID104           16             40,913      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID112           16             78,113      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID120           16             37,897      ls_dmnd_fills_from_sys.ext_cache_remote

Also support perf stat record and perf stat report with the ability to
specify a different cache level to aggregate data at when running perf
stat report.

  $ sudo perf stat record --per-cache=L2 -a -e ls_dmnd_fills_from_sys.ext_cache_remote -- \
    taskset -c 0-15,64-79,128-143,192-207 \
    perf bench sched messaging -p -t -l 100000 -g 8

  ...

   Performance counter stats for 'system wide':
  
  S0-D0-L2-ID0              2          1,442,061      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L2-ID1              2          1,548,994      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L2-ID2              2          1,553,557      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L2-ID3              2          1,420,122      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L2-ID4              2          1,465,461      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L2-ID5              2          1,455,153      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L2-ID6              2          1,595,237      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L2-ID7              2          1,499,321      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L2-ID8              2          1,919,025      ls_dmnd_fills_from_sys.ext_cache_remote
  ...
  S1-D1-L2-ID127            2             21,295      ls_dmnd_fills_from_sys.ext_cache_remote

  $ sudo perf stat report --per-cache=L3

   Performance counter stats for 'perf stat record --per-cache=L2 -a -e ls_dmnd_fills_from_sys.ext_cache_remote --\
                                  taskset -c 0-15,64-79,128-143,192-207 \
                                  perf bench sched messaging -p -t -l 100000 -g 8':
  
  S0-D0-L3-ID0             16         11,979,906      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID8             16         14,257,202      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID16            16            377,484      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID24            16             27,224      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID32            16             26,816      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID40            16             14,461      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID48            16             10,499      ls_dmnd_fills_from_sys.ext_cache_remote
  S0-D0-L3-ID56            16             53,817      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID64            16         27,361,987      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID72            16         37,299,024      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID80            16             84,125      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID88            16             64,561      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID96            16             13,403      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID104           16             20,138      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID112           16             93,220      ls_dmnd_fills_from_sys.ext_cache_remote
  S1-D1-L3-ID120           16             35,465      ls_dmnd_fills_from_sys.ext_cache_remote

On the above system, the domain covered by S0-D0-L3-ID0 contains
S0-D0-L2-ID0 to S0-D0-L2-ID7, the corresponding count for L3-ID0 is
equal to the sum of counts for L2-ID0 to L2-ID7.

Add documentation for the newly introduced "--per-cache" option.

Suggested-by: Gautham R. Shenoy <gautham.shenoy@....com>
Signed-off-by: K Prateek Nayak <kprateek.nayak@....com>
---
Changelog:
o v3->v4:
  - Previously part of Patch 2.
  - Fixed errors in documentation.
---
 tools/perf/Documentation/perf-stat.txt | 16 ++++++++
 tools/perf/builtin-stat.c              | 56 ++++++++++++++++++++++++++
 2 files changed, 72 insertions(+)

diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 29bdcfa93f04..785f0e2bcfac 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -308,6 +308,14 @@ use --per-die in addition to -a. (system-wide).  The output includes the
 die number and the number of online processors on that die. This is
 useful to gauge the amount of aggregation.
 
+--per-cache::
+Aggregate counts per cache instance for system-wide mode measurements.  By
+default, the aggregation happens for the cache level at the highest index
+in the system. To specify a particular level, mention the cache level
+alongside the option in the format [Ll][1-9][0-9]*. For example:
+Using option "--per-cache=l3" or "--per-cache=L3" will aggregate the
+information at the boundary of the level 3 cache in the system.
+
 --per-core::
 Aggregate counts per physical processor for system-wide mode measurements.  This
 is a useful mode to detect imbalance between physical cores.  To enable this mode,
@@ -379,6 +387,14 @@ Aggregate counts per processor socket for system-wide mode measurements.
 --per-die::
 Aggregate counts per processor die for system-wide mode measurements.
 
+--per-cache::
+Aggregate counts per cache instance for system-wide mode measurements.  By
+default, the aggregation happens for the cache level at the highest index
+in the system. To specify a particular level, mention the cache level
+alongside the option in the format [Ll][1-9][0-9]*. For example: Using
+option "--per-cache=l3" or "--per-cache=L3" will aggregate the
+information at the boundary of the level 3 cache in the system.
+
 --per-core::
 Aggregate counts per physical processor for system-wide mode measurements.
 
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index b072c4160fe1..7aafea5c7e6c 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1100,6 +1100,55 @@ static int parse_hybrid_type(const struct option *opt,
 	return 0;
 }
 
+static int parse_cache_level(const struct option *opt,
+			     const char *str,
+			     int unset __maybe_unused)
+{
+	int level;
+	u32 *aggr_mode = (u32 *)opt->value;
+	u32 *aggr_level = (u32 *)opt->data;
+
+	/*
+	 * If no string is specified, aggregate based on the topology of
+	 * Last Level Cache (LLC). Since the LLC level can change from
+	 * architecture to architecture, set level greater than
+	 * MAX_CACHE_LVL which will be interpreted as LLC.
+	 */
+	if (str == NULL) {
+		level = MAX_CACHE_LVL + 1;
+		goto out;
+	}
+
+	/*
+	 * The format to specify cache level is LX or lX where X is the
+	 * cache level.
+	 */
+	if (strlen(str) != 2 || (str[0] != 'l' && str[0] != 'L')) {
+		pr_err("Cache level must be of form L[1-%d], or l[1-%d]\n",
+		       MAX_CACHE_LVL,
+		       MAX_CACHE_LVL);
+		return -EINVAL;
+	}
+
+	level = atoi(&str[1]);
+	if (level < 1) {
+		pr_err("Cache level must be of form L[1-%d], or l[1-%d]\n",
+		       MAX_CACHE_LVL,
+		       MAX_CACHE_LVL);
+		return -EINVAL;
+	}
+
+	if (level > MAX_CACHE_LVL) {
+		pr_err("perf only supports max cache level of %d.\n"
+		       "Consider increasing MAX_CACHE_LVL\n", MAX_CACHE_LVL);
+		return -EINVAL;
+	}
+out:
+	*aggr_mode = AGGR_CACHE;
+	*aggr_level = level;
+	return 0;
+}
+
 static struct option stat_options[] = {
 	OPT_BOOLEAN('T', "transaction", &transaction_run,
 		    "hardware transaction statistics"),
@@ -1177,6 +1226,9 @@ static struct option stat_options[] = {
 		     "aggregate counts per processor socket", AGGR_SOCKET),
 	OPT_SET_UINT(0, "per-die", &stat_config.aggr_mode,
 		     "aggregate counts per processor die", AGGR_DIE),
+	OPT_CALLBACK_OPTARG(0, "per-cache", &stat_config.aggr_mode, &stat_config.aggr_level,
+			    "cache level", "aggregate count at this cache level (Default: LLC)",
+			    parse_cache_level),
 	OPT_SET_UINT(0, "per-core", &stat_config.aggr_mode,
 		     "aggregate counts per physical processor core", AGGR_CORE),
 	OPT_SET_UINT(0, "per-thread", &stat_config.aggr_mode,
@@ -2200,6 +2252,10 @@ static int __cmd_report(int argc, const char **argv)
 		     "aggregate counts per processor socket", AGGR_SOCKET),
 	OPT_SET_UINT(0, "per-die", &perf_stat.aggr_mode,
 		     "aggregate counts per processor die", AGGR_DIE),
+	OPT_CALLBACK_OPTARG(0, "per-cache", &perf_stat.aggr_mode, &perf_stat.aggr_level,
+			    "cache level",
+			    "aggregate count at this cache level (Default: LLC)",
+			    parse_cache_level),
 	OPT_SET_UINT(0, "per-core", &perf_stat.aggr_mode,
 		     "aggregate counts per physical processor core", AGGR_CORE),
 	OPT_SET_UINT(0, "per-node", &perf_stat.aggr_mode,
-- 
2.25.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ