[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250719034515.2000467-13-irogers@google.com>
Date: Fri, 18 Jul 2025 20:45:08 -0700
From: Ian Rogers <irogers@...gle.com>
To: Thomas Falcon <thomas.falcon@...el.com>, Peter Zijlstra <peterz@...radead.org>,
Ingo Molnar <mingo@...hat.com>, Arnaldo Carvalho de Melo <acme@...nel.org>, Namhyung Kim <namhyung@...nel.org>,
Mark Rutland <mark.rutland@....com>,
Alexander Shishkin <alexander.shishkin@...ux.intel.com>, Jiri Olsa <jolsa@...nel.org>,
Ian Rogers <irogers@...gle.com>, Adrian Hunter <adrian.hunter@...el.com>,
Kan Liang <kan.liang@...ux.intel.com>,
"Andreas Färber" <afaerber@...e.de>, Manivannan Sadhasivam <mani@...nel.org>,
Caleb Biggers <caleb.biggers@...el.com>, Weilin Wang <weilin.wang@...el.com>,
linux-kernel@...r.kernel.org, linux-perf-users@...r.kernel.org,
linux-arm-kernel@...ts.infradead.org, linux-actions@...ts.infradead.org
Subject: [PATCH v1 12/19] perf vendor events: Update lunarlake events/metrics
Update events from v1.14 to v1.17. Update metrics from TMA 5.0 to 5.1.
The event updates come from:
https://github.com/intel/perfmon/commit/6bdcbce3e9df30ae02bd0ea51fd73bf51ee8aff4
https://github.com/intel/perfmon/commit/1684fa543fd45970759bb72dc3fc00c2ef87c0e8
Signed-off-by: Ian Rogers <irogers@...gle.com>
---
.../pmu-events/arch/x86/lunarlake/cache.json | 104 ++++++---
.../arch/x86/lunarlake/frontend.json | 40 ++--
.../arch/x86/lunarlake/lnl-metrics.json | 216 ++++++++++--------
.../pmu-events/arch/x86/lunarlake/memory.json | 22 +-
.../arch/x86/lunarlake/pipeline.json | 85 ++++---
.../x86/lunarlake/uncore-interconnect.json | 10 +
.../arch/x86/lunarlake/uncore-memory.json | 8 +
tools/perf/pmu-events/arch/x86/mapfile.csv | 2 +-
8 files changed, 294 insertions(+), 193 deletions(-)
create mode 100644 tools/perf/pmu-events/arch/x86/lunarlake/uncore-interconnect.json
diff --git a/tools/perf/pmu-events/arch/x86/lunarlake/cache.json b/tools/perf/pmu-events/arch/x86/lunarlake/cache.json
index ff37d49611c3..29bcb847178f 100644
--- a/tools/perf/pmu-events/arch/x86/lunarlake/cache.json
+++ b/tools/perf/pmu-events/arch/x86/lunarlake/cache.json
@@ -28,6 +28,16 @@
"UMask": "0x1",
"Unit": "cpu_core"
},
+ {
+ "BriefDescription": "Cachelines replaced into the L1 d-cache. Successful replacements only (not blocked) and exclude WB-miss case",
+ "Counter": "0,1,2,3,4,5,6,7,8,9",
+ "EventCode": "0x51",
+ "EventName": "L1D.L1_REPLACEMENT",
+ "PublicDescription": "Counts cachelines replaced into the L1 d-cache.",
+ "SampleAfterValue": "1000003",
+ "UMask": "0x4",
+ "Unit": "cpu_core"
+ },
{
"BriefDescription": "Cachelines replaced into the L0 and L1 d-cache. Successful replacements only (not blocked) and exclude WB-miss case",
"Counter": "0,1,2,3,4,5,6,7,8,9",
@@ -592,7 +602,7 @@
"Data_LA": "1",
"EventCode": "0xd0",
"EventName": "MEM_INST_RETIRED.ALL_LOADS",
- "PublicDescription": "Counts Instructions with at least one architecturally visible load retired. Available PDIST counters: 0",
+ "PublicDescription": "Counts Instructions with at least one architecturally visible load retired. Available PDIST counters: 0,1",
"SampleAfterValue": "1000003",
"UMask": "0x81",
"Unit": "cpu_core"
@@ -603,7 +613,7 @@
"Data_LA": "1",
"EventCode": "0xd0",
"EventName": "MEM_INST_RETIRED.ALL_STORES",
- "PublicDescription": "Counts all retired store instructions. Available PDIST counters: 0",
+ "PublicDescription": "Counts all retired store instructions. Available PDIST counters: 0,1",
"SampleAfterValue": "1000003",
"UMask": "0x82",
"Unit": "cpu_core"
@@ -613,7 +623,7 @@
"Counter": "0,1,2,3",
"EventCode": "0xd0",
"EventName": "MEM_INST_RETIRED.ALL_SWPF",
- "PublicDescription": "Counts all retired software prefetch instructions. Available PDIST counters: 0",
+ "PublicDescription": "Counts all retired software prefetch instructions. Available PDIST counters: 0,1",
"SampleAfterValue": "1000003",
"UMask": "0x84",
"Unit": "cpu_core"
@@ -624,7 +634,7 @@
"Data_LA": "1",
"EventCode": "0xd0",
"EventName": "MEM_INST_RETIRED.ANY",
- "PublicDescription": "Counts all retired memory instructions - loads and stores. Available PDIST counters: 0",
+ "PublicDescription": "Counts all retired memory instructions - loads and stores. Available PDIST counters: 0,1",
"SampleAfterValue": "1000003",
"UMask": "0x87",
"Unit": "cpu_core"
@@ -635,7 +645,7 @@
"Data_LA": "1",
"EventCode": "0xd0",
"EventName": "MEM_INST_RETIRED.LOCK_LOADS",
- "PublicDescription": "Counts retired load instructions with locked access. Available PDIST counters: 0",
+ "PublicDescription": "Counts retired load instructions with locked access. Available PDIST counters: 0,1",
"SampleAfterValue": "100007",
"UMask": "0x21",
"Unit": "cpu_core"
@@ -646,7 +656,7 @@
"Data_LA": "1",
"EventCode": "0xd0",
"EventName": "MEM_INST_RETIRED.SPLIT_LOADS",
- "PublicDescription": "Counts retired load instructions that split across a cacheline boundary. Available PDIST counters: 0",
+ "PublicDescription": "Counts retired load instructions that split across a cacheline boundary. Available PDIST counters: 0,1",
"SampleAfterValue": "100003",
"UMask": "0x41",
"Unit": "cpu_core"
@@ -657,18 +667,29 @@
"Data_LA": "1",
"EventCode": "0xd0",
"EventName": "MEM_INST_RETIRED.SPLIT_STORES",
- "PublicDescription": "Counts retired store instructions that split across a cacheline boundary. Available PDIST counters: 0",
+ "PublicDescription": "Counts retired store instructions that split across a cacheline boundary. Available PDIST counters: 0,1",
"SampleAfterValue": "100003",
"UMask": "0x42",
"Unit": "cpu_core"
},
+ {
+ "BriefDescription": "Retired instructions that hit the STLB.",
+ "Counter": "0,1,2,3",
+ "Data_LA": "1",
+ "EventCode": "0xd0",
+ "EventName": "MEM_INST_RETIRED.STLB_HIT_ANY",
+ "PublicDescription": "Number of retired instructions with a clean hit in the 2nd-level TLB (STLB). Available PDIST counters: 0,1",
+ "SampleAfterValue": "100003",
+ "UMask": "0xf",
+ "Unit": "cpu_core"
+ },
{
"BriefDescription": "Retired load instructions that hit the STLB.",
"Counter": "0,1,2,3",
"Data_LA": "1",
"EventCode": "0xd0",
"EventName": "MEM_INST_RETIRED.STLB_HIT_LOADS",
- "PublicDescription": "Number of retired load instructions with a clean hit in the 2nd-level TLB (STLB). Available PDIST counters: 0",
+ "PublicDescription": "Number of retired load instructions with a clean hit in the 2nd-level TLB (STLB). Available PDIST counters: 0,1",
"SampleAfterValue": "100003",
"UMask": "0x9",
"Unit": "cpu_core"
@@ -679,18 +700,39 @@
"Data_LA": "1",
"EventCode": "0xd0",
"EventName": "MEM_INST_RETIRED.STLB_HIT_STORES",
- "PublicDescription": "Number of retired store instructions that hit in the 2nd-level TLB (STLB). Available PDIST counters: 0",
+ "PublicDescription": "Number of retired store instructions that hit in the 2nd-level TLB (STLB). Available PDIST counters: 0,1",
"SampleAfterValue": "100003",
"UMask": "0xa",
"Unit": "cpu_core"
},
+ {
+ "BriefDescription": "Retired SWPF instructions that hit the STLB.",
+ "Counter": "0,1,2,3",
+ "EventCode": "0xd0",
+ "EventName": "MEM_INST_RETIRED.STLB_HIT_SWPF",
+ "PublicDescription": "Number of retired SWPF instructions that hit in the 2nd-level TLB (STLB). Available PDIST counters: 0,1",
+ "SampleAfterValue": "1000003",
+ "UMask": "0xc",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "Retired instructions that miss the STLB.",
+ "Counter": "0,1,2,3",
+ "Data_LA": "1",
+ "EventCode": "0xd0",
+ "EventName": "MEM_INST_RETIRED.STLB_MISS_ANY",
+ "PublicDescription": "Retired instructions that miss the STLB. Available PDIST counters: 0,1",
+ "SampleAfterValue": "100003",
+ "UMask": "0x17",
+ "Unit": "cpu_core"
+ },
{
"BriefDescription": "Retired load instructions that miss the STLB.",
"Counter": "0,1,2,3",
"Data_LA": "1",
"EventCode": "0xd0",
"EventName": "MEM_INST_RETIRED.STLB_MISS_LOADS",
- "PublicDescription": "Number of retired load instructions that (start a) miss in the 2nd-level TLB (STLB). Available PDIST counters: 0",
+ "PublicDescription": "Number of retired load instructions that (start a) miss in the 2nd-level TLB (STLB). Available PDIST counters: 0,1",
"SampleAfterValue": "100003",
"UMask": "0x11",
"Unit": "cpu_core"
@@ -701,18 +743,28 @@
"Data_LA": "1",
"EventCode": "0xd0",
"EventName": "MEM_INST_RETIRED.STLB_MISS_STORES",
- "PublicDescription": "Number of retired store instructions that (start a) miss in the 2nd-level TLB (STLB). Available PDIST counters: 0",
+ "PublicDescription": "Number of retired store instructions that (start a) miss in the 2nd-level TLB (STLB). Available PDIST counters: 0,1",
"SampleAfterValue": "100003",
"UMask": "0x12",
"Unit": "cpu_core"
},
+ {
+ "BriefDescription": "Retired SWPF instructions that miss the STLB.",
+ "Counter": "0,1,2,3",
+ "EventCode": "0xd0",
+ "EventName": "MEM_INST_RETIRED.STLB_MISS_SWPF",
+ "PublicDescription": "Number of retired SWPF instructions that (start a) miss in the 2nd-level TLB (STLB). Available PDIST counters: 0,1",
+ "SampleAfterValue": "1000003",
+ "UMask": "0x14",
+ "Unit": "cpu_core"
+ },
{
"BriefDescription": "Retired load instructions whose data sources were a cross-core Snoop hits and forwards data from an in on-package core cache (induced by NI$)",
"Counter": "0,1,2,3",
"Data_LA": "1",
"EventCode": "0xd2",
"EventName": "MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD",
- "PublicDescription": "Counts retired load instructions whose data sources were a cross-core Snoop hits and forwards data from an in on-package core cache (induced by NI$) Available PDIST counters: 0",
+ "PublicDescription": "Counts retired load instructions whose data sources were a cross-core Snoop hits and forwards data from an in on-package core cache (induced by NI$) Available PDIST counters: 0,1",
"SampleAfterValue": "20011",
"UMask": "0x10",
"Unit": "cpu_core"
@@ -723,7 +775,7 @@
"Data_LA": "1",
"EventCode": "0xd2",
"EventName": "MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM",
- "PublicDescription": "Counts retired load instructions whose data sources were HitM responses from shared L3, Hit-with-FWD is normally excluded. Available PDIST counters: 0",
+ "PublicDescription": "Counts retired load instructions whose data sources were HitM responses from shared L3, Hit-with-FWD is normally excluded. Available PDIST counters: 0,1",
"SampleAfterValue": "20011",
"UMask": "0x4",
"Unit": "cpu_core"
@@ -734,7 +786,7 @@
"Data_LA": "1",
"EventCode": "0xd2",
"EventName": "MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS",
- "PublicDescription": "Counts the retired load instructions whose data sources were L3 hit and cross-core snoop missed in on-pkg core cache. Available PDIST counters: 0",
+ "PublicDescription": "Counts the retired load instructions whose data sources were L3 hit and cross-core snoop missed in on-pkg core cache. Available PDIST counters: 0,1",
"SampleAfterValue": "20011",
"UMask": "0x1",
"Unit": "cpu_core"
@@ -745,7 +797,7 @@
"Data_LA": "1",
"EventCode": "0xd2",
"EventName": "MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD",
- "PublicDescription": "Counts retired load instructions whose data sources were L3 and cross-core snoop hits in on-pkg core cache. Available PDIST counters: 0",
+ "PublicDescription": "Counts retired load instructions whose data sources were L3 and cross-core snoop hits in on-pkg core cache. Available PDIST counters: 0,1",
"SampleAfterValue": "20011",
"UMask": "0x2",
"Unit": "cpu_core"
@@ -756,7 +808,7 @@
"Data_LA": "1",
"EventCode": "0xd3",
"EventName": "MEM_LOAD_L3_MISS_RETIRED.MEMSIDE_CACHE",
- "PublicDescription": "Retired load instructions which data source is memory side cache. Available PDIST counters: 0",
+ "PublicDescription": "Retired load instructions which data source is memory side cache. Available PDIST counters: 0,1",
"SampleAfterValue": "100007",
"Unit": "cpu_core"
},
@@ -766,7 +818,7 @@
"Data_LA": "1",
"EventCode": "0xd4",
"EventName": "MEM_LOAD_MISC_RETIRED.UC",
- "PublicDescription": "Retired instructions with at least one load to uncacheable memory-type, or at least one cache-line split locked access (Bus Lock). Available PDIST counters: 0",
+ "PublicDescription": "Retired instructions with at least one load to uncacheable memory-type, or at least one cache-line split locked access (Bus Lock). Available PDIST counters: 0,1",
"SampleAfterValue": "100007",
"UMask": "0x4",
"Unit": "cpu_core"
@@ -777,7 +829,7 @@
"Data_LA": "1",
"EventCode": "0xd1",
"EventName": "MEM_LOAD_RETIRED.FB_HIT",
- "PublicDescription": "Counts retired load instructions with at least one uop was load missed in L1 but hit FB (Fill Buffers) due to preceding miss to the same cache line with data not ready. Available PDIST counters: 0",
+ "PublicDescription": "Counts retired load instructions with at least one uop was load missed in L1 but hit FB (Fill Buffers) due to preceding miss to the same cache line with data not ready. Available PDIST counters: 0,1",
"SampleAfterValue": "100007",
"UMask": "0x40",
"Unit": "cpu_core"
@@ -788,7 +840,7 @@
"Data_LA": "1",
"EventCode": "0xd1",
"EventName": "MEM_LOAD_RETIRED.L1_HIT",
- "PublicDescription": "Counts retired load instructions with at least one uop that hit in the L1 data cache. This event includes all SW prefetches and lock instructions regardless of the data source. Available PDIST counters: 0",
+ "PublicDescription": "Counts retired load instructions with at least one uop that hit in the L1 data cache. This event includes all SW prefetches and lock instructions regardless of the data source. Available PDIST counters: 0,1",
"SampleAfterValue": "1000003",
"UMask": "0x101",
"Unit": "cpu_core"
@@ -799,7 +851,7 @@
"Data_LA": "1",
"EventCode": "0xd1",
"EventName": "MEM_LOAD_RETIRED.L1_HIT_L0",
- "PublicDescription": "Counts retired load instructions with at least one uop that hit in the Level 0 of the L1 data cache. This event includes all SW prefetches and lock instructions regardless of the data source. Available PDIST counters: 0",
+ "PublicDescription": "Counts retired load instructions with at least one uop that hit in the Level 0 of the L1 data cache. This event includes all SW prefetches and lock instructions regardless of the data source. Available PDIST counters: 0,1",
"SampleAfterValue": "1000003",
"UMask": "0x1",
"Unit": "cpu_core"
@@ -809,7 +861,7 @@
"Counter": "0,1,2,3",
"EventCode": "0xd1",
"EventName": "MEM_LOAD_RETIRED.L1_HIT_L1",
- "PublicDescription": "Counts retired load instructions with at least one uop that hit in the Level 1 of the L1 data cache. Available PDIST counters: 0",
+ "PublicDescription": "Counts retired load instructions with at least one uop that hit in the Level 1 of the L1 data cache. Available PDIST counters: 0,1",
"SampleAfterValue": "1000003",
"Unit": "cpu_core"
},
@@ -819,7 +871,7 @@
"Data_LA": "1",
"EventCode": "0xd1",
"EventName": "MEM_LOAD_RETIRED.L1_MISS",
- "PublicDescription": "Counts retired load instructions with at least one uop that missed in the L1 cache. Available PDIST counters: 0",
+ "PublicDescription": "Counts retired load instructions with at least one uop that missed in the L1 cache. Available PDIST counters: 0,1",
"SampleAfterValue": "200003",
"UMask": "0x8",
"Unit": "cpu_core"
@@ -830,7 +882,7 @@
"Data_LA": "1",
"EventCode": "0xd1",
"EventName": "MEM_LOAD_RETIRED.L2_HIT",
- "PublicDescription": "Counts retired load instructions with L2 cache hits as data sources. Available PDIST counters: 0",
+ "PublicDescription": "Counts retired load instructions with L2 cache hits as data sources. Available PDIST counters: 0,1",
"SampleAfterValue": "200003",
"UMask": "0x2",
"Unit": "cpu_core"
@@ -841,7 +893,7 @@
"Data_LA": "1",
"EventCode": "0xd1",
"EventName": "MEM_LOAD_RETIRED.L2_MISS",
- "PublicDescription": "Counts retired load instructions missed L2 cache as data sources. Available PDIST counters: 0",
+ "PublicDescription": "Counts retired load instructions missed L2 cache as data sources. Available PDIST counters: 0,1",
"SampleAfterValue": "100021",
"UMask": "0x10",
"Unit": "cpu_core"
@@ -852,7 +904,7 @@
"Data_LA": "1",
"EventCode": "0xd1",
"EventName": "MEM_LOAD_RETIRED.L3_HIT",
- "PublicDescription": "Counts retired load instructions with at least one uop that hit in the L3 cache. Available PDIST counters: 0",
+ "PublicDescription": "Counts retired load instructions with at least one uop that hit in the L3 cache. Available PDIST counters: 0,1",
"SampleAfterValue": "100021",
"UMask": "0x4",
"Unit": "cpu_core"
@@ -863,7 +915,7 @@
"Data_LA": "1",
"EventCode": "0xd1",
"EventName": "MEM_LOAD_RETIRED.L3_MISS",
- "PublicDescription": "Counts retired load instructions with at least one uop that missed in the L3 cache. Available PDIST counters: 0",
+ "PublicDescription": "Counts retired load instructions with at least one uop that missed in the L3 cache. Available PDIST counters: 0,1",
"SampleAfterValue": "50021",
"UMask": "0x20",
"Unit": "cpu_core"
diff --git a/tools/perf/pmu-events/arch/x86/lunarlake/frontend.json b/tools/perf/pmu-events/arch/x86/lunarlake/frontend.json
index e2facc4086e9..b21d602e9f1a 100644
--- a/tools/perf/pmu-events/arch/x86/lunarlake/frontend.json
+++ b/tools/perf/pmu-events/arch/x86/lunarlake/frontend.json
@@ -108,7 +108,7 @@
"EventName": "FRONTEND_RETIRED.ANY_ANT",
"MSRIndex": "0x3F7",
"MSRValue": "0x9",
- "PublicDescription": "Always Not Taken (ANT) conditional retired branches (no BTB entry and not mispredicted) Available PDIST counters: 0",
+ "PublicDescription": "Always Not Taken (ANT) conditional retired branches (no BTB entry and not mispredicted) Available PDIST counters: 0,1",
"SampleAfterValue": "100007",
"UMask": "0x3",
"Unit": "cpu_core"
@@ -120,7 +120,7 @@
"EventName": "FRONTEND_RETIRED.ANY_DSB_MISS",
"MSRIndex": "0x3F7",
"MSRValue": "0x1",
- "PublicDescription": "Counts retired Instructions that experienced DSB (Decode stream buffer i.e. the decoded instruction-cache) miss. Available PDIST counters: 0",
+ "PublicDescription": "Counts retired Instructions that experienced DSB (Decode stream buffer i.e. the decoded instruction-cache) miss. Available PDIST counters: 0,1",
"SampleAfterValue": "100007",
"UMask": "0x3",
"Unit": "cpu_core"
@@ -169,7 +169,7 @@
"EventName": "FRONTEND_RETIRED.DSB_MISS",
"MSRIndex": "0x3F7",
"MSRValue": "0x11",
- "PublicDescription": "Number of retired Instructions that experienced a critical DSB (Decode stream buffer i.e. the decoded instruction-cache) miss. Critical means stalls were exposed to the back-end as a result of the DSB miss. Available PDIST counters: 0",
+ "PublicDescription": "Number of retired Instructions that experienced a critical DSB (Decode stream buffer i.e. the decoded instruction-cache) miss. Critical means stalls were exposed to the back-end as a result of the DSB miss. Available PDIST counters: 0,1",
"SampleAfterValue": "100007",
"UMask": "0x3",
"Unit": "cpu_core"
@@ -199,7 +199,7 @@
"EventName": "FRONTEND_RETIRED.ITLB_MISS",
"MSRIndex": "0x3F7",
"MSRValue": "0x14",
- "PublicDescription": "Counts retired Instructions that experienced iTLB (Instruction TLB) true miss. Available PDIST counters: 0",
+ "PublicDescription": "Counts retired Instructions that experienced iTLB (Instruction TLB) true miss. Available PDIST counters: 0,1",
"SampleAfterValue": "100007",
"UMask": "0x3",
"Unit": "cpu_core"
@@ -211,7 +211,7 @@
"EventName": "FRONTEND_RETIRED.L1I_MISS",
"MSRIndex": "0x3F7",
"MSRValue": "0x12",
- "PublicDescription": "Counts retired Instructions who experienced Instruction L1 Cache true miss. Available PDIST counters: 0",
+ "PublicDescription": "Counts retired Instructions who experienced Instruction L1 Cache true miss. Available PDIST counters: 0,1",
"SampleAfterValue": "100007",
"UMask": "0x3",
"Unit": "cpu_core"
@@ -223,7 +223,7 @@
"EventName": "FRONTEND_RETIRED.L2_MISS",
"MSRIndex": "0x3F7",
"MSRValue": "0x13",
- "PublicDescription": "Counts retired Instructions who experienced Instruction L2 Cache true miss. Available PDIST counters: 0",
+ "PublicDescription": "Counts retired Instructions who experienced Instruction L2 Cache true miss. Available PDIST counters: 0,1",
"SampleAfterValue": "100007",
"UMask": "0x3",
"Unit": "cpu_core"
@@ -235,7 +235,7 @@
"EventName": "FRONTEND_RETIRED.LATENCY_GE_128",
"MSRIndex": "0x3F7",
"MSRValue": "0x608006",
- "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 128 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0",
+ "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 128 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0,1",
"SampleAfterValue": "100007",
"UMask": "0x3",
"Unit": "cpu_core"
@@ -247,7 +247,7 @@
"EventName": "FRONTEND_RETIRED.LATENCY_GE_16",
"MSRIndex": "0x3F7",
"MSRValue": "0x601006",
- "PublicDescription": "Counts retired instructions that are delivered to the back-end after a front-end stall of at least 16 cycles. During this period the front-end delivered no uops. Available PDIST counters: 0",
+ "PublicDescription": "Counts retired instructions that are delivered to the back-end after a front-end stall of at least 16 cycles. During this period the front-end delivered no uops. Available PDIST counters: 0,1",
"SampleAfterValue": "100007",
"UMask": "0x3",
"Unit": "cpu_core"
@@ -259,7 +259,7 @@
"EventName": "FRONTEND_RETIRED.LATENCY_GE_2",
"MSRIndex": "0x3F7",
"MSRValue": "0x600206",
- "PublicDescription": "Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of at least 2 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0",
+ "PublicDescription": "Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of at least 2 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0,1",
"SampleAfterValue": "100007",
"UMask": "0x3",
"Unit": "cpu_core"
@@ -271,7 +271,7 @@
"EventName": "FRONTEND_RETIRED.LATENCY_GE_256",
"MSRIndex": "0x3F7",
"MSRValue": "0x610006",
- "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 256 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0",
+ "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 256 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0,1",
"SampleAfterValue": "100007",
"UMask": "0x3",
"Unit": "cpu_core"
@@ -283,7 +283,7 @@
"EventName": "FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1",
"MSRIndex": "0x3F7",
"MSRValue": "0x100206",
- "PublicDescription": "Counts retired instructions that are delivered to the back-end after the front-end had at least 1 bubble-slot for a period of 2 cycles. A bubble-slot is an empty issue-pipeline slot while there was no RAT stall. Available PDIST counters: 0",
+ "PublicDescription": "Counts retired instructions that are delivered to the back-end after the front-end had at least 1 bubble-slot for a period of 2 cycles. A bubble-slot is an empty issue-pipeline slot while there was no RAT stall. Available PDIST counters: 0,1",
"SampleAfterValue": "100007",
"UMask": "0x3",
"Unit": "cpu_core"
@@ -295,7 +295,7 @@
"EventName": "FRONTEND_RETIRED.LATENCY_GE_32",
"MSRIndex": "0x3F7",
"MSRValue": "0x602006",
- "PublicDescription": "Counts retired instructions that are delivered to the back-end after a front-end stall of at least 32 cycles. During this period the front-end delivered no uops. Available PDIST counters: 0",
+ "PublicDescription": "Counts retired instructions that are delivered to the back-end after a front-end stall of at least 32 cycles. During this period the front-end delivered no uops. Available PDIST counters: 0,1",
"SampleAfterValue": "100007",
"UMask": "0x3",
"Unit": "cpu_core"
@@ -307,7 +307,7 @@
"EventName": "FRONTEND_RETIRED.LATENCY_GE_4",
"MSRIndex": "0x3F7",
"MSRValue": "0x600406",
- "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 4 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0",
+ "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 4 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0,1",
"SampleAfterValue": "100007",
"UMask": "0x3",
"Unit": "cpu_core"
@@ -319,7 +319,7 @@
"EventName": "FRONTEND_RETIRED.LATENCY_GE_512",
"MSRIndex": "0x3F7",
"MSRValue": "0x620006",
- "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 512 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0",
+ "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 512 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0,1",
"SampleAfterValue": "100007",
"UMask": "0x3",
"Unit": "cpu_core"
@@ -331,7 +331,7 @@
"EventName": "FRONTEND_RETIRED.LATENCY_GE_64",
"MSRIndex": "0x3F7",
"MSRValue": "0x604006",
- "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 64 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0",
+ "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 64 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0,1",
"SampleAfterValue": "100007",
"UMask": "0x3",
"Unit": "cpu_core"
@@ -343,7 +343,7 @@
"EventName": "FRONTEND_RETIRED.LATENCY_GE_8",
"MSRIndex": "0x3F7",
"MSRValue": "0x600806",
- "PublicDescription": "Counts retired instructions that are delivered to the back-end after a front-end stall of at least 8 cycles. During this period the front-end delivered no uops. Available PDIST counters: 0",
+ "PublicDescription": "Counts retired instructions that are delivered to the back-end after a front-end stall of at least 8 cycles. During this period the front-end delivered no uops. Available PDIST counters: 0,1",
"SampleAfterValue": "100007",
"UMask": "0x3",
"Unit": "cpu_core"
@@ -355,7 +355,7 @@
"EventName": "FRONTEND_RETIRED.MISP_ANT",
"MSRIndex": "0x3F7",
"MSRValue": "0x9",
- "PublicDescription": "ANT retired branches that got just mispredicted Available PDIST counters: 0",
+ "PublicDescription": "ANT retired branches that got just mispredicted Available PDIST counters: 0,1",
"SampleAfterValue": "100007",
"UMask": "0x2",
"Unit": "cpu_core"
@@ -367,7 +367,7 @@
"EventName": "FRONTEND_RETIRED.MS_FLOWS",
"MSRIndex": "0x3F7",
"MSRValue": "0x8",
- "PublicDescription": "Counts flows delivered by the Microcode Sequencer Available PDIST counters: 0",
+ "PublicDescription": "Counts flows delivered by the Microcode Sequencer Available PDIST counters: 0,1",
"SampleAfterValue": "100007",
"UMask": "0x3",
"Unit": "cpu_core"
@@ -397,7 +397,7 @@
"EventName": "FRONTEND_RETIRED.STLB_MISS",
"MSRIndex": "0x3F7",
"MSRValue": "0x15",
- "PublicDescription": "Counts retired Instructions that experienced STLB (2nd level TLB) true miss. Available PDIST counters: 0",
+ "PublicDescription": "Counts retired Instructions that experienced STLB (2nd level TLB) true miss. Available PDIST counters: 0,1",
"SampleAfterValue": "100007",
"UMask": "0x3",
"Unit": "cpu_core"
@@ -409,7 +409,7 @@
"EventName": "FRONTEND_RETIRED.UNKNOWN_BRANCH",
"MSRIndex": "0x3F7",
"MSRValue": "0x17",
- "PublicDescription": "Number retired branch instructions that caused the front-end to be resteered when it finds the instruction in a fetch line. This is called Unknown Branch which occurs for the first time a branch instruction is fetched or when the branch is not tracked by the BPU (Branch Prediction Unit) anymore. Available PDIST counters: 0",
+ "PublicDescription": "Number retired branch instructions that caused the front-end to be resteered when it finds the instruction in a fetch line. This is called Unknown Branch which occurs for the first time a branch instruction is fetched or when the branch is not tracked by the BPU (Branch Prediction Unit) anymore. Available PDIST counters: 0,1",
"SampleAfterValue": "100007",
"UMask": "0x3",
"Unit": "cpu_core"
diff --git a/tools/perf/pmu-events/arch/x86/lunarlake/lnl-metrics.json b/tools/perf/pmu-events/arch/x86/lunarlake/lnl-metrics.json
index 3c740962e63e..06390a72110d 100644
--- a/tools/perf/pmu-events/arch/x86/lunarlake/lnl-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/lunarlake/lnl-metrics.json
@@ -1,74 +1,46 @@
[
{
"BriefDescription": "C10 residency percent per package",
- "MetricExpr": "cstate_pkg@c10\\-residency@ / TSC",
+ "MetricExpr": "cstate_pkg@c10\\-residency@ / msr@tsc@",
"MetricGroup": "Power",
"MetricName": "C10_Pkg_Residency",
"ScaleUnit": "100%"
},
{
"BriefDescription": "C1 residency percent per core",
- "MetricExpr": "cstate_core@c1\\-residency@ / TSC",
+ "MetricExpr": "cstate_core@c1\\-residency@ / msr@tsc@",
"MetricGroup": "Power",
"MetricName": "C1_Core_Residency",
"ScaleUnit": "100%"
},
{
"BriefDescription": "C2 residency percent per package",
- "MetricExpr": "cstate_pkg@c2\\-residency@ / TSC",
+ "MetricExpr": "cstate_pkg@c2\\-residency@ / msr@tsc@",
"MetricGroup": "Power",
"MetricName": "C2_Pkg_Residency",
"ScaleUnit": "100%"
},
- {
- "BriefDescription": "C3 residency percent per package",
- "MetricExpr": "cstate_pkg@c3\\-residency@ / TSC",
- "MetricGroup": "Power",
- "MetricName": "C3_Pkg_Residency",
- "ScaleUnit": "100%"
- },
{
"BriefDescription": "C6 residency percent per core",
- "MetricExpr": "cstate_core@c6\\-residency@ / TSC",
+ "MetricExpr": "cstate_core@c6\\-residency@ / msr@tsc@",
"MetricGroup": "Power",
"MetricName": "C6_Core_Residency",
"ScaleUnit": "100%"
},
{
"BriefDescription": "C6 residency percent per package",
- "MetricExpr": "cstate_pkg@c6\\-residency@ / TSC",
+ "MetricExpr": "cstate_pkg@c6\\-residency@ / msr@tsc@",
"MetricGroup": "Power",
"MetricName": "C6_Pkg_Residency",
"ScaleUnit": "100%"
},
{
"BriefDescription": "C7 residency percent per core",
- "MetricExpr": "cstate_core@c7\\-residency@ / TSC",
+ "MetricExpr": "cstate_core@c7\\-residency@ / msr@tsc@",
"MetricGroup": "Power",
"MetricName": "C7_Core_Residency",
"ScaleUnit": "100%"
},
- {
- "BriefDescription": "C7 residency percent per package",
- "MetricExpr": "cstate_pkg@c7\\-residency@ / TSC",
- "MetricGroup": "Power",
- "MetricName": "C7_Pkg_Residency",
- "ScaleUnit": "100%"
- },
- {
- "BriefDescription": "C8 residency percent per package",
- "MetricExpr": "cstate_pkg@c8\\-residency@ / TSC",
- "MetricGroup": "Power",
- "MetricName": "C8_Pkg_Residency",
- "ScaleUnit": "100%"
- },
- {
- "BriefDescription": "C9 residency percent per package",
- "MetricExpr": "cstate_pkg@c9\\-residency@ / TSC",
- "MetricGroup": "Power",
- "MetricName": "C9_Pkg_Residency",
- "ScaleUnit": "100%"
- },
{
"BriefDescription": "Percentage of cycles spent in System Management Interrupts.",
"MetricExpr": "((msr@...rf@ - cycles) / msr@...rf@ if msr@smi@ > 0 else 0)",
@@ -555,7 +527,7 @@
},
{
"BriefDescription": "Average CPU Utilization",
- "MetricExpr": "cpu_atom@..._CLK_UNHALTED.REF_TSC@ / TSC",
+ "MetricExpr": "cpu_atom@..._CLK_UNHALTED.REF_TSC@ / msr@tsc\\,cpu=cpu_atom@",
"MetricName": "tma_info_system_cpu_utilization",
"Unit": "cpu_atom"
},
@@ -724,6 +696,13 @@
"ScaleUnit": "100%",
"Unit": "cpu_atom"
},
+ {
+ "BriefDescription": "Uncore frequency per die [GHZ]",
+ "MetricExpr": "tma_info_system_socket_clks / #num_dies / duration_time / 1e9",
+ "MetricGroup": "SoC",
+ "MetricName": "UNCORE_FREQ",
+ "Unit": "cpu_core"
+ },
{
"BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.",
"MetricExpr": "cpu_core@...S_DISPATCHED.ALU@ / (6 * tma_info_thread_clks)",
@@ -755,7 +734,7 @@
{
"BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend",
"DefaultMetricgroupName": "TopdownL1",
- "MetricExpr": "cpu_core@...down\\-be\\-bound@ / (cpu_core@...down\\-fe\\-bound@ + cpu_core@...down\\-bad\\-spec@ + cpu_core@...down\\-retiring@ + cpu_core@...down\\-be\\-bound@) + 0 * tma_info_thread_slots",
+ "MetricExpr": "cpu_core@...down\\-be\\-bound@ / (cpu_core@...down\\-fe\\-bound@ + cpu_core@...down\\-bad\\-spec@ + cpu_core@...down\\-retiring@ + cpu_core@...down\\-be\\-bound@)",
"MetricGroup": "BvOB;Default;TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_backend_bound",
"MetricThreshold": "tma_backend_bound > 0.2",
@@ -767,7 +746,7 @@
{
"BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations",
"DefaultMetricgroupName": "TopdownL1",
- "MetricExpr": "cpu_core@...down\\-bad\\-spec@ / (cpu_core@...down\\-fe\\-bound@ + cpu_core@...down\\-bad\\-spec@ + cpu_core@...down\\-retiring@ + cpu_core@...down\\-be\\-bound@) + 0 * tma_info_thread_slots",
+ "MetricExpr": "cpu_core@...down\\-bad\\-spec@ / (cpu_core@...down\\-fe\\-bound@ + cpu_core@...down\\-bad\\-spec@ + cpu_core@...down\\-retiring@ + cpu_core@...down\\-be\\-bound@)",
"MetricGroup": "Default;TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_bad_speculation",
"MetricThreshold": "tma_bad_speculation > 0.15",
@@ -793,36 +772,36 @@
"PublicDescription": "Total pipeline cost of instructions used for program control-flow - a subset of the Retiring category in TMA. Examples include function calls; loops and alignments. (A lower bound)",
"Unit": "cpu_core"
},
+ {
+ "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation",
+ "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))",
+ "MetricGroup": "BvCB;Cor;tma_issueComp",
+ "MetricName": "tma_bottleneck_compute_bound_est",
+ "MetricThreshold": "tma_bottleneck_compute_bound_est > 20",
+ "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: ",
+ "Unit": "cpu_core"
+ },
{
"BriefDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks",
- "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_l1_latency_capacity + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))",
+ "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_l1_latency_capacity + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_early_blk + tma_store_fwd_blk)))",
"MetricGroup": "BvMB;Mem;MemoryBW;Offcore;tma_issueBW",
- "MetricName": "tma_bottleneck_cache_memory_bandwidth",
- "MetricThreshold": "tma_bottleneck_cache_memory_bandwidth > 20",
+ "MetricName": "tma_bottleneck_data_cache_memory_bandwidth",
+ "MetricThreshold": "tma_bottleneck_data_cache_memory_bandwidth > 20",
"PublicDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full",
"Unit": "cpu_core"
},
{
"BriefDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks",
- "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l1_latency_dependency / (tma_dtlb_load + tma_fb_full + tma_l1_latency_capacity + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l1_latency_capacity / (tma_dtlb_load + tma_fb_full + tma_l1_latency_capacity + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_lock_latency / (tma_dtlb_load + tma_fb_full + tma_l1_latency_capacity + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_loads / (tma_dtlb_load + tma_fb_full + tma_l1_latency_capacity + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_stores / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))",
+ "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l1_latency_dependency / (tma_dtlb_load + tma_fb_full + tma_l1_latency_capacity + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_early_blk + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l1_latency_capacity / (tma_dtlb_load + tma_fb_full + tma_l1_latency_capacity + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_early_blk + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_lock_latency / (tma_dtlb_load + tma_fb_full + tma_l1_latency_capacity + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_early_blk + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_loads / (tma_dtlb_load + tma_fb_full + tma_l1_latency_capacity + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_early_blk + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_stores / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))",
"MetricGroup": "BvML;Mem;MemoryLat;Offcore;tma_issueLat",
- "MetricName": "tma_bottleneck_cache_memory_latency",
- "MetricThreshold": "tma_bottleneck_cache_memory_latency > 20",
+ "MetricName": "tma_bottleneck_data_cache_memory_latency",
+ "MetricThreshold": "tma_bottleneck_data_cache_memory_latency > 20",
"PublicDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks. Related metrics: tma_l3_hit_latency, tma_mem_latency",
"Unit": "cpu_core"
},
- {
- "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation",
- "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))",
- "MetricGroup": "BvCB;Cor;tma_issueComp",
- "MetricName": "tma_bottleneck_compute_bound_est",
- "MetricThreshold": "tma_bottleneck_compute_bound_est > 20",
- "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: ",
- "Unit": "cpu_core"
- },
{
"BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks (when the front-end could not sustain operations delivery to the back-end)",
- "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - (1 - cpu_core@...T_RETIRED.REP_ITERATION@ / cpu_core@...S_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_ms / (tma_dsb + tma_lsd + tma_mite + tma_ms))) - tma_bottleneck_big_code",
+ "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - (1 - cpu_core@...T_RETIRED.REP_ITERATION@ / cpu_core@...S_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_ms)) - tma_bottleneck_big_code",
"MetricGroup": "BvFB;Fed;FetchBW;Frontend",
"MetricName": "tma_bottleneck_instruction_fetch_bw",
"MetricThreshold": "tma_bottleneck_instruction_fetch_bw > 20",
@@ -830,7 +809,7 @@
},
{
"BriefDescription": "Total pipeline cost of irregular execution (e.g",
- "MetricExpr": "100 * ((1 - cpu_core@...T_RETIRED.REP_ITERATION@ / cpu_core@...S_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_ms / (tma_dsb + tma_lsd + tma_mite + tma_ms)) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + cpu_core@...EMPTY_RESOURCE@ / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_microcode_sequencer + tma_few_uops_instructions) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)",
+ "MetricExpr": "100 * ((1 - cpu_core@...T_RETIRED.REP_ITERATION@ / cpu_core@...S_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_ms) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + cpu_core@...EMPTY_RESOURCE@ / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_microcode_sequencer + tma_few_uops_instructions) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)",
"MetricGroup": "Bad;BvIO;Cor;Ret;tma_issueMS",
"MetricName": "tma_bottleneck_irregular_overhead",
"MetricThreshold": "tma_bottleneck_irregular_overhead > 10",
@@ -839,7 +818,7 @@
},
{
"BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)",
- "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / (tma_dtlb_load + tma_fb_full + tma_l1_latency_capacity + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))",
+ "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / (tma_dtlb_load + tma_fb_full + tma_l1_latency_capacity + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_early_blk + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))",
"MetricGroup": "BvMT;Mem;MemoryTLB;Offcore;tma_issueTLB",
"MetricName": "tma_bottleneck_memory_data_tlbs",
"MetricThreshold": "tma_bottleneck_memory_data_tlbs > 20",
@@ -866,7 +845,7 @@
},
{
"BriefDescription": "Total pipeline cost of remaining bottlenecks in the back-end",
- "MetricExpr": "100 - (tma_bottleneck_big_code + tma_bottleneck_instruction_fetch_bw + tma_bottleneck_mispredictions + tma_bottleneck_cache_memory_bandwidth + tma_bottleneck_cache_memory_latency + tma_bottleneck_memory_data_tlbs + tma_bottleneck_memory_synchronization + tma_bottleneck_compute_bound_est + tma_bottleneck_irregular_overhead + tma_bottleneck_branching_overhead + tma_bottleneck_useful_work)",
+ "MetricExpr": "100 - (tma_bottleneck_big_code + tma_bottleneck_instruction_fetch_bw + tma_bottleneck_mispredictions + tma_bottleneck_data_cache_memory_bandwidth + tma_bottleneck_data_cache_memory_latency + tma_bottleneck_memory_data_tlbs + tma_bottleneck_memory_synchronization + tma_bottleneck_compute_bound_est + tma_bottleneck_irregular_overhead + tma_bottleneck_branching_overhead + tma_bottleneck_useful_work)",
"MetricGroup": "BvOB;Cor;Offcore",
"MetricName": "tma_bottleneck_other_bottlenecks",
"MetricThreshold": "tma_bottleneck_other_bottlenecks > 20",
@@ -883,7 +862,7 @@
},
{
"BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction",
- "MetricExpr": "cpu_core@...down\\-br\\-mispredict@ / (cpu_core@...down\\-fe\\-bound@ + cpu_core@...down\\-bad\\-spec@ + cpu_core@...down\\-retiring@ + cpu_core@...down\\-be\\-bound@) + 0 * tma_info_thread_slots",
+ "MetricExpr": "cpu_core@...down\\-br\\-mispredict@ / (cpu_core@...down\\-fe\\-bound@ + cpu_core@...down\\-bad\\-spec@ + cpu_core@...down\\-retiring@ + cpu_core@...down\\-be\\-bound@)",
"MetricGroup": "BadSpec;BrMispredicts;BvMP;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM",
"MetricName": "tma_branch_mispredicts",
"MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15",
@@ -1023,7 +1002,6 @@
},
{
"BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses",
- "MetricConstraint": "NO_GROUP_EVENTS",
"MetricExpr": "(cpu_core@..._LOAD_L3_HIT_RETIRED.XSNP_MISS@ * min(cpu_core@..._LOAD_L3_HIT_RETIRED.XSNP_MISS@R, 24 * tma_info_system_core_frequency) + cpu_core@..._LOAD_L3_HIT_RETIRED.XSNP_HITM@ * min(cpu_core@..._LOAD_L3_HIT_RETIRED.XSNP_HITM@R, 25 * tma_info_system_core_frequency)) * (1 + cpu_core@..._LOAD_RETIRED.FB_HIT@ / cpu_core@..._LOAD_RETIRED.L1_MISS@ / 2) / tma_info_thread_clks",
"MetricGroup": "BvMS;DataSharing;LockCont;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group",
"MetricName": "tma_contested_accesses",
@@ -1076,7 +1054,7 @@
},
{
"BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline",
- "MetricExpr": "(cpu_core@....DSB_UOPS\\,cmask\\=0x8\\,inv\\=0x1@ + cpu_core@....DSB_UOPS@ / (cpu_core@....DSB_UOPS@ + cpu_core@....MITE_UOPS@) * (cpu_core@..._BUBBLES.CYCLES_0_UOPS_DELIV.CORE@ - cpu_core@..._BUBBLES.FETCH_LATENCY@)) / tma_info_thread_clks",
+ "MetricExpr": "(cpu_core@....DSB_UOPS\\,cmask\\=0x8\\,inv\\=0x1@ / 2 + cpu_core@....DSB_UOPS@ / (cpu_core@....DSB_UOPS@ + cpu_core@....MITE_UOPS@) * (cpu_core@..._BUBBLES.STARVATION_CYCLES@ - cpu_core@..._BUBBLES.FETCH_LATENCY@)) / tma_info_thread_clks",
"MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
"MetricName": "tma_dsb",
"MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2",
@@ -1130,7 +1108,7 @@
"MetricGroup": "BvMB;MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group",
"MetricName": "tma_fb_full",
"MetricThreshold": "tma_fb_full > 0.3",
- "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores",
+ "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores",
"ScaleUnit": "100%",
"Unit": "cpu_core"
},
@@ -1147,7 +1125,7 @@
},
{
"BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues",
- "MetricExpr": "cpu_core@...down\\-fetch\\-lat@ / (cpu_core@...down\\-fe\\-bound@ + cpu_core@...down\\-bad\\-spec@ + cpu_core@...down\\-retiring@ + cpu_core@...down\\-be\\-bound@) + 0 * tma_info_thread_slots",
+ "MetricExpr": "cpu_core@...down\\-fetch\\-lat@ / (cpu_core@...down\\-fe\\-bound@ + cpu_core@...down\\-bad\\-spec@ + cpu_core@...down\\-retiring@ + cpu_core@...down\\-be\\-bound@)",
"MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group",
"MetricName": "tma_fetch_latency",
"MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15",
@@ -1197,7 +1175,7 @@
},
{
"BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired",
- "MetricExpr": "cpu_core@...ARITH_INST_RETIRED.SCALAR@ / (tma_retiring * tma_info_thread_slots)",
+ "MetricExpr": "cpu_core@...ARITH_OPS_RETIRED.SCALAR@ / (tma_retiring * tma_info_thread_slots)",
"MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P",
"MetricName": "tma_fp_scalar",
"MetricThreshold": "tma_fp_scalar > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)",
@@ -1207,7 +1185,7 @@
},
{
"BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths",
- "MetricExpr": "cpu_core@...ARITH_INST_RETIRED.VECTOR@ / (tma_retiring * tma_info_thread_slots)",
+ "MetricExpr": "cpu_core@...ARITH_OPS_RETIRED.VECTOR@ / (tma_retiring * tma_info_thread_slots)",
"MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P",
"MetricName": "tma_fp_vector",
"MetricThreshold": "tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)",
@@ -1217,7 +1195,7 @@
},
{
"BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors",
- "MetricExpr": "(cpu_core@...ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + cpu_core@...ARITH_INST_RETIRED.128B_PACKED_SINGLE@) / (tma_retiring * tma_info_thread_slots)",
+ "MetricExpr": "(cpu_core@...ARITH_OPS_RETIRED.128B_PACKED_DOUBLE@ + cpu_core@...ARITH_OPS_RETIRED.128B_PACKED_SINGLE@) / (tma_retiring * tma_info_thread_slots)",
"MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P",
"MetricName": "tma_fp_vector_128b",
"MetricThreshold": "tma_fp_vector_128b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))",
@@ -1227,7 +1205,7 @@
},
{
"BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors",
- "MetricExpr": "cpu_core@...ARITH_INST_RETIRED.VECTOR\\,umask\\=0x30@ / (tma_retiring * tma_info_thread_slots)",
+ "MetricExpr": "cpu_core@...ARITH_OPS_RETIRED.VECTOR\\,umask\\=0x30@ / (tma_retiring * tma_info_thread_slots)",
"MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P",
"MetricName": "tma_fp_vector_256b",
"MetricThreshold": "tma_fp_vector_256b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))",
@@ -1238,7 +1216,7 @@
{
"BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend",
"DefaultMetricgroupName": "TopdownL1",
- "MetricExpr": "cpu_core@...down\\-fe\\-bound@ / (cpu_core@...down\\-fe\\-bound@ + cpu_core@...down\\-bad\\-spec@ + cpu_core@...down\\-retiring@ + cpu_core@...down\\-be\\-bound@) + 0 * tma_info_thread_slots",
+ "MetricExpr": "cpu_core@...down\\-fe\\-bound@ / (cpu_core@...down\\-fe\\-bound@ + cpu_core@...down\\-bad\\-spec@ + cpu_core@...down\\-retiring@ + cpu_core@...down\\-be\\-bound@)",
"MetricGroup": "BvFB;BvIO;Default;PGO;TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_frontend_bound",
"MetricThreshold": "tma_frontend_bound > 0.15",
@@ -1259,7 +1237,7 @@
},
{
"BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences",
- "MetricExpr": "cpu_core@...down\\-heavy\\-ops@ / (cpu_core@...down\\-fe\\-bound@ + cpu_core@...down\\-bad\\-spec@ + cpu_core@...down\\-retiring@ + cpu_core@...down\\-be\\-bound@) + 0 * tma_info_thread_slots",
+ "MetricExpr": "cpu_core@...down\\-heavy\\-ops@ / (cpu_core@...down\\-fe\\-bound@ + cpu_core@...down\\-bad\\-spec@ + cpu_core@...down\\-retiring@ + cpu_core@...down\\-be\\-bound@)",
"MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
"MetricName": "tma_heavy_operations",
"MetricThreshold": "tma_heavy_operations > 0.1",
@@ -1437,7 +1415,7 @@
},
{
"BriefDescription": "Floating Point Operations Per Cycle",
- "MetricExpr": "(cpu_core@...ARITH_INST_RETIRED.SCALAR@ + 2 * cpu_core@...ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + 4 * cpu_core@...ARITH_INST_RETIRED.4_FLOPS@ + 8 * cpu_core@...ARITH_INST_RETIRED.256B_PACKED_SINGLE@) / tma_info_thread_clks",
+ "MetricExpr": "(cpu_core@...ARITH_OPS_RETIRED.SCALAR@ + 2 * cpu_core@...ARITH_OPS_RETIRED.128B_PACKED_DOUBLE@ + 4 * cpu_core@...ARITH_OPS_RETIRED.4_FLOPS@ + 8 * cpu_core@...ARITH_OPS_RETIRED.256B_PACKED_SINGLE@) / tma_info_thread_clks",
"MetricGroup": "Flops;Ret",
"MetricName": "tma_info_core_flopc",
"Unit": "cpu_core"
@@ -1578,7 +1556,7 @@
},
{
"BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)",
- "MetricExpr": "cpu_core@...T_RETIRED.ANY@ / (cpu_core@...ARITH_INST_RETIRED.SCALAR@ + cpu_core@...ARITH_INST_RETIRED.VECTOR@)",
+ "MetricExpr": "cpu_core@...T_RETIRED.ANY@ / (cpu_core@...ARITH_OPS_RETIRED.SCALAR@ + cpu_core@...ARITH_OPS_RETIRED.VECTOR@)",
"MetricGroup": "Flops;InsType",
"MetricName": "tma_info_inst_mix_iparith",
"MetricThreshold": "tma_info_inst_mix_iparith < 10",
@@ -1587,7 +1565,7 @@
},
{
"BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)",
- "MetricExpr": "cpu_core@...T_RETIRED.ANY@ / (cpu_core@...ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + cpu_core@...ARITH_INST_RETIRED.128B_PACKED_SINGLE@)",
+ "MetricExpr": "cpu_core@...T_RETIRED.ANY@ / (cpu_core@...ARITH_OPS_RETIRED.128B_PACKED_DOUBLE@ + cpu_core@...ARITH_OPS_RETIRED.128B_PACKED_SINGLE@)",
"MetricGroup": "Flops;FpVector;InsType",
"MetricName": "tma_info_inst_mix_iparith_avx128",
"MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10",
@@ -1596,7 +1574,7 @@
},
{
"BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)",
- "MetricExpr": "cpu_core@...T_RETIRED.ANY@ / (cpu_core@...ARITH_INST_RETIRED.256B_PACKED_DOUBLE@ + cpu_core@...ARITH_INST_RETIRED.256B_PACKED_SINGLE@)",
+ "MetricExpr": "cpu_core@...T_RETIRED.ANY@ / (cpu_core@...ARITH_OPS_RETIRED.256B_PACKED_DOUBLE@ + cpu_core@...ARITH_OPS_RETIRED.256B_PACKED_SINGLE@)",
"MetricGroup": "Flops;FpVector;InsType",
"MetricName": "tma_info_inst_mix_iparith_avx256",
"MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10",
@@ -1605,7 +1583,7 @@
},
{
"BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)",
- "MetricExpr": "cpu_core@...T_RETIRED.ANY@ / cpu_core@...ARITH_INST_RETIRED.SCALAR_DOUBLE@",
+ "MetricExpr": "cpu_core@...T_RETIRED.ANY@ / cpu_core@...ARITH_OPS_RETIRED.SCALAR_DOUBLE@",
"MetricGroup": "Flops;FpScalar;InsType",
"MetricName": "tma_info_inst_mix_iparith_scalar_dp",
"MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10",
@@ -1614,7 +1592,7 @@
},
{
"BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)",
- "MetricExpr": "cpu_core@...T_RETIRED.ANY@ / cpu_core@...ARITH_INST_RETIRED.SCALAR_SINGLE@",
+ "MetricExpr": "cpu_core@...T_RETIRED.ANY@ / cpu_core@...ARITH_OPS_RETIRED.SCALAR_SINGLE@",
"MetricGroup": "Flops;FpScalar;InsType",
"MetricName": "tma_info_inst_mix_iparith_scalar_sp",
"MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10",
@@ -1639,7 +1617,7 @@
},
{
"BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)",
- "MetricExpr": "cpu_core@...T_RETIRED.ANY@ / (cpu_core@...ARITH_INST_RETIRED.SCALAR@ + 2 * cpu_core@...ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + 4 * cpu_core@...ARITH_INST_RETIRED.4_FLOPS@ + 8 * cpu_core@...ARITH_INST_RETIRED.256B_PACKED_SINGLE@)",
+ "MetricExpr": "cpu_core@...T_RETIRED.ANY@ / (cpu_core@...ARITH_OPS_RETIRED.SCALAR@ + 2 * cpu_core@...ARITH_OPS_RETIRED.128B_PACKED_DOUBLE@ + 4 * cpu_core@...ARITH_OPS_RETIRED.4_FLOPS@ + 8 * cpu_core@...ARITH_OPS_RETIRED.256B_PACKED_SINGLE@)",
"MetricGroup": "Flops;InsType",
"MetricName": "tma_info_inst_mix_ipflop",
"MetricThreshold": "tma_info_inst_mix_ipflop < 10",
@@ -1694,7 +1672,7 @@
},
{
"BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]",
- "MetricExpr": "64 * cpu_core@....REPLACEMENT@ / 1e9 / tma_info_system_time",
+ "MetricExpr": "64 * cpu_core@....L1_REPLACEMENT@ / 1e9 / tma_info_system_time",
"MetricGroup": "Mem;MemoryBW",
"MetricName": "tma_info_memory_l1d_cache_fill_bw",
"Unit": "cpu_core"
@@ -1706,6 +1684,13 @@
"MetricName": "tma_info_memory_l1dl0_cache_fill_bw",
"Unit": "cpu_core"
},
+ {
+ "BriefDescription": "L0 cache true misses per kilo instruction for retired demand loads",
+ "MetricExpr": "1e3 * (cpu_core@..._LOAD_RETIRED.L1_MISS@ + cpu_core@..._LOAD_RETIRED.L1_HIT_L1@) / cpu_core@...T_RETIRED.ANY@",
+ "MetricGroup": "CacheHits;Mem",
+ "MetricName": "tma_info_memory_l1dl0_mpki",
+ "Unit": "cpu_core"
+ },
{
"BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
"MetricExpr": "1e3 * cpu_core@..._LOAD_RETIRED.L1_MISS@ / cpu_core@...T_RETIRED.ANY@",
@@ -1921,6 +1906,13 @@
"MetricName": "tma_info_pipeline_fetch_mite",
"Unit": "cpu_core"
},
+ {
+ "BriefDescription": "Average number of uops fetched from MS per cycle",
+ "MetricExpr": "cpu_core@....MS_UOPS@ / cpu_core@....MS_UOPS\\,cmask\\=1@",
+ "MetricGroup": "Fed;FetchLat;MicroSeq",
+ "MetricName": "tma_info_pipeline_fetch_ms",
+ "Unit": "cpu_core"
+ },
{
"BriefDescription": "Instructions per a microcode Assist invocation",
"MetricExpr": "cpu_core@...T_RETIRED.ANY@ / cpu_core@...ISTS.ANY@",
@@ -1955,7 +1947,7 @@
},
{
"BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]",
- "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / tma_info_system_time",
+ "MetricExpr": "tma_info_system_turbo_utilization * msr@tsc\\,cpu=cpu_core@ / 1e9 / tma_info_system_time",
"MetricGroup": "Power;Summary",
"MetricName": "tma_info_system_core_frequency",
"Unit": "cpu_core"
@@ -1969,14 +1961,22 @@
},
{
"BriefDescription": "Average number of utilized CPUs",
- "MetricExpr": "cpu_core@..._CLK_UNHALTED.REF_TSC@ / TSC",
+ "MetricExpr": "cpu_core@..._CLK_UNHALTED.REF_TSC@ / msr@tsc\\,cpu=cpu_core@",
"MetricGroup": "Summary",
"MetricName": "tma_info_system_cpus_utilized",
"Unit": "cpu_core"
},
+ {
+ "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
+ "MetricExpr": "32 * UNC_M_TOTAL_DATA / 1e9 / tma_info_system_time",
+ "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW",
+ "MetricName": "tma_info_system_dram_bw_use",
+ "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_mem_bandwidth, tma_sq_full",
+ "Unit": "cpu_core"
+ },
{
"BriefDescription": "Giga Floating Point Operations Per Second",
- "MetricExpr": "(cpu_core@...ARITH_INST_RETIRED.SCALAR@ + 2 * cpu_core@...ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + 4 * cpu_core@...ARITH_INST_RETIRED.4_FLOPS@ + 8 * cpu_core@...ARITH_INST_RETIRED.256B_PACKED_SINGLE@) / 1e9 / tma_info_system_time",
+ "MetricExpr": "(cpu_core@...ARITH_OPS_RETIRED.SCALAR@ + 2 * cpu_core@...ARITH_OPS_RETIRED.128B_PACKED_DOUBLE@ + 4 * cpu_core@...ARITH_OPS_RETIRED.4_FLOPS@ + 8 * cpu_core@...ARITH_OPS_RETIRED.256B_PACKED_SINGLE@) / 1e9 / tma_info_system_time",
"MetricGroup": "Cor;Flops;HPC",
"MetricName": "tma_info_system_gflops",
"PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width",
@@ -2020,6 +2020,13 @@
"MetricName": "tma_info_system_power",
"Unit": "cpu_core"
},
+ {
+ "BriefDescription": "Socket actual clocks when any core is active on that socket",
+ "MetricExpr": "UNC_CLOCK.SOCKET",
+ "MetricGroup": "SoC",
+ "MetricName": "tma_info_system_socket_clks",
+ "Unit": "cpu_core"
+ },
{
"BriefDescription": "Run duration time in seconds",
"MetricExpr": "duration_time",
@@ -2035,6 +2042,13 @@
"MetricName": "tma_info_system_turbo_utilization",
"Unit": "cpu_core"
},
+ {
+ "BriefDescription": "Measured Average Uncore Frequency for the SoC [GHz]",
+ "MetricExpr": "tma_info_system_socket_clks / 1e9 / tma_info_system_time",
+ "MetricGroup": "SoC",
+ "MetricName": "tma_info_system_uncore_frequency",
+ "Unit": "cpu_core"
+ },
{
"BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.",
"MetricExpr": "cpu_core@..._CLK_UNHALTED.THREAD@",
@@ -2156,12 +2170,12 @@
"Unit": "cpu_core"
},
{
- "BriefDescription": "This metric([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache",
- "MetricExpr": "4 * cpu_core@...ENDENT_LOADS.ANY@ / tma_info_thread_clks",
+ "BriefDescription": "This metric ([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache",
+ "MetricExpr": "4 * cpu_core@...ENDENT_LOADS.ANY\\,cmask\\=1@ / tma_info_thread_clks",
"MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_l1_bound_group",
"MetricName": "tma_l1_latency_dependency",
"MetricThreshold": "tma_l1_latency_dependency > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
- "PublicDescription": "This metric([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache. The short latency of the L1D cache may be exposed in pointer-chasing memory access patterns as an example. Sample with: MEM_LOAD_UOPS_RETIRED.L1_HIT_PS",
+ "PublicDescription": "This metric ([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache. The short latency of the L1D cache may be exposed in pointer-chasing memory access patterns as an example. Sample with: MEM_LOAD_UOPS_RETIRED.L1_HIT_PS",
"ScaleUnit": "100%",
"Unit": "cpu_core"
},
@@ -2177,7 +2191,6 @@
},
{
"BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L2 cache under unloaded scenarios (possibly L2 latency limited)",
- "MetricConstraint": "NO_GROUP_EVENTS",
"MetricExpr": "cpu_core@..._LOAD_RETIRED.L2_HIT@ * min(cpu_core@..._LOAD_RETIRED.L2_HIT@R, 3 * tma_info_system_core_frequency) * (1 + cpu_core@..._LOAD_RETIRED.FB_HIT@ / cpu_core@..._LOAD_RETIRED.L1_MISS@ / 2) / tma_info_thread_clks",
"MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_l2_bound_group",
"MetricName": "tma_l2_hit_latency",
@@ -2198,12 +2211,11 @@
},
{
"BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
- "MetricConstraint": "NO_GROUP_EVENTS",
"MetricExpr": "cpu_core@..._LOAD_RETIRED.L3_HIT@ * min(cpu_core@..._LOAD_RETIRED.L3_HIT@R, 9 * tma_info_system_core_frequency) * (1 + cpu_core@..._LOAD_RETIRED.FB_HIT@ / cpu_core@..._LOAD_RETIRED.L1_MISS@ / 2) / tma_info_thread_clks",
"MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group",
"MetricName": "tma_l3_hit_latency",
"MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
- "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_bottleneck_cache_memory_latency, tma_mem_latency",
+ "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_bottleneck_data_cache_memory_latency, tma_mem_latency",
"ScaleUnit": "100%",
"Unit": "cpu_core"
},
@@ -2285,6 +2297,7 @@
},
{
"BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations",
+ "MetricConstraint": "NO_GROUP_EVENTS",
"MetricExpr": "cpu_core@..._INST_RETIRED.LOCK_LOADS@ * cpu_core@..._INST_RETIRED.LOCK_LOADS@R / tma_info_thread_clks",
"MetricGroup": "LockCont;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group",
"MetricName": "tma_lock_latency",
@@ -2295,7 +2308,7 @@
},
{
"BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to LSD (Loop Stream Detector) unit",
- "MetricExpr": "cpu_core@....UOPS\\,cmask\\=0x8\\,inv\\=0x1@ / tma_info_thread_clks",
+ "MetricExpr": "cpu_core@....UOPS\\,cmask\\=0x8\\,inv\\=0x1@ / tma_info_thread_clks / 2",
"MetricGroup": "FetchBW;LSD;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
"MetricName": "tma_lsd",
"MetricThreshold": "tma_lsd > 0.15 & tma_fetch_bandwidth > 0.2",
@@ -2320,7 +2333,7 @@
"MetricGroup": "BvMB;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW",
"MetricName": "tma_mem_bandwidth",
"MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
- "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full",
+ "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full",
"ScaleUnit": "100%",
"Unit": "cpu_core"
},
@@ -2330,13 +2343,13 @@
"MetricGroup": "BvML;MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat",
"MetricName": "tma_mem_latency",
"MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
- "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_bottleneck_cache_memory_latency, tma_l3_hit_latency",
+ "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_bottleneck_data_cache_memory_latency, tma_l3_hit_latency",
"ScaleUnit": "100%",
"Unit": "cpu_core"
},
{
"BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck",
- "MetricExpr": "cpu_core@...down\\-mem\\-bound@ / (cpu_core@...down\\-fe\\-bound@ + cpu_core@...down\\-bad\\-spec@ + cpu_core@...down\\-retiring@ + cpu_core@...down\\-be\\-bound@) + 0 * tma_info_thread_slots",
+ "MetricExpr": "cpu_core@...down\\-mem\\-bound@ / (cpu_core@...down\\-fe\\-bound@ + cpu_core@...down\\-bad\\-spec@ + cpu_core@...down\\-retiring@ + cpu_core@...down\\-be\\-bound@)",
"MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
"MetricName": "tma_memory_bound",
"MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2",
@@ -2347,7 +2360,6 @@
},
{
"BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to LFENCE Instructions.",
- "MetricConstraint": "NO_GROUP_EVENTS_NMI",
"MetricExpr": "13 * cpu_core@...C2_RETIRED.LFENCE@ / tma_info_thread_clks",
"MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group",
"MetricName": "tma_memory_fence",
@@ -2386,7 +2398,7 @@
},
{
"BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)",
- "MetricExpr": "(cpu_core@....MITE_UOPS\\,cmask\\=0x8\\,inv\\=0x1@ / 2 + cpu_core@....MITE_UOPS@ / (cpu_core@....DSB_UOPS@ + cpu_core@....MITE_UOPS@) * (cpu_core@..._BUBBLES.CYCLES_0_UOPS_DELIV.CORE@ - cpu_core@..._BUBBLES.FETCH_LATENCY@)) / tma_info_thread_clks",
+ "MetricExpr": "(cpu_core@....MITE_UOPS\\,cmask\\=0x8\\,inv\\=0x1@ / 2 + cpu_core@....MITE_UOPS@ / (cpu_core@....DSB_UOPS@ + cpu_core@....MITE_UOPS@) * (cpu_core@..._BUBBLES.STARVATION_CYCLES@ - cpu_core@..._BUBBLES.FETCH_LATENCY@)) / tma_info_thread_clks",
"MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
"MetricName": "tma_mite",
"MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2",
@@ -2406,7 +2418,7 @@
},
{
"BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the Microcode Sequencer (MS) unit - see Microcode_Sequencer node for details.",
- "MetricExpr": "cpu_core@....MS_CYCLES_ANY@ / tma_info_thread_clks",
+ "MetricExpr": "cpu_core@....MS_CYCLES_ANY@ / tma_info_thread_clks / 1.8",
"MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
"MetricName": "tma_ms",
"MetricThreshold": "tma_ms > 0.05 & tma_fetch_bandwidth > 0.2",
@@ -2445,7 +2457,8 @@
},
{
"BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes",
- "MetricExpr": "max(0, tma_light_operations - (tma_x87_use + (cpu_core@...ARITH_INST_RETIRED.SCALAR@ + cpu_core@...ARITH_INST_RETIRED.VECTOR@) / (tma_retiring * tma_info_thread_slots) + (cpu_core@..._VEC_RETIRED.ADD_128@ + cpu_core@..._VEC_RETIRED.VNNI_128@ + cpu_core@..._VEC_RETIRED.ADD_256@ + cpu_core@..._VEC_RETIRED.MUL_256@ + cpu_core@..._VEC_RETIRED.VNNI_256@) / (tma_retiring * tma_info_thread_slots) + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches))",
+ "MetricConstraint": "NO_GROUP_EVENTS",
+ "MetricExpr": "max(0, tma_light_operations - (tma_x87_use + (cpu_core@...ARITH_OPS_RETIRED.SCALAR@ + cpu_core@...ARITH_OPS_RETIRED.VECTOR@) / (tma_retiring * tma_info_thread_slots) + (cpu_core@..._VEC_RETIRED.ADD_128@ + cpu_core@..._VEC_RETIRED.VNNI_128@ + cpu_core@..._VEC_RETIRED.ADD_256@ + cpu_core@..._VEC_RETIRED.MUL_256@ + cpu_core@..._VEC_RETIRED.VNNI_256@) / (tma_retiring * tma_info_thread_slots) + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches))",
"MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group",
"MetricName": "tma_other_light_ops",
"MetricThreshold": "tma_other_light_ops > 0.3 & tma_light_operations > 0.6",
@@ -2483,6 +2496,7 @@
},
{
"BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)",
+ "MetricConstraint": "NO_GROUP_EVENTS_NMI",
"MetricExpr": "((cpu_core@..._ACTIVITY.EXE_BOUND_0_PORTS@ + (cpu_core@..._ACTIVITY.1_PORTS_UTIL@ + tma_retiring * cpu_core@..._ACTIVITY.2_3_PORTS_UTIL@)) / tma_info_thread_clks if cpu_core@...TH.DIV_ACTIVE@ < cpu_core@...LE_ACTIVITY.STALLS_TOTAL@ - cpu_core@..._ACTIVITY.BOUND_ON_LOADS@ else (cpu_core@..._ACTIVITY.1_PORTS_UTIL@ + tma_retiring * cpu_core@..._ACTIVITY.2_3_PORTS_UTIL@) / tma_info_thread_clks)",
"MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group",
"MetricName": "tma_ports_utilization",
@@ -2493,6 +2507,7 @@
},
{
"BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
+ "MetricConstraint": "NO_THRESHOLD_AND_NMI",
"MetricExpr": "cpu_core@..._ACTIVITY.EXE_BOUND_0_PORTS@ / tma_info_thread_clks",
"MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
"MetricName": "tma_ports_utilized_0",
@@ -2503,6 +2518,7 @@
},
{
"BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
+ "MetricConstraint": "NO_THRESHOLD_AND_NMI",
"MetricExpr": "cpu_core@..._ACTIVITY.1_PORTS_UTIL@ / tma_info_thread_clks",
"MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group",
"MetricName": "tma_ports_utilized_1",
@@ -2513,7 +2529,6 @@
},
{
"BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
- "MetricConstraint": "NO_GROUP_EVENTS_NMI",
"MetricExpr": "cpu_core@..._ACTIVITY.2_PORTS_UTIL@ / tma_info_thread_clks",
"MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group",
"MetricName": "tma_ports_utilized_2",
@@ -2524,7 +2539,6 @@
},
{
"BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
- "MetricConstraint": "NO_GROUP_EVENTS_NMI",
"MetricExpr": "cpu_core@...S_EXECUTED.CYCLES_GE_3@ / tma_info_thread_clks",
"MetricGroup": "BvCB;PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
"MetricName": "tma_ports_utilized_3m",
@@ -2545,7 +2559,7 @@
{
"BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired",
"DefaultMetricgroupName": "TopdownL1",
- "MetricExpr": "cpu_core@...down\\-retiring@ / (cpu_core@...down\\-fe\\-bound@ + cpu_core@...down\\-bad\\-spec@ + cpu_core@...down\\-retiring@ + cpu_core@...down\\-be\\-bound@) + 0 * tma_info_thread_slots",
+ "MetricExpr": "cpu_core@...down\\-retiring@ / (cpu_core@...down\\-fe\\-bound@ + cpu_core@...down\\-bad\\-spec@ + cpu_core@...down\\-retiring@ + cpu_core@...down\\-be\\-bound@)",
"MetricGroup": "BvUW;Default;TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_retiring",
"MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1",
@@ -2576,7 +2590,6 @@
},
{
"BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions",
- "MetricConstraint": "NO_GROUP_EVENTS_NMI",
"MetricExpr": "cpu_core@..._CLK_UNHALTED.PAUSE@ / tma_info_thread_clks",
"MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group",
"MetricName": "tma_slow_pause",
@@ -2611,7 +2624,7 @@
"MetricGroup": "BvMB;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group",
"MetricName": "tma_sq_full",
"MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
- "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth",
+ "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth",
"ScaleUnit": "100%",
"Unit": "cpu_core"
},
@@ -2625,6 +2638,15 @@
"ScaleUnit": "100%",
"Unit": "cpu_core"
},
+ {
+ "BriefDescription": "This metric estimates clocks wasted due to loads blocked due to unknown store address (did not do memory disambiguation) or due to unknown store data",
+ "MetricExpr": "7 * cpu_core@...BLOCKS.STORE_EARLY\\,cmask\\=1@ / tma_info_thread_clks",
+ "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group",
+ "MetricName": "tma_store_early_blk",
+ "MetricThreshold": "tma_store_early_blk > 0.2",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
{
"BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores",
"MetricExpr": "13 * cpu_core@...BLOCKS.STORE_FORWARD@ / tma_info_thread_clks",
diff --git a/tools/perf/pmu-events/arch/x86/lunarlake/memory.json b/tools/perf/pmu-events/arch/x86/lunarlake/memory.json
index 8021a1c7dd3b..25021cb76f61 100644
--- a/tools/perf/pmu-events/arch/x86/lunarlake/memory.json
+++ b/tools/perf/pmu-events/arch/x86/lunarlake/memory.json
@@ -163,7 +163,7 @@
"EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_1024",
"MSRIndex": "0x3F6",
"MSRValue": "0x400",
- "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 1024 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0",
+ "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 1024 cycles. Reported latency may be longer than just the memory latency.",
"SampleAfterValue": "53",
"UMask": "0x1",
"Unit": "cpu_core"
@@ -176,7 +176,7 @@
"EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_128",
"MSRIndex": "0x3F6",
"MSRValue": "0x80",
- "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 128 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0",
+ "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 128 cycles. Reported latency may be longer than just the memory latency.",
"SampleAfterValue": "1009",
"UMask": "0x1",
"Unit": "cpu_core"
@@ -189,7 +189,7 @@
"EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_16",
"MSRIndex": "0x3F6",
"MSRValue": "0x10",
- "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 16 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0",
+ "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 16 cycles. Reported latency may be longer than just the memory latency.",
"SampleAfterValue": "20011",
"UMask": "0x1",
"Unit": "cpu_core"
@@ -202,7 +202,7 @@
"EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_2048",
"MSRIndex": "0x3F6",
"MSRValue": "0x800",
- "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 2048 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0",
+ "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 2048 cycles. Reported latency may be longer than just the memory latency.",
"SampleAfterValue": "23",
"UMask": "0x1",
"Unit": "cpu_core"
@@ -215,7 +215,7 @@
"EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_256",
"MSRIndex": "0x3F6",
"MSRValue": "0x100",
- "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 256 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0",
+ "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 256 cycles. Reported latency may be longer than just the memory latency.",
"SampleAfterValue": "503",
"UMask": "0x1",
"Unit": "cpu_core"
@@ -228,7 +228,7 @@
"EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_32",
"MSRIndex": "0x3F6",
"MSRValue": "0x20",
- "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 32 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0",
+ "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 32 cycles. Reported latency may be longer than just the memory latency.",
"SampleAfterValue": "100007",
"UMask": "0x1",
"Unit": "cpu_core"
@@ -241,7 +241,7 @@
"EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_4",
"MSRIndex": "0x3F6",
"MSRValue": "0x4",
- "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 4 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0",
+ "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 4 cycles. Reported latency may be longer than just the memory latency.",
"SampleAfterValue": "100003",
"UMask": "0x1",
"Unit": "cpu_core"
@@ -254,7 +254,7 @@
"EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_512",
"MSRIndex": "0x3F6",
"MSRValue": "0x200",
- "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 512 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0",
+ "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 512 cycles. Reported latency may be longer than just the memory latency.",
"SampleAfterValue": "101",
"UMask": "0x1",
"Unit": "cpu_core"
@@ -267,7 +267,7 @@
"EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_64",
"MSRIndex": "0x3F6",
"MSRValue": "0x40",
- "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 64 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0",
+ "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 64 cycles. Reported latency may be longer than just the memory latency.",
"SampleAfterValue": "2003",
"UMask": "0x1",
"Unit": "cpu_core"
@@ -280,7 +280,7 @@
"EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_8",
"MSRIndex": "0x3F6",
"MSRValue": "0x8",
- "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 8 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0",
+ "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 8 cycles. Reported latency may be longer than just the memory latency.",
"SampleAfterValue": "50021",
"UMask": "0x1",
"Unit": "cpu_core"
@@ -291,7 +291,7 @@
"Data_LA": "1",
"EventCode": "0xcd",
"EventName": "MEM_TRANS_RETIRED.STORE_SAMPLE",
- "PublicDescription": "Counts Retired memory accesses with at least 1 store operation. This PEBS event is the precisely-distributed (PDist) trigger covering all stores uops for sampling by the PEBS Store Latency Facility. The facility is described in Intel SDM Volume 3 section 19.9.8 Available PDIST counters: 0",
+ "PublicDescription": "Counts Retired memory accesses with at least 1 store operation. This PEBS event is the precisely-distributed (PDist) trigger covering all stores uops for sampling by the PEBS Store Latency Facility. The facility is described in Intel SDM Volume 3 section 19.9.8 Available PDIST counters: 0,1",
"SampleAfterValue": "1000003",
"UMask": "0x2",
"Unit": "cpu_core"
diff --git a/tools/perf/pmu-events/arch/x86/lunarlake/pipeline.json b/tools/perf/pmu-events/arch/x86/lunarlake/pipeline.json
index 6ac410510628..cdaa01e9a57d 100644
--- a/tools/perf/pmu-events/arch/x86/lunarlake/pipeline.json
+++ b/tools/perf/pmu-events/arch/x86/lunarlake/pipeline.json
@@ -110,7 +110,7 @@
"Counter": "0,1,2,3,4,5,6,7,8,9",
"EventCode": "0xc4",
"EventName": "BR_INST_RETIRED.ALL_BRANCHES",
- "PublicDescription": "Counts all branch instructions retired. Available PDIST counters: 0",
+ "PublicDescription": "Counts all branch instructions retired. Available PDIST counters: 0,1",
"SampleAfterValue": "400009",
"Unit": "cpu_core"
},
@@ -128,7 +128,7 @@
"Counter": "0,1,2,3,4,5,6,7,8,9",
"EventCode": "0xc4",
"EventName": "BR_INST_RETIRED.COND",
- "PublicDescription": "Counts conditional branch instructions retired. Available PDIST counters: 0",
+ "PublicDescription": "Counts conditional branch instructions retired. Available PDIST counters: 0,1",
"SampleAfterValue": "400009",
"UMask": "0x111",
"Unit": "cpu_core"
@@ -147,7 +147,7 @@
"Counter": "0,1,2,3,4,5,6,7,8,9",
"EventCode": "0xc4",
"EventName": "BR_INST_RETIRED.COND_NTAKEN",
- "PublicDescription": "Counts not taken branch instructions retired. Available PDIST counters: 0",
+ "PublicDescription": "Counts not taken branch instructions retired. Available PDIST counters: 0,1",
"SampleAfterValue": "400009",
"UMask": "0x10",
"Unit": "cpu_core"
@@ -166,7 +166,7 @@
"Counter": "0,1,2,3,4,5,6,7,8,9",
"EventCode": "0xc4",
"EventName": "BR_INST_RETIRED.COND_TAKEN",
- "PublicDescription": "Counts taken conditional branch instructions retired. Available PDIST counters: 0",
+ "PublicDescription": "Counts taken conditional branch instructions retired. Available PDIST counters: 0,1",
"SampleAfterValue": "400009",
"UMask": "0x101",
"Unit": "cpu_core"
@@ -176,7 +176,7 @@
"Counter": "0,1,2,3,4,5,6,7,8,9",
"EventCode": "0xc4",
"EventName": "BR_INST_RETIRED.COND_TAKEN_BWD",
- "PublicDescription": "Counts taken backward conditional branch instructions retired. Available PDIST counters: 0",
+ "PublicDescription": "Counts taken backward conditional branch instructions retired. Available PDIST counters: 0,1",
"SampleAfterValue": "400009",
"UMask": "0x1",
"Unit": "cpu_core"
@@ -186,7 +186,7 @@
"Counter": "0,1,2,3,4,5,6,7,8,9",
"EventCode": "0xc4",
"EventName": "BR_INST_RETIRED.COND_TAKEN_FWD",
- "PublicDescription": "Counts taken forward conditional branch instructions retired. Available PDIST counters: 0",
+ "PublicDescription": "Counts taken forward conditional branch instructions retired. Available PDIST counters: 0,1",
"SampleAfterValue": "400009",
"UMask": "0x102",
"Unit": "cpu_core"
@@ -205,7 +205,7 @@
"Counter": "0,1,2,3,4,5,6,7,8,9",
"EventCode": "0xc4",
"EventName": "BR_INST_RETIRED.FAR_BRANCH",
- "PublicDescription": "Counts far branch instructions retired. Available PDIST counters: 0",
+ "PublicDescription": "Counts far branch instructions retired. Available PDIST counters: 0,1",
"SampleAfterValue": "100007",
"UMask": "0x40",
"Unit": "cpu_core"
@@ -224,7 +224,7 @@
"Counter": "0,1,2,3,4,5,6,7,8,9",
"EventCode": "0xc4",
"EventName": "BR_INST_RETIRED.INDIRECT",
- "PublicDescription": "Counts near indirect branch instructions retired excluding returns. TSX abort is an indirect branch. Available PDIST counters: 0",
+ "PublicDescription": "Counts near indirect branch instructions retired excluding returns. TSX abort is an indirect branch. Available PDIST counters: 0,1",
"SampleAfterValue": "100003",
"UMask": "0x80",
"Unit": "cpu_core"
@@ -261,7 +261,7 @@
"Counter": "0,1,2,3,4,5,6,7,8,9",
"EventCode": "0xc4",
"EventName": "BR_INST_RETIRED.NEAR_CALL",
- "PublicDescription": "Counts both direct and indirect near call instructions retired. Available PDIST counters: 0",
+ "PublicDescription": "Counts both direct and indirect near call instructions retired. Available PDIST counters: 0,1",
"SampleAfterValue": "100007",
"UMask": "0x2",
"Unit": "cpu_core"
@@ -280,13 +280,13 @@
"Counter": "0,1,2,3,4,5,6,7,8,9",
"EventCode": "0xc4",
"EventName": "BR_INST_RETIRED.NEAR_RETURN",
- "PublicDescription": "Counts return instructions retired. Available PDIST counters: 0",
+ "PublicDescription": "Counts return instructions retired. Available PDIST counters: 0,1",
"SampleAfterValue": "100007",
"UMask": "0x8",
"Unit": "cpu_core"
},
{
- "BriefDescription": "Counts the number of taken branch instructions retired",
+ "BriefDescription": "Counts the number of near taken branch instructions retired",
"Counter": "0,1,2,3,4,5,6,7",
"EventCode": "0xc4",
"EventName": "BR_INST_RETIRED.NEAR_TAKEN",
@@ -299,7 +299,7 @@
"Counter": "0,1,2,3,4,5,6,7,8,9",
"EventCode": "0xc4",
"EventName": "BR_INST_RETIRED.NEAR_TAKEN",
- "PublicDescription": "Counts taken branch instructions retired. Available PDIST counters: 0",
+ "PublicDescription": "Counts taken branch instructions retired. Available PDIST counters: 0,1",
"SampleAfterValue": "400009",
"UMask": "0x20",
"Unit": "cpu_core"
@@ -327,7 +327,7 @@
"Counter": "0,1,2,3,4,5,6,7,8,9",
"EventCode": "0xc5",
"EventName": "BR_MISP_RETIRED.ALL_BRANCHES",
- "PublicDescription": "Counts all the retired branch instructions that were mispredicted by the processor. A branch misprediction occurs when the processor incorrectly predicts the destination of the branch. When the misprediction is discovered at execution, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. Available PDIST counters: 0",
+ "PublicDescription": "Counts all the retired branch instructions that were mispredicted by the processor. A branch misprediction occurs when the processor incorrectly predicts the destination of the branch. When the misprediction is discovered at execution, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. Available PDIST counters: 0,1",
"SampleAfterValue": "400009",
"Unit": "cpu_core"
},
@@ -336,7 +336,7 @@
"Counter": "0,1,2,3,4,5,6,7,8,9",
"EventCode": "0xc5",
"EventName": "BR_MISP_RETIRED.ALL_BRANCHES_COST",
- "PublicDescription": "All mispredicted branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0",
+ "PublicDescription": "All mispredicted branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0,1",
"SampleAfterValue": "400009",
"UMask": "0x44",
"Unit": "cpu_core"
@@ -355,7 +355,7 @@
"Counter": "0,1,2,3,4,5,6,7,8,9",
"EventCode": "0xc5",
"EventName": "BR_MISP_RETIRED.COND",
- "PublicDescription": "Counts mispredicted conditional branch instructions retired. Available PDIST counters: 0",
+ "PublicDescription": "Counts mispredicted conditional branch instructions retired. Available PDIST counters: 0,1",
"SampleAfterValue": "400009",
"UMask": "0x111",
"Unit": "cpu_core"
@@ -365,7 +365,7 @@
"Counter": "0,1,2,3,4,5,6,7,8,9",
"EventCode": "0xc5",
"EventName": "BR_MISP_RETIRED.COND_COST",
- "PublicDescription": "Mispredicted conditional branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0",
+ "PublicDescription": "Mispredicted conditional branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0,1",
"SampleAfterValue": "400009",
"UMask": "0x151",
"Unit": "cpu_core"
@@ -384,7 +384,7 @@
"Counter": "0,1,2,3,4,5,6,7,8,9",
"EventCode": "0xc5",
"EventName": "BR_MISP_RETIRED.COND_NTAKEN",
- "PublicDescription": "Counts the number of conditional branch instructions retired that were mispredicted and the branch direction was not taken. Available PDIST counters: 0",
+ "PublicDescription": "Counts the number of conditional branch instructions retired that were mispredicted and the branch direction was not taken. Available PDIST counters: 0,1",
"SampleAfterValue": "400009",
"UMask": "0x10",
"Unit": "cpu_core"
@@ -394,7 +394,7 @@
"Counter": "0,1,2,3,4,5,6,7,8,9",
"EventCode": "0xc5",
"EventName": "BR_MISP_RETIRED.COND_NTAKEN_COST",
- "PublicDescription": "Mispredicted non-taken conditional branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0",
+ "PublicDescription": "Mispredicted non-taken conditional branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0,1",
"SampleAfterValue": "400009",
"UMask": "0x50",
"Unit": "cpu_core"
@@ -413,7 +413,7 @@
"Counter": "0,1,2,3,4,5,6,7,8,9",
"EventCode": "0xc5",
"EventName": "BR_MISP_RETIRED.COND_TAKEN",
- "PublicDescription": "Counts taken conditional mispredicted branch instructions retired. Available PDIST counters: 0",
+ "PublicDescription": "Counts taken conditional mispredicted branch instructions retired. Available PDIST counters: 0,1",
"SampleAfterValue": "400009",
"UMask": "0x101",
"Unit": "cpu_core"
@@ -423,7 +423,7 @@
"Counter": "0,1,2,3,4,5,6,7,8,9",
"EventCode": "0xc5",
"EventName": "BR_MISP_RETIRED.COND_TAKEN_BWD",
- "PublicDescription": "Counts taken backward conditional mispredicted branch instructions retired. Available PDIST counters: 0",
+ "PublicDescription": "Counts taken backward conditional mispredicted branch instructions retired. Available PDIST counters: 0,1",
"SampleAfterValue": "400009",
"UMask": "0x1",
"Unit": "cpu_core"
@@ -433,7 +433,7 @@
"Counter": "0,1,2,3,4,5,6,7,8,9",
"EventCode": "0xc5",
"EventName": "BR_MISP_RETIRED.COND_TAKEN_BWD_COST",
- "PublicDescription": "number of branch instructions retired that were mispredicted and taken backward. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0",
+ "PublicDescription": "number of branch instructions retired that were mispredicted and taken backward. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0,1",
"SampleAfterValue": "400009",
"UMask": "0x8001",
"Unit": "cpu_core"
@@ -443,7 +443,7 @@
"Counter": "0,1,2,3,4,5,6,7,8,9",
"EventCode": "0xc5",
"EventName": "BR_MISP_RETIRED.COND_TAKEN_COST",
- "PublicDescription": "Mispredicted taken conditional branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0",
+ "PublicDescription": "Mispredicted taken conditional branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0,1",
"SampleAfterValue": "400009",
"UMask": "0x141",
"Unit": "cpu_core"
@@ -453,7 +453,7 @@
"Counter": "0,1,2,3,4,5,6,7,8,9",
"EventCode": "0xc5",
"EventName": "BR_MISP_RETIRED.COND_TAKEN_FWD",
- "PublicDescription": "Counts taken forward conditional mispredicted branch instructions retired. Available PDIST counters: 0",
+ "PublicDescription": "Counts taken forward conditional mispredicted branch instructions retired. Available PDIST counters: 0,1",
"SampleAfterValue": "400009",
"Unit": "cpu_core"
},
@@ -462,7 +462,7 @@
"Counter": "0,1,2,3,4,5,6,7,8,9",
"EventCode": "0xc5",
"EventName": "BR_MISP_RETIRED.COND_TAKEN_FWD_COST",
- "PublicDescription": "number of branch instructions retired that were mispredicted and taken forward. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0",
+ "PublicDescription": "number of branch instructions retired that were mispredicted and taken forward. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0,1",
"SampleAfterValue": "400009",
"UMask": "0x8002",
"Unit": "cpu_core"
@@ -481,7 +481,7 @@
"Counter": "0,1,2,3,4,5,6,7,8,9",
"EventCode": "0xc5",
"EventName": "BR_MISP_RETIRED.INDIRECT",
- "PublicDescription": "Counts miss-predicted near indirect branch instructions retired excluding returns. TSX abort is an indirect branch. Available PDIST counters: 0",
+ "PublicDescription": "Counts miss-predicted near indirect branch instructions retired excluding returns. TSX abort is an indirect branch. Available PDIST counters: 0,1",
"SampleAfterValue": "100003",
"UMask": "0x80",
"Unit": "cpu_core"
@@ -500,7 +500,7 @@
"Counter": "0,1,2,3,4,5,6,7,8,9",
"EventCode": "0xc5",
"EventName": "BR_MISP_RETIRED.INDIRECT_CALL",
- "PublicDescription": "Counts retired mispredicted indirect (near taken) CALL instructions, including both register and memory indirect. Available PDIST counters: 0",
+ "PublicDescription": "Counts retired mispredicted indirect (near taken) CALL instructions, including both register and memory indirect. Available PDIST counters: 0,1",
"SampleAfterValue": "400009",
"UMask": "0x2",
"Unit": "cpu_core"
@@ -510,7 +510,7 @@
"Counter": "0,1,2,3,4,5,6,7,8,9",
"EventCode": "0xc5",
"EventName": "BR_MISP_RETIRED.INDIRECT_CALL_COST",
- "PublicDescription": "Mispredicted indirect CALL retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0",
+ "PublicDescription": "Mispredicted indirect CALL retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0,1",
"SampleAfterValue": "400009",
"UMask": "0x42",
"Unit": "cpu_core"
@@ -520,7 +520,7 @@
"Counter": "0,1,2,3,4,5,6,7,8,9",
"EventCode": "0xc5",
"EventName": "BR_MISP_RETIRED.INDIRECT_COST",
- "PublicDescription": "Mispredicted near indirect branch instructions retired (excluding returns). This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0",
+ "PublicDescription": "Mispredicted near indirect branch instructions retired (excluding returns). This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0,1",
"SampleAfterValue": "100003",
"UMask": "0xc0",
"Unit": "cpu_core"
@@ -548,7 +548,7 @@
"Counter": "0,1,2,3,4,5,6,7,8,9",
"EventCode": "0xc5",
"EventName": "BR_MISP_RETIRED.NEAR_TAKEN",
- "PublicDescription": "Counts number of near branch instructions retired that were mispredicted and taken. Available PDIST counters: 0",
+ "PublicDescription": "Counts number of near branch instructions retired that were mispredicted and taken. Available PDIST counters: 0,1",
"SampleAfterValue": "400009",
"UMask": "0x20",
"Unit": "cpu_core"
@@ -558,7 +558,7 @@
"Counter": "0,1,2,3,4,5,6,7,8,9",
"EventCode": "0xc5",
"EventName": "BR_MISP_RETIRED.NEAR_TAKEN_COST",
- "PublicDescription": "Mispredicted taken near branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0",
+ "PublicDescription": "Mispredicted taken near branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0,1",
"SampleAfterValue": "400009",
"UMask": "0x60",
"Unit": "cpu_core"
@@ -568,7 +568,7 @@
"Counter": "0,1,2,3,4,5,6,7,8,9",
"EventCode": "0xc5",
"EventName": "BR_MISP_RETIRED.RET",
- "PublicDescription": "This is a non-precise version (that is, does not use PEBS) of the event that counts mispredicted return instructions retired. Available PDIST counters: 0",
+ "PublicDescription": "This is a non-precise version (that is, does not use PEBS) of the event that counts mispredicted return instructions retired. Available PDIST counters: 0,1",
"SampleAfterValue": "100007",
"UMask": "0x8",
"Unit": "cpu_core"
@@ -587,7 +587,7 @@
"Counter": "0,1,2,3,4,5,6,7,8,9",
"EventCode": "0xc5",
"EventName": "BR_MISP_RETIRED.RET_COST",
- "PublicDescription": "Mispredicted ret instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0",
+ "PublicDescription": "Mispredicted ret instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0,1",
"SampleAfterValue": "100007",
"UMask": "0x48",
"Unit": "cpu_core"
@@ -906,7 +906,7 @@
"Counter": "0,1,2,3,4,5,6,7,8,9",
"EventCode": "0xc0",
"EventName": "INST_RETIRED.ANY_P",
- "PublicDescription": "Counts the number of X86 instructions retired - an Architectural PerfMon event. Counting continues during hardware interrupts, traps, and inside interrupt handlers. Notes: INST_RETIRED.ANY is counted by a designated fixed counter freeing up programmable counters to count other events. INST_RETIRED.ANY_P is counted by a programmable counter. Available PDIST counters: 0",
+ "PublicDescription": "Counts the number of X86 instructions retired - an Architectural PerfMon event. Counting continues during hardware interrupts, traps, and inside interrupt handlers. Notes: INST_RETIRED.ANY is counted by a designated fixed counter freeing up programmable counters to count other events. INST_RETIRED.ANY_P is counted by a programmable counter. Available PDIST counters: 0,1",
"SampleAfterValue": "2000003",
"Unit": "cpu_core"
},
@@ -915,7 +915,7 @@
"Counter": "0,1,2,3,4,5,6,7,8,9",
"EventCode": "0xc0",
"EventName": "INST_RETIRED.BR_FUSED",
- "PublicDescription": "retired macro-fused uops when there is a branch in the macro-fused pair (the two instructions that got macro-fused count once in this pmon) Available PDIST counters: 0",
+ "PublicDescription": "retired macro-fused uops when there is a branch in the macro-fused pair (the two instructions that got macro-fused count once in this pmon) Available PDIST counters: 0,1",
"SampleAfterValue": "1000003",
"UMask": "0x10",
"Unit": "cpu_core"
@@ -925,7 +925,7 @@
"Counter": "0,1,2,3,4,5,6,7,8,9",
"EventCode": "0xc0",
"EventName": "INST_RETIRED.MACRO_FUSED",
- "PublicDescription": "INST_RETIRED.MACRO_FUSED Available PDIST counters: 0",
+ "PublicDescription": "INST_RETIRED.MACRO_FUSED Available PDIST counters: 0,1",
"SampleAfterValue": "2000003",
"UMask": "0x30",
"Unit": "cpu_core"
@@ -935,7 +935,7 @@
"Counter": "0,1,2,3,4,5,6,7,8,9",
"EventCode": "0xc0",
"EventName": "INST_RETIRED.NOP",
- "PublicDescription": "Counts all retired NOP or ENDBR32/64 or PREFETCHIT0/1 instructions Available PDIST counters: 0",
+ "PublicDescription": "Counts all retired NOP or ENDBR32/64 or PREFETCHIT0/1 instructions Available PDIST counters: 0,1",
"SampleAfterValue": "2000003",
"UMask": "0x2",
"Unit": "cpu_core"
@@ -954,7 +954,7 @@
"Counter": "0,1,2,3,4,5,6,7,8,9",
"EventCode": "0xc0",
"EventName": "INST_RETIRED.REP_ITERATION",
- "PublicDescription": "Number of iterations of Repeat (REP) string retired instructions such as MOVS, CMPS, and SCAS. Each has a byte, word, and doubleword version and string instructions can be repeated using a repetition prefix, REP, that allows their architectural execution to be repeated a number of times as specified by the RCX register. Note the number of iterations is implementation-dependent. Available PDIST counters: 0",
+ "PublicDescription": "Number of iterations of Repeat (REP) string retired instructions such as MOVS, CMPS, and SCAS. Each has a byte, word, and doubleword version and string instructions can be repeated using a repetition prefix, REP, that allows their architectural execution to be repeated a number of times as specified by the RCX register. Note the number of iterations is implementation-dependent. Available PDIST counters: 0,1",
"SampleAfterValue": "2000003",
"UMask": "0x8",
"Unit": "cpu_core"
@@ -1227,6 +1227,15 @@
"UMask": "0x88",
"Unit": "cpu_core"
},
+ {
+ "BriefDescription": "Counts the number of times a load got early blocked due to preceding store operation with unknown address or unknown data. Excluding in-line (immediate) wakeups",
+ "Counter": "0,1,2,3,4,5,6,7,8,9",
+ "EventCode": "0x03",
+ "EventName": "LD_BLOCKS.STORE_EARLY",
+ "SampleAfterValue": "100003",
+ "UMask": "0xa1",
+ "Unit": "cpu_core"
+ },
{
"BriefDescription": "Counts the number of occurrences a retired load gets blocked because its address partially overlaps with an older store (size mismatch) - unknown_sta/bad_forward",
"Counter": "0,1,2,3,4,5,6,7",
@@ -1451,7 +1460,7 @@
"Counter": "0,1,2,3,4,5,6,7,8,9",
"EventCode": "0xe4",
"EventName": "MISC_RETIRED.LBR_INSERTS",
- "PublicDescription": "LBR record is inserted Available PDIST counters: 0",
+ "PublicDescription": "LBR record is inserted Available PDIST counters: 0,1",
"SampleAfterValue": "1000003",
"UMask": "0x1",
"Unit": "cpu_core"
diff --git a/tools/perf/pmu-events/arch/x86/lunarlake/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/lunarlake/uncore-interconnect.json
new file mode 100644
index 000000000000..69ef928d57f6
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/lunarlake/uncore-interconnect.json
@@ -0,0 +1,10 @@
+[
+ {
+ "BriefDescription": "This 48-bit fixed counter counts the UCLK cycles.",
+ "Counter": "FIXED",
+ "EventCode": "0xff",
+ "EventName": "UNC_CLOCK.SOCKET",
+ "PerPkg": "1",
+ "Unit": "SANTA"
+ }
+]
diff --git a/tools/perf/pmu-events/arch/x86/lunarlake/uncore-memory.json b/tools/perf/pmu-events/arch/x86/lunarlake/uncore-memory.json
index 7d63580302de..63c4aa2791e4 100644
--- a/tools/perf/pmu-events/arch/x86/lunarlake/uncore-memory.json
+++ b/tools/perf/pmu-events/arch/x86/lunarlake/uncore-memory.json
@@ -32,5 +32,13 @@
"Experimental": "1",
"PerPkg": "1",
"Unit": "iMC"
+ },
+ {
+ "BriefDescription": "Total number of read and write byte transfers to/from DRAM, in 32B chunk, per DDR channel. Counter increments by 1 after sending or receiving 32B chunk data.",
+ "Counter": "0,1,2,3,4",
+ "EventCode": "0x3C",
+ "EventName": "UNC_M_TOTAL_DATA",
+ "PerPkg": "1",
+ "Unit": "iMC"
}
]
diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv
index 2d7a9fa055dd..894a3f485a4a 100644
--- a/tools/perf/pmu-events/arch/x86/mapfile.csv
+++ b/tools/perf/pmu-events/arch/x86/mapfile.csv
@@ -22,7 +22,7 @@ GenuineIntel-6-3A,v24,ivybridge,core
GenuineIntel-6-3E,v24,ivytown,core
GenuineIntel-6-2D,v24,jaketown,core
GenuineIntel-6-(57|85),v16,knightslanding,core
-GenuineIntel-6-BD,v1.14,lunarlake,core
+GenuineIntel-6-BD,v1.17,lunarlake,core
GenuineIntel-6-(AA|AC|B5),v1.14,meteorlake,core
GenuineIntel-6-1[AEF],v4,nehalemep,core
GenuineIntel-6-2E,v4,nehalemex,core
--
2.50.0.727.gbf7dc18ff4-goog
Powered by blists - more mailing lists