[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20230515215844.653610-6-irogers@google.com>
Date: Mon, 15 May 2023 14:58:34 -0700
From: Ian Rogers <irogers@...gle.com>
To: Peter Zijlstra <peterz@...radead.org>,
Ingo Molnar <mingo@...hat.com>,
Arnaldo Carvalho de Melo <acme@...nel.org>,
Mark Rutland <mark.rutland@....com>,
Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
Jiri Olsa <jolsa@...nel.org>,
Namhyung Kim <namhyung@...nel.org>,
Ian Rogers <irogers@...gle.com>,
Adrian Hunter <adrian.hunter@...el.com>,
Kan Liang <kan.liang@...ux.intel.com>,
Zhengjun Xing <zhengjun.xing@...ux.intel.com>,
John Garry <john.garry@...wei.com>,
Kajol Jain <kjain@...ux.ibm.com>,
Thomas Richter <tmricht@...ux.ibm.com>,
linux-kernel@...r.kernel.org, linux-perf-users@...r.kernel.org
Subject: [PATCH v1 05/15] perf vendor events intel: Update haswell(x) metrics
Metrics are updated to make TMA info metric names
synchronized. Metrics were generated by:
https://github.com/intel/perfmon/blob/main/scripts/create_perf_json.py
Signed-off-by: Ian Rogers <irogers@...gle.com>
---
.../arch/x86/haswell/hsw-metrics.json | 484 ++++++------
.../arch/x86/haswellx/hsx-metrics.json | 700 ++++++++++++------
2 files changed, 696 insertions(+), 488 deletions(-)
diff --git a/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json b/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json
index 9570a88d6d1c..79d89c263677 100644
--- a/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json
@@ -50,7 +50,7 @@
},
{
"BriefDescription": "Uncore frequency per die [GHZ]",
- "MetricExpr": "tma_info_socket_clks / #num_dies / duration_time / 1e9",
+ "MetricExpr": "tma_info_system_socket_clks / #num_dies / duration_time / 1e9",
"MetricGroup": "SoC",
"MetricName": "UNCORE_FREQ"
},
@@ -71,7 +71,7 @@
},
{
"BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset",
- "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_clks",
+ "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_thread_clks",
"MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group",
"MetricName": "tma_4k_aliasing",
"MetricThreshold": "tma_4k_aliasing > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -81,7 +81,7 @@
{
"BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.",
"MetricConstraint": "NO_GROUP_EVENTS_NMI",
- "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_slots",
+ "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_thread_slots",
"MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group",
"MetricName": "tma_alu_op_utilization",
"MetricThreshold": "tma_alu_op_utilization > 0.6",
@@ -89,7 +89,7 @@
},
{
"BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists",
- "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_slots",
+ "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_thread_slots",
"MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group",
"MetricName": "tma_assists",
"MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)",
@@ -109,7 +109,7 @@
},
{
"BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations",
- "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_slots",
+ "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_thread_slots",
"MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_bad_speculation",
"MetricThreshold": "tma_bad_speculation > 0.15",
@@ -125,12 +125,12 @@
"MetricName": "tma_branch_mispredicts",
"MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15",
"MetricgroupNoGroup": "TopdownL2",
- "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers",
+ "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers",
"ScaleUnit": "100%"
},
{
"BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers",
- "MetricExpr": "12 * (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY) / tma_info_clks",
+ "MetricExpr": "12 * (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY) / tma_info_thread_clks",
"MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group",
"MetricName": "tma_branch_resteers",
"MetricThreshold": "tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
@@ -150,7 +150,7 @@
{
"BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses",
"MetricConstraint": "NO_GROUP_EVENTS",
- "MetricExpr": "(60 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS))) + 43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS)))) / tma_info_clks",
+ "MetricExpr": "(60 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS))) + 43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS)))) / tma_info_thread_clks",
"MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group",
"MetricName": "tma_contested_accesses",
"MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -171,7 +171,7 @@
{
"BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses",
"MetricConstraint": "NO_GROUP_EVENTS",
- "MetricExpr": "43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS))) / tma_info_clks",
+ "MetricExpr": "43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS))) / tma_info_thread_clks",
"MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group",
"MetricName": "tma_data_sharing",
"MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -180,7 +180,7 @@
},
{
"BriefDescription": "This metric represents fraction of cycles where the Divider unit was active",
- "MetricExpr": "10 * ARITH.DIVIDER_UOPS / tma_info_core_clks",
+ "MetricExpr": "10 * ARITH.DIVIDER_UOPS / tma_info_core_core_clks",
"MetricGroup": "TopdownL3;tma_L3_group;tma_core_bound_group",
"MetricName": "tma_divider",
"MetricThreshold": "tma_divider > 0.2 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)",
@@ -190,7 +190,7 @@
{
"BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads",
"MetricConstraint": "NO_GROUP_EVENTS_SMT",
- "MetricExpr": "(1 - MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS)) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_clks",
+ "MetricExpr": "(1 - MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS)) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_thread_clks",
"MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
"MetricName": "tma_dram_bound",
"MetricThreshold": "tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
@@ -199,25 +199,25 @@
},
{
"BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline",
- "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_clks / 2",
+ "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_core_clks / 2",
"MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
"MetricName": "tma_dsb",
- "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35)",
+ "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)",
"PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.",
"ScaleUnit": "100%"
},
{
"BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines",
- "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_clks",
+ "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_thread_clks",
"MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB",
"MetricName": "tma_dsb_switches",
"MetricThreshold": "tma_dsb_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
- "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Related metrics: tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_iptb, tma_lcp",
+ "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Related metrics: tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp",
"ScaleUnit": "100%"
},
{
"BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses",
- "MetricExpr": "(8 * DTLB_LOAD_MISSES.STLB_HIT + DTLB_LOAD_MISSES.WALK_DURATION) / tma_info_clks",
+ "MetricExpr": "(8 * DTLB_LOAD_MISSES.STLB_HIT + DTLB_LOAD_MISSES.WALK_DURATION) / tma_info_thread_clks",
"MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group",
"MetricName": "tma_dtlb_load",
"MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -226,7 +226,7 @@
},
{
"BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses",
- "MetricExpr": "(8 * DTLB_STORE_MISSES.STLB_HIT + DTLB_STORE_MISSES.WALK_DURATION) / tma_info_clks",
+ "MetricExpr": "(8 * DTLB_STORE_MISSES.STLB_HIT + DTLB_STORE_MISSES.WALK_DURATION) / tma_info_thread_clks",
"MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group",
"MetricName": "tma_dtlb_store",
"MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -235,7 +235,7 @@
},
{
"BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing",
- "MetricExpr": "60 * OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE / tma_info_clks",
+ "MetricExpr": "60 * OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE / tma_info_thread_clks",
"MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group",
"MetricName": "tma_false_sharing",
"MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -245,11 +245,11 @@
{
"BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed",
"MetricConstraint": "NO_GROUP_EVENTS",
- "MetricExpr": "tma_info_load_miss_real_latency * cpu@..._PEND_MISS.REQUEST_FB_FULL\\,cmask\\=1@ / tma_info_clks",
+ "MetricExpr": "tma_info_memory_load_miss_real_latency * cpu@..._PEND_MISS.REQUEST_FB_FULL\\,cmask\\=1@ / tma_info_thread_clks",
"MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group",
"MetricName": "tma_fb_full",
"MetricThreshold": "tma_fb_full > 0.3",
- "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores",
+ "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores",
"ScaleUnit": "100%"
},
{
@@ -257,14 +257,14 @@
"MetricExpr": "tma_frontend_bound - tma_fetch_latency",
"MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
"MetricName": "tma_fetch_bandwidth",
- "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35",
+ "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35",
"MetricgroupNoGroup": "TopdownL2",
- "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_iptb, tma_lcp",
+ "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp",
"ScaleUnit": "100%"
},
{
"BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues",
- "MetricExpr": "4 * min(CPU_CLK_UNHALTED.THREAD, IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE) / tma_info_slots",
+ "MetricExpr": "4 * min(CPU_CLK_UNHALTED.THREAD, IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE) / tma_info_thread_slots",
"MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group",
"MetricName": "tma_fetch_latency",
"MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15",
@@ -274,7 +274,7 @@
},
{
"BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend",
- "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / tma_info_slots",
+ "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / tma_info_thread_slots",
"MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_frontend_bound",
"MetricThreshold": "tma_frontend_bound > 0.15",
@@ -294,324 +294,324 @@
},
{
"BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses.",
- "MetricExpr": "ICACHE.IFDATA_STALL / tma_info_clks",
+ "MetricExpr": "ICACHE.IFDATA_STALL / tma_info_thread_clks",
"MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
"MetricName": "tma_icache_misses",
"MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
"ScaleUnit": "100%"
},
{
- "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]",
- "MetricExpr": "tma_info_turbo_utilization * TSC / 1e9 / duration_time",
- "MetricGroup": "Power;Summary",
- "MetricName": "tma_info_average_frequency"
- },
- {
- "BriefDescription": "Branch instructions per taken branch.",
- "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN",
- "MetricGroup": "Branches;Fed;PGO",
- "MetricName": "tma_info_bptkbranch"
+ "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).",
+ "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@...MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)",
+ "MetricGroup": "Bad;BrMispredicts",
+ "MetricName": "tma_info_bad_spec_ipmisp_indirect",
+ "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3"
},
{
- "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.",
- "MetricExpr": "CPU_CLK_UNHALTED.THREAD",
- "MetricGroup": "Pipeline",
- "MetricName": "tma_info_clks"
+ "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)",
+ "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES",
+ "MetricGroup": "Bad;BadSpec;BrMispredicts",
+ "MetricName": "tma_info_bad_spec_ipmispredict",
+ "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200"
},
{
"BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core",
- "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / 2 * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2 if #SMT_on else tma_info_clks))",
+ "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / 2 * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2 if #SMT_on else tma_info_thread_clks))",
"MetricGroup": "SMT",
- "MetricName": "tma_info_core_clks"
+ "MetricName": "tma_info_core_core_clks"
},
{
"BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)",
- "MetricExpr": "INST_RETIRED.ANY / tma_info_core_clks",
+ "MetricExpr": "INST_RETIRED.ANY / tma_info_core_core_clks",
"MetricGroup": "Ret;SMT;TmaL1;tma_L1_group",
- "MetricName": "tma_info_coreipc"
+ "MetricName": "tma_info_core_coreipc"
},
{
- "BriefDescription": "Cycles Per Instruction (per Logical Processor)",
- "MetricExpr": "1 / tma_info_ipc",
- "MetricGroup": "Mem;Pipeline",
- "MetricName": "tma_info_cpi"
- },
- {
- "BriefDescription": "Average CPU Utilization",
- "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC",
- "MetricGroup": "HPC;Summary",
- "MetricName": "tma_info_cpu_utilization"
- },
- {
- "BriefDescription": "Average Parallel L2 cache miss data reads",
- "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD",
- "MetricGroup": "Memory_BW;Offcore",
- "MetricName": "tma_info_data_l2_mlp"
- },
- {
- "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
- "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3",
- "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW",
- "MetricName": "tma_info_dram_bw_use",
- "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_mem_bandwidth, tma_sq_full"
+ "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core",
+ "MetricExpr": "(UOPS_EXECUTED.CORE / 2 / (cpu@...S_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else cpu@...S_EXECUTED.CORE\\,cmask\\=1@) if #SMT_on else UOPS_EXECUTED.CORE / (cpu@...S_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else cpu@...S_EXECUTED.CORE\\,cmask\\=1@))",
+ "MetricGroup": "Backend;Cor;Pipeline;PortsUtil",
+ "MetricName": "tma_info_core_ilp"
},
{
"BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)",
"MetricExpr": "IDQ.DSB_UOPS / (IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS)",
"MetricGroup": "DSB;Fed;FetchBW;tma_issueFB",
- "MetricName": "tma_info_dsb_coverage",
- "MetricThreshold": "tma_info_dsb_coverage < 0.7 & tma_info_ipc / 4 > 0.35",
- "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_iptb, tma_lcp"
+ "MetricName": "tma_info_frontend_dsb_coverage",
+ "MetricThreshold": "tma_info_frontend_dsb_coverage < 0.7 & tma_info_thread_ipc / 4 > 0.35",
+ "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_inst_mix_iptb, tma_lcp"
},
{
- "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core",
- "MetricExpr": "(UOPS_EXECUTED.CORE / 2 / (cpu@...S_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else cpu@...S_EXECUTED.CORE\\,cmask\\=1@) if #SMT_on else UOPS_EXECUTED.CORE / (cpu@...S_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else cpu@...S_EXECUTED.CORE\\,cmask\\=1@))",
- "MetricGroup": "Backend;Cor;Pipeline;PortsUtil",
- "MetricName": "tma_info_ilp"
+ "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)",
+ "MetricExpr": "tma_info_inst_mix_instructions / BACLEARS.ANY",
+ "MetricGroup": "Fed",
+ "MetricName": "tma_info_frontend_ipunknown_branch"
+ },
+ {
+ "BriefDescription": "Branch instructions per taken branch.",
+ "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN",
+ "MetricGroup": "Branches;Fed;PGO",
+ "MetricName": "tma_info_inst_mix_bptkbranch"
},
{
"BriefDescription": "Total number of retired Instructions",
"MetricExpr": "INST_RETIRED.ANY",
"MetricGroup": "Summary;TmaL1;tma_L1_group",
- "MetricName": "tma_info_instructions",
+ "MetricName": "tma_info_inst_mix_instructions",
"PublicDescription": "Total number of retired Instructions. Sample with: INST_RETIRED.PREC_DIST"
},
{
"BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)",
"MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES",
"MetricGroup": "Branches;Fed;InsType",
- "MetricName": "tma_info_ipbranch",
- "MetricThreshold": "tma_info_ipbranch < 8"
- },
- {
- "BriefDescription": "Instructions Per Cycle (per Logical Processor)",
- "MetricExpr": "INST_RETIRED.ANY / tma_info_clks",
- "MetricGroup": "Ret;Summary",
- "MetricName": "tma_info_ipc"
+ "MetricName": "tma_info_inst_mix_ipbranch",
+ "MetricThreshold": "tma_info_inst_mix_ipbranch < 8"
},
{
"BriefDescription": "Instructions per (near) call (lower number means higher occurrence rate)",
"MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL",
"MetricGroup": "Branches;Fed;PGO",
- "MetricName": "tma_info_ipcall",
- "MetricThreshold": "tma_info_ipcall < 200"
- },
- {
- "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]",
- "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u",
- "MetricGroup": "Branches;OS",
- "MetricName": "tma_info_ipfarbranch",
- "MetricThreshold": "tma_info_ipfarbranch < 1e6"
+ "MetricName": "tma_info_inst_mix_ipcall",
+ "MetricThreshold": "tma_info_inst_mix_ipcall < 200"
},
{
"BriefDescription": "Instructions per Load (lower number means higher occurrence rate)",
"MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_LOADS",
"MetricGroup": "InsType",
- "MetricName": "tma_info_ipload",
- "MetricThreshold": "tma_info_ipload < 3"
- },
- {
- "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).",
- "MetricExpr": "tma_info_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@...MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)",
- "MetricGroup": "Bad;BrMispredicts",
- "MetricName": "tma_info_ipmisp_indirect",
- "MetricThreshold": "tma_info_ipmisp_indirect < 1e3"
- },
- {
- "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)",
- "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES",
- "MetricGroup": "Bad;BadSpec;BrMispredicts",
- "MetricName": "tma_info_ipmispredict",
- "MetricThreshold": "tma_info_ipmispredict < 200"
+ "MetricName": "tma_info_inst_mix_ipload",
+ "MetricThreshold": "tma_info_inst_mix_ipload < 3"
},
{
"BriefDescription": "Instructions per Store (lower number means higher occurrence rate)",
"MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_STORES",
"MetricGroup": "InsType",
- "MetricName": "tma_info_ipstore",
- "MetricThreshold": "tma_info_ipstore < 8"
+ "MetricName": "tma_info_inst_mix_ipstore",
+ "MetricThreshold": "tma_info_inst_mix_ipstore < 8"
},
{
"BriefDescription": "Instruction per taken branch",
"MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN",
"MetricGroup": "Branches;Fed;FetchBW;Frontend;PGO;tma_issueFB",
- "MetricName": "tma_info_iptb",
- "MetricThreshold": "tma_info_iptb < 9",
- "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_lcp"
- },
- {
- "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)",
- "MetricExpr": "tma_info_instructions / BACLEARS.ANY",
- "MetricGroup": "Fed",
- "MetricName": "tma_info_ipunknown_branch"
- },
- {
- "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode",
- "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k",
- "MetricGroup": "OS",
- "MetricName": "tma_info_kernel_cpi"
- },
- {
- "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode",
- "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD",
- "MetricGroup": "OS",
- "MetricName": "tma_info_kernel_utilization",
- "MetricThreshold": "tma_info_kernel_utilization > 0.05"
+ "MetricName": "tma_info_inst_mix_iptb",
+ "MetricThreshold": "tma_info_inst_mix_iptb < 9",
+ "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_lcp"
},
{
"BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]",
"MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time",
"MetricGroup": "Mem;MemoryBW",
- "MetricName": "tma_info_l1d_cache_fill_bw"
- },
- {
- "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]",
- "MetricExpr": "tma_info_l1d_cache_fill_bw",
- "MetricGroup": "Mem;MemoryBW",
- "MetricName": "tma_info_l1d_cache_fill_bw_1t"
- },
- {
- "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
- "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY",
- "MetricGroup": "CacheMisses;Mem",
- "MetricName": "tma_info_l1mpki"
+ "MetricName": "tma_info_memory_core_l1d_cache_fill_bw"
},
{
"BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]",
"MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time",
"MetricGroup": "Mem;MemoryBW",
- "MetricName": "tma_info_l2_cache_fill_bw"
+ "MetricName": "tma_info_memory_core_l2_cache_fill_bw"
},
{
- "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]",
- "MetricExpr": "tma_info_l2_cache_fill_bw",
+ "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
+ "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
"MetricGroup": "Mem;MemoryBW",
- "MetricName": "tma_info_l2_cache_fill_bw_1t"
+ "MetricName": "tma_info_memory_core_l3_cache_fill_bw"
+ },
+ {
+ "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
+ "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY",
+ "MetricGroup": "CacheMisses;Mem",
+ "MetricName": "tma_info_memory_l1mpki"
},
{
"BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads",
"MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY",
"MetricGroup": "Backend;CacheMisses;Mem",
- "MetricName": "tma_info_l2mpki"
+ "MetricName": "tma_info_memory_l2mpki"
},
{
- "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]",
- "MetricExpr": "0",
- "MetricGroup": "Mem;MemoryBW;Offcore",
- "MetricName": "tma_info_l3_cache_access_bw_1t"
+ "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
+ "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY",
+ "MetricGroup": "CacheMisses;Mem",
+ "MetricName": "tma_info_memory_l3mpki"
},
{
- "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
- "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
- "MetricGroup": "Mem;MemoryBW",
- "MetricName": "tma_info_l3_cache_fill_bw"
+ "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
+ "MetricConstraint": "NO_GROUP_EVENTS",
+ "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)",
+ "MetricGroup": "Mem;MemoryBound;MemoryLat",
+ "MetricName": "tma_info_memory_load_miss_real_latency"
},
{
- "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]",
- "MetricExpr": "tma_info_l3_cache_fill_bw",
- "MetricGroup": "Mem;MemoryBW",
- "MetricName": "tma_info_l3_cache_fill_bw_1t"
+ "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
+ "MetricConstraint": "NO_GROUP_EVENTS",
+ "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
+ "MetricGroup": "Mem;MemoryBW;MemoryBound",
+ "MetricName": "tma_info_memory_mlp",
+ "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)"
},
{
- "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
- "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY",
- "MetricGroup": "CacheMisses;Mem",
- "MetricName": "tma_info_l3mpki"
+ "BriefDescription": "Average Parallel L2 cache miss data reads",
+ "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD",
+ "MetricGroup": "Memory_BW;Offcore",
+ "MetricName": "tma_info_memory_oro_data_l2_mlp"
},
{
"BriefDescription": "Average Latency for L2 cache miss demand Loads",
"MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD",
"MetricGroup": "Memory_Lat;Offcore",
- "MetricName": "tma_info_load_l2_miss_latency"
+ "MetricName": "tma_info_memory_oro_load_l2_miss_latency"
},
{
"BriefDescription": "Average Parallel L2 cache miss demand Loads",
"MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD",
"MetricGroup": "Memory_BW;Offcore",
- "MetricName": "tma_info_load_l2_mlp"
+ "MetricName": "tma_info_memory_oro_load_l2_mlp"
},
{
- "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
- "MetricConstraint": "NO_GROUP_EVENTS",
- "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)",
- "MetricGroup": "Mem;MemoryBound;MemoryLat",
- "MetricName": "tma_info_load_miss_real_latency"
+ "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]",
+ "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw",
+ "MetricGroup": "Mem;MemoryBW",
+ "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t"
},
{
- "BriefDescription": "Average number of parallel requests to external memory",
- "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_OCCUPANCY.CYCLES_WITH_ANY_REQUEST",
- "MetricGroup": "Mem;SoC",
- "MetricName": "tma_info_mem_parallel_requests",
- "PublicDescription": "Average number of parallel requests to external memory. Accounts for all requests"
+ "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]",
+ "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw",
+ "MetricGroup": "Mem;MemoryBW",
+ "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t"
},
{
- "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)",
- "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_REQUESTS.ALL",
- "MetricGroup": "Mem;SoC",
- "MetricName": "tma_info_mem_request_latency"
+ "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]",
+ "MetricExpr": "0",
+ "MetricGroup": "Mem;MemoryBW;Offcore",
+ "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t"
},
{
- "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
- "MetricConstraint": "NO_GROUP_EVENTS",
- "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
- "MetricGroup": "Mem;MemoryBW;MemoryBound",
- "MetricName": "tma_info_mlp",
- "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)"
+ "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]",
+ "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw",
+ "MetricGroup": "Mem;MemoryBW",
+ "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t"
},
{
"BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
- "MetricExpr": "(ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION) / tma_info_core_clks",
+ "MetricExpr": "(ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION) / tma_info_core_core_clks",
"MetricGroup": "Mem;MemoryTLB",
- "MetricName": "tma_info_page_walks_utilization",
- "MetricThreshold": "tma_info_page_walks_utilization > 0.5"
+ "MetricName": "tma_info_memory_tlb_page_walks_utilization",
+ "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5"
},
{
"BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.",
"MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / cpu@...S_RETIRED.RETIRE_SLOTS\\,cmask\\=1@",
"MetricGroup": "Pipeline;Ret",
- "MetricName": "tma_info_retire"
+ "MetricName": "tma_info_pipeline_retire"
},
{
- "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)",
- "MetricExpr": "4 * tma_info_core_clks",
- "MetricGroup": "TmaL1;tma_L1_group",
- "MetricName": "tma_info_slots"
+ "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]",
+ "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time",
+ "MetricGroup": "Power;Summary",
+ "MetricName": "tma_info_system_average_frequency"
+ },
+ {
+ "BriefDescription": "Average CPU Utilization",
+ "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC",
+ "MetricGroup": "HPC;Summary",
+ "MetricName": "tma_info_system_cpu_utilization"
+ },
+ {
+ "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
+ "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / duration_time / 1e3",
+ "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW",
+ "MetricName": "tma_info_system_dram_bw_use",
+ "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_mem_bandwidth, tma_sq_full"
+ },
+ {
+ "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]",
+ "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u",
+ "MetricGroup": "Branches;OS",
+ "MetricName": "tma_info_system_ipfarbranch",
+ "MetricThreshold": "tma_info_system_ipfarbranch < 1e6"
+ },
+ {
+ "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode",
+ "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k",
+ "MetricGroup": "OS",
+ "MetricName": "tma_info_system_kernel_cpi"
+ },
+ {
+ "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode",
+ "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD",
+ "MetricGroup": "OS",
+ "MetricName": "tma_info_system_kernel_utilization",
+ "MetricThreshold": "tma_info_system_kernel_utilization > 0.05"
+ },
+ {
+ "BriefDescription": "Average number of parallel requests to external memory",
+ "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_OCCUPANCY.CYCLES_WITH_ANY_REQUEST",
+ "MetricGroup": "Mem;SoC",
+ "MetricName": "tma_info_system_mem_parallel_requests",
+ "PublicDescription": "Average number of parallel requests to external memory. Accounts for all requests"
+ },
+ {
+ "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)",
+ "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / UNC_ARB_TRK_REQUESTS.ALL",
+ "MetricGroup": "Mem;SoC",
+ "MetricName": "tma_info_system_mem_request_latency"
},
{
"BriefDescription": "Fraction of cycles where both hardware Logical Processors were active",
"MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0)",
"MetricGroup": "SMT",
- "MetricName": "tma_info_smt_2t_utilization"
+ "MetricName": "tma_info_system_smt_2t_utilization"
},
{
"BriefDescription": "Socket actual clocks when any core is active on that socket",
"MetricExpr": "UNC_CLOCK.SOCKET",
"MetricGroup": "SoC",
- "MetricName": "tma_info_socket_clks"
+ "MetricName": "tma_info_system_socket_clks"
},
{
"BriefDescription": "Average Frequency Utilization relative nominal frequency",
- "MetricExpr": "tma_info_clks / CPU_CLK_UNHALTED.REF_TSC",
+ "MetricExpr": "tma_info_thread_clks / CPU_CLK_UNHALTED.REF_TSC",
"MetricGroup": "Power",
- "MetricName": "tma_info_turbo_utilization"
+ "MetricName": "tma_info_system_turbo_utilization"
+ },
+ {
+ "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.",
+ "MetricExpr": "CPU_CLK_UNHALTED.THREAD",
+ "MetricGroup": "Pipeline",
+ "MetricName": "tma_info_thread_clks"
+ },
+ {
+ "BriefDescription": "Cycles Per Instruction (per Logical Processor)",
+ "MetricExpr": "1 / tma_info_thread_ipc",
+ "MetricGroup": "Mem;Pipeline",
+ "MetricName": "tma_info_thread_cpi"
+ },
+ {
+ "BriefDescription": "Instructions Per Cycle (per Logical Processor)",
+ "MetricExpr": "INST_RETIRED.ANY / tma_info_thread_clks",
+ "MetricGroup": "Ret;Summary",
+ "MetricName": "tma_info_thread_ipc"
+ },
+ {
+ "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)",
+ "MetricExpr": "4 * tma_info_core_core_clks",
+ "MetricGroup": "TmaL1;tma_L1_group",
+ "MetricName": "tma_info_thread_slots"
},
{
"BriefDescription": "Uops Per Instruction",
"MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY",
"MetricGroup": "Pipeline;Ret;Retire",
- "MetricName": "tma_info_uoppi",
- "MetricThreshold": "tma_info_uoppi > 1.05"
+ "MetricName": "tma_info_thread_uoppi",
+ "MetricThreshold": "tma_info_thread_uoppi > 1.05"
},
{
"BriefDescription": "Instruction per taken branch",
"MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / BR_INST_RETIRED.NEAR_TAKEN",
"MetricGroup": "Branches;Fed;FetchBW",
- "MetricName": "tma_info_uptb",
- "MetricThreshold": "tma_info_uptb < 6"
+ "MetricName": "tma_info_thread_uptb",
+ "MetricThreshold": "tma_info_thread_uptb < 6"
},
{
"BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses",
- "MetricExpr": "(14 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION) / tma_info_clks",
+ "MetricExpr": "(14 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION) / tma_info_thread_clks",
"MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
"MetricName": "tma_itlb_misses",
"MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
@@ -620,7 +620,7 @@
},
{
"BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache",
- "MetricExpr": "max((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) - CYCLE_ACTIVITY.STALLS_L1D_PENDING) / tma_info_clks, 0)",
+ "MetricExpr": "max((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) - CYCLE_ACTIVITY.STALLS_L1D_PENDING) / tma_info_thread_clks, 0)",
"MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
"MetricName": "tma_l1_bound",
"MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
@@ -629,7 +629,7 @@
},
{
"BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads",
- "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_PENDING - CYCLE_ACTIVITY.STALLS_L2_PENDING) / tma_info_clks",
+ "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_PENDING - CYCLE_ACTIVITY.STALLS_L2_PENDING) / tma_info_thread_clks",
"MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
"MetricName": "tma_l2_bound",
"MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
@@ -639,7 +639,7 @@
{
"BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core",
"MetricConstraint": "NO_GROUP_EVENTS_SMT",
- "MetricExpr": "MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_clks",
+ "MetricExpr": "MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_thread_clks",
"MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
"MetricName": "tma_l3_bound",
"MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
@@ -649,7 +649,7 @@
{
"BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
"MetricConstraint": "NO_GROUP_EVENTS",
- "MetricExpr": "29 * (MEM_LOAD_UOPS_RETIRED.L3_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS))) / tma_info_clks",
+ "MetricExpr": "29 * (MEM_LOAD_UOPS_RETIRED.L3_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_RETIRED.L3_MISS))) / tma_info_thread_clks",
"MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group",
"MetricName": "tma_l3_hit_latency",
"MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -658,11 +658,11 @@
},
{
"BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)",
- "MetricExpr": "ILD_STALL.LCP / tma_info_clks",
+ "MetricExpr": "ILD_STALL.LCP / tma_info_thread_clks",
"MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB",
"MetricName": "tma_lcp",
"MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
- "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_iptb",
+ "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb",
"ScaleUnit": "100%"
},
{
@@ -678,7 +678,7 @@
{
"BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations",
"MetricConstraint": "NO_GROUP_EVENTS_NMI",
- "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_clks)",
+ "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_core_clks)",
"MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group",
"MetricName": "tma_load_op_utilization",
"MetricThreshold": "tma_load_op_utilization > 0.6",
@@ -688,7 +688,7 @@
{
"BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations",
"MetricConstraint": "NO_GROUP_EVENTS",
- "MetricExpr": "MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO) / tma_info_clks",
+ "MetricExpr": "MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO) / tma_info_thread_clks",
"MetricGroup": "Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group",
"MetricName": "tma_lock_latency",
"MetricThreshold": "tma_lock_latency > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -708,16 +708,16 @@
},
{
"BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)",
- "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@...CORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=6@) / tma_info_clks",
+ "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@...CORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=6@) / tma_info_thread_clks",
"MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW",
"MetricName": "tma_mem_bandwidth",
"MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
- "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_sq_full",
+ "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full",
"ScaleUnit": "100%"
},
{
"BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)",
- "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_clks - tma_mem_bandwidth",
+ "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth",
"MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat",
"MetricName": "tma_mem_latency",
"MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -727,7 +727,7 @@
{
"BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck",
"MetricConstraint": "NO_GROUP_EVENTS",
- "MetricExpr": "((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) + RESOURCE_STALLS.SB) / (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + (cpu@...S_EXECUTED.CORE\\,cmask\\=1@ - (cpu@...S_EXECUTED.CORE\\,cmask\\=3@ if tma_info_ipc > 1.8 else cpu@...S_EXECUTED.CORE\\,cmask\\=2@)) / 2 - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) if #SMT_on else min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + cpu@...S_EXECUTED.CORE\\,cmask\\=1@ - (cpu@...S_EXECUTED.CORE\\,cmask\\=3@ if tma_info_ipc > 1.8 else cpu@...S_EXECUTED.CORE\\,cmask\\=2@) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) * tma_backend_bound",
+ "MetricExpr": "((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) + RESOURCE_STALLS.SB) / (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + (cpu@...S_EXECUTED.CORE\\,cmask\\=1@ - (cpu@...S_EXECUTED.CORE\\,cmask\\=3@ if tma_info_thread_ipc > 1.8 else cpu@...S_EXECUTED.CORE\\,cmask\\=2@)) / 2 - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) if #SMT_on else min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + cpu@...S_EXECUTED.CORE\\,cmask\\=1@ - (cpu@...S_EXECUTED.CORE\\,cmask\\=3@ if tma_info_thread_ipc > 1.8 else cpu@...S_EXECUTED.CORE\\,cmask\\=2@) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) * tma_backend_bound",
"MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
"MetricName": "tma_memory_bound",
"MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2",
@@ -737,7 +737,7 @@
},
{
"BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit",
- "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_slots",
+ "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots",
"MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS",
"MetricName": "tma_microcode_sequencer",
"MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1",
@@ -746,16 +746,16 @@
},
{
"BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)",
- "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_clks / 2",
+ "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_core_clks / 2",
"MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
"MetricName": "tma_mite",
- "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35)",
+ "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)",
"PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck.",
"ScaleUnit": "100%"
},
{
"BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)",
- "MetricExpr": "2 * IDQ.MS_SWITCHES / tma_info_clks",
+ "MetricExpr": "2 * IDQ.MS_SWITCHES / tma_info_thread_clks",
"MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO",
"MetricName": "tma_ms_switches",
"MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
@@ -764,7 +764,7 @@
},
{
"BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch)",
- "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / tma_info_core_clks",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / tma_info_core_core_clks",
"MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
"MetricName": "tma_port_0",
"MetricThreshold": "tma_port_0 > 0.6",
@@ -773,7 +773,7 @@
},
{
"BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU)",
- "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / tma_info_core_clks",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / tma_info_core_core_clks",
"MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
"MetricName": "tma_port_1",
"MetricThreshold": "tma_port_1 > 0.6",
@@ -782,7 +782,7 @@
},
{
"BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 2 ([SNB+]Loads and Store-address; [ICL+] Loads)",
- "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / tma_info_core_clks",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / tma_info_core_core_clks",
"MetricGroup": "TopdownL6;tma_L6_group;tma_load_op_utilization_group",
"MetricName": "tma_port_2",
"MetricThreshold": "tma_port_2 > 0.6",
@@ -791,7 +791,7 @@
},
{
"BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 3 ([SNB+]Loads and Store-address; [ICL+] Loads)",
- "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / tma_info_core_clks",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / tma_info_core_core_clks",
"MetricGroup": "TopdownL6;tma_L6_group;tma_load_op_utilization_group",
"MetricName": "tma_port_3",
"MetricThreshold": "tma_port_3 > 0.6",
@@ -809,7 +809,7 @@
},
{
"BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU)",
- "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / tma_info_core_clks",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / tma_info_core_core_clks",
"MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
"MetricName": "tma_port_5",
"MetricThreshold": "tma_port_5 > 0.6",
@@ -818,7 +818,7 @@
},
{
"BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)",
- "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_clks",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_core_clks",
"MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
"MetricName": "tma_port_6",
"MetricThreshold": "tma_port_6 > 0.6",
@@ -827,7 +827,7 @@
},
{
"BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 7 ([HSW+]simple Store-address)",
- "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / tma_info_core_clks",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / tma_info_core_core_clks",
"MetricGroup": "TopdownL6;tma_L6_group;tma_store_op_utilization_group",
"MetricName": "tma_port_7",
"MetricThreshold": "tma_port_7 > 0.6",
@@ -837,7 +837,7 @@
{
"BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)",
"MetricConstraint": "NO_GROUP_EVENTS",
- "MetricExpr": "(min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + (cpu@...S_EXECUTED.CORE\\,cmask\\=1@ - (cpu@...S_EXECUTED.CORE\\,cmask\\=3@ if tma_info_ipc > 1.8 else cpu@...S_EXECUTED.CORE\\,cmask\\=2@)) / 2 - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB if #SMT_on else min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + cpu@...S_EXECUTED.CORE\\,cmask\\=1@ - (cpu@...S_EXECUTED.CORE\\,cmask\\=3@ if tma_info_ipc > 1.8 else cpu@...S_EXECUTED.CORE\\,cmask\\=2@) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB - RESOURCE_STALLS.SB - min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING)) / tma_info_clks",
+ "MetricExpr": "(min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + (cpu@...S_EXECUTED.CORE\\,cmask\\=1@ - (cpu@...S_EXECUTED.CORE\\,cmask\\=3@ if tma_info_thread_ipc > 1.8 else cpu@...S_EXECUTED.CORE\\,cmask\\=2@)) / 2 - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB if #SMT_on else min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + cpu@...S_EXECUTED.CORE\\,cmask\\=1@ - (cpu@...S_EXECUTED.CORE\\,cmask\\=3@ if tma_info_thread_ipc > 1.8 else cpu@...S_EXECUTED.CORE\\,cmask\\=2@) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB - RESOURCE_STALLS.SB - min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING)) / tma_info_thread_clks",
"MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group",
"MetricName": "tma_ports_utilization",
"MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)",
@@ -846,7 +846,7 @@
},
{
"BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
- "MetricExpr": "(cpu@...S_EXECUTED.CORE\\,inv\\,cmask\\=1@ / 2 if #SMT_on else (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0)) / tma_info_core_clks)",
+ "MetricExpr": "(cpu@...S_EXECUTED.CORE\\,inv\\,cmask\\=1@ / 2 if #SMT_on else (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0)) / tma_info_core_core_clks)",
"MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
"MetricName": "tma_ports_utilized_0",
"MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
@@ -855,7 +855,7 @@
},
{
"BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
- "MetricExpr": "((cpu@...S_EXECUTED.CORE\\,cmask\\=1@ - cpu@...S_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else (cpu@...S_EXECUTED.CORE\\,cmask\\=1@ - cpu@...S_EXECUTED.CORE\\,cmask\\=2@) / tma_info_core_clks)",
+ "MetricExpr": "((cpu@...S_EXECUTED.CORE\\,cmask\\=1@ - cpu@...S_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else (cpu@...S_EXECUTED.CORE\\,cmask\\=1@ - cpu@...S_EXECUTED.CORE\\,cmask\\=2@) / tma_info_core_core_clks)",
"MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group",
"MetricName": "tma_ports_utilized_1",
"MetricThreshold": "tma_ports_utilized_1 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
@@ -864,7 +864,7 @@
},
{
"BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
- "MetricExpr": "((cpu@...S_EXECUTED.CORE\\,cmask\\=2@ - cpu@...S_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else (cpu@...S_EXECUTED.CORE\\,cmask\\=2@ - cpu@...S_EXECUTED.CORE\\,cmask\\=3@) / tma_info_core_clks)",
+ "MetricExpr": "((cpu@...S_EXECUTED.CORE\\,cmask\\=2@ - cpu@...S_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else (cpu@...S_EXECUTED.CORE\\,cmask\\=2@ - cpu@...S_EXECUTED.CORE\\,cmask\\=3@) / tma_info_core_core_clks)",
"MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group",
"MetricName": "tma_ports_utilized_2",
"MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
@@ -873,7 +873,7 @@
},
{
"BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).",
- "MetricExpr": "(cpu@...S_EXECUTED.CORE\\,cmask\\=3@ / 2 if #SMT_on else cpu@...S_EXECUTED.CORE\\,cmask\\=3@) / tma_info_core_clks",
+ "MetricExpr": "(cpu@...S_EXECUTED.CORE\\,cmask\\=3@ / 2 if #SMT_on else cpu@...S_EXECUTED.CORE\\,cmask\\=3@) / tma_info_core_core_clks",
"MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
"MetricName": "tma_ports_utilized_3m",
"MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
@@ -881,7 +881,7 @@
},
{
"BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired",
- "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / tma_info_slots",
+ "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / tma_info_thread_slots",
"MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_retiring",
"MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1",
@@ -892,7 +892,7 @@
{
"BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary",
"MetricConstraint": "NO_GROUP_EVENTS",
- "MetricExpr": "tma_info_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_clks",
+ "MetricExpr": "tma_info_memory_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_thread_clks",
"MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group",
"MetricName": "tma_split_loads",
"MetricThreshold": "tma_split_loads > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -901,7 +901,7 @@
},
{
"BriefDescription": "This metric represents rate of split store accesses",
- "MetricExpr": "2 * MEM_UOPS_RETIRED.SPLIT_STORES / tma_info_core_clks",
+ "MetricExpr": "2 * MEM_UOPS_RETIRED.SPLIT_STORES / tma_info_core_core_clks",
"MetricGroup": "TopdownL4;tma_L4_group;tma_issueSpSt;tma_store_bound_group",
"MetricName": "tma_split_stores",
"MetricThreshold": "tma_split_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -910,16 +910,16 @@
},
{
"BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)",
- "MetricExpr": "(OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2 if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / tma_info_core_clks",
+ "MetricExpr": "(OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2 if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / tma_info_core_core_clks",
"MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group",
"MetricName": "tma_sq_full",
"MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
- "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_mem_bandwidth",
+ "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth",
"ScaleUnit": "100%"
},
{
"BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write",
- "MetricExpr": "RESOURCE_STALLS.SB / tma_info_clks",
+ "MetricExpr": "RESOURCE_STALLS.SB / tma_info_thread_clks",
"MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
"MetricName": "tma_store_bound",
"MetricThreshold": "tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
@@ -928,7 +928,7 @@
},
{
"BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores",
- "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_clks",
+ "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_thread_clks",
"MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group",
"MetricName": "tma_store_fwd_blk",
"MetricThreshold": "tma_store_fwd_blk > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -938,7 +938,7 @@
{
"BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses",
"MetricConstraint": "NO_GROUP_EVENTS",
- "MetricExpr": "(L2_RQSTS.RFO_HIT * 9 * (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) + (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_clks",
+ "MetricExpr": "(L2_RQSTS.RFO_HIT * 9 * (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) + (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_thread_clks",
"MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_issueSL;tma_store_bound_group",
"MetricName": "tma_store_latency",
"MetricThreshold": "tma_store_latency > 0.1 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -947,7 +947,7 @@
},
{
"BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations",
- "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / tma_info_core_clks",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / tma_info_core_core_clks",
"MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group",
"MetricName": "tma_store_op_utilization",
"MetricThreshold": "tma_store_op_utilization > 0.6",
@@ -955,7 +955,7 @@
},
{
"BriefDescription": "This metric serves as an approximation of legacy x87 usage",
- "MetricExpr": "INST_RETIRED.X87 * tma_info_uoppi / UOPS_RETIRED.RETIRE_SLOTS",
+ "MetricExpr": "INST_RETIRED.X87 * tma_info_thread_uoppi / UOPS_RETIRED.RETIRE_SLOTS",
"MetricGroup": "Compute;TopdownL4;tma_L4_group;tma_fp_arith_group",
"MetricName": "tma_x87_use",
"MetricThreshold": "tma_x87_use > 0.1",
diff --git a/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json b/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json
index a522202cf684..5f451948c893 100644
--- a/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json
@@ -50,10 +50,206 @@
},
{
"BriefDescription": "Uncore frequency per die [GHZ]",
- "MetricExpr": "tma_info_socket_clks / #num_dies / duration_time / 1e9",
+ "MetricExpr": "tma_info_system_socket_clks / #num_dies / duration_time / 1e9",
"MetricGroup": "SoC",
"MetricName": "UNCORE_FREQ"
},
+ {
+ "BriefDescription": "Cycles per instruction retired; indicating how much time each executed instruction took; in units of cycles.",
+ "MetricExpr": "CPU_CLK_UNHALTED.THREAD / INST_RETIRED.ANY",
+ "MetricName": "cpi",
+ "ScaleUnit": "1per_instr"
+ },
+ {
+ "BriefDescription": "CPU operating frequency (in GHz)",
+ "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC * #SYSTEM_TSC_FREQ / 1e9",
+ "MetricName": "cpu_operating_frequency",
+ "ScaleUnit": "1GHz"
+ },
+ {
+ "BriefDescription": "Percentage of time spent in the active CPU power state C0",
+ "MetricExpr": "tma_info_system_cpu_utilization",
+ "MetricName": "cpu_utilization",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data loads to the total number of completed instructions",
+ "MetricExpr": "DTLB_LOAD_MISSES.WALK_COMPLETED / INST_RETIRED.ANY",
+ "MetricName": "dtlb_load_mpi",
+ "PublicDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data loads to the total number of completed instructions. This implies it missed in the DTLB and further levels of TLB.",
+ "ScaleUnit": "1per_instr"
+ },
+ {
+ "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data stores to the total number of completed instructions",
+ "MetricExpr": "DTLB_STORE_MISSES.WALK_COMPLETED / INST_RETIRED.ANY",
+ "MetricName": "dtlb_store_mpi",
+ "PublicDescription": "Ratio of number of completed page walks (for all page sizes) caused by demand data stores to the total number of completed instructions. This implies it missed in the DTLB and further levels of TLB.",
+ "ScaleUnit": "1per_instr"
+ },
+ {
+ "BriefDescription": "Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the CPU.",
+ "MetricExpr": "cbox@..._C_TOR_INSERTS.OPCODE\\,filter_opc\\=0x19e@ * 64 / 1e6 / duration_time",
+ "MetricName": "io_bandwidth_read",
+ "ScaleUnit": "1MB/s"
+ },
+ {
+ "BriefDescription": "Bandwidth of IO writes that are initiated by end device controllers that are writing memory to the CPU.",
+ "MetricExpr": "cbox@..._C_TOR_INSERTS.OPCODE\\,filter_opc\\=0x1c8\\,filter_tid\\=0x3e@ * 64 / 1e6 / duration_time",
+ "MetricName": "io_bandwidth_write",
+ "ScaleUnit": "1MB/s"
+ },
+ {
+ "BriefDescription": "Ratio of number of completed page walks (for 2 megabyte and 4 megabyte page sizes) caused by a code fetch to the total number of completed instructions",
+ "MetricExpr": "ITLB_MISSES.WALK_COMPLETED_2M_4M / INST_RETIRED.ANY",
+ "MetricName": "itlb_large_page_mpi",
+ "PublicDescription": "Ratio of number of completed page walks (for 2 megabyte and 4 megabyte page sizes) caused by a code fetch to the total number of completed instructions. This implies it missed in the Instruction Translation Lookaside Buffer (ITLB) and further levels of TLB.",
+ "ScaleUnit": "1per_instr"
+ },
+ {
+ "BriefDescription": "Ratio of number of completed page walks (for all page sizes) caused by a code fetch to the total number of completed instructions",
+ "MetricExpr": "ITLB_MISSES.WALK_COMPLETED / INST_RETIRED.ANY",
+ "MetricName": "itlb_mpi",
+ "PublicDescription": "Ratio of number of completed page walks (for all page sizes) caused by a code fetch to the total number of completed instructions. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB.",
+ "ScaleUnit": "1per_instr"
+ },
+ {
+ "BriefDescription": "Ratio of number of code read requests missing in L1 instruction cache (includes prefetches) to the total number of completed instructions",
+ "MetricExpr": "L2_RQSTS.ALL_CODE_RD / INST_RETIRED.ANY",
+ "MetricName": "l1_i_code_read_misses_with_prefetches_per_instr",
+ "ScaleUnit": "1per_instr"
+ },
+ {
+ "BriefDescription": "Ratio of number of demand load requests hitting in L1 data cache to the total number of completed instructions",
+ "MetricExpr": "MEM_LOAD_UOPS_RETIRED.L1_HIT / INST_RETIRED.ANY",
+ "MetricName": "l1d_demand_data_read_hits_per_instr",
+ "ScaleUnit": "1per_instr"
+ },
+ {
+ "BriefDescription": "Ratio of number of requests missing L1 data cache (includes data+rfo w/ prefetches) to the total number of completed instructions",
+ "MetricExpr": "L1D.REPLACEMENT / INST_RETIRED.ANY",
+ "MetricName": "l1d_mpi",
+ "ScaleUnit": "1per_instr"
+ },
+ {
+ "BriefDescription": "Ratio of number of code read request missing L2 cache to the total number of completed instructions",
+ "MetricExpr": "L2_RQSTS.CODE_RD_MISS / INST_RETIRED.ANY",
+ "MetricName": "l2_demand_code_mpi",
+ "ScaleUnit": "1per_instr"
+ },
+ {
+ "BriefDescription": "Ratio of number of completed demand load requests hitting in L2 cache to the total number of completed instructions",
+ "MetricExpr": "MEM_LOAD_UOPS_RETIRED.L2_HIT / INST_RETIRED.ANY",
+ "MetricName": "l2_demand_data_read_hits_per_instr",
+ "ScaleUnit": "1per_instr"
+ },
+ {
+ "BriefDescription": "Ratio of number of completed data read request missing L2 cache to the total number of completed instructions",
+ "MetricExpr": "MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY",
+ "MetricName": "l2_demand_data_read_mpi",
+ "ScaleUnit": "1per_instr"
+ },
+ {
+ "BriefDescription": "Ratio of number of requests missing L2 cache (includes code+data+rfo w/ prefetches) to the total number of completed instructions",
+ "MetricExpr": "L2_LINES_IN.ALL / INST_RETIRED.ANY",
+ "MetricName": "l2_mpi",
+ "ScaleUnit": "1per_instr"
+ },
+ {
+ "BriefDescription": "Ratio of number of code read requests missing last level core cache (includes demand w/ prefetches) to the total number of completed instructions",
+ "MetricExpr": "(cbox@..._C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x181@ + cbox@..._C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x191@) / INST_RETIRED.ANY",
+ "MetricName": "llc_code_read_mpi_demand_plus_prefetch",
+ "ScaleUnit": "1per_instr"
+ },
+ {
+ "BriefDescription": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) in nano seconds",
+ "MetricExpr": "1e9 * (cbox@..._C_TOR_OCCUPANCY.MISS_OPCODE\\,filter_opc\\=0x182@ / cbox@..._C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x182@) / (UNC_C_CLOCKTICKS / (#num_cores / #num_packages * #num_packages)) * duration_time",
+ "MetricName": "llc_data_read_demand_plus_prefetch_miss_latency",
+ "ScaleUnit": "1ns"
+ },
+ {
+ "BriefDescription": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) addressed to local memory in nano seconds",
+ "MetricExpr": "1e9 * (cbox@..._C_TOR_OCCUPANCY.MISS_LOCAL_OPCODE\\,filter_opc\\=0x182@ / cbox@..._C_TOR_INSERTS.MISS_LOCAL_OPCODE\\,filter_opc\\=0x182@) / (UNC_C_CLOCKTICKS / (#num_cores / #num_packages * #num_packages)) * duration_time",
+ "MetricName": "llc_data_read_demand_plus_prefetch_miss_latency_for_local_requests",
+ "ScaleUnit": "1ns"
+ },
+ {
+ "BriefDescription": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) addressed to remote memory in nano seconds",
+ "MetricExpr": "1e9 * (cbox@..._C_TOR_OCCUPANCY.MISS_REMOTE_OPCODE\\,filter_opc\\=0x182@ / cbox@..._C_TOR_INSERTS.MISS_REMOTE_OPCODE\\,filter_opc\\=0x182@) / (UNC_C_CLOCKTICKS / (#num_cores / #num_packages * #num_packages)) * duration_time",
+ "MetricName": "llc_data_read_demand_plus_prefetch_miss_latency_for_remote_requests",
+ "ScaleUnit": "1ns"
+ },
+ {
+ "BriefDescription": "Ratio of number of data read requests missing last level core cache (includes demand w/ prefetches) to the total number of completed instructions",
+ "MetricExpr": "(cbox@..._C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x182@ + cbox@..._C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x192@) / INST_RETIRED.ANY",
+ "MetricName": "llc_data_read_mpi_demand_plus_prefetch",
+ "ScaleUnit": "1per_instr"
+ },
+ {
+ "BriefDescription": "The ratio of number of completed memory load instructions to the total number completed instructions",
+ "MetricExpr": "MEM_UOPS_RETIRED.ALL_LOADS / INST_RETIRED.ANY",
+ "MetricName": "loads_per_instr",
+ "ScaleUnit": "1per_instr"
+ },
+ {
+ "BriefDescription": "DDR memory read bandwidth (MB/sec)",
+ "MetricExpr": "UNC_M_CAS_COUNT.RD * 64 / 1e6 / duration_time",
+ "MetricName": "memory_bandwidth_read",
+ "ScaleUnit": "1MB/s"
+ },
+ {
+ "BriefDescription": "DDR memory bandwidth (MB/sec)",
+ "MetricExpr": "(UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) * 64 / 1e6 / duration_time",
+ "MetricName": "memory_bandwidth_total",
+ "ScaleUnit": "1MB/s"
+ },
+ {
+ "BriefDescription": "DDR memory write bandwidth (MB/sec)",
+ "MetricExpr": "UNC_M_CAS_COUNT.WR * 64 / 1e6 / duration_time",
+ "MetricName": "memory_bandwidth_write",
+ "ScaleUnit": "1MB/s"
+ },
+ {
+ "BriefDescription": "Memory read that miss the last level cache (LLC) addressed to local DRAM as a percentage of total memory read accesses, does not include LLC prefetches.",
+ "MetricExpr": "cbox@..._C_TOR_INSERTS.MISS_LOCAL_OPCODE\\,filter_opc\\=0x182@ / (cbox@..._C_TOR_INSERTS.MISS_LOCAL_OPCODE\\,filter_opc\\=0x182@ + cbox@..._C_TOR_INSERTS.MISS_REMOTE_OPCODE\\,filter_opc\\=0x182@)",
+ "MetricName": "numa_reads_addressed_to_local_dram",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "Memory reads that miss the last level cache (LLC) addressed to remote DRAM as a percentage of total memory read accesses, does not include LLC prefetches.",
+ "MetricExpr": "cbox@..._C_TOR_INSERTS.MISS_REMOTE_OPCODE\\,filter_opc\\=0x182@ / (cbox@..._C_TOR_INSERTS.MISS_LOCAL_OPCODE\\,filter_opc\\=0x182@ + cbox@..._C_TOR_INSERTS.MISS_REMOTE_OPCODE\\,filter_opc\\=0x182@)",
+ "MetricName": "numa_reads_addressed_to_remote_dram",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "Uops delivered from decoded instruction cache (decoded stream buffer or DSB) as a percent of total uops delivered to Instruction Decode Queue",
+ "MetricExpr": "IDQ.DSB_UOPS / UOPS_ISSUED.ANY",
+ "MetricName": "percent_uops_delivered_from_decoded_icache",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "Uops delivered from legacy decode pipeline (Micro-instruction Translation Engine or MITE) as a percent of total uops delivered to Instruction Decode Queue",
+ "MetricExpr": "IDQ.MITE_UOPS / UOPS_ISSUED.ANY",
+ "MetricName": "percent_uops_delivered_from_legacy_decode_pipeline",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "Uops delivered from loop stream detector(LSD) as a percent of total uops delivered to Instruction Decode Queue",
+ "MetricExpr": "(UOPS_ISSUED.ANY - IDQ.MITE_UOPS - IDQ.MS_UOPS - IDQ.DSB_UOPS) / UOPS_ISSUED.ANY",
+ "MetricName": "percent_uops_delivered_from_loop_stream_detector",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "Uops delivered from microcode sequencer (MS) as a percent of total uops delivered to Instruction Decode Queue",
+ "MetricExpr": "IDQ.MS_UOPS / UOPS_ISSUED.ANY",
+ "MetricName": "percent_uops_delivered_from_microcode_sequencer",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "Intel(R) Quick Path Interconnect (QPI) data transmit bandwidth (MB/sec)",
+ "MetricExpr": "UNC_Q_TxL_FLITS_G0.DATA * 8 / 1e6 / duration_time",
+ "MetricName": "qpi_data_transmit_bw",
+ "ScaleUnit": "1MB/s"
+ },
{
"BriefDescription": "Percentage of cycles spent in System Management Interrupts.",
"MetricExpr": "((msr@...rf@ - cycles) / msr@...rf@ if msr@smi@ > 0 else 0)",
@@ -69,9 +265,15 @@
"MetricName": "smi_num",
"ScaleUnit": "1SMI#"
},
+ {
+ "BriefDescription": "The ratio of number of completed memory store instructions to the total number completed instructions",
+ "MetricExpr": "MEM_UOPS_RETIRED.ALL_STORES / INST_RETIRED.ANY",
+ "MetricName": "stores_per_instr",
+ "ScaleUnit": "1per_instr"
+ },
{
"BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset",
- "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_clks",
+ "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_thread_clks",
"MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group",
"MetricName": "tma_4k_aliasing",
"MetricThreshold": "tma_4k_aliasing > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -81,7 +283,7 @@
{
"BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.",
"MetricConstraint": "NO_GROUP_EVENTS_NMI",
- "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_slots",
+ "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_thread_slots",
"MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group",
"MetricName": "tma_alu_op_utilization",
"MetricThreshold": "tma_alu_op_utilization > 0.6",
@@ -89,7 +291,7 @@
},
{
"BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists",
- "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_slots",
+ "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / tma_info_thread_slots",
"MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group",
"MetricName": "tma_assists",
"MetricThreshold": "tma_assists > 0.1 & (tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1)",
@@ -109,7 +311,7 @@
},
{
"BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations",
- "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_slots",
+ "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * (INT_MISC.RECOVERY_CYCLES_ANY / 2 if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / tma_info_thread_slots",
"MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_bad_speculation",
"MetricThreshold": "tma_bad_speculation > 0.15",
@@ -125,12 +327,12 @@
"MetricName": "tma_branch_mispredicts",
"MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15",
"MetricgroupNoGroup": "TopdownL2",
- "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers",
+ "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_bad_spec_branch_misprediction_cost, tma_mispredicts_resteers",
"ScaleUnit": "100%"
},
{
"BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers",
- "MetricExpr": "12 * (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY) / tma_info_clks",
+ "MetricExpr": "12 * (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY) / tma_info_thread_clks",
"MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group",
"MetricName": "tma_branch_resteers",
"MetricThreshold": "tma_branch_resteers > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
@@ -150,7 +352,7 @@
{
"BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses",
"MetricConstraint": "NO_GROUP_EVENTS",
- "MetricExpr": "(60 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) + 43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD)))) / tma_info_clks",
+ "MetricExpr": "(60 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) + 43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD)))) / tma_info_thread_clks",
"MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group",
"MetricName": "tma_contested_accesses",
"MetricThreshold": "tma_contested_accesses > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -171,7 +373,7 @@
{
"BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses",
"MetricConstraint": "NO_GROUP_EVENTS",
- "MetricExpr": "43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_clks",
+ "MetricExpr": "43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks",
"MetricGroup": "Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group",
"MetricName": "tma_data_sharing",
"MetricThreshold": "tma_data_sharing > 0.05 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -180,7 +382,7 @@
},
{
"BriefDescription": "This metric represents fraction of cycles where the Divider unit was active",
- "MetricExpr": "10 * ARITH.DIVIDER_UOPS / tma_info_core_clks",
+ "MetricExpr": "10 * ARITH.DIVIDER_UOPS / tma_info_core_core_clks",
"MetricGroup": "TopdownL3;tma_L3_group;tma_core_bound_group",
"MetricName": "tma_divider",
"MetricThreshold": "tma_divider > 0.2 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)",
@@ -190,7 +392,7 @@
{
"BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads",
"MetricConstraint": "NO_GROUP_EVENTS_SMT",
- "MetricExpr": "(1 - MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS)) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_clks",
+ "MetricExpr": "(1 - MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS)) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_thread_clks",
"MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
"MetricName": "tma_dram_bound",
"MetricThreshold": "tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
@@ -199,25 +401,25 @@
},
{
"BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline",
- "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_clks / 2",
+ "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / tma_info_core_core_clks / 2",
"MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
"MetricName": "tma_dsb",
- "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35)",
+ "MetricThreshold": "tma_dsb > 0.15 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)",
"PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.",
"ScaleUnit": "100%"
},
{
"BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines",
- "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_clks",
+ "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / tma_info_thread_clks",
"MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB",
"MetricName": "tma_dsb_switches",
"MetricThreshold": "tma_dsb_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
- "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Related metrics: tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_iptb, tma_lcp",
+ "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Related metrics: tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp",
"ScaleUnit": "100%"
},
{
"BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses",
- "MetricExpr": "(8 * DTLB_LOAD_MISSES.STLB_HIT + DTLB_LOAD_MISSES.WALK_DURATION) / tma_info_clks",
+ "MetricExpr": "(8 * DTLB_LOAD_MISSES.STLB_HIT + DTLB_LOAD_MISSES.WALK_DURATION) / tma_info_thread_clks",
"MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_l1_bound_group",
"MetricName": "tma_dtlb_load",
"MetricThreshold": "tma_dtlb_load > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -226,7 +428,7 @@
},
{
"BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses",
- "MetricExpr": "(8 * DTLB_STORE_MISSES.STLB_HIT + DTLB_STORE_MISSES.WALK_DURATION) / tma_info_clks",
+ "MetricExpr": "(8 * DTLB_STORE_MISSES.STLB_HIT + DTLB_STORE_MISSES.WALK_DURATION) / tma_info_thread_clks",
"MetricGroup": "MemoryTLB;TopdownL4;tma_L4_group;tma_issueTLB;tma_store_bound_group",
"MetricName": "tma_dtlb_store",
"MetricThreshold": "tma_dtlb_store > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -235,7 +437,7 @@
},
{
"BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing",
- "MetricExpr": "(200 * OFFCORE_RESPONSE.DEMAND_RFO.LLC_MISS.REMOTE_HITM + 60 * OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_OTHER_CORE) / tma_info_clks",
+ "MetricExpr": "(200 * OFFCORE_RESPONSE.DEMAND_RFO.LLC_MISS.REMOTE_HITM + 60 * OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_OTHER_CORE) / tma_info_thread_clks",
"MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_store_bound_group",
"MetricName": "tma_false_sharing",
"MetricThreshold": "tma_false_sharing > 0.05 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -245,11 +447,11 @@
{
"BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed",
"MetricConstraint": "NO_GROUP_EVENTS",
- "MetricExpr": "tma_info_load_miss_real_latency * cpu@..._PEND_MISS.REQUEST_FB_FULL\\,cmask\\=1@ / tma_info_clks",
+ "MetricExpr": "tma_info_memory_load_miss_real_latency * cpu@..._PEND_MISS.REQUEST_FB_FULL\\,cmask\\=1@ / tma_info_thread_clks",
"MetricGroup": "MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group",
"MetricName": "tma_fb_full",
"MetricThreshold": "tma_fb_full > 0.3",
- "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores",
+ "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores",
"ScaleUnit": "100%"
},
{
@@ -257,14 +459,14 @@
"MetricExpr": "tma_frontend_bound - tma_fetch_latency",
"MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
"MetricName": "tma_fetch_bandwidth",
- "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35",
+ "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35",
"MetricgroupNoGroup": "TopdownL2",
- "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_iptb, tma_lcp",
+ "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb, tma_lcp",
"ScaleUnit": "100%"
},
{
"BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues",
- "MetricExpr": "4 * min(CPU_CLK_UNHALTED.THREAD, IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE) / tma_info_slots",
+ "MetricExpr": "4 * min(CPU_CLK_UNHALTED.THREAD, IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE) / tma_info_thread_slots",
"MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group",
"MetricName": "tma_fetch_latency",
"MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15",
@@ -274,7 +476,7 @@
},
{
"BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend",
- "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / tma_info_slots",
+ "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / tma_info_thread_slots",
"MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_frontend_bound",
"MetricThreshold": "tma_frontend_bound > 0.15",
@@ -294,325 +496,325 @@
},
{
"BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses.",
- "MetricExpr": "ICACHE.IFDATA_STALL / tma_info_clks",
+ "MetricExpr": "ICACHE.IFDATA_STALL / tma_info_thread_clks",
"MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_L3_group;tma_fetch_latency_group",
"MetricName": "tma_icache_misses",
"MetricThreshold": "tma_icache_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
"ScaleUnit": "100%"
},
{
- "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]",
- "MetricExpr": "tma_info_turbo_utilization * TSC / 1e9 / duration_time",
- "MetricGroup": "Power;Summary",
- "MetricName": "tma_info_average_frequency"
- },
- {
- "BriefDescription": "Branch instructions per taken branch.",
- "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN",
- "MetricGroup": "Branches;Fed;PGO",
- "MetricName": "tma_info_bptkbranch"
+ "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).",
+ "MetricExpr": "tma_info_inst_mix_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@...MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)",
+ "MetricGroup": "Bad;BrMispredicts",
+ "MetricName": "tma_info_bad_spec_ipmisp_indirect",
+ "MetricThreshold": "tma_info_bad_spec_ipmisp_indirect < 1e3"
},
{
- "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.",
- "MetricExpr": "CPU_CLK_UNHALTED.THREAD",
- "MetricGroup": "Pipeline",
- "MetricName": "tma_info_clks"
+ "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)",
+ "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES",
+ "MetricGroup": "Bad;BadSpec;BrMispredicts",
+ "MetricName": "tma_info_bad_spec_ipmispredict",
+ "MetricThreshold": "tma_info_bad_spec_ipmispredict < 200"
},
{
"BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core",
- "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / 2 * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2 if #SMT_on else tma_info_clks))",
+ "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / 2 * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2 if #SMT_on else tma_info_thread_clks))",
"MetricGroup": "SMT",
- "MetricName": "tma_info_core_clks"
+ "MetricName": "tma_info_core_core_clks"
},
{
"BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)",
- "MetricExpr": "INST_RETIRED.ANY / tma_info_core_clks",
+ "MetricExpr": "INST_RETIRED.ANY / tma_info_core_core_clks",
"MetricGroup": "Ret;SMT;TmaL1;tma_L1_group",
- "MetricName": "tma_info_coreipc"
+ "MetricName": "tma_info_core_coreipc"
},
{
- "BriefDescription": "Cycles Per Instruction (per Logical Processor)",
- "MetricExpr": "1 / tma_info_ipc",
- "MetricGroup": "Mem;Pipeline",
- "MetricName": "tma_info_cpi"
- },
- {
- "BriefDescription": "Average CPU Utilization",
- "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC",
- "MetricGroup": "HPC;Summary",
- "MetricName": "tma_info_cpu_utilization"
- },
- {
- "BriefDescription": "Average Parallel L2 cache miss data reads",
- "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD",
- "MetricGroup": "Memory_BW;Offcore",
- "MetricName": "tma_info_data_l2_mlp"
- },
- {
- "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
- "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time",
- "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW",
- "MetricName": "tma_info_dram_bw_use",
- "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_mem_bandwidth, tma_sq_full"
+ "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core",
+ "MetricExpr": "(UOPS_EXECUTED.CORE / 2 / (cpu@...S_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else cpu@...S_EXECUTED.CORE\\,cmask\\=1@) if #SMT_on else UOPS_EXECUTED.CORE / (cpu@...S_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else cpu@...S_EXECUTED.CORE\\,cmask\\=1@))",
+ "MetricGroup": "Backend;Cor;Pipeline;PortsUtil",
+ "MetricName": "tma_info_core_ilp"
},
{
"BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)",
"MetricExpr": "IDQ.DSB_UOPS / (IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS)",
"MetricGroup": "DSB;Fed;FetchBW;tma_issueFB",
- "MetricName": "tma_info_dsb_coverage",
- "MetricThreshold": "tma_info_dsb_coverage < 0.7 & tma_info_ipc / 4 > 0.35",
- "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_iptb, tma_lcp"
+ "MetricName": "tma_info_frontend_dsb_coverage",
+ "MetricThreshold": "tma_info_frontend_dsb_coverage < 0.7 & tma_info_thread_ipc / 4 > 0.35",
+ "PublicDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache). Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_inst_mix_iptb, tma_lcp"
},
{
- "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core",
- "MetricExpr": "(UOPS_EXECUTED.CORE / 2 / (cpu@...S_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else cpu@...S_EXECUTED.CORE\\,cmask\\=1@) if #SMT_on else UOPS_EXECUTED.CORE / (cpu@...S_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else cpu@...S_EXECUTED.CORE\\,cmask\\=1@))",
- "MetricGroup": "Backend;Cor;Pipeline;PortsUtil",
- "MetricName": "tma_info_ilp"
+ "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)",
+ "MetricExpr": "tma_info_inst_mix_instructions / BACLEARS.ANY",
+ "MetricGroup": "Fed",
+ "MetricName": "tma_info_frontend_ipunknown_branch"
+ },
+ {
+ "BriefDescription": "Branch instructions per taken branch.",
+ "MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN",
+ "MetricGroup": "Branches;Fed;PGO",
+ "MetricName": "tma_info_inst_mix_bptkbranch"
},
{
"BriefDescription": "Total number of retired Instructions",
"MetricExpr": "INST_RETIRED.ANY",
"MetricGroup": "Summary;TmaL1;tma_L1_group",
- "MetricName": "tma_info_instructions",
+ "MetricName": "tma_info_inst_mix_instructions",
"PublicDescription": "Total number of retired Instructions. Sample with: INST_RETIRED.PREC_DIST"
},
{
"BriefDescription": "Instructions per Branch (lower number means higher occurrence rate)",
"MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.ALL_BRANCHES",
"MetricGroup": "Branches;Fed;InsType",
- "MetricName": "tma_info_ipbranch",
- "MetricThreshold": "tma_info_ipbranch < 8"
- },
- {
- "BriefDescription": "Instructions Per Cycle (per Logical Processor)",
- "MetricExpr": "INST_RETIRED.ANY / tma_info_clks",
- "MetricGroup": "Ret;Summary",
- "MetricName": "tma_info_ipc"
+ "MetricName": "tma_info_inst_mix_ipbranch",
+ "MetricThreshold": "tma_info_inst_mix_ipbranch < 8"
},
{
"BriefDescription": "Instructions per (near) call (lower number means higher occurrence rate)",
"MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_CALL",
"MetricGroup": "Branches;Fed;PGO",
- "MetricName": "tma_info_ipcall",
- "MetricThreshold": "tma_info_ipcall < 200"
- },
- {
- "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]",
- "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u",
- "MetricGroup": "Branches;OS",
- "MetricName": "tma_info_ipfarbranch",
- "MetricThreshold": "tma_info_ipfarbranch < 1e6"
+ "MetricName": "tma_info_inst_mix_ipcall",
+ "MetricThreshold": "tma_info_inst_mix_ipcall < 200"
},
{
"BriefDescription": "Instructions per Load (lower number means higher occurrence rate)",
"MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_LOADS",
"MetricGroup": "InsType",
- "MetricName": "tma_info_ipload",
- "MetricThreshold": "tma_info_ipload < 3"
- },
- {
- "BriefDescription": "Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).",
- "MetricExpr": "tma_info_instructions / (UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * cpu@...MISP_EXEC.ALL_BRANCHES\\,umask\\=0xE4@)",
- "MetricGroup": "Bad;BrMispredicts",
- "MetricName": "tma_info_ipmisp_indirect",
- "MetricThreshold": "tma_info_ipmisp_indirect < 1e3"
- },
- {
- "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)",
- "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES",
- "MetricGroup": "Bad;BadSpec;BrMispredicts",
- "MetricName": "tma_info_ipmispredict",
- "MetricThreshold": "tma_info_ipmispredict < 200"
+ "MetricName": "tma_info_inst_mix_ipload",
+ "MetricThreshold": "tma_info_inst_mix_ipload < 3"
},
{
"BriefDescription": "Instructions per Store (lower number means higher occurrence rate)",
"MetricExpr": "INST_RETIRED.ANY / MEM_UOPS_RETIRED.ALL_STORES",
"MetricGroup": "InsType",
- "MetricName": "tma_info_ipstore",
- "MetricThreshold": "tma_info_ipstore < 8"
+ "MetricName": "tma_info_inst_mix_ipstore",
+ "MetricThreshold": "tma_info_inst_mix_ipstore < 8"
},
{
"BriefDescription": "Instruction per taken branch",
"MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.NEAR_TAKEN",
"MetricGroup": "Branches;Fed;FetchBW;Frontend;PGO;tma_issueFB",
- "MetricName": "tma_info_iptb",
- "MetricThreshold": "tma_info_iptb < 9",
- "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_lcp"
- },
- {
- "BriefDescription": "Instructions per speculative Unknown Branch Misprediction (BAClear) (lower number means higher occurrence rate)",
- "MetricExpr": "tma_info_instructions / BACLEARS.ANY",
- "MetricGroup": "Fed",
- "MetricName": "tma_info_ipunknown_branch"
- },
- {
- "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode",
- "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k",
- "MetricGroup": "OS",
- "MetricName": "tma_info_kernel_cpi"
- },
- {
- "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode",
- "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD",
- "MetricGroup": "OS",
- "MetricName": "tma_info_kernel_utilization",
- "MetricThreshold": "tma_info_kernel_utilization > 0.05"
+ "MetricName": "tma_info_inst_mix_iptb",
+ "MetricThreshold": "tma_info_inst_mix_iptb < 9",
+ "PublicDescription": "Instruction per taken branch. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_lcp"
},
{
"BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]",
"MetricExpr": "64 * L1D.REPLACEMENT / 1e9 / duration_time",
"MetricGroup": "Mem;MemoryBW",
- "MetricName": "tma_info_l1d_cache_fill_bw"
- },
- {
- "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]",
- "MetricExpr": "tma_info_l1d_cache_fill_bw",
- "MetricGroup": "Mem;MemoryBW",
- "MetricName": "tma_info_l1d_cache_fill_bw_1t"
- },
- {
- "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
- "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY",
- "MetricGroup": "CacheMisses;Mem",
- "MetricName": "tma_info_l1mpki"
+ "MetricName": "tma_info_memory_core_l1d_cache_fill_bw"
},
{
"BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]",
"MetricExpr": "64 * L2_LINES_IN.ALL / 1e9 / duration_time",
"MetricGroup": "Mem;MemoryBW",
- "MetricName": "tma_info_l2_cache_fill_bw"
+ "MetricName": "tma_info_memory_core_l2_cache_fill_bw"
},
{
- "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]",
- "MetricExpr": "tma_info_l2_cache_fill_bw",
+ "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
+ "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
"MetricGroup": "Mem;MemoryBW",
- "MetricName": "tma_info_l2_cache_fill_bw_1t"
+ "MetricName": "tma_info_memory_core_l3_cache_fill_bw"
+ },
+ {
+ "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
+ "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY",
+ "MetricGroup": "CacheMisses;Mem",
+ "MetricName": "tma_info_memory_l1mpki"
},
{
"BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads",
"MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY",
"MetricGroup": "Backend;CacheMisses;Mem",
- "MetricName": "tma_info_l2mpki"
+ "MetricName": "tma_info_memory_l2mpki"
},
{
- "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]",
- "MetricExpr": "0",
- "MetricGroup": "Mem;MemoryBW;Offcore",
- "MetricName": "tma_info_l3_cache_access_bw_1t"
+ "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
+ "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY",
+ "MetricGroup": "CacheMisses;Mem",
+ "MetricName": "tma_info_memory_l3mpki"
},
{
- "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
- "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1e9 / duration_time",
- "MetricGroup": "Mem;MemoryBW",
- "MetricName": "tma_info_l3_cache_fill_bw"
+ "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
+ "MetricConstraint": "NO_GROUP_EVENTS",
+ "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)",
+ "MetricGroup": "Mem;MemoryBound;MemoryLat",
+ "MetricName": "tma_info_memory_load_miss_real_latency"
},
{
- "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]",
- "MetricExpr": "tma_info_l3_cache_fill_bw",
- "MetricGroup": "Mem;MemoryBW",
- "MetricName": "tma_info_l3_cache_fill_bw_1t"
+ "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
+ "MetricConstraint": "NO_GROUP_EVENTS",
+ "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
+ "MetricGroup": "Mem;MemoryBW;MemoryBound",
+ "MetricName": "tma_info_memory_mlp",
+ "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)"
},
{
- "BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
- "MetricExpr": "1e3 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY",
- "MetricGroup": "CacheMisses;Mem",
- "MetricName": "tma_info_l3mpki"
+ "BriefDescription": "Average Parallel L2 cache miss data reads",
+ "MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD",
+ "MetricGroup": "Memory_BW;Offcore",
+ "MetricName": "tma_info_memory_oro_data_l2_mlp"
},
{
"BriefDescription": "Average Latency for L2 cache miss demand Loads",
"MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS.DEMAND_DATA_RD",
"MetricGroup": "Memory_Lat;Offcore",
- "MetricName": "tma_info_load_l2_miss_latency"
+ "MetricName": "tma_info_memory_oro_load_l2_miss_latency"
},
{
"BriefDescription": "Average Parallel L2 cache miss demand Loads",
"MetricExpr": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD / OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD",
"MetricGroup": "Memory_BW;Offcore",
- "MetricName": "tma_info_load_l2_mlp"
+ "MetricName": "tma_info_memory_oro_load_l2_mlp"
},
{
- "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
- "MetricConstraint": "NO_GROUP_EVENTS",
- "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + MEM_LOAD_UOPS_RETIRED.HIT_LFB)",
- "MetricGroup": "Mem;MemoryBound;MemoryLat",
- "MetricName": "tma_info_load_miss_real_latency"
+ "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]",
+ "MetricExpr": "tma_info_memory_core_l1d_cache_fill_bw",
+ "MetricGroup": "Mem;MemoryBW",
+ "MetricName": "tma_info_memory_thread_l1d_cache_fill_bw_1t"
},
{
- "BriefDescription": "Average number of parallel data read requests to external memory",
- "MetricExpr": "UNC_C_TOR_OCCUPANCY.MISS_OPCODE@...ter_opc\\=0x182@ / UNC_C_TOR_OCCUPANCY.MISS_OPCODE@...ter_opc\\=0x182\\,thresh\\=1@",
- "MetricGroup": "Mem;MemoryBW;SoC",
- "MetricName": "tma_info_mem_parallel_reads",
- "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches"
+ "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]",
+ "MetricExpr": "tma_info_memory_core_l2_cache_fill_bw",
+ "MetricGroup": "Mem;MemoryBW",
+ "MetricName": "tma_info_memory_thread_l2_cache_fill_bw_1t"
},
{
- "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)",
- "MetricExpr": "1e9 * (UNC_C_TOR_OCCUPANCY.MISS_OPCODE@...ter_opc\\=0x182@ / UNC_C_TOR_INSERTS.MISS_OPCODE@...ter_opc\\=0x182@) / (tma_info_socket_clks / duration_time)",
- "MetricGroup": "Mem;MemoryLat;SoC",
- "MetricName": "tma_info_mem_read_latency",
- "PublicDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. ([RKL+]memory-controller only)"
+ "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]",
+ "MetricExpr": "0",
+ "MetricGroup": "Mem;MemoryBW;Offcore",
+ "MetricName": "tma_info_memory_thread_l3_cache_access_bw_1t"
},
{
- "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss",
- "MetricConstraint": "NO_GROUP_EVENTS",
- "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
- "MetricGroup": "Mem;MemoryBW;MemoryBound",
- "MetricName": "tma_info_mlp",
- "PublicDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)"
+ "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]",
+ "MetricExpr": "tma_info_memory_core_l3_cache_fill_bw",
+ "MetricGroup": "Mem;MemoryBW",
+ "MetricName": "tma_info_memory_thread_l3_cache_fill_bw_1t"
},
{
"BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
- "MetricExpr": "(ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION) / tma_info_core_clks",
+ "MetricExpr": "(ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION) / tma_info_core_core_clks",
"MetricGroup": "Mem;MemoryTLB",
- "MetricName": "tma_info_page_walks_utilization",
- "MetricThreshold": "tma_info_page_walks_utilization > 0.5"
+ "MetricName": "tma_info_memory_tlb_page_walks_utilization",
+ "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5"
},
{
"BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.",
"MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / cpu@...S_RETIRED.RETIRE_SLOTS\\,cmask\\=1@",
"MetricGroup": "Pipeline;Ret",
- "MetricName": "tma_info_retire"
+ "MetricName": "tma_info_pipeline_retire"
},
{
- "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)",
- "MetricExpr": "4 * tma_info_core_clks",
- "MetricGroup": "TmaL1;tma_L1_group",
- "MetricName": "tma_info_slots"
+ "BriefDescription": "Measured Average Frequency for unhalted processors [GHz]",
+ "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / duration_time",
+ "MetricGroup": "Power;Summary",
+ "MetricName": "tma_info_system_average_frequency"
+ },
+ {
+ "BriefDescription": "Average CPU Utilization",
+ "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC",
+ "MetricGroup": "HPC;Summary",
+ "MetricName": "tma_info_system_cpu_utilization"
+ },
+ {
+ "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
+ "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / duration_time",
+ "MetricGroup": "HPC;Mem;MemoryBW;SoC;tma_issueBW",
+ "MetricName": "tma_info_system_dram_bw_use",
+ "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_fb_full, tma_mem_bandwidth, tma_sq_full"
+ },
+ {
+ "BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]",
+ "MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u",
+ "MetricGroup": "Branches;OS",
+ "MetricName": "tma_info_system_ipfarbranch",
+ "MetricThreshold": "tma_info_system_ipfarbranch < 1e6"
+ },
+ {
+ "BriefDescription": "Cycles Per Instruction for the Operating System (OS) Kernel mode",
+ "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / INST_RETIRED.ANY_P:k",
+ "MetricGroup": "OS",
+ "MetricName": "tma_info_system_kernel_cpi"
+ },
+ {
+ "BriefDescription": "Fraction of cycles spent in the Operating System (OS) Kernel mode",
+ "MetricExpr": "CPU_CLK_UNHALTED.THREAD_P:k / CPU_CLK_UNHALTED.THREAD",
+ "MetricGroup": "OS",
+ "MetricName": "tma_info_system_kernel_utilization",
+ "MetricThreshold": "tma_info_system_kernel_utilization > 0.05"
+ },
+ {
+ "BriefDescription": "Average number of parallel data read requests to external memory",
+ "MetricExpr": "UNC_C_TOR_OCCUPANCY.MISS_OPCODE@...ter_opc\\=0x182@ / UNC_C_TOR_OCCUPANCY.MISS_OPCODE@...ter_opc\\=0x182\\,thresh\\=1@",
+ "MetricGroup": "Mem;MemoryBW;SoC",
+ "MetricName": "tma_info_system_mem_parallel_reads",
+ "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches"
+ },
+ {
+ "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)",
+ "MetricExpr": "1e9 * (UNC_C_TOR_OCCUPANCY.MISS_OPCODE@...ter_opc\\=0x182@ / UNC_C_TOR_INSERTS.MISS_OPCODE@...ter_opc\\=0x182@) / (tma_info_system_socket_clks / duration_time)",
+ "MetricGroup": "Mem;MemoryLat;SoC",
+ "MetricName": "tma_info_system_mem_read_latency",
+ "PublicDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches. ([RKL+]memory-controller only)"
},
{
"BriefDescription": "Fraction of cycles where both hardware Logical Processors were active",
"MetricExpr": "(1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0)",
"MetricGroup": "SMT",
- "MetricName": "tma_info_smt_2t_utilization"
+ "MetricName": "tma_info_system_smt_2t_utilization"
},
{
"BriefDescription": "Socket actual clocks when any core is active on that socket",
"MetricExpr": "cbox_0@...nt\\=0x0@",
"MetricGroup": "SoC",
- "MetricName": "tma_info_socket_clks"
+ "MetricName": "tma_info_system_socket_clks"
},
{
"BriefDescription": "Average Frequency Utilization relative nominal frequency",
- "MetricExpr": "tma_info_clks / CPU_CLK_UNHALTED.REF_TSC",
+ "MetricExpr": "tma_info_thread_clks / CPU_CLK_UNHALTED.REF_TSC",
"MetricGroup": "Power",
- "MetricName": "tma_info_turbo_utilization"
+ "MetricName": "tma_info_system_turbo_utilization"
+ },
+ {
+ "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.",
+ "MetricExpr": "CPU_CLK_UNHALTED.THREAD",
+ "MetricGroup": "Pipeline",
+ "MetricName": "tma_info_thread_clks"
+ },
+ {
+ "BriefDescription": "Cycles Per Instruction (per Logical Processor)",
+ "MetricExpr": "1 / tma_info_thread_ipc",
+ "MetricGroup": "Mem;Pipeline",
+ "MetricName": "tma_info_thread_cpi"
+ },
+ {
+ "BriefDescription": "Instructions Per Cycle (per Logical Processor)",
+ "MetricExpr": "INST_RETIRED.ANY / tma_info_thread_clks",
+ "MetricGroup": "Ret;Summary",
+ "MetricName": "tma_info_thread_ipc"
+ },
+ {
+ "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)",
+ "MetricExpr": "4 * tma_info_core_core_clks",
+ "MetricGroup": "TmaL1;tma_L1_group",
+ "MetricName": "tma_info_thread_slots"
},
{
"BriefDescription": "Uops Per Instruction",
"MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / INST_RETIRED.ANY",
"MetricGroup": "Pipeline;Ret;Retire",
- "MetricName": "tma_info_uoppi",
- "MetricThreshold": "tma_info_uoppi > 1.05"
+ "MetricName": "tma_info_thread_uoppi",
+ "MetricThreshold": "tma_info_thread_uoppi > 1.05"
},
{
"BriefDescription": "Instruction per taken branch",
"MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / BR_INST_RETIRED.NEAR_TAKEN",
"MetricGroup": "Branches;Fed;FetchBW",
- "MetricName": "tma_info_uptb",
- "MetricThreshold": "tma_info_uptb < 6"
+ "MetricName": "tma_info_thread_uptb",
+ "MetricThreshold": "tma_info_thread_uptb < 6"
},
{
"BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses",
- "MetricExpr": "(14 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION) / tma_info_clks",
+ "MetricExpr": "(14 * ITLB_MISSES.STLB_HIT + ITLB_MISSES.WALK_DURATION) / tma_info_thread_clks",
"MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_L3_group;tma_fetch_latency_group",
"MetricName": "tma_itlb_misses",
"MetricThreshold": "tma_itlb_misses > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
@@ -621,7 +823,7 @@
},
{
"BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache",
- "MetricExpr": "max((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) - CYCLE_ACTIVITY.STALLS_L1D_PENDING) / tma_info_clks, 0)",
+ "MetricExpr": "max((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) - CYCLE_ACTIVITY.STALLS_L1D_PENDING) / tma_info_thread_clks, 0)",
"MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_issueL1;tma_issueMC;tma_memory_bound_group",
"MetricName": "tma_l1_bound",
"MetricThreshold": "tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
@@ -630,7 +832,7 @@
},
{
"BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads",
- "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_PENDING - CYCLE_ACTIVITY.STALLS_L2_PENDING) / tma_info_clks",
+ "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_PENDING - CYCLE_ACTIVITY.STALLS_L2_PENDING) / tma_info_thread_clks",
"MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
"MetricName": "tma_l2_bound",
"MetricThreshold": "tma_l2_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
@@ -640,7 +842,7 @@
{
"BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core",
"MetricConstraint": "NO_GROUP_EVENTS_SMT",
- "MetricExpr": "MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_clks",
+ "MetricExpr": "MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS) * CYCLE_ACTIVITY.STALLS_L2_PENDING / tma_info_thread_clks",
"MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
"MetricName": "tma_l3_bound",
"MetricThreshold": "tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
@@ -650,7 +852,7 @@
{
"BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
"MetricConstraint": "NO_GROUP_EVENTS",
- "MetricExpr": "41 * (MEM_LOAD_UOPS_RETIRED.L3_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_clks",
+ "MetricExpr": "41 * (MEM_LOAD_UOPS_RETIRED.L3_HIT * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks",
"MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group",
"MetricName": "tma_l3_hit_latency",
"MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -659,11 +861,11 @@
},
{
"BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)",
- "MetricExpr": "ILD_STALL.LCP / tma_info_clks",
+ "MetricExpr": "ILD_STALL.LCP / tma_info_thread_clks",
"MetricGroup": "FetchLat;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueFB",
"MetricName": "tma_lcp",
"MetricThreshold": "tma_lcp > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
- "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_dsb_coverage, tma_info_iptb",
+ "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs. Related metrics: tma_dsb_switches, tma_fetch_bandwidth, tma_info_frontend_dsb_coverage, tma_info_inst_mix_iptb",
"ScaleUnit": "100%"
},
{
@@ -679,7 +881,7 @@
{
"BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations",
"MetricConstraint": "NO_GROUP_EVENTS_NMI",
- "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_clks)",
+ "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_core_clks)",
"MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group",
"MetricName": "tma_load_op_utilization",
"MetricThreshold": "tma_load_op_utilization > 0.6",
@@ -689,7 +891,7 @@
{
"BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory",
"MetricConstraint": "NO_GROUP_EVENTS",
- "MetricExpr": "200 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_clks",
+ "MetricExpr": "200 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks",
"MetricGroup": "Server;TopdownL5;tma_L5_group;tma_mem_latency_group",
"MetricName": "tma_local_dram",
"MetricThreshold": "tma_local_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
@@ -699,7 +901,7 @@
{
"BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations",
"MetricConstraint": "NO_GROUP_EVENTS",
- "MetricExpr": "MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO) / tma_info_clks",
+ "MetricExpr": "MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO) / tma_info_thread_clks",
"MetricGroup": "Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group",
"MetricName": "tma_lock_latency",
"MetricThreshold": "tma_lock_latency > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -719,16 +921,16 @@
},
{
"BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)",
- "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@...CORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=6@) / tma_info_clks",
+ "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@...CORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=6@) / tma_info_thread_clks",
"MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW",
"MetricName": "tma_mem_bandwidth",
"MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
- "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_sq_full",
+ "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full",
"ScaleUnit": "100%"
},
{
"BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)",
- "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_clks - tma_mem_bandwidth",
+ "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / tma_info_thread_clks - tma_mem_bandwidth",
"MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat",
"MetricName": "tma_mem_latency",
"MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -738,7 +940,7 @@
{
"BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck",
"MetricConstraint": "NO_GROUP_EVENTS",
- "MetricExpr": "((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) + RESOURCE_STALLS.SB) / (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + (cpu@...S_EXECUTED.CORE\\,cmask\\=1@ - (cpu@...S_EXECUTED.CORE\\,cmask\\=3@ if tma_info_ipc > 1.8 else cpu@...S_EXECUTED.CORE\\,cmask\\=2@)) / 2 - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) if #SMT_on else min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + cpu@...S_EXECUTED.CORE\\,cmask\\=1@ - (cpu@...S_EXECUTED.CORE\\,cmask\\=3@ if tma_info_ipc > 1.8 else cpu@...S_EXECUTED.CORE\\,cmask\\=2@) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) * tma_backend_bound",
+ "MetricExpr": "((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) + RESOURCE_STALLS.SB) / (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + (cpu@...S_EXECUTED.CORE\\,cmask\\=1@ - (cpu@...S_EXECUTED.CORE\\,cmask\\=3@ if tma_info_thread_ipc > 1.8 else cpu@...S_EXECUTED.CORE\\,cmask\\=2@)) / 2 - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) if #SMT_on else min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + cpu@...S_EXECUTED.CORE\\,cmask\\=1@ - (cpu@...S_EXECUTED.CORE\\,cmask\\=3@ if tma_info_thread_ipc > 1.8 else cpu@...S_EXECUTED.CORE\\,cmask\\=2@) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) * tma_backend_bound",
"MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
"MetricName": "tma_memory_bound",
"MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2",
@@ -748,7 +950,7 @@
},
{
"BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit",
- "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_slots",
+ "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots",
"MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS",
"MetricName": "tma_microcode_sequencer",
"MetricThreshold": "tma_microcode_sequencer > 0.05 & tma_heavy_operations > 0.1",
@@ -757,16 +959,16 @@
},
{
"BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)",
- "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_clks / 2",
+ "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / tma_info_core_core_clks / 2",
"MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group",
"MetricName": "tma_mite",
- "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35)",
+ "MetricThreshold": "tma_mite > 0.1 & (tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_thread_ipc / 4 > 0.35)",
"PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck.",
"ScaleUnit": "100%"
},
{
"BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)",
- "MetricExpr": "2 * IDQ.MS_SWITCHES / tma_info_clks",
+ "MetricExpr": "2 * IDQ.MS_SWITCHES / tma_info_thread_clks",
"MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_L3_group;tma_fetch_latency_group;tma_issueMC;tma_issueMS;tma_issueMV;tma_issueSO",
"MetricName": "tma_ms_switches",
"MetricThreshold": "tma_ms_switches > 0.05 & (tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15)",
@@ -775,7 +977,7 @@
},
{
"BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch)",
- "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / tma_info_core_clks",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / tma_info_core_core_clks",
"MetricGroup": "Compute;TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
"MetricName": "tma_port_0",
"MetricThreshold": "tma_port_0 > 0.6",
@@ -784,7 +986,7 @@
},
{
"BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU)",
- "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / tma_info_core_clks",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / tma_info_core_core_clks",
"MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
"MetricName": "tma_port_1",
"MetricThreshold": "tma_port_1 > 0.6",
@@ -793,7 +995,7 @@
},
{
"BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 2 ([SNB+]Loads and Store-address; [ICL+] Loads)",
- "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / tma_info_core_clks",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / tma_info_core_core_clks",
"MetricGroup": "TopdownL6;tma_L6_group;tma_load_op_utilization_group",
"MetricName": "tma_port_2",
"MetricThreshold": "tma_port_2 > 0.6",
@@ -802,7 +1004,7 @@
},
{
"BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 3 ([SNB+]Loads and Store-address; [ICL+] Loads)",
- "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / tma_info_core_clks",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / tma_info_core_core_clks",
"MetricGroup": "TopdownL6;tma_L6_group;tma_load_op_utilization_group",
"MetricName": "tma_port_3",
"MetricThreshold": "tma_port_3 > 0.6",
@@ -820,7 +1022,7 @@
},
{
"BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU)",
- "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / tma_info_core_clks",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / tma_info_core_core_clks",
"MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
"MetricName": "tma_port_5",
"MetricThreshold": "tma_port_5 > 0.6",
@@ -829,7 +1031,7 @@
},
{
"BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU)",
- "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_clks",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / tma_info_core_core_clks",
"MetricGroup": "TopdownL6;tma_L6_group;tma_alu_op_utilization_group;tma_issue2P",
"MetricName": "tma_port_6",
"MetricThreshold": "tma_port_6 > 0.6",
@@ -838,7 +1040,7 @@
},
{
"BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 7 ([HSW+]simple Store-address)",
- "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / tma_info_core_clks",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / tma_info_core_core_clks",
"MetricGroup": "TopdownL6;tma_L6_group;tma_store_op_utilization_group",
"MetricName": "tma_port_7",
"MetricThreshold": "tma_port_7 > 0.6",
@@ -848,7 +1050,7 @@
{
"BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)",
"MetricConstraint": "NO_GROUP_EVENTS",
- "MetricExpr": "(min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + (cpu@...S_EXECUTED.CORE\\,cmask\\=1@ - (cpu@...S_EXECUTED.CORE\\,cmask\\=3@ if tma_info_ipc > 1.8 else cpu@...S_EXECUTED.CORE\\,cmask\\=2@)) / 2 - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB if #SMT_on else min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + cpu@...S_EXECUTED.CORE\\,cmask\\=1@ - (cpu@...S_EXECUTED.CORE\\,cmask\\=3@ if tma_info_ipc > 1.8 else cpu@...S_EXECUTED.CORE\\,cmask\\=2@) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB - RESOURCE_STALLS.SB - min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING)) / tma_info_clks",
+ "MetricExpr": "(min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + (cpu@...S_EXECUTED.CORE\\,cmask\\=1@ - (cpu@...S_EXECUTED.CORE\\,cmask\\=3@ if tma_info_thread_ipc > 1.8 else cpu@...S_EXECUTED.CORE\\,cmask\\=2@)) / 2 - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB if #SMT_on else min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + cpu@...S_EXECUTED.CORE\\,cmask\\=1@ - (cpu@...S_EXECUTED.CORE\\,cmask\\=3@ if tma_info_thread_ipc > 1.8 else cpu@...S_EXECUTED.CORE\\,cmask\\=2@) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB - RESOURCE_STALLS.SB - min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING)) / tma_info_thread_clks",
"MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group",
"MetricName": "tma_ports_utilization",
"MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)",
@@ -857,7 +1059,7 @@
},
{
"BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
- "MetricExpr": "(cpu@...S_EXECUTED.CORE\\,inv\\,cmask\\=1@ / 2 if #SMT_on else (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0)) / tma_info_core_clks)",
+ "MetricExpr": "(cpu@...S_EXECUTED.CORE\\,inv\\,cmask\\=1@ / 2 if #SMT_on else (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0)) / tma_info_core_core_clks)",
"MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
"MetricName": "tma_ports_utilized_0",
"MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
@@ -866,7 +1068,7 @@
},
{
"BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
- "MetricExpr": "((cpu@...S_EXECUTED.CORE\\,cmask\\=1@ - cpu@...S_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else (cpu@...S_EXECUTED.CORE\\,cmask\\=1@ - cpu@...S_EXECUTED.CORE\\,cmask\\=2@) / tma_info_core_clks)",
+ "MetricExpr": "((cpu@...S_EXECUTED.CORE\\,cmask\\=1@ - cpu@...S_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else (cpu@...S_EXECUTED.CORE\\,cmask\\=1@ - cpu@...S_EXECUTED.CORE\\,cmask\\=2@) / tma_info_core_core_clks)",
"MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group",
"MetricName": "tma_ports_utilized_1",
"MetricThreshold": "tma_ports_utilized_1 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
@@ -875,7 +1077,7 @@
},
{
"BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
- "MetricExpr": "((cpu@...S_EXECUTED.CORE\\,cmask\\=2@ - cpu@...S_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else (cpu@...S_EXECUTED.CORE\\,cmask\\=2@ - cpu@...S_EXECUTED.CORE\\,cmask\\=3@) / tma_info_core_clks)",
+ "MetricExpr": "((cpu@...S_EXECUTED.CORE\\,cmask\\=2@ - cpu@...S_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else (cpu@...S_EXECUTED.CORE\\,cmask\\=2@ - cpu@...S_EXECUTED.CORE\\,cmask\\=3@) / tma_info_core_core_clks)",
"MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group",
"MetricName": "tma_ports_utilized_2",
"MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
@@ -884,7 +1086,7 @@
},
{
"BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).",
- "MetricExpr": "(cpu@...S_EXECUTED.CORE\\,cmask\\=3@ / 2 if #SMT_on else cpu@...S_EXECUTED.CORE\\,cmask\\=3@) / tma_info_core_clks",
+ "MetricExpr": "(cpu@...S_EXECUTED.CORE\\,cmask\\=3@ / 2 if #SMT_on else cpu@...S_EXECUTED.CORE\\,cmask\\=3@) / tma_info_core_core_clks",
"MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group",
"MetricName": "tma_ports_utilized_3m",
"MetricThreshold": "tma_ports_utilized_3m > 0.7 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))",
@@ -893,7 +1095,7 @@
{
"BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues",
"MetricConstraint": "NO_GROUP_EVENTS",
- "MetricExpr": "(200 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) + 180 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD)))) / tma_info_clks",
+ "MetricExpr": "(200 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) + 180 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD)))) / tma_info_thread_clks",
"MetricGroup": "Offcore;Server;Snoop;TopdownL5;tma_L5_group;tma_issueSyncxn;tma_mem_latency_group",
"MetricName": "tma_remote_cache",
"MetricThreshold": "tma_remote_cache > 0.05 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
@@ -903,7 +1105,7 @@
{
"BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory",
"MetricConstraint": "NO_GROUP_EVENTS",
- "MetricExpr": "310 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_clks",
+ "MetricExpr": "310 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks",
"MetricGroup": "Server;Snoop;TopdownL5;tma_L5_group;tma_mem_latency_group",
"MetricName": "tma_remote_dram",
"MetricThreshold": "tma_remote_dram > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))",
@@ -912,7 +1114,7 @@
},
{
"BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired",
- "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / tma_info_slots",
+ "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / tma_info_thread_slots",
"MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_retiring",
"MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1",
@@ -923,7 +1125,7 @@
{
"BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary",
"MetricConstraint": "NO_GROUP_EVENTS",
- "MetricExpr": "tma_info_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_clks",
+ "MetricExpr": "tma_info_memory_load_miss_real_latency * LD_BLOCKS.NO_SR / tma_info_thread_clks",
"MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group",
"MetricName": "tma_split_loads",
"MetricThreshold": "tma_split_loads > 0.2 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -932,7 +1134,7 @@
},
{
"BriefDescription": "This metric represents rate of split store accesses",
- "MetricExpr": "2 * MEM_UOPS_RETIRED.SPLIT_STORES / tma_info_core_clks",
+ "MetricExpr": "2 * MEM_UOPS_RETIRED.SPLIT_STORES / tma_info_core_core_clks",
"MetricGroup": "TopdownL4;tma_L4_group;tma_issueSpSt;tma_store_bound_group",
"MetricName": "tma_split_stores",
"MetricThreshold": "tma_split_stores > 0.2 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -941,16 +1143,16 @@
},
{
"BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)",
- "MetricExpr": "(OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2 if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / tma_info_core_clks",
+ "MetricExpr": "(OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2 if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / tma_info_core_core_clks",
"MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group",
"MetricName": "tma_sq_full",
"MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
- "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_dram_bw_use, tma_mem_bandwidth",
+ "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth",
"ScaleUnit": "100%"
},
{
"BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write",
- "MetricExpr": "RESOURCE_STALLS.SB / tma_info_clks",
+ "MetricExpr": "RESOURCE_STALLS.SB / tma_info_thread_clks",
"MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group",
"MetricName": "tma_store_bound",
"MetricThreshold": "tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)",
@@ -959,7 +1161,7 @@
},
{
"BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores",
- "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_clks",
+ "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_thread_clks",
"MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group",
"MetricName": "tma_store_fwd_blk",
"MetricThreshold": "tma_store_fwd_blk > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -969,7 +1171,7 @@
{
"BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses",
"MetricConstraint": "NO_GROUP_EVENTS",
- "MetricExpr": "(L2_RQSTS.RFO_HIT * 9 * (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) + (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_clks",
+ "MetricExpr": "(L2_RQSTS.RFO_HIT * 9 * (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) + (1 - MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / tma_info_thread_clks",
"MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_issueSL;tma_store_bound_group",
"MetricName": "tma_store_latency",
"MetricThreshold": "tma_store_latency > 0.1 & (tma_store_bound > 0.2 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))",
@@ -978,7 +1180,7 @@
},
{
"BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations",
- "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / tma_info_core_clks",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / tma_info_core_core_clks",
"MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group",
"MetricName": "tma_store_op_utilization",
"MetricThreshold": "tma_store_op_utilization > 0.6",
@@ -986,11 +1188,17 @@
},
{
"BriefDescription": "This metric serves as an approximation of legacy x87 usage",
- "MetricExpr": "INST_RETIRED.X87 * tma_info_uoppi / UOPS_RETIRED.RETIRE_SLOTS",
+ "MetricExpr": "INST_RETIRED.X87 * tma_info_thread_uoppi / UOPS_RETIRED.RETIRE_SLOTS",
"MetricGroup": "Compute;TopdownL4;tma_L4_group;tma_fp_arith_group",
"MetricName": "tma_x87_use",
"MetricThreshold": "tma_x87_use > 0.1",
"PublicDescription": "This metric serves as an approximation of legacy x87 usage. It accounts for instructions beyond X87 FP arithmetic operations; hence may be used as a thermometer to avoid X87 high usage and preferably upgrade to modern ISA. See Tip under Tuning Hint.",
"ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "Uncore operating frequency in GHz",
+ "MetricExpr": "UNC_C_CLOCKTICKS / (#num_cores / #num_packages * #num_packages) / 1e9 / duration_time",
+ "MetricName": "uncore_frequency",
+ "ScaleUnit": "1GHz"
}
]
--
2.40.1.606.ga4b1b128d6-goog
Powered by blists - more mailing lists