[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aBUbKXMS8oMYqxR9@x1>
Date: Fri, 2 May 2025 16:21:13 -0300
From: Arnaldo Carvalho de Melo <acme@...nel.org>
To: Namhyung Kim <namhyung@...nel.org>
Cc: Ian Rogers <irogers@...gle.com>, Kan Liang <kan.liang@...ux.intel.com>,
Jiri Olsa <jolsa@...nel.org>,
Adrian Hunter <adrian.hunter@...el.com>,
Peter Zijlstra <peterz@...radead.org>,
Ingo Molnar <mingo@...nel.org>, LKML <linux-kernel@...r.kernel.org>,
linux-perf-users@...r.kernel.org,
Ravi Bangoria <ravi.bangoria@....com>, Leo Yan <leo.yan@....com>
Subject: Re: [PATCH 11/11] perf mem: Add 'dtlb' output field
On Fri, May 02, 2025 at 11:38:35AM -0700, Namhyung Kim wrote:
> On Fri, May 02, 2025 at 01:30:35PM -0300, Arnaldo Carvalho de Melo wrote:
> > On Wed, Apr 30, 2025 at 01:55:48PM -0700, Namhyung Kim wrote:
> > > This is a breakdown of perf_mem_data_src.mem_dtlb values. It assumes
> > > PMU drivers would set PERF_MEM_TLB_HIT bit with an appropriate level.
> > > And having PERF_MEM_TLB_MISS means that it failed to find one in any
> > > levels of TLB. For now, it doesn't use PERF_MEM_TLB_{WK,OS} bits.
> > > Also it seems Intel machines don't distinguish L1 or L2 precisely. So I
> > > added ANY_HIT (printed as "L?-Hit") to handle the case.
> > > $ perf mem report -F overhead,dtlb,dso --stdio
> > > ...
> > > # --- D-TLB ----
> > > # Overhead L?-Hit Miss Shared Object
> > > # ........ .............. .................
> > > #
> > > 67.03% 99.5% 0.5% [unknown]
> > > 31.23% 99.2% 0.8% [kernel.kallsyms]
> > > 1.08% 97.8% 2.2% [i915]
> > > 0.36% 100.0% 0.0% [JIT] tid 6853
> > > 0.12% 100.0% 0.0% [drm]
> > > 0.05% 100.0% 0.0% [drm_kms_helper]
> > > 0.05% 100.0% 0.0% [ext4]
> > > 0.02% 100.0% 0.0% [aesni_intel]
> > > 0.02% 100.0% 0.0% [crc32c_intel]
> > > 0.02% 100.0% 0.0% [dm_crypt]
> > > ...
> > root@...ber:~# perf report --header | grep cpudesc
> > # cpudesc : AMD Ryzen 9 9950X3D 16-Core Processor
> > root@...ber:~# perf mem report -F overhead,dtlb,dso --stdio | head -20
> > # To display the perf.data header info, please use --header/--header-only options.
> > #
> > #
> > # Total Lost Samples: 0
> > #
> > # Samples: 2K of event 'cycles:P'
> > # Total weight : 2637
> > # Sort order : local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked,blocked,local_ins_lat,local_p_stage_cyc
> > #
> > # ---------- D-TLB -----------
> > # Overhead L1-Hit L2-Hit Miss Other Shared Object
> > # ........ ............................ .................................
> > #
> > 77.47% 18.4% 0.1% 0.6% 80.9% [kernel.kallsyms]
> > 5.61% 36.5% 0.7% 1.4% 61.5% libxul.so
> > 2.77% 39.7% 0.0% 12.3% 47.9% libc.so.6
> > 2.01% 34.0% 1.9% 1.9% 62.3% libglib-2.0.so.0.8400.1
> > 1.93% 31.4% 2.0% 2.0% 64.7% [amdgpu]
> > 1.63% 48.8% 0.0% 0.0% 51.2% [JIT] tid 60168
> > 1.14% 3.3% 0.0% 0.0% 96.7% [vdso]
> > root@...ber:~#
>
> I guess it's because those samples don't have mem_info as they are not
> memory instructions.
>
> Can you please re-run the perf record with filters like below?
>
> $ perf record -aW --sample-mem-info -e cycles:P --filter 'mem_op == load || mem_op == store' sleep 1
I tried, it got stuck for more than 1 second and then control+C took
also a while and eventually:
root@x1:~# fg
perf record -aW --sample-mem-info -e cycles:P --filter 'mem_op == load || mem_op == store' sleep 1
^X^C^C^C^Clibbpf: prog 'perf_sample_filter': BPF program load failed: -EAGAIN
libbpf: prog 'perf_sample_filter': -- BEGIN PROG LOAD LOG --
arg#0 reference type('UNKNOWN ') size cannot be determined: -22
0: R1=ctx() R10=fp0
; kctx = bpf_cast_to_kern_ctx(ctx); @ sample_filter.bpf.c:215
0: (85) call bpf_cast_to_kern_ctx#72125 ; R0_w=trusted_ptr_bpf_perf_event_data_kern()
1: (7b) *(u64 *)(r10 -48) = r0 ; R0_w=trusted_ptr_bpf_perf_event_data_kern() R10=fp0 fp-48_w=trusted_ptr_bpf_perf_event_data_kern()
2: (b7) r6 = 0 ; R6_w=0
; k = 0; @ sample_filter.bpf.c:217
3: (63) *(u32 *)(r10 -4) = r6 ; R6_w=0 R10=fp0 fp-8=0000????
; if (use_idx_hash) { @ sample_filter.bpf.c:219
4: (18) r1 = 0xffffa9e081be6000 ; R1_w=map_value(map=sample_f.rodata,ks=4,vs=4)
6: (61) r1 = *(u32 *)(r1 +0) ; R1_w=0
7: (15) if r1 == 0x0 goto pc+42 ; R1_w=0
; k = *idx; @ sample_filter.bpf.c:239
50: (bf) r2 = r10 ; R2_w=fp0 R10=fp0
; entry = bpf_map_lookup_elem(&filters, &k); @ sample_filter.bpf.c:244
51: (07) r2 += -4 ; R2_w=fp-4
52: (18) r1 = 0xffff8ecc22be8000 ; R1_w=map_ptr(map=filters,ks=4,vs=1536)
54: (85) call bpf_map_lookup_elem#1 ; R0_w=map_value_or_null(id=1,map=filters,ks=4,vs=1536)
55: (bf) r6 = r0 ; R0_w=map_value_or_null(id=1,map=filters,ks=4,vs=1536) R6_w=map_value_or_null(id=1,map=filters,ks=4,vs=1536)
56: (79) r5 = *(u64 *)(r10 -48) ; R5_w=trusted_ptr_bpf_perf_event_data_kern() R10=fp0 fp-48=trusted_ptr_bpf_perf_event_data_kern()
; if (entry == NULL) @ sample_filter.bpf.c:245
57: (15) if r6 == 0x0 goto pc-21 ; R6_w=map_value(map=filters,ks=4,vs=1536)
58: (b7) r8 = 0 ; R8_w=0
59: (b7) r9 = 0 ; R9_w=0
60: (b7) r1 = 0 ; R1_w=0
61: (7b) *(u64 *)(r10 -40) = r1 ; R1_w=0 R10=fp0 fp-40_w=0
62: (05) goto pc+9
; for (i = 0; i < MAX_FILTERS; i++) { @ sample_filter.bpf.c:248
72: (bf) r7 = r6 ; R6=map_value(map=filters,ks=4,vs=1536) R7_w=map_value(map=filters,ks=4,vs=1536)
73: (0f) r7 += r8 ; R7_w=map_value(map=filters,ks=4,vs=1536) R8=0
; struct perf_sample_data___new *data = (void *)kctx->data; @ sample_filter.bpf.c:77
<SNIP tons of verifier lines>
269: (65) if r2 s> 0x1 goto pc+28 ; R2_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=1,var_off=(0x0; 0x1))
270: (15) if r2 == 0x0 goto pc+45 ; R2_w=1
271: (15) if r2 == 0x1 goto pc+1 ; R2_w=1
; CHECK_RESULT(sample_data, !=, entry[i].value) @ sample_filter.bpf.c:256
273: (bf) r2 = r6 ; R2_w=map_value(map=filters,ks=4,vs=1536) R6=map_value(map=filters,ks=4,vs=1536)
274: (0f) r2 += r8 ; R2_w=map_value(map=filters,ks=4,vs=1536,off=984) R8_w=984
275: (79) r2 = *(u64 *)(r2 +16) ; R2_w=scalar()
276: (5d) if r1 != r2 goto pc+101 378: R0_w=1 R1_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) R2_w=scalar() R3_w=ptr_perf_sample_data() R4_w=scalar(smin=umin=umin32=1,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) R5=trusted_ptr_bpf_perf_event_data_kern() R6=map_value(map=filters,ks=4,vs=1536) R7_w=map_value(map=filters,ks=4,vs=1536,off=984) R8_w=984 R9=0 R10=fp0 fp-8=mmmm???? fp-40=0 fp-48=trusted_ptr_bpf_perf_event_data_kern()
378: (67) r9 <<= 32 ; R9_w=0
379: (bf) r1 = r9 ; R1_w=0 R9_w=0
380: (77) r1 >>= 32 ; R1_w=0
381: (b7) r9 = 0 ; R9_w=0
382: (15) if r1 == 0x0 goto pc-313 ; R1_w=0
; for (i = 0; i < MAX_FILTERS; i++) { @ sample_filter.bpf.c:248
70: (07) r8 += 24 ; R8_w=1008
71: (15) if r8 == 0x600 goto pc-25 ; R8_w=1008
72: (bf) r7 = r6 ; R6=map_value(map=filters,ks=4,vs=1536) R7_w=map_value(map=filters,ks=4,vs=1536)
73: (0f) r7 += r8 ; R7_w=map_value(map=filters,ks=4,vs=1536,off=1008) R8_w=1008
; struct perf_sample_data___new *data = (void *)kctx->data; @ sample_filter.bpf.c:77
74: (79) r3 = *(u64 *)(r5 +8)
processed 2344 insns (limit 1000000) max_states_per_insn 4 total_states 57 peak_states 57 mark_read 6
-- END PROG LOAD LOG --
libbpf: prog 'perf_sample_filter': failed to load: -EAGAIN
libbpf: failed to load object 'sample_filter_bpf'
libbpf: failed to load BPF skeleton 'sample_filter_bpf': -EAGAIN
Failed to load perf sample-filter BPF skeleton
failed to set filter "BPF" on event cpu_core/cycles/P with 11 (Resource temporarily unavailable)
^Z
[1]+ Stopped perf record -aW --sample-mem-info -e cycles:P --filter 'mem_op == load || mem_op == store' sleep 1
root@x1:~#
root@x1:~# fg
perf record -aW --sample-mem-info -e cycles:P --filter 'mem_op == load || mem_op == store' sleep 1
^C^C^C^C^C
^C^C^C^C^C
Well, I just had it suspended, I'm not being able to stop it:
root@x1:~# ps
PID TTY TIME CMD
1355566 pts/15 00:00:00 sudo
1355567 pts/15 00:00:00 su
1355570 pts/15 00:00:00 bash
1373907 pts/15 00:00:44 perf
1373908 pts/15 00:00:00 perf-exec
1374019 pts/15 00:00:00 ps
root@x1:~# kill -9 1373907
root@x1:~# kill -9 1373907
-bash: kill: (1373907) - No such process
[1]+ Killed perf record -aW --sample-mem-info -e cycles:P --filter 'mem_op == load || mem_op == store' sleep 1
root@x1:~#
Ok, killed.
It gets stuck in that MAP_FREEZE sys_bpf call:
root@x1:~# perf trace perf record -aW --sample-mem-info -e cycles:P --filter 'mem_op == load || mem_op == store' sleep 1
<SNIP>
16349.552 ( 0.023 ms): perf/1374043 bpf(uattr: (union bpf_attr){(struct){.map_type = (__u32)2,.key_size = (__u32)4,.value_size = (__u32)4,.max_entries = (__u32)1,.map_flags = (__u32)1152,.map_name = (char[16])['s','a','m','p','l','e','_','f','.','r','o','d','a','t','a',],.btf_fd = (__u32)61,.btf_value_type_id = (__u32)59,},(struct){.map_fd = (__u32)2,.key = (__u64)4294967300,(union){.value = (__u64)1152,.next_key = (__u64)1152,},.flags = (__u64)8101238451258523648,},.batch = (struct){.in_batch = (__u64)17179869186,.out_batch = (__u64)4294967300,.keys = (__u64)1152,.values = (__u64)8101238451258523648,.count = (__u32)1717527916,.map_fd = (__u32)1685025326,.elem_flags = (__u64)6386785,.flags = (__u64)61,},(struct){.prog_type = (__u32)2,.insn_cnt = (__u32)4,.insns = (__u64)4294967300,.license = (__u64)1152,.log_size = (__u32)1886216563,.log_buf = (__u64)7237128669819266412,.kern_version = (__u32)6386785,.prog_name = (char[16])['=',';',],.func_info = (__u64)140735998446044,.func_info_cnt = (__u32)520807888,.line_info = (__u64)140735998445848,.line_info_cnt = (__u32)4874802,.attach_btf_id = (__u32)4,(union){.attach_prog_fd = (__u32)2805057752,.attach_btf_obj_fd = (__u32)2805057752,},.core_relo_cnt = (__u32)32767,.fd_array = (__u64)15131936,.core_relos = (__u64)140735998445952,.core_relo_rec_size = (__u32)15049664,.prog_token_fd = (__s32)-1489909376,},(struct){.pathname = (__u64)17179869186,.bpf_fd = (__u32)4,.file_flags = (__u32)1,.path_fd = (__s32)1152,},(struct){(union){.target_fd = (__u32)2,.target_ifindex = (__u32)2,},.attach_bpf_fd = (__u32)4,.attach_type = (__u32)4,.attach_flags = (__u32)1,.replace_bpf_fd = (__u32)1152,.expected_revision = (__u64)8101238451258523648,},.test = (struct){.prog_fd = (__u32)2,.retval = (__u32)4,.data_size_in = (__u32)4,.data_size_out = (__u32)1,.data_in = (__u64)1152,.data_out = (__u64)8101238451258523648,.repeat = (__u32)1717527916,.duration = (__u32)1685025326,.ctx_size_in = (__u32)6386785,.ctx_in = (__u64)61,.ctx_out = (__u64)59,},(struct){(union){.start_id = (__u32)2,.prog_id = (__u32)2,.map_id = (__u32)2,.btf_id ) = 62
16349.576 ( 0.002 ms): perf/1374043 dup3(oldfd: 62, newfd: 60<anon_inode:bpf-map>, flags: 524288) = 60
16349.582 ( 0.001 ms): perf/1374043 close(fd: 62) = 0
16349.586 ( 0.006 ms): perf/1374043 bpf(cmd: MAP_UPDATE_ELEM, uattr: (union bpf_attr){(struct){.map_type = (__u32)60,.value_size = (__u32)2805058140,.max_entries = (__u32)32767,.map_flags = (__u32)1994993664,.inner_map_fd = (__u32)32622,.map_ifindex = (__u32)2,.btf_fd = (__u32)2805058000,.btf_key_type_id = (__u32)32767,.btf_value_type_id = (__u32)16,.btf_vmlinux_value_type_id = (__u32)48,.map_extra = (__u64)140735998446240,.value_type_btf_obj_fd = (__s32)-1489909280,.map_token_fd = (__s32)32767,},(struct){.map_fd = (__u32)60,.key = (__u64)140735998446172,(union){.value = (__u64)140112418123776,.next_key = (__u64)140112418123776,},},.batch = (struct){.in_batch = (__u64)60,.out_batch = (__u64)140735998446172,.keys = (__u64)140112418123776,.count = (__u32)7379400,.elem_flags = (__u64)9110741168,.flags = (__u64)140735998446032,},(struct){.prog_type = (__u32)60,.insns = (__u64)140735998446172,.license = (__u64)140112418123776,.log_buf = (__u64)7379400,.kern_version = (__u32)520806576,.prog_flags = (__u32)2,.prog_name = (char[16])[208,201,'1',167,255,127,16,'0',],.prog_ifindex = (__u32)2805058208,.expected_attach_type = (__u32)32767,.prog_btf_fd = (__u32)2805058016,.func_info_rec_size = (__u32)32767,.func_info = (__u64)410826951296,.func_info_cnt = (__u32)4874930,.line_info = (__u64)40,.line_info_cnt = (__u32)1,.attach_btf_id = (__u32)4,(union){.attach_prog_fd = (__u32)520846144,.attach_btf_obj_fd = (__u32)520846144,},.fd_array = (__u64)60,.core_relos = (__u64)140735998445776,.core_relo_rec_size = (__u32)2805057752,.log_true_size = (__u32)524288,},(struct){.pathname = (__u64)60,.bpf_fd = (__u32)2805058140,.file_flags = (__u32)32767,.path_fd = (__s32)1994993664,},(struct){(union){.target_fd = (__u32)60,.target_ifindex = (__u32)60,},.attach_type = (__u32)2805058140,.attach_flags = (__u32)32767,.replace_bpf_fd = (__u32)1994993664,(union){.relative_fd = (__u32)32622,.relative_id = (__u32)32622,},},.test = (struct){.prog_fd = (__u32)60,.data_size_in = (__u32)2805058140,.data_size_out = (__u32)32767,.data_in = (__u64)140112418123776,.repeat = (__u32)7379400,.) = 0
16349.594 ( 0.002 ms): perf/1374043 bpf(cmd: MAP_FREEZE, uattr: (union bpf_attr){(struct){.map_type = (__u32)60,.value_size = (__u32)2805058140,.max_entries = (__u32)32767,.map_flags = (__u32)1994993664,.inner_map_fd = (__u32)32622,.map_ifindex = (__u32)2,.btf_fd = (__u32)2805058000,.btf_key_type_id = (__u32)32767,.btf_value_type_id = (__u32)16,.btf_vmlinux_value_type_id = (__u32)48,.map_extra = (__u64)140735998446240,.value_type_btf_obj_fd = (__s32)-1489909280,.map_token_fd = (__s32)32767,},(struct){.map_fd = (__u32)60,.key = (__u64)140735998446172,(union){.value = (__u64)140112418123776,.next_key = (__u64)140112418123776,},},.batch = (struct){.in_batch = (__u64)60,.out_batch = (__u64)140735998446172,.keys = (__u64)140112418123776,.count = (__u32)7379400,.elem_flags = (__u64)9110741168,.flags = (__u64)140735998446032,},(struct){.prog_type = (__u32)60,.insns = (__u64)140735998446172,.license = (__u64)140112418123776,.log_buf = (__u64)7379400,.kern_version = (__u32)520806576,.prog_flags = (__u32)2,.prog_name = (char[16])[208,201,'1',167,255,127,16,'0',],.prog_ifindex = (__u32)2805058208,.expected_attach_type = (__u32)32767,.prog_btf_fd = (__u32)2805058016,.func_info_rec_size = (__u32)32767,.func_info = (__u64)410826951296,.func_info_cnt = (__u32)4874930,.line_info = (__u64)40,.line_info_cnt = (__u32)1,.attach_btf_id = (__u32)4,(union){.attach_prog_fd = (__u32)520846144,.attach_btf_obj_fd = (__u32)520846144,},.fd_array = (__u64)60,.core_relos = (__u64)140735998445776,.core_relo_rec_size = (__u32)2805057752,.log_true_size = (__u32)524288,},(struct){.pathname = (__u64)60,.bpf_fd = (__u32)2805058140,.file_flags = (__u32)32767,.path_fd = (__s32)1994993664,},(struct){(union){.target_fd = (__u32)60,.target_ifindex = (__u32)60,},.attach_type = (__u32)2805058140,.attach_flags = (__u32)32767,.replace_bpf_fd = (__u32)1994993664,(union){.relative_fd = (__u32)32622,.relative_id = (__u32)32622,},},.test = (struct){.prog_fd = (__u32)60,.data_size_in = (__u32)2805058140,.data_size_out = (__u32)32767,.data_in = (__u64)140112418123776,.repeat = (__u32)7379400,.ctx_s) = 0
o
16349.601 ( 0.050 ms): perf/1374043 mmap(addr: 0x7f6e76e93000, len: 4096, prot: READ, flags: SHARED|FIXED, fd: 60) = 0x7f6e76e93000
root@x1:~# uname -a
Linux x1 6.13.9-100.fc40.x86_64 #1 SMP PREEMPT_DYNAMIC Sat Mar 29 01:27:18 UTC 2025 x86_64 GNU/Linux
root@x1:~#
root@x1:~# grep -m1 "model name" /proc/cpuinfo
model name : 13th Gen Intel(R) Core(TM) i7-1365U
root@x1:~#
Now trying on another machine:
root@...ber:~# uname -a
Linux number 6.15.0-rc4+ #2 SMP PREEMPT_DYNAMIC Tue Apr 29 15:56:43 -03 2025 x86_64 GNU/Linux
root@...ber:~# grep -m1 "model name" /proc/cpuinfo
model name : AMD Ryzen 9 9950X3D 16-Core Processor
root@...ber:~# perf record -aW --sample-mem-info -e cycles:P --filter 'mem_op == load || mem_op == store' sleep 1
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 1.870 MB perf.data (1199 samples) ]
root@...ber:~#
root@...ber:~# perf mem report -F overhead,dtlb,dso --stdio | head -20
# To display the perf.data header info, please use --header/--header-only options.
#
#
# Total Lost Samples: 0
#
# Samples: 1K of event 'cycles:P'
# Total weight : 7879
# Sort order : local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked,blocked,local_ins_lat,local_p_stage_cyc
#
# ---------- D-TLB -----------
# Overhead L1-Hit L2-Hit Miss Other Shared Object
# ........ ............................ .................................
#
48.51% 44.8% 18.7% 28.3% 8.2% [kernel.kallsyms]
11.97% 2.1% 1.4% 96.0% 0.5% libc.so.6
8.58% 85.7% 14.1% 0.0% 0.3% libxul.so
6.76% 100.0% 0.0% 0.0% 0.0% libfreeblpriv3.so
6.08% 100.0% 0.0% 0.0% 0.0% libsystemd-shared-257.5-2.fc42.so
4.59% 100.0% 0.0% 0.0% 0.0% firefox
4.33% 100.0% 0.0% 0.0% 0.0% libgallium-25.0.4.so
root@...ber:~#
Looks better :-)
Having the record as an alias seems interesting, ditto for the report.
- Arnaldo
Powered by blists - more mailing lists