[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <b39ffdd5-1692-46ed-86d9-726011c92036@linaro.org>
Date: Tue, 7 Oct 2025 10:10:12 +0100
From: James Clark <james.clark@...aro.org>
To: Ian Rogers <irogers@...gle.com>
Cc: Peter Zijlstra <peterz@...radead.org>, Ingo Molnar <mingo@...hat.com>,
Arnaldo Carvalho de Melo <acme@...nel.org>,
Namhyung Kim <namhyung@...nel.org>, Mark Rutland <mark.rutland@....com>,
Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
Jiri Olsa <jolsa@...nel.org>, Adrian Hunter <adrian.hunter@...el.com>,
Leo Yan <leo.yan@....com>, linux-perf-users@...r.kernel.org,
linux-kernel@...r.kernel.org
Subject: Re: [PATCH] perf tests: Don't retest sections in "Object code
reading"
On 06/10/2025 4:21 pm, Ian Rogers wrote:
> On Mon, Oct 6, 2025 at 6:11 AM James Clark <james.clark@...aro.org> wrote:
>>
>> We already only test each kcore map once, but on slow systems
>> (particularly with network filesystems) even the non-kcore maps are
>> slow. The test can test the same objump output over and over which only
>> wastes time. Generalize the skipping mechanism to track all DSOs and
>> addresses so that each section is only tested once.
>>
>> On a fully loaded Arm Juno (simulating a parallel Perf test run) with a
>> network filesystem, the original runtime is:
>>
>> real 1m51.126s
>> user 0m19.445s
>> sys 1m15.431s
>>
>> And the new runtime is:
>>
>> real 0m48.873s
>> user 0m8.031s
>> sys 0m32.353s
>>
>> Signed-off-by: James Clark <james.clark@...aro.org>
>
> Reviewed-by: Ian Rogers <irogers@...gle.com>
>
>> ---
>> tools/perf/tests/code-reading.c | 119 ++++++++++++++++++++++++++++------------
>> 1 file changed, 85 insertions(+), 34 deletions(-)
>>
>> diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c
>> index 9c2091310191..4c9fbf6965c4 100644
>> --- a/tools/perf/tests/code-reading.c
>> +++ b/tools/perf/tests/code-reading.c
>> @@ -2,6 +2,7 @@
>> #include <errno.h>
>> #include <linux/kconfig.h>
>> #include <linux/kernel.h>
>> +#include <linux/rbtree.h>
>> #include <linux/types.h>
>> #include <inttypes.h>
>> #include <stdlib.h>
>> @@ -39,11 +40,64 @@
>> #define BUFSZ 1024
>> #define READLEN 128
>>
>> -struct state {
>> - u64 done[1024];
>> - size_t done_cnt;
>> +struct tested_section {
>> + struct rb_node rb_node;
>> + u64 addr;
>> + char path[PATH_MAX];
>> };
>>
>> +static bool tested_code_insert_or_exists(const char *path, u64 addr,
>> + struct rb_root *tested_sections)
>> +{
>> + struct rb_node **node = &tested_sections->rb_node;
>> + struct rb_node *parent = NULL;
>> + struct tested_section *data;
>> +
>> + while (*node) {
>> + int cmp;
>> +
>> + parent = *node;
>> + data = rb_entry(*node, struct tested_section, rb_node);
>> + cmp = strcmp(path, data->path);
>> + if (!cmp) {
>> + if (addr < data->addr)
>> + cmp = -1;
>> + else if (addr > data->addr)
>> + cmp = 1;
>> + else
>> + return true; /* already tested */
>> + }
>> +
>> + if (cmp < 0)
>> + node = &(*node)->rb_left;
>> + else
>> + node = &(*node)->rb_right;
>> + }
>> +
>> + data = zalloc(sizeof(*data));
>> + if (!data)
>> + return true;
>> +
>> + data->addr = addr;
>> + strlcpy(data->path, path, sizeof(data->path));
>
> nit: perhaps strdup rather than having 4kb per tested_section.
>
> Thanks,
> Ian
>
Oh yeah that would have been better, not sure why I didn't do it that
way. Although the max sections I saw was around 50, and it's usually a
lot less so it's probably not worth the churn to change it now that
Arnaldo's applied it?
Thanks
James
>> + rb_link_node(&data->rb_node, parent, node);
>> + rb_insert_color(&data->rb_node, tested_sections);
>> + return false;
>> +}
>> +
>> +static void tested_sections__free(struct rb_root *root)
>> +{
>> + while (!RB_EMPTY_ROOT(root)) {
>> + struct rb_node *node = rb_first(root);
>> + struct tested_section *ts = rb_entry(node,
>> + struct tested_section,
>> + rb_node);
>> +
>> + rb_erase(node, root);
>> + free(ts);
>> + }
>> +}
>> +
>> static size_t read_objdump_chunk(const char **line, unsigned char **buf,
>> size_t *buf_len)
>> {
>> @@ -316,13 +370,15 @@ static void dump_buf(unsigned char *buf, size_t len)
>> }
>>
>> static int read_object_code(u64 addr, size_t len, u8 cpumode,
>> - struct thread *thread, struct state *state)
>> + struct thread *thread,
>> + struct rb_root *tested_sections)
>> {
>> struct addr_location al;
>> unsigned char buf1[BUFSZ] = {0};
>> unsigned char buf2[BUFSZ] = {0};
>> size_t ret_len;
>> u64 objdump_addr;
>> + u64 skip_addr;
>> const char *objdump_name;
>> char decomp_name[KMOD_DECOMP_LEN];
>> bool decomp = false;
>> @@ -350,6 +406,18 @@ static int read_object_code(u64 addr, size_t len, u8 cpumode,
>> goto out;
>> }
>>
>> + /*
>> + * Don't retest the same addresses. objdump struggles with kcore - try
>> + * each map only once even if the address is different.
>> + */
>> + skip_addr = dso__is_kcore(dso) ? map__start(al.map) : al.addr;
>> + if (tested_code_insert_or_exists(dso__long_name(dso), skip_addr,
>> + tested_sections)) {
>> + pr_debug("Already tested %s @ %#"PRIx64" - skipping\n",
>> + dso__long_name(dso), skip_addr);
>> + goto out;
>> + }
>> +
>> pr_debug("On file address is: %#"PRIx64"\n", al.addr);
>>
>> if (len > BUFSZ)
>> @@ -387,24 +455,6 @@ static int read_object_code(u64 addr, size_t len, u8 cpumode,
>> goto out;
>> }
>>
>> - /* objdump struggles with kcore - try each map only once */
>> - if (dso__is_kcore(dso)) {
>> - size_t d;
>> -
>> - for (d = 0; d < state->done_cnt; d++) {
>> - if (state->done[d] == map__start(al.map)) {
>> - pr_debug("kcore map tested already");
>> - pr_debug(" - skipping\n");
>> - goto out;
>> - }
>> - }
>> - if (state->done_cnt >= ARRAY_SIZE(state->done)) {
>> - pr_debug("Too many kcore maps - skipping\n");
>> - goto out;
>> - }
>> - state->done[state->done_cnt++] = map__start(al.map);
>> - }
>> -
>> objdump_name = dso__long_name(dso);
>> if (dso__needs_decompress(dso)) {
>> if (dso__decompress_kmodule_path(dso, objdump_name,
>> @@ -471,9 +521,9 @@ static int read_object_code(u64 addr, size_t len, u8 cpumode,
>> return err;
>> }
>>
>> -static int process_sample_event(struct machine *machine,
>> - struct evlist *evlist,
>> - union perf_event *event, struct state *state)
>> +static int process_sample_event(struct machine *machine, struct evlist *evlist,
>> + union perf_event *event,
>> + struct rb_root *tested_sections)
>> {
>> struct perf_sample sample;
>> struct thread *thread;
>> @@ -494,7 +544,8 @@ static int process_sample_event(struct machine *machine,
>> goto out;
>> }
>>
>> - ret = read_object_code(sample.ip, READLEN, sample.cpumode, thread, state);
>> + ret = read_object_code(sample.ip, READLEN, sample.cpumode, thread,
>> + tested_sections);
>> thread__put(thread);
>> out:
>> perf_sample__exit(&sample);
>> @@ -502,10 +553,11 @@ static int process_sample_event(struct machine *machine,
>> }
>>
>> static int process_event(struct machine *machine, struct evlist *evlist,
>> - union perf_event *event, struct state *state)
>> + union perf_event *event, struct rb_root *tested_sections)
>> {
>> if (event->header.type == PERF_RECORD_SAMPLE)
>> - return process_sample_event(machine, evlist, event, state);
>> + return process_sample_event(machine, evlist, event,
>> + tested_sections);
>>
>> if (event->header.type == PERF_RECORD_THROTTLE ||
>> event->header.type == PERF_RECORD_UNTHROTTLE)
>> @@ -525,7 +577,7 @@ static int process_event(struct machine *machine, struct evlist *evlist,
>> }
>>
>> static int process_events(struct machine *machine, struct evlist *evlist,
>> - struct state *state)
>> + struct rb_root *tested_sections)
>> {
>> union perf_event *event;
>> struct mmap *md;
>> @@ -537,7 +589,7 @@ static int process_events(struct machine *machine, struct evlist *evlist,
>> continue;
>>
>> while ((event = perf_mmap__read_event(&md->core)) != NULL) {
>> - ret = process_event(machine, evlist, event, state);
>> + ret = process_event(machine, evlist, event, tested_sections);
>> perf_mmap__consume(&md->core);
>> if (ret < 0)
>> return ret;
>> @@ -637,9 +689,7 @@ static int do_test_code_reading(bool try_kcore)
>> .uses_mmap = true,
>> },
>> };
>> - struct state state = {
>> - .done_cnt = 0,
>> - };
>> + struct rb_root tested_sections = RB_ROOT;
>> struct perf_thread_map *threads = NULL;
>> struct perf_cpu_map *cpus = NULL;
>> struct evlist *evlist = NULL;
>> @@ -773,7 +823,7 @@ static int do_test_code_reading(bool try_kcore)
>>
>> evlist__disable(evlist);
>>
>> - ret = process_events(machine, evlist, &state);
>> + ret = process_events(machine, evlist, &tested_sections);
>> if (ret < 0)
>> goto out_put;
>>
>> @@ -793,6 +843,7 @@ static int do_test_code_reading(bool try_kcore)
>> perf_thread_map__put(threads);
>> machine__delete(machine);
>> perf_env__exit(&host_env);
>> + tested_sections__free(&tested_sections);
>>
>> return err;
>> }
>>
>> ---
>> base-commit: a22d167ed82505f770340c3a7c257c04ba24dac9
>> change-id: 20251006-james-perf-object-code-reading-9646e486d595
>>
>> Best regards,
>> --
>> James Clark <james.clark@...aro.org>
>>
Powered by blists - more mailing lists