[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <ae0de6b0-82d8-4016-8011-0439ae2851e2@linux.ibm.com>
Date: Wed, 21 Jan 2026 22:48:22 +0530
From: Shrikanth Hegde <sshegde@...ux.ibm.com>
To: Swapnil Sapkal <swapnil.sapkal@....com>
Cc: ravi.bangoria@....com, yu.c.chen@...el.com, mark.rutland@....com,
alexander.shishkin@...ux.intel.com, jolsa@...nel.org,
rostedt@...dmis.org, vincent.guittot@...aro.org,
adrian.hunter@...el.com, kan.liang@...ux.intel.com,
gautham.shenoy@....com, kprateek.nayak@....com, juri.lelli@...hat.com,
yangjihong@...edance.com, void@...ifault.com, tj@...nel.org,
ctshao@...gle.com, quic_zhonhan@...cinc.com, thomas.falcon@...el.com,
blakejones@...gle.com, ashelat@...hat.com, leo.yan@....com,
dvyukov@...gle.com, ak@...ux.intel.com, yujie.liu@...el.com,
graham.woodward@....com, ben.gainey@....com, vineethr@...ux.ibm.com,
tim.c.chen@...ux.intel.com, linux@...blig.org, santosh.shukla@....com,
sandipan.das@....com, linux-kernel@...r.kernel.org,
linux-perf-users@...r.kernel.org, peterz@...radead.org,
mingo@...hat.com, acme@...nel.org, namhyung@...nel.org,
irogers@...gle.com, james.clark@....com
Subject: Re: [PATCH v5 02/10] perf header: Support CPU DOMAIN relation info
Hi Swapnil.
On 1/19/26 11:28 PM, Swapnil Sapkal wrote:
> '/proc/schedstat' gives the info about load balancing statistics within
> a given domain. It also contains the cpu_mask giving information about
> the sibling cpus and domain names after schedstat version 17. Storing
> this information in perf header will help tools like `perf sched stats`
> for better analysis.
>
> Signed-off-by: Swapnil Sapkal <swapnil.sapkal@....com>
> ---
> .../Documentation/perf.data-file-format.txt | 17 ++
> tools/perf/builtin-inject.c | 1 +
> tools/perf/util/env.c | 29 ++
> tools/perf/util/env.h | 17 ++
> tools/perf/util/header.c | 286 ++++++++++++++++++
> tools/perf/util/header.h | 1 +
> tools/perf/util/util.c | 42 +++
> tools/perf/util/util.h | 3 +
> 8 files changed, 396 insertions(+)
>
> diff --git a/tools/perf/Documentation/perf.data-file-format.txt b/tools/perf/Documentation/perf.data-file-format.txt
> index c9d4dec65344..0e4d0ecc9e12 100644
> --- a/tools/perf/Documentation/perf.data-file-format.txt
> +++ b/tools/perf/Documentation/perf.data-file-format.txt
> @@ -447,6 +447,23 @@ struct {
> } [nr_pmu];
> };
>
> + HEADER_CPU_DOMAIN_INFO = 32,
> +
> +List of cpu-domain relation info. The format of the data is as below.
> +
> +struct domain_info {
> + int domain;
> + char dname[];
> + char cpumask[];
> + char cpulist[];
> +};
> +
> +struct cpu_domain_info {
> + int cpu;
> + int nr_domains;
> + struct domain_info domains[];
> +};
> +
> other bits are reserved and should ignored for now
> HEADER_FEAT_BITS = 256,
>
> diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
> index 6080afec537d..587c180035b2 100644
> --- a/tools/perf/builtin-inject.c
> +++ b/tools/perf/builtin-inject.c
> @@ -2047,6 +2047,7 @@ static bool keep_feat(struct perf_inject *inject, int feat)
> case HEADER_CLOCK_DATA:
> case HEADER_HYBRID_TOPOLOGY:
> case HEADER_PMU_CAPS:
> + case HEADER_CPU_DOMAIN_INFO:
> return true;
> /* Information that can be updated */
> case HEADER_BUILD_ID:
> diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c
> index f1626d2032cd..93d475a80f14 100644
> --- a/tools/perf/util/env.c
> +++ b/tools/perf/util/env.c
> @@ -216,6 +216,34 @@ static void perf_env__purge_bpf(struct perf_env *env __maybe_unused)
> }
> #endif // HAVE_LIBBPF_SUPPORT
>
> +void free_cpu_domain_info(struct cpu_domain_map **cd_map, u32 schedstat_version, u32 nr)
> +{
> + if (!cd_map)
> + return;
> +
> + for (u32 i = 0; i < nr; i++) {
> + if (!cd_map[i])
> + continue;
> +
> + for (u32 j = 0; j < cd_map[i]->nr_domains; j++) {
> + struct domain_info *d_info = cd_map[i]->domains[j];
> +
> + if (!d_info)
> + continue;
> +
> + if (schedstat_version >= 17)
> + zfree(&d_info->dname);
> +
> + zfree(&d_info->cpumask);
> + zfree(&d_info->cpulist);
> + zfree(&d_info);
> + }
> + zfree(&cd_map[i]->domains);
> + zfree(&cd_map[i]);
> + }
> + zfree(&cd_map);
> +}
> +
> void perf_env__exit(struct perf_env *env)
> {
> int i, j;
> @@ -265,6 +293,7 @@ void perf_env__exit(struct perf_env *env)
> zfree(&env->pmu_caps[i].pmu_name);
> }
> zfree(&env->pmu_caps);
> + free_cpu_domain_info(env->cpu_domain, env->schedstat_version, env->nr_cpus_avail);
> }
>
> void perf_env__init(struct perf_env *env)
> diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h
> index 9977b85523a8..76ba1a36e9ff 100644
> --- a/tools/perf/util/env.h
> +++ b/tools/perf/util/env.h
> @@ -54,6 +54,19 @@ struct pmu_caps {
> char *pmu_name;
> };
>
> +struct domain_info {
> + u32 domain;
> + char *dname;
> + char *cpumask;
> + char *cpulist;
> +};
> +
> +struct cpu_domain_map {
> + u32 cpu;
> + u32 nr_domains;
> + struct domain_info **domains;
> +};
> +
> typedef const char *(arch_syscalls__strerrno_t)(int err);
>
> struct perf_env {
> @@ -70,6 +83,8 @@ struct perf_env {
> unsigned int max_branches;
> unsigned int br_cntr_nr;
> unsigned int br_cntr_width;
> + unsigned int schedstat_version;
> + unsigned int max_sched_domains;
> int kernel_is_64_bit;
>
> int nr_cmdline;
> @@ -92,6 +107,7 @@ struct perf_env {
> char **cpu_pmu_caps;
> struct cpu_topology_map *cpu;
> struct cpu_cache_level *caches;
> + struct cpu_domain_map **cpu_domain;
> int caches_cnt;
> u32 comp_ratio;
> u32 comp_ver;
> @@ -151,6 +167,7 @@ struct bpf_prog_info_node;
> struct btf_node;
>
> int perf_env__read_core_pmu_caps(struct perf_env *env);
> +void free_cpu_domain_info(struct cpu_domain_map **cd_map, u32 schedstat_version, u32 nr);
> void perf_env__exit(struct perf_env *env);
>
> int perf_env__kernel_is_64_bit(struct perf_env *env);
> diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
> index f5cad377c99e..673d53bb2a2c 100644
> --- a/tools/perf/util/header.c
> +++ b/tools/perf/util/header.c
> @@ -1614,6 +1614,162 @@ static int write_pmu_caps(struct feat_fd *ff,
> return 0;
> }
>
> +static struct cpu_domain_map **build_cpu_domain_map(u32 *schedstat_version, u32 *max_sched_domains,
> + u32 nr)
> +{
> + struct domain_info *domain_info;
> + struct cpu_domain_map **cd_map;
> + char dname[16], cpumask[256];
You should likely make cpumask and cpulist as NR_CPUS to be
safe.
256 will work till 1024 CPUs no? These days there are systems with
more CPUs than that.
Making it NR_CPUS will likely cover all crazy cases/configurations.
> + char cpulist[1024];
> + char *line = NULL;
> + u32 cpu, domain;
> + u32 dcount = 0;
> + size_t len;
> + FILE *fp;
> +
> + fp = fopen("/proc/schedstat", "r");
> + if (!fp) {
> + pr_err("Failed to open /proc/schedstat\n");
> + return NULL;
> + }
> +
> + cd_map = zalloc(sizeof(*cd_map) * nr);
> + if (!cd_map)
> + goto out;
> +
> + while (getline(&line, &len, fp) > 0) {
> + int retval;
> +
> + if (strncmp(line, "version", 7) == 0) {
> + retval = sscanf(line, "version %d\n", schedstat_version);
> + if (retval != 1)
> + continue;
> +
> + } else if (strncmp(line, "cpu", 3) == 0) {
> + retval = sscanf(line, "cpu%u %*s", &cpu);
> + if (retval == 1) {
> + cd_map[cpu] = zalloc(sizeof(*cd_map[cpu]));
> + if (!cd_map[cpu])
> + goto out_free_line;
> + cd_map[cpu]->cpu = cpu;
> + } else
> + continue;
> +
> + dcount = 0;
> + } else if (strncmp(line, "domain", 6) == 0) {
> + struct domain_info **temp_domains;
> +
> + dcount++;
> + temp_domains = realloc(cd_map[cpu]->domains, dcount * sizeof(domain_info));
> + if (!temp_domains)
> + goto out_free_line;
> + else
> + cd_map[cpu]->domains = temp_domains;
> +
> + domain_info = zalloc(sizeof(*domain_info));
> + if (!domain_info)
> + goto out_free_line;
> +
> + cd_map[cpu]->domains[dcount - 1] = domain_info;
> +
> + if (*schedstat_version >= 17) {
> + retval = sscanf(line, "domain%u %s %s %*s", &domain, dname,
> + cpumask);
> + if (retval != 3)
> + continue;
> +
> + domain_info->dname = strdup(dname);
> + if (!domain_info->dname)
> + goto out_free_line;
> + } else {
> + retval = sscanf(line, "domain%u %s %*s", &domain, cpumask);
> + if (retval != 2)
> + continue;
> + }
> +
> + domain_info->domain = domain;
> + if (domain > *max_sched_domains)
> + *max_sched_domains = domain;
> +
> + domain_info->cpumask = strdup(cpumask);
> + if (!domain_info->cpumask)
> + goto out_free_line;
> +
> + cpumask_to_cpulist(cpumask, cpulist);
> + domain_info->cpulist = strdup(cpulist);
> + if (!domain_info->cpulist)
> + goto out_free_line;
> +
> + cd_map[cpu]->nr_domains = dcount;
> + }
> + }
> +
> +out_free_line:
> + free(line);
> +out:
> + fclose(fp);
> + return cd_map;
> +}
> +
> +static int write_cpu_domain_info(struct feat_fd *ff,
> + struct evlist *evlist __maybe_unused)
> +{
> + u32 max_sched_domains = 0, schedstat_version = 0;
> + struct cpu_domain_map **cd_map;
> + u32 i, j, nr, ret;
> +
> + nr = cpu__max_present_cpu().cpu;
> +
> + cd_map = build_cpu_domain_map(&schedstat_version, &max_sched_domains, nr);
> + if (!cd_map)
> + return -1;
> +
> + ret = do_write(ff, &schedstat_version, sizeof(u32));
> + if (ret < 0)
> + goto out;
> +
> + max_sched_domains += 1;
> + ret = do_write(ff, &max_sched_domains, sizeof(u32));
> + if (ret < 0)
> + goto out;
> +
> + for (i = 0; i < nr; i++) {
> + if (!cd_map[i])
> + continue;
> +
> + ret = do_write(ff, &cd_map[i]->cpu, sizeof(u32));
> + if (ret < 0)
> + goto out;
> +
> + ret = do_write(ff, &cd_map[i]->nr_domains, sizeof(u32));
> + if (ret < 0)
> + goto out;
> +
> + for (j = 0; j < cd_map[i]->nr_domains; j++) {
> + ret = do_write(ff, &cd_map[i]->domains[j]->domain, sizeof(u32));
> + if (ret < 0)
> + goto out;
> + if (schedstat_version >= 17) {
> + ret = do_write_string(ff, cd_map[i]->domains[j]->dname);
> + if (ret < 0)
> + goto out;
> + }
> +
> + ret = do_write_string(ff, cd_map[i]->domains[j]->cpumask);
> + if (ret < 0)
> + goto out;
> +
> + ret = do_write_string(ff, cd_map[i]->domains[j]->cpulist);
> + if (ret < 0)
> + goto out;
> + }
> + }
> +
> +out:
> + free_cpu_domain_info(cd_map, schedstat_version, nr);
> + return ret;
> +}
> +
> static void print_hostname(struct feat_fd *ff, FILE *fp)
> {
> fprintf(fp, "# hostname : %s\n", ff->ph->env.hostname);
> @@ -2247,6 +2403,39 @@ static void print_mem_topology(struct feat_fd *ff, FILE *fp)
> }
> }
>
> +static void print_cpu_domain_info(struct feat_fd *ff, FILE *fp)
> +{
> + struct cpu_domain_map **cd_map = ff->ph->env.cpu_domain;
> + u32 nr = ff->ph->env.nr_cpus_avail;
> + struct domain_info *d_info;
> + u32 i, j;
> +
> + fprintf(fp, "# schedstat version : %u\n", ff->ph->env.schedstat_version);
> + fprintf(fp, "# Maximum sched domains : %u\n", ff->ph->env.max_sched_domains);
> +
> + for (i = 0; i < nr; i++) {
> + if (!cd_map[i])
> + continue;
> +
> + fprintf(fp, "# cpu : %u\n", cd_map[i]->cpu);
> + fprintf(fp, "# nr_domains : %u\n", cd_map[i]->nr_domains);
> +
> + for (j = 0; j < cd_map[i]->nr_domains; j++) {
> + d_info = cd_map[i]->domains[j];
> + if (!d_info)
> + continue;
> +
> + fprintf(fp, "# Domain : %u\n", d_info->domain);
> +
> + if (ff->ph->env.schedstat_version >= 17)
> + fprintf(fp, "# Domain name : %s\n", d_info->dname);
> +
> + fprintf(fp, "# Domain cpu map : %s\n", d_info->cpumask);
> + fprintf(fp, "# Domain cpu list : %s\n", d_info->cpulist);
> + }
> + }
> +}
> +
> static int __event_process_build_id(struct perf_record_header_build_id *bev,
> char *filename,
> struct perf_session *session)
> @@ -3388,6 +3577,102 @@ static int process_pmu_caps(struct feat_fd *ff, void *data __maybe_unused)
> return ret;
> }
>
> +static int process_cpu_domain_info(struct feat_fd *ff, void *data __maybe_unused)
> +{
> + u32 schedstat_version, max_sched_domains, cpu, domain, nr_domains;
> + struct perf_env *env = &ff->ph->env;
> + char *dname, *cpumask, *cpulist;
> + struct cpu_domain_map **cd_map;
> + struct domain_info *d_info;
> + u32 nra, nr, i, j;
> + int ret;
> +
> + nra = env->nr_cpus_avail;
> + nr = env->nr_cpus_online;
> +
> + cd_map = zalloc(sizeof(*cd_map) * nra);
> + if (!cd_map)
> + return -1;
> +
> + env->cpu_domain = cd_map;
> +
> + ret = do_read_u32(ff, &schedstat_version);
> + if (ret)
> + return ret;
> +
> + env->schedstat_version = schedstat_version;
> +
> + ret = do_read_u32(ff, &max_sched_domains);
> + if (ret)
> + return ret;
> +
> + env->max_sched_domains = max_sched_domains;
> +
> + for (i = 0; i < nr; i++) {
> + if (do_read_u32(ff, &cpu))
> + return -1;
> +
> + cd_map[cpu] = zalloc(sizeof(*cd_map[cpu]));
> + if (!cd_map[cpu])
> + return -1;
> +
> + cd_map[cpu]->cpu = cpu;
> +
> + if (do_read_u32(ff, &nr_domains))
> + return -1;
> +
> + cd_map[cpu]->nr_domains = nr_domains;
> +
> + cd_map[cpu]->domains = zalloc(sizeof(*d_info) * max_sched_domains);
> + if (!cd_map[cpu]->domains)
> + return -1;
> +
> + for (j = 0; j < nr_domains; j++) {
> + if (do_read_u32(ff, &domain))
> + return -1;
> +
> + d_info = zalloc(sizeof(*d_info));
> + if (!d_info)
> + return -1;
> +
> + cd_map[cpu]->domains[domain] = d_info;
> + d_info->domain = domain;
> +
> + if (schedstat_version >= 17) {
> + dname = do_read_string(ff);
> + if (!dname)
> + return -1;
> +
> + d_info->dname = zalloc(strlen(dname) + 1);
> + if (!d_info->dname)
> + return -1;
> +
> + d_info->dname = strdup(dname);
> + }
> +
> + cpumask = do_read_string(ff);
> + if (!cpumask)
> + return -1;
> +
> + d_info->cpumask = zalloc(strlen(cpumask) + 1);
> + if (!d_info->cpumask)
> + return -1;
> + d_info->cpumask = strdup(cpumask);
> +
> + cpulist = do_read_string(ff);
> + if (!cpulist)
> + return -1;
> +
> + d_info->cpulist = zalloc(strlen(cpulist) + 1);
> + if (!d_info->cpulist)
> + return -1;
> + d_info->cpulist = strdup(cpulist);
> + }
> + }
> +
> + return ret;
> +}
> +
> #define FEAT_OPR(n, func, __full_only) \
> [HEADER_##n] = { \
> .name = __stringify(n), \
> @@ -3453,6 +3738,7 @@ const struct perf_header_feature_ops feat_ops[HEADER_LAST_FEATURE] = {
> FEAT_OPR(CLOCK_DATA, clock_data, false),
> FEAT_OPN(HYBRID_TOPOLOGY, hybrid_topology, true),
> FEAT_OPR(PMU_CAPS, pmu_caps, false),
> + FEAT_OPR(CPU_DOMAIN_INFO, cpu_domain_info, true),
> };
>
> struct header_print_data {
> diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
> index c058021c3150..c62f3275a80f 100644
> --- a/tools/perf/util/header.h
> +++ b/tools/perf/util/header.h
> @@ -53,6 +53,7 @@ enum {
> HEADER_CLOCK_DATA,
> HEADER_HYBRID_TOPOLOGY,
> HEADER_PMU_CAPS,
> + HEADER_CPU_DOMAIN_INFO,
> HEADER_LAST_FEATURE,
> HEADER_FEAT_BITS = 256,
> };
> diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
> index 0f031eb80b4c..b87ff96a9f45 100644
> --- a/tools/perf/util/util.c
> +++ b/tools/perf/util/util.c
> @@ -257,6 +257,48 @@ static int rm_rf_kcore_dir(const char *path)
> return 0;
> }
>
> +void cpumask_to_cpulist(char *cpumask, char *cpulist)
> +{
> + int i, j, bm_size, nbits;
> + int len = strlen(cpumask);
> + unsigned long *bm;
> + char cpus[1024];
> +
> + for (i = 0; i < len; i++) {
> + if (cpumask[i] == ',') {
> + for (j = i; j < len; j++)
> + cpumask[j] = cpumask[j + 1];
> + }
> + }
> +
> + len = strlen(cpumask);
> + bm_size = (len + 15) / 16;
> + nbits = bm_size * 64;
> + if (nbits <= 0)
> + return;
> +
> + bm = calloc(bm_size, sizeof(unsigned long));
> + if (!cpumask)
> + goto free_bm;
> +
> + for (i = 0; i < bm_size; i++) {
> + char blk[17];
> + int blklen = len > 16 ? 16 : len;
> +
> + strncpy(blk, cpumask + len - blklen, blklen);
> + blk[blklen] = '\0';
> + bm[i] = strtoul(blk, NULL, 16);
> + cpumask[len - blklen] = '\0';
> + len = strlen(cpumask);
> + }
> +
> + bitmap_scnprintf(bm, nbits, cpus, sizeof(cpus));
> + strcpy(cpulist, cpus);
> +
> +free_bm:
> + free(bm);
> +}
> +
> int rm_rf_perf_data(const char *path)
> {
> const char *pat[] = {
> diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
> index 3423778e39a5..1572c8cf04e5 100644
> --- a/tools/perf/util/util.h
> +++ b/tools/perf/util/util.h
> @@ -11,6 +11,7 @@
> #include <stdbool.h>
> #include <stddef.h>
> #include <linux/compiler.h>
> +#include <linux/bitmap.h>
> #include <sys/types.h>
> #ifndef __cplusplus
> #include <internal/cpumap.h>
> @@ -48,6 +49,8 @@ bool sysctl__nmi_watchdog_enabled(void);
>
> int perf_tip(char **strp, const char *dirpath);
>
> +void cpumask_to_cpulist(char *cpumask, char *cpulist);
> +
> #ifndef HAVE_SCHED_GETCPU_SUPPORT
> int sched_getcpu(void);
> #endif
Powered by blists - more mailing lists