[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <3e7a84c0-a549-4e83-ba1f-beb0fe8a2930@amd.com>
Date: Tue, 20 Jan 2026 00:05:37 +0530
From: Swapnil Sapkal <swapnil.sapkal@....com>
To: Namhyung Kim <namhyung@...nel.org>
CC: <peterz@...radead.org>, <mingo@...hat.com>, <acme@...nel.org>,
<irogers@...gle.com>, <james.clark@....com>, <ravi.bangoria@....com>,
<yu.c.chen@...el.com>, <mark.rutland@....com>,
<alexander.shishkin@...ux.intel.com>, <jolsa@...nel.org>,
<rostedt@...dmis.org>, <vincent.guittot@...aro.org>,
<adrian.hunter@...el.com>, <kan.liang@...ux.intel.com>,
<gautham.shenoy@....com>, <kprateek.nayak@....com>, <juri.lelli@...hat.com>,
<yangjihong@...edance.com>, <void@...ifault.com>, <tj@...nel.org>,
<sshegde@...ux.ibm.com>, <ctshao@...gle.com>, <quic_zhonhan@...cinc.com>,
<thomas.falcon@...el.com>, <blakejones@...gle.com>, <ashelat@...hat.com>,
<leo.yan@....com>, <dvyukov@...gle.com>, <ak@...ux.intel.com>,
<yujie.liu@...el.com>, <graham.woodward@....com>, <ben.gainey@....com>,
<vineethr@...ux.ibm.com>, <tim.c.chen@...ux.intel.com>, <linux@...blig.org>,
<linux-kernel@...r.kernel.org>, <linux-perf-users@...r.kernel.org>,
<santosh.shukla@....com>, <sandipan.das@....com>, James Clark
<james.clark@...aro.org>
Subject: Re: [PATCH RESEND v4 07/11] perf sched stats: Add support for report
subcommand
Hi Namhyung,
On 09-01-2026 16:55, Swapnil Sapkal wrote:
> Hello Namhyung,
>
> On 03-01-2026 04:05, Namhyung Kim wrote:
>> On Tue, Sep 09, 2025 at 11:42:23AM +0000, Swapnil Sapkal wrote:
>>> `perf sched stats record` captures two sets of samples. For workload
>>> profile, first set right before workload starts and second set after
>>> workload finishes. For the systemwide profile, first set at the
>>> beginning of profile and second set on receiving SIGINT signal.
>>>
>>> Add `perf sched stats report` subcommand that will read both the set
>>> of samples, get the diff and render a final report. Final report prints
>>> scheduler stat at cpu granularity as well as sched domain granularity.
>>>
>>> Example usage:
>>>
>>> # perf sched stats record
>>> # perf sched stats report
>>
>> It'd be great if you could add an example output as well.
>>
>
> Sure, will do.
>
>>>
>>> Co-developed-by: Ravi Bangoria <ravi.bangoria@....com>
>>> Signed-off-by: Ravi Bangoria <ravi.bangoria@....com>
>>> Tested-by: James Clark <james.clark@...aro.org>
>>> Signed-off-by: Swapnil Sapkal <swapnil.sapkal@....com>
>>> ---
>>> tools/perf/builtin-sched.c | 509 ++++++++++++++++++++++++++++++++++++-
>>> 1 file changed, 508 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
>>> index 2573491fa5f8..e23018798f5b 100644
>>> --- a/tools/perf/builtin-sched.c
>>> +++ b/tools/perf/builtin-sched.c
>>> @@ -3940,6 +3940,505 @@ static int
>>> perf_sched__schedstat_record(struct perf_sched *sched,
>>> return err;
>>> }
>>> +struct schedstat_domain {
>>> + struct list_head domain_list;
>>> + struct perf_record_schedstat_domain *domain_data;
>>> +};
>>> +
>>> +struct schedstat_cpu {
>>> + struct list_head cpu_list;
>>> + struct list_head domain_head;
>>> + struct perf_record_schedstat_cpu *cpu_data;
>>> +};
>>> +
>>> +static struct list_head cpu_head = LIST_HEAD_INIT(cpu_head);
>>> +static struct schedstat_cpu *cpu_second_pass;
>>> +static struct schedstat_domain *domain_second_pass;
>>> +static bool after_workload_flag;
>>> +static bool verbose_field;
>>> +
>>> +static void store_schedtstat_cpu_diff(struct schedstat_cpu
>>> *after_workload)
>>> +{
>>> + struct perf_record_schedstat_cpu *before = cpu_second_pass-
>>> >cpu_data;
>>> + struct perf_record_schedstat_cpu *after = after_workload->cpu_data;
>>> + __u16 version = after_workload->cpu_data->version;
>>> +
>>> +#define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of,
>>> _ver) \
>>> + (before->_ver._name = after->_ver._name - before->_ver._name)
>>> +
>>> + if (version == 15) {
>>> +#include <perf/schedstat-v15.h>
>>> + } else if (version == 16) {
>>> +#include <perf/schedstat-v16.h>
>>> + } else if (version == 17) {
>>> +#include <perf/schedstat-v17.h>
>>> + }
>>> +
>>> +#undef CPU_FIELD
>>> +}
>>> +
>>> +static void store_schedstat_domain_diff(struct schedstat_domain
>>> *after_workload)
>>> +{
>>> + struct perf_record_schedstat_domain *before =
>>> domain_second_pass->domain_data;
>>> + struct perf_record_schedstat_domain *after = after_workload-
>>> >domain_data;
>>> + __u16 version = after_workload->domain_data->version;
>>> +
>>> +#define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies,
>>> _ver) \
>>> + (before->_ver._name = after->_ver._name - before->_ver._name)
>>> +
>>> + if (version == 15) {
>>> +#include <perf/schedstat-v15.h>
>>> + } else if (version == 16) {
>>> +#include <perf/schedstat-v16.h>
>>> + } else if (version == 17) {
>>> +#include <perf/schedstat-v17.h>
>>> + }
>>> +#undef DOMAIN_FIELD
>>> +}
>>> +
>>> +static inline void print_cpu_stats(struct perf_record_schedstat_cpu
>>> *cs)
>>> +{
>>> + printf("%-65s %12s %12s\n", "DESC", "COUNT", "PCT_CHANGE");
>>> + printf("%.*s\n", 100, graph_dotted_line);
>>> +
>>> +#define CALC_PCT(_x, _y) ((_y) ? ((double)(_x) / (_y)) * 100 : 0.0)
>>> +
>>> +#define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of,
>>> _ver) \
>>> + do { \
>>> + printf("%-65s: " _format, verbose_field ? _desc : #_name, \
>>> + cs->_ver._name); \
>>> + if (_is_pct) { \
>>> + printf(" ( %8.2lf%% )", \
>>> + CALC_PCT(cs->_ver._name, cs->_ver._pct_of)); \
>>> + } \
>>> + printf("\n"); \
>>> + } while (0)
>>> +
>>> + if (cs->version == 15) {
>>> +#include <perf/schedstat-v15.h>
>>> + } else if (cs->version == 16) {
>>> +#include <perf/schedstat-v16.h>
>>> + } else if (cs->version == 17) {
>>> +#include <perf/schedstat-v17.h>
>>> + }
>>> +
>>> +#undef CPU_FIELD
>>> +#undef CALC_PCT
>>> +}
>>> +
>>> +static inline void print_domain_stats(struct
>>> perf_record_schedstat_domain *ds,
>>> + __u64 jiffies)
>>> +{
>>> + printf("%-65s %12s %14s\n", "DESC", "COUNT", "AVG_JIFFIES");
>>> +
>>> +#define DOMAIN_CATEGORY(_desc) \
>>> + do { \
>>> + size_t _len = strlen(_desc); \
>>> + size_t _pre_dash_cnt = (100 - _len) / 2; \
>>> + size_t _post_dash_cnt = 100 - _len - _pre_dash_cnt; \
>>> + print_separator((int)_pre_dash_cnt, _desc,
>>> (int)_post_dash_cnt);\
>>> + } while (0)
>>> +
>>> +#define CALC_AVG(_x, _y) ((_y) ? (long double)(_x) / (_y) : 0.0)
>>> +
>>> +#define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies,
>>> _ver) \
>>> + do { \
>>> + printf("%-65s: " _format, verbose_field ? _desc : #_name, \
>>> + ds->_ver._name); \
>>> + if (_is_jiffies) { \
>>> + printf(" $ %11.2Lf $", \
>>> + CALC_AVG(jiffies, ds->_ver._name)); \
>>> + } \
>>> + printf("\n"); \
>>> + } while (0)
>>> +
>>> +#define DERIVED_CNT_FIELD(_name, _desc, _format, _x, _y, _z,
>>> _ver) \
>>> + printf("*%-64s: " _format "\n", verbose_field ? _desc :
>>> #_name, \
>>> + (ds->_ver._x) - (ds->_ver._y) - (ds->_ver._z))
>>> +
>>> +#define DERIVED_AVG_FIELD(_name, _desc, _format, _x, _y, _z, _w,
>>> _ver) \
>>> + printf("*%-64s: " _format "\n", verbose_field ? _desc :
>>> #_name, \
>>> + CALC_AVG(ds->_ver._w, \
>>> + ((ds->_ver._x) - (ds->_ver._y) - (ds->_ver._z))))
>>> +
>>> + if (ds->version == 15) {
>>> +#include <perf/schedstat-v15.h>
>>> + } else if (ds->version == 16) {
>>> +#include <perf/schedstat-v16.h>
>>> + } else if (ds->version == 17) {
>>> +#include <perf/schedstat-v17.h>
>>> + }
>>> +
>>> +#undef DERIVED_AVG_FIELD
>>> +#undef DERIVED_CNT_FIELD
>>> +#undef DOMAIN_FIELD
>>> +#undef CALC_AVG
>>> +#undef DOMAIN_CATEGORY
>>> +}
>>> +
>>> +static void summarize_schedstat_cpu(struct schedstat_cpu *summary_cpu,
>>> + struct schedstat_cpu *cptr,
>>> + int cnt, bool is_last)
>>> +{
>>> + struct perf_record_schedstat_cpu *summary_cs = summary_cpu-
>>> >cpu_data,
>>> + *temp_cs = cptr->cpu_data;
>>> +
>>> +#define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of,
>>> _ver) \
>>> + do { \
>>> + summary_cs->_ver._name += temp_cs->_ver._name; \
>>> + if (is_last) \
>>> + summary_cs->_ver._name /= cnt; \
>>> + } while (0)
>>> +
>>> + if (cptr->cpu_data->version == 15) {
>>> +#include <perf/schedstat-v15.h>
>>> + } else if (cptr->cpu_data->version == 16) {
>>> +#include <perf/schedstat-v16.h>
>>> + } else if (cptr->cpu_data->version == 17) {
>>> +#include <perf/schedstat-v17.h>
>>> + }
>>> +#undef CPU_FIELD
>>> +}
>>> +
>>> +static void summarize_schedstat_domain(struct schedstat_domain
>>> *summary_domain,
>>> + struct schedstat_domain *dptr,
>>> + int cnt, bool is_last)
>>> +{
>>> + struct perf_record_schedstat_domain *summary_ds =
>>> summary_domain->domain_data,
>>> + *temp_ds = dptr->domain_data;
>>> +
>>> +#define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies,
>>> _ver) \
>>> + do { \
>>> + summary_ds->_ver._name += temp_ds->_ver._name; \
>>> + if (is_last) \
>>> + summary_ds->_ver._name /= cnt; \
>>> + } while (0)
>>> +
>>> + if (dptr->domain_data->version == 15) {
>>> +#include <perf/schedstat-v15.h>
>>> + } else if (dptr->domain_data->version == 16) {
>>> +#include <perf/schedstat-v16.h>
>>> + } else if (dptr->domain_data->version == 17) {
>>> +#include <perf/schedstat-v17.h>
>>> + }
>>> +#undef DOMAIN_FIELD
>>> +}
>>> +
>>> +static int get_all_cpu_stats(struct list_head *head)
>>> +{
>>> + struct schedstat_cpu *cptr = list_first_entry(head, struct
>>> schedstat_cpu, cpu_list);
>>> + struct schedstat_cpu *summary_head = NULL;
>>> + struct perf_record_schedstat_domain *ds;
>>> + struct perf_record_schedstat_cpu *cs;
>>> + struct schedstat_domain *dptr, *tdptr;
>>> + bool is_last = false;
>>> + int cnt = 1;
>>> + int ret = 0;
>>> +
>>> + if (cptr) {
>>> + summary_head = zalloc(sizeof(*summary_head));
>>> + if (!summary_head)
>>> + return -ENOMEM;
>>> +
>>> + summary_head->cpu_data = zalloc(sizeof(*cs));
>>> + memcpy(summary_head->cpu_data, cptr->cpu_data, sizeof(*cs));
>>> +
>>> + INIT_LIST_HEAD(&summary_head->domain_head);
>>> +
>>> + list_for_each_entry(dptr, &cptr->domain_head, domain_list) {
>>> + tdptr = zalloc(sizeof(*tdptr));
>>> + if (!tdptr)
>>> + return -ENOMEM;
>>> +
>>> + tdptr->domain_data = zalloc(sizeof(*ds));
>>> + if (!tdptr->domain_data)
>>> + return -ENOMEM;
>>
>> Please free tdptr too.
>>
>
> Ack.
Apologies for not including this in my previous comment. The memory
allocated for tdptr and tdptr->domain_data is part of a list created to
store CPU and domain related information. In the next version, I've
added comment to represent this list. This list is used by the
show_schedstat_data() function to print the output and later it is freed
by the free_schedstat() function. Because of which I did not address
this in version 5.
>
>>> +
>>> + memcpy(tdptr->domain_data, dptr->domain_data, sizeof(*ds));
>>> + list_add_tail(&tdptr->domain_list, &summary_head-
>>> >domain_head);
>>> + }
>>> + }
>>> +
>>> +
>>> + list_for_each_entry(cptr, head, cpu_list) {
>>> + if (list_is_first(&cptr->cpu_list, head))
>>> + continue;
>>> +
>>> + if (list_is_last(&cptr->cpu_list, head))
>>> + is_last = true;
>>> +
>>> + cnt++;
>>> + summarize_schedstat_cpu(summary_head, cptr, cnt, is_last);
>>> + tdptr = list_first_entry(&summary_head->domain_head, struct
>>> schedstat_domain,
>>> + domain_list);
>>> +
>>> + list_for_each_entry(dptr, &cptr->domain_head, domain_list) {
>>> + summarize_schedstat_domain(tdptr, dptr, cnt, is_last);
>>> + tdptr = list_next_entry(tdptr, domain_list);
>>> + }
>>> + }
>>> +
>>> + list_add(&summary_head->cpu_list, head);
>>> +
>>> + return ret;
>>> +}
>>> +
>>> +static void print_field_description(struct schedstat_cpu *cptr)
>>> +{
>>> +#define CPU_FIELD(_type, _name, _desc, _format, _is_pct, _pct_of,
>>> _ver) \
>>> + printf("%-30s-> %s\n", #_name, _desc) \
>>> +
>>> +#define DOMAIN_CATEGORY(_desc) \
>>> + do { \
>>> + size_t _len = strlen(_desc); \
>>> + size_t _pre_dash_cnt = (100 - _len) / 2; \
>>> + size_t _post_dash_cnt = 100 - _len - _pre_dash_cnt; \
>>> + print_separator((int)_pre_dash_cnt, _desc,
>>> (int)_post_dash_cnt);\
>>> + } while (0)
>>> +
>>> +#define DOMAIN_FIELD(_type, _name, _desc, _format, _is_jiffies,
>>> _ver) \
>>> + printf("%-30s-> %s\n", #_name, _desc) \
>>> +
>>> +#define DERIVED_CNT_FIELD(_name, _desc, _format, _x, _y, _z,
>>> _ver) \
>>> + printf("*%-29s-> %s\n", #_name, _desc) \
>>> +
>>> +#define DERIVED_AVG_FIELD(_name, _desc, _format, _x, _y, _z, _w,
>>> _ver) \
>>> + printf("*%-29s-> %s\n", #_name, _desc) \
>>> +
>>> + if (cptr->cpu_data->version == 15) {
>>> +#include <perf/schedstat-v15.h>
>>> + } else if (cptr->cpu_data->version == 16) {
>>> +#include <perf/schedstat-v16.h>
>>> + } else if (cptr->cpu_data->version == 17) {
>>> +#include <perf/schedstat-v17.h>
>>> + }
>>> +#undef CPU_FIELD
>>> +#undef DOMAIN_CATEGORY
>>> +#undef DERIVED_CNT_FIELD
>>> +#undef DERIVED_AVG_FIELD
>>> +#undef DOMAIN_FIELD
>>> +}
>>> +
>>> +static int show_schedstat_data(struct list_head *head, struct
>>> cpu_domain_map **cd_map)
>>> +{
>>> + struct schedstat_cpu *cptr = list_first_entry(head, struct
>>> schedstat_cpu, cpu_list);
>>> + __u64 jiffies = cptr->cpu_data->timestamp;
>>> + struct perf_record_schedstat_domain *ds;
>>> + struct perf_record_schedstat_cpu *cs;
>>> + struct schedstat_domain *dptr;
>>> + bool is_summary = true;
>>> + int ret = 0;
>>> +
>>> + printf("Description\n");
>>> + print_separator(100, "", 0);
>>> + printf("%-30s-> %s\n", "DESC", "Description of the field");
>>> + printf("%-30s-> %s\n", "COUNT", "Value of the field");
>>> + printf("%-30s-> %s\n", "PCT_CHANGE", "Percent change with
>>> corresponding base value");
>>> + printf("%-30s-> %s\n", "AVG_JIFFIES",
>>> + "Avg time in jiffies between two consecutive occurrence
>>> of event");
>>> +
>>> + if (!verbose_field) {
>>> + print_separator(100, "", 0);
>>> + print_field_description(cptr);
>>> + }
>>> +
>>> + print_separator(100, "", 0);
>>> + printf("\n");
>>> +
>>> + printf("%-65s: %11llu\n", "Time elapsed (in jiffies)", jiffies);
>>> + print_separator(100, "", 0);
>>> +
>>> + ret = get_all_cpu_stats(head);
>>> +
>>> + list_for_each_entry(cptr, head, cpu_list) {
>>> + cs = cptr->cpu_data;
>>> + printf("\n");
>>> + print_separator(100, "", 0);
>>> +
>>> + if (is_summary)
>>> + printf("CPU <ALL CPUS SUMMARY>\n");
>>> + else
>>> + printf("CPU %d\n", cs->cpu);
>>> +
>>> + print_separator(100, "", 0);
>>> + print_cpu_stats(cs);
>>> + print_separator(100, "", 0);
>>> +
>>> + list_for_each_entry(dptr, &cptr->domain_head, domain_list) {
>>> + struct domain_info *dinfo;
>>> +
>>> + ds = dptr->domain_data;
>>> + dinfo = cd_map[ds->cpu]->domains[ds->domain];
>>> + if (is_summary)
>>
>> Please add a pair of braces for multiline statements.
>>
>
> Ack.
>
>>
>>> + if (dinfo->dname)
>>> + printf("CPU <ALL CPUS SUMMARY>, DOMAIN %s\n",
>>> dinfo->dname);
>>> + else
>>> + printf("CPU <ALL CPUS SUMMARY>, DOMAIN %d\n",
>>> dinfo->domain);
>>> + else {
>>> + if (dinfo->dname)
>>> + printf("CPU %d, DOMAIN %s CPUS ", cs->cpu,
>>> dinfo->dname);
>>> + else
>>> + printf("CPU %d, DOMAIN %d CPUS ", cs->cpu,
>>> dinfo->domain);
>>> +
>>> + printf("%s\n", dinfo->cpulist);
>>> + }
>>> + print_separator(100, "", 0);
>>> + print_domain_stats(ds, jiffies);
>>> + print_separator(100, "", 0);
>>> + }
>>> + is_summary = false;
>>> + }
>>> + return ret;
>>> +}
>>> +
>>> +static int perf_sched__process_schedstat(struct perf_session
>>> *session __maybe_unused,
>>> + union perf_event *event)
>>> +{
>>> + struct perf_cpu this_cpu;
>>> + static __u32 initial_cpu;
>>> +
>>> + switch (event->header.type) {
>>> + case PERF_RECORD_SCHEDSTAT_CPU:
>>> + this_cpu.cpu = event->schedstat_cpu.cpu;
>>> + break;
>>> + case PERF_RECORD_SCHEDSTAT_DOMAIN:
>>> + this_cpu.cpu = event->schedstat_domain.cpu;
>>> + break;
>>> + default:
>>> + return 0;
>>> + }
>>> +
>>> + if (user_requested_cpus && !
>>> perf_cpu_map__has(user_requested_cpus, this_cpu))
>>> + return 0;
>>> +
>>> + if (event->header.type == PERF_RECORD_SCHEDSTAT_CPU) {
>>> + struct schedstat_cpu *temp = zalloc(sizeof(*temp));
>>> +
>>> + if (!temp)
>>> + return -ENOMEM;
>>> +
>>> + temp->cpu_data = zalloc(sizeof(*temp->cpu_data));
>>> + if (!temp->cpu_data)
>>> + return -ENOMEM;
>>
>> Please free temp as well.
>>
>
> Ack.
As mentioned above this is also freed by free_schedstat() function.
>
>>> +
>>> + memcpy(temp->cpu_data, &event->schedstat_cpu, sizeof(*temp-
>>> >cpu_data));
>>> +
>>> + if (!list_empty(&cpu_head) && temp->cpu_data->cpu ==
>>> initial_cpu)
>>> + after_workload_flag = true;
>>> +
>>> + if (!after_workload_flag) {
>>> + if (list_empty(&cpu_head))
>>> + initial_cpu = temp->cpu_data->cpu;
>>> +
>>> + list_add_tail(&temp->cpu_list, &cpu_head);
>>> + INIT_LIST_HEAD(&temp->domain_head);
>>> + } else {
>>> + if (temp->cpu_data->cpu == initial_cpu) {
>>> + cpu_second_pass = list_first_entry(&cpu_head, struct
>>> schedstat_cpu,
>>> + cpu_list);
>>> + cpu_second_pass->cpu_data->timestamp =
>>> + temp->cpu_data->timestamp - cpu_second_pass-
>>> >cpu_data->timestamp;
>>> + } else {
>>> + cpu_second_pass = list_next_entry(cpu_second_pass,
>>> cpu_list);
>>> + }
>>> + domain_second_pass = list_first_entry(&cpu_second_pass-
>>> >domain_head,
>>> + struct schedstat_domain,
>>> domain_list);
>>> + store_schedtstat_cpu_diff(temp);
>>> + free(temp);
>>
>> What about temp->cpu_data?
>>
>
> Sure, will free it.
>
Same here.
>>
>>> + }
>>> + } else if (event->header.type == PERF_RECORD_SCHEDSTAT_DOMAIN) {
>>> + struct schedstat_cpu *cpu_tail;
>>> + struct schedstat_domain *temp = zalloc(sizeof(*temp));
>>> +
>>> + if (!temp)
>>> + return -ENOMEM;
>>> +
>>> + temp->domain_data = zalloc(sizeof(*temp->domain_data));
>>> + if (!temp->domain_data)
>>> + return -ENOMEM;
>>
>> Ditto.
>>
>
> Ack.
Same here.
>
>>> +
>>> + memcpy(temp->domain_data, &event->schedstat_domain,
>>> sizeof(*temp->domain_data));
>>> +
>>> + if (!after_workload_flag) {
>>> + cpu_tail = list_last_entry(&cpu_head, struct
>>> schedstat_cpu, cpu_list);
>>> + list_add_tail(&temp->domain_list, &cpu_tail->domain_head);
>>> + } else {
>>> + store_schedstat_domain_diff(temp);
>>> + domain_second_pass = list_next_entry(domain_second_pass,
>>> domain_list);
>>> + free(temp);
>>
>> Ditto.
>>
>
> Ack.
>
Same here.
> --
> Thanks and Regards,
> Swapnil
>
>> Thanks,
>> Namhyung
>>
--
Thanks and Regards,
Swapnil
Powered by blists - more mailing lists