[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <o6lskhkmwehl4snhpx6j7zlgv3npz3ckp6npnjizqphtmyh7dn@e4ige74hbqau>
Date: Wed, 29 Jan 2025 12:03:35 +0900
From: Koichiro Den <koichiro.den@...onical.com>
To: Dhananjay Ugwekar <Dhananjay.Ugwekar@....com>
Cc: peterz@...radead.org, mingo@...hat.com, rui.zhang@...el.com,
irogers@...gle.com, kan.liang@...ux.intel.com, tglx@...utronix.de, bp@...en8.dei,
gautham.shenoy@....com, kprateek.nayak@....com, ravi.bangoria@....com, x86@...nel.org,
linux-perf-users@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v7 10/10] perf/x86/rapl: Add core energy counter support
for AMD CPUs
On Tue, Jan 28, 2025 at 01:41:51PM GMT, Dhananjay Ugwekar wrote:
> On 1/20/2025 5:12 PM, Dhananjay Ugwekar wrote:
> > On 1/15/2025 7:53 PM, Koichiro Den wrote:
> >> On Mon, Jan 13, 2025 at 12:04:43PM GMT, Dhananjay Ugwekar wrote:
> >>> On 1/12/2025 7:12 PM, Koichiro Den wrote:
> >>>> On Fri, Nov 15, 2024 at 06:08:06AM GMT, Dhananjay Ugwekar wrote:
> >>>>> Add a new "power_core" PMU and "energy-core" event for monitoring
> >>>>> energy consumption by each individual core. The existing energy-cores
> >>>>> event aggregates the energy consumption of CPU cores at the package level.
> >>>>> This new event aligns with the AMD's per-core energy counters.
> >>>>>
> >>>>> Tested the package level and core level PMU counters with workloads
> >>>>> pinned to different CPUs.
> >>>>>
> >>>>> Results with workload pinned to CPU 4 in core 4 on an AMD Zen4 Genoa
> >>>>> machine:
> >>>>>
> >>>>> $ sudo perf stat --per-core -e power_core/energy-core/ -- taskset -c 4 stress-ng --matrix 1 --timeout 5s
> >>>>> stress-ng: info: [21250] setting to a 5 second run per stressor
> >>>>> stress-ng: info: [21250] dispatching hogs: 1 matrix
> >>>>> stress-ng: info: [21250] successful run completed in 5.00s
> >>>>>
> >>>>> Performance counter stats for 'system wide':
> >>>>>
> >>>>> S0-D0-C0 1 0.00 Joules power_core/energy-core/
> >>>>> S0-D0-C1 1 0.00 Joules power_core/energy-core/
> >>>>> S0-D0-C2 1 0.00 Joules power_core/energy-core/
> >>>>> S0-D0-C3 1 0.00 Joules power_core/energy-core/
> >>>>> S0-D0-C4 1 8.43 Joules power_core/energy-core/
> >>>>> S0-D0-C5 1 0.00 Joules power_core/energy-core/
> >>>>> S0-D0-C6 1 0.00 Joules power_core/energy-core/
> >>>>> S0-D0-C7 1 0.00 Joules power_core/energy-core/
> >>>>> S0-D1-C8 1 0.00 Joules power_core/energy-core/
> >>>>> S0-D1-C9 1 0.00 Joules power_core/energy-core/
> >>>>> S0-D1-C10 1 0.00 Joules power_core/energy-core/
> >>>>>
> >>>>> Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@....com>
> >>>>> Reviewed-by: Gautham R. Shenoy <gautham.shenoy@....com>
> >>>>> ---
> >>>>> arch/x86/events/rapl.c | 185 +++++++++++++++++++++++++++++++++--------
> >>>>> 1 file changed, 152 insertions(+), 33 deletions(-)
> >>>>>
> >>>>> diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
> >>>>> index 6e51386ff91f..e9be1f31163d 100644
> >>>>> --- a/arch/x86/events/rapl.c
> >>>>> +++ b/arch/x86/events/rapl.c
> >>>>> @@ -39,6 +39,10 @@
> >>>>> * event: rapl_energy_psys
> >>>>> * perf code: 0x5
> >>>>> *
> >>>>> + * core counter: consumption of a single physical core
> >>>>> + * event: rapl_energy_core (power_core PMU)
> >>>>> + * perf code: 0x1
> >>>>> + *
> >>>>> * We manage those counters as free running (read-only). They may be
> >>>>> * use simultaneously by other tools, such as turbostat.
> >>>>> *
> >>>>> @@ -81,6 +85,10 @@ enum perf_rapl_pkg_events {
> >>>>> NR_RAPL_PKG_DOMAINS = PERF_RAPL_PKG_EVENTS_MAX,
> >>>>> };
> >>>>>
> >>>>> +#define PERF_RAPL_CORE 0 /* single core */
> >>>>> +#define PERF_RAPL_CORE_EVENTS_MAX 1
> >>>>> +#define NR_RAPL_CORE_DOMAINS PERF_RAPL_CORE_EVENTS_MAX
> >>>>> +
> >>>>> static const char *const rapl_pkg_domain_names[NR_RAPL_PKG_DOMAINS] __initconst = {
> >>>>> "pp0-core",
> >>>>> "package",
> >>>>> @@ -89,6 +97,8 @@ static const char *const rapl_pkg_domain_names[NR_RAPL_PKG_DOMAINS] __initconst
> >>>>> "psys",
> >>>>> };
> >>>>>
> >>>>> +static const char *const rapl_core_domain_name __initconst = "core";
> >>>>> +
> >>>>> /*
> >>>>> * event code: LSB 8 bits, passed in attr->config
> >>>>> * any other bit is reserved
> >>>>> @@ -141,14 +151,18 @@ enum rapl_unit_quirk {
> >>>>>
> >>>>> struct rapl_model {
> >>>>> struct perf_msr *rapl_pkg_msrs;
> >>>>> + struct perf_msr *rapl_core_msrs;
> >>>>> unsigned long pkg_events;
> >>>>> + unsigned long core_events;
> >>>>> unsigned int msr_power_unit;
> >>>>> enum rapl_unit_quirk unit_quirk;
> >>>>> };
> >>>>>
> >>>>> /* 1/2^hw_unit Joule */
> >>>>> static int rapl_pkg_hw_unit[NR_RAPL_PKG_DOMAINS] __read_mostly;
> >>>>> +static int rapl_core_hw_unit __read_mostly;
> >>>>> static struct rapl_pmus *rapl_pmus_pkg;
> >>>>> +static struct rapl_pmus *rapl_pmus_core;
> >>>>> static u64 rapl_timer_ms;
> >>>>> static struct rapl_model *rapl_model;
> >>>>>
> >>>>> @@ -156,14 +170,23 @@ static struct rapl_model *rapl_model;
> >>>>> * Helper function to get the correct topology id according to the
> >>>>> * RAPL PMU scope.
> >>>>> */
> >>>>> -static inline unsigned int get_rapl_pmu_idx(int cpu)
> >>>>> -{ /*
> >>>>> +static inline unsigned int get_rapl_pmu_idx(int cpu, int scope)
> >>>>> +{
> >>>>> + /*
> >>>>> * Returns unsigned int, which converts the '-1' return value
> >>>>> * (for non-existent mappings in topology map) to UINT_MAX, so
> >>>>> * the error check in the caller is simplified.
> >>>>> */
> >>>>> - return rapl_pkg_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) :
> >>>>> - topology_logical_die_id(cpu);
> >>>>> + switch (scope) {
> >>>>> + case PERF_PMU_SCOPE_PKG:
> >>>>> + return topology_logical_package_id(cpu);
> >>>>> + case PERF_PMU_SCOPE_DIE:
> >>>>> + return topology_logical_die_id(cpu);
> >>>>> + case PERF_PMU_SCOPE_CORE:
> >>>>> + return topology_logical_core_id(cpu);
> >>>>> + default:
> >>>>> + return -EINVAL;
> >>>>> + }
> >>>>> }
> >>>>>
> >>>>> static inline u64 rapl_read_counter(struct perf_event *event)
> >>>>> @@ -173,19 +196,20 @@ static inline u64 rapl_read_counter(struct perf_event *event)
> >>>>> return raw;
> >>>>> }
> >>>>>
> >>>>> -static inline u64 rapl_scale(u64 v, int cfg)
> >>>>> +static inline u64 rapl_scale(u64 v, struct perf_event *event)
> >>>>> {
> >>>>> - if (cfg > NR_RAPL_PKG_DOMAINS) {
> >>>>> - pr_warn("Invalid domain %d, failed to scale data\n", cfg);
> >>>>> - return v;
> >>>>> - }
> >>>>> + int hw_unit = rapl_pkg_hw_unit[event->hw.config - 1];
> >>>>> +
> >>>>> + if (event->pmu->scope == PERF_PMU_SCOPE_CORE)
> >>>>> + hw_unit = rapl_core_hw_unit;
> >>>>> +
> >>>>> /*
> >>>>> * scale delta to smallest unit (1/2^32)
> >>>>> * users must then scale back: count * 1/(1e9*2^32) to get Joules
> >>>>> * or use ldexp(count, -32).
> >>>>> * Watts = Joules/Time delta
> >>>>> */
> >>>>> - return v << (32 - rapl_pkg_hw_unit[cfg - 1]);
> >>>>> + return v << (32 - hw_unit);
> >>>>> }
> >>>>>
> >>>>> static u64 rapl_event_update(struct perf_event *event)
> >>>>> @@ -212,7 +236,7 @@ static u64 rapl_event_update(struct perf_event *event)
> >>>>> delta = (new_raw_count << shift) - (prev_raw_count << shift);
> >>>>> delta >>= shift;
> >>>>>
> >>>>> - sdelta = rapl_scale(delta, event->hw.config);
> >>>>> + sdelta = rapl_scale(delta, event);
> >>>>>
> >>>>> local64_add(sdelta, &event->count);
> >>>>>
> >>>>> @@ -341,13 +365,14 @@ static void rapl_pmu_event_del(struct perf_event *event, int flags)
> >>>>> static int rapl_pmu_event_init(struct perf_event *event)
> >>>>> {
> >>>>> u64 cfg = event->attr.config & RAPL_EVENT_MASK;
> >>>>> - int bit, ret = 0;
> >>>>> + int bit, rapl_pmus_scope, ret = 0;
> >>>>> struct rapl_pmu *rapl_pmu;
> >>>>> unsigned int rapl_pmu_idx;
> >>>>> + struct rapl_pmus *rapl_pmus;
> >>>>>
> >>>>> - /* only look at RAPL events */
> >>>>> - if (event->attr.type != rapl_pmus_pkg->pmu.type)
> >>>>> - return -ENOENT;
> >>>>> + /* unsupported modes and filters */
> >>>>> + if (event->attr.sample_period) /* no sampling */
> >>>>> + return -EINVAL;
> >>>>
> >>>> Hi Dhananjay,
> >>>>
> >>>> On linux-next, since this commit, it seems that simple sampling with 'perf
> >>>> record -- <command>' (i.e. the default event), 'perf top' etc. can
> >>>> unexpectedly fail because rapl_pmu_event_init() now returns -EINVAL instead
> >>>> of -ENOENT even in such cases of a type mismatch. I observed that this
> >>>> prevents evsel__fallback() from falling back to cpu-clock or task-clock.
> >>>>
> >>>> Should we reorder the checks in rapl_pmu_event_init() to allow an early
> >>>> return with -ENOENT in such cases, as shown below? I'm not very familiar
> >>>> with this area and I might be missing something. I'd appreciate it if you
> >>>> could share your thoughts.
> >>>>
> >>>> --- a/arch/x86/events/rapl.c
> >>>> +++ b/arch/x86/events/rapl.c
> >>>> @@ -370,17 +370,6 @@ static int rapl_pmu_event_init(struct perf_event *event)
> >>>> unsigned int rapl_pmu_idx;
> >>>> struct rapl_pmus *rapl_pmus;
> >>>>
> >>>> - /* unsupported modes and filters */
> >>>> - if (event->attr.sample_period) /* no sampling */
> >>>> - return -EINVAL;
> >>>> -
> >>>> - /* check only supported bits are set */
> >>>> - if (event->attr.config & ~RAPL_EVENT_MASK)
> >>>> - return -EINVAL;
> >>>> -
> >>>> - if (event->cpu < 0)
> >>>> - return -EINVAL;
> >>>> -
> >>>> rapl_pmus = container_of(event->pmu, struct rapl_pmus, pmu);
> >>>> if (!rapl_pmus)
> >>>> return -EINVAL;
> >>>> @@ -411,6 +400,17 @@ static int rapl_pmu_event_init(struct perf_event *event)
> >>>> } else
> >>>> return -EINVAL;
> >>>>
> >>>> + /* unsupported modes and filters */
> >>>> + if (event->attr.sample_period) /* no sampling */
> >>>> + return -EINVAL;
> >>>> +
> >>>> + /* check only supported bits are set */
> >>>> + if (event->attr.config & ~RAPL_EVENT_MASK)
> >>>> + return -EINVAL;
> >>>> +
> >>>> + if (event->cpu < 0)
> >>>> + return -EINVAL;
> >>>> +
> >>>> /* check event supported */
> >>>> if (!(rapl_pmus->cntr_mask & (1 << bit)))
> >>>> return -EINVAL;
> >>>>
> >>>> Thanks.
> >>>>
> >>>> -Koichiro
> >>>
> >>> Hello Koichiro,
> >>>
> >>> I tried reproducing the issue you mentioned using "sudo perf record -- sleep 2" and
> >>> "sudo perf top" commands on an AMD EPYC system, the commands worked successfully.
> >>> Can you please mention which system and which exact commands you're
> >>> running that reproduced the issue?
> >>>
> >>> My analysis is, if we are running "perf record/top" with the default event, we would
> >>> not enter the rapl_pmu_event_init() function, which renders the reordering of the type
> >>> checks irrelevant. Regardless, please let me know how I can reproduce the issue.
> >>>
> >>> Thanks,
> >>> Dhananjay
> >>
> >> Hi,
> >>
> >> Apologies for the delayed response, and thank you for your comment. I
> >> confirmed that just running "perf top" on a qemu instance reproduces it.
> >> The host CPU model is Intel Core i9-13900K, which is passed through to
> >> the guest.
> >>
> >> In my case, no pmu for PERF_TYPE_RAW is registered, but the rapl pmu is
> >> present. Then, perf_init_event() reaches the line marked "---->" below, and
> >> rapl_pmu_event_init() run, which returns -EINVAL before the type check.
> >>
> >> static struct pmu *perf_init_event(struct perf_event *event)
> >> {
> >> [...]
> >> if (pmu) {
> >> [...]
> >> goto unlock;
> >> }
> >>
> >> list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
> >> ----> ret = perf_try_init_event(pmu, event);
> >> if (!ret)
> >> goto unlock;
> >>
> >> if (ret != -ENOENT) {
> >> pmu = ERR_PTR(ret);
> >> goto unlock;
> >> }
> >> }
> >>
> >> I'll look into this a bit more on my side later and get back to you if
> >> something becomes clear.
> >
> > Sorry for the delayed response, can you please try the below diff and let me know
> > if it fixes the issue?
>
> Hello Koichiro,
>
> Gentle ping, please let me know once you try out the below fix.
Sorry to be late. Yes it works.
Early return with -ENOENT when event->attr.type != event->pmu->type seems
like a common approach used in other pmu implementations as well.
Thanks,
Koichiro
>
> Thanks,
> Dhananjay
>
> >
> > diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
> > index d3bb3865c1b1..4952faf03e82 100644
> > --- a/arch/x86/events/rapl.c
> > +++ b/arch/x86/events/rapl.c
> > @@ -370,6 +370,10 @@ static int rapl_pmu_event_init(struct perf_event *event)
> > unsigned int rapl_pmu_idx;
> > struct rapl_pmus *rapl_pmus;
> >
> > + /* only look at RAPL events */
> > + if (event->attr.type != event->pmu->type)
> > + return -ENOENT;
> > +
> > /* unsupported modes and filters */
> > if (event->attr.sample_period) /* no sampling */
> > return -EINVAL;
> > @@ -387,10 +391,6 @@ static int rapl_pmu_event_init(struct perf_event *event)
> > rapl_pmus_scope = rapl_pmus->pmu.scope;
> >
> > if (rapl_pmus_scope == PERF_PMU_SCOPE_PKG || rapl_pmus_scope == PERF_PMU_SCOPE_DIE) {
> > - /* only look at RAPL package events */
> > - if (event->attr.type != rapl_pmus_pkg->pmu.type)
> > - return -ENOENT;
> > -
> > cfg = array_index_nospec((long)cfg, NR_RAPL_PKG_DOMAINS + 1);
> > if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1)
> > return -EINVAL;
> > @@ -398,10 +398,6 @@ static int rapl_pmu_event_init(struct perf_event *event)
> > bit = cfg - 1;
> > event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr;
> > } else if (rapl_pmus_scope == PERF_PMU_SCOPE_CORE) {
> > - /* only look at RAPL core events */
> > - if (event->attr.type != rapl_pmus_core->pmu.type)
> > - return -ENOENT;
> > -
> > cfg = array_index_nospec((long)cfg, NR_RAPL_CORE_DOMAINS + 1);
> > if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1)
> > return -EINVAL;
> >
> >>
> >> Thanks,
> >>
> >> -Koichiro
> >
>
Powered by blists - more mailing lists