lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <e50d57f1-ec74-47cb-9b65-ed64b7d99e2d@amd.com>
Date: Wed, 29 Jan 2025 11:04:06 +0530
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@....com>
To: Koichiro Den <koichiro.den@...onical.com>
Cc: peterz@...radead.org, mingo@...hat.com, rui.zhang@...el.com,
 irogers@...gle.com, kan.liang@...ux.intel.com, tglx@...utronix.de,
 bp@...en8.dei, gautham.shenoy@....com, kprateek.nayak@....com,
 ravi.bangoria@....com, x86@...nel.org, linux-perf-users@...r.kernel.org,
 linux-kernel@...r.kernel.org
Subject: Re: [PATCH v7 10/10] perf/x86/rapl: Add core energy counter support
 for AMD CPUs

On 1/29/2025 8:33 AM, Koichiro Den wrote:
> On Tue, Jan 28, 2025 at 01:41:51PM GMT, Dhananjay Ugwekar wrote:
>> On 1/20/2025 5:12 PM, Dhananjay Ugwekar wrote:
>>> On 1/15/2025 7:53 PM, Koichiro Den wrote:
>>>> On Mon, Jan 13, 2025 at 12:04:43PM GMT, Dhananjay Ugwekar wrote:
>>>>> On 1/12/2025 7:12 PM, Koichiro Den wrote:
>>>>>> On Fri, Nov 15, 2024 at 06:08:06AM GMT, Dhananjay Ugwekar wrote:
>>>>>>> Add a new "power_core" PMU and "energy-core" event for monitoring
>>>>>>> energy consumption by each individual core. The existing energy-cores
>>>>>>> event aggregates the energy consumption of CPU cores at the package level.
>>>>>>> This new event aligns with the AMD's per-core energy counters.
>>>>>>>
>>>>>>> Tested the package level and core level PMU counters with workloads
>>>>>>> pinned to different CPUs.
>>>>>>>
>>>>>>> Results with workload pinned to CPU 4 in core 4 on an AMD Zen4 Genoa
>>>>>>> machine:
>>>>>>>
>>>>>>> $ sudo perf stat --per-core -e power_core/energy-core/ -- taskset -c 4 stress-ng --matrix 1 --timeout 5s
>>>>>>> stress-ng: info:  [21250] setting to a 5 second run per stressor
>>>>>>> stress-ng: info:  [21250] dispatching hogs: 1 matrix
>>>>>>> stress-ng: info:  [21250] successful run completed in 5.00s
>>>>>>>
>>>>>>>  Performance counter stats for 'system wide':
>>>>>>>
>>>>>>> S0-D0-C0              1               0.00 Joules power_core/energy-core/
>>>>>>> S0-D0-C1              1               0.00 Joules power_core/energy-core/
>>>>>>> S0-D0-C2              1               0.00 Joules power_core/energy-core/
>>>>>>> S0-D0-C3              1               0.00 Joules power_core/energy-core/
>>>>>>> S0-D0-C4              1               8.43 Joules power_core/energy-core/
>>>>>>> S0-D0-C5              1               0.00 Joules power_core/energy-core/
>>>>>>> S0-D0-C6              1               0.00 Joules power_core/energy-core/
>>>>>>> S0-D0-C7              1               0.00 Joules power_core/energy-core/
>>>>>>> S0-D1-C8              1               0.00 Joules power_core/energy-core/
>>>>>>> S0-D1-C9              1               0.00 Joules power_core/energy-core/
>>>>>>> S0-D1-C10             1               0.00 Joules power_core/energy-core/
>>>>>>>
>>>>>>> Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@....com>
>>>>>>> Reviewed-by: Gautham R. Shenoy <gautham.shenoy@....com>
>>>>>>> ---
>>>>>>>  arch/x86/events/rapl.c | 185 +++++++++++++++++++++++++++++++++--------
>>>>>>>  1 file changed, 152 insertions(+), 33 deletions(-)
>>>>>>>
>>>>>>> diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
>>>>>>> index 6e51386ff91f..e9be1f31163d 100644
>>>>>>> --- a/arch/x86/events/rapl.c
>>>>>>> +++ b/arch/x86/events/rapl.c
>>>>>>> @@ -39,6 +39,10 @@
>>>>>>>   *	  event: rapl_energy_psys
>>>>>>>   *    perf code: 0x5
>>>>>>>   *
>>>>>>> + *  core counter: consumption of a single physical core
>>>>>>> + *	  event: rapl_energy_core (power_core PMU)
>>>>>>> + *    perf code: 0x1
>>>>>>> + *
>>>>>>>   * We manage those counters as free running (read-only). They may be
>>>>>>>   * use simultaneously by other tools, such as turbostat.
>>>>>>>   *
>>>>>>> @@ -81,6 +85,10 @@ enum perf_rapl_pkg_events {
>>>>>>>  	NR_RAPL_PKG_DOMAINS = PERF_RAPL_PKG_EVENTS_MAX,
>>>>>>>  };
>>>>>>>  
>>>>>>> +#define PERF_RAPL_CORE			0		/* single core */
>>>>>>> +#define PERF_RAPL_CORE_EVENTS_MAX	1
>>>>>>> +#define NR_RAPL_CORE_DOMAINS		PERF_RAPL_CORE_EVENTS_MAX
>>>>>>> +
>>>>>>>  static const char *const rapl_pkg_domain_names[NR_RAPL_PKG_DOMAINS] __initconst = {
>>>>>>>  	"pp0-core",
>>>>>>>  	"package",
>>>>>>> @@ -89,6 +97,8 @@ static const char *const rapl_pkg_domain_names[NR_RAPL_PKG_DOMAINS] __initconst
>>>>>>>  	"psys",
>>>>>>>  };
>>>>>>>  
>>>>>>> +static const char *const rapl_core_domain_name __initconst = "core";
>>>>>>> +
>>>>>>>  /*
>>>>>>>   * event code: LSB 8 bits, passed in attr->config
>>>>>>>   * any other bit is reserved
>>>>>>> @@ -141,14 +151,18 @@ enum rapl_unit_quirk {
>>>>>>>  
>>>>>>>  struct rapl_model {
>>>>>>>  	struct perf_msr *rapl_pkg_msrs;
>>>>>>> +	struct perf_msr *rapl_core_msrs;
>>>>>>>  	unsigned long	pkg_events;
>>>>>>> +	unsigned long	core_events;
>>>>>>>  	unsigned int	msr_power_unit;
>>>>>>>  	enum rapl_unit_quirk	unit_quirk;
>>>>>>>  };
>>>>>>>  
>>>>>>>   /* 1/2^hw_unit Joule */
>>>>>>>  static int rapl_pkg_hw_unit[NR_RAPL_PKG_DOMAINS] __read_mostly;
>>>>>>> +static int rapl_core_hw_unit __read_mostly;
>>>>>>>  static struct rapl_pmus *rapl_pmus_pkg;
>>>>>>> +static struct rapl_pmus *rapl_pmus_core;
>>>>>>>  static u64 rapl_timer_ms;
>>>>>>>  static struct rapl_model *rapl_model;
>>>>>>>  
>>>>>>> @@ -156,14 +170,23 @@ static struct rapl_model *rapl_model;
>>>>>>>   * Helper function to get the correct topology id according to the
>>>>>>>   * RAPL PMU scope.
>>>>>>>   */
>>>>>>> -static inline unsigned int get_rapl_pmu_idx(int cpu)
>>>>>>> -{	/*
>>>>>>> +static inline unsigned int get_rapl_pmu_idx(int cpu, int scope)
>>>>>>> +{
>>>>>>> +	/*
>>>>>>>  	 * Returns unsigned int, which converts the '-1' return value
>>>>>>>  	 * (for non-existent mappings in topology map) to UINT_MAX, so
>>>>>>>  	 * the error check in the caller is simplified.
>>>>>>>  	 */
>>>>>>> -	return rapl_pkg_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) :
>>>>>>> -					     topology_logical_die_id(cpu);
>>>>>>> +	switch (scope) {
>>>>>>> +	case PERF_PMU_SCOPE_PKG:
>>>>>>> +		return topology_logical_package_id(cpu);
>>>>>>> +	case PERF_PMU_SCOPE_DIE:
>>>>>>> +		return topology_logical_die_id(cpu);
>>>>>>> +	case PERF_PMU_SCOPE_CORE:
>>>>>>> +		return topology_logical_core_id(cpu);
>>>>>>> +	default:
>>>>>>> +		return -EINVAL;
>>>>>>> +	}
>>>>>>>  }
>>>>>>>  
>>>>>>>  static inline u64 rapl_read_counter(struct perf_event *event)
>>>>>>> @@ -173,19 +196,20 @@ static inline u64 rapl_read_counter(struct perf_event *event)
>>>>>>>  	return raw;
>>>>>>>  }
>>>>>>>  
>>>>>>> -static inline u64 rapl_scale(u64 v, int cfg)
>>>>>>> +static inline u64 rapl_scale(u64 v, struct perf_event *event)
>>>>>>>  {
>>>>>>> -	if (cfg > NR_RAPL_PKG_DOMAINS) {
>>>>>>> -		pr_warn("Invalid domain %d, failed to scale data\n", cfg);
>>>>>>> -		return v;
>>>>>>> -	}
>>>>>>> +	int hw_unit = rapl_pkg_hw_unit[event->hw.config - 1];
>>>>>>> +
>>>>>>> +	if (event->pmu->scope == PERF_PMU_SCOPE_CORE)
>>>>>>> +		hw_unit = rapl_core_hw_unit;
>>>>>>> +
>>>>>>>  	/*
>>>>>>>  	 * scale delta to smallest unit (1/2^32)
>>>>>>>  	 * users must then scale back: count * 1/(1e9*2^32) to get Joules
>>>>>>>  	 * or use ldexp(count, -32).
>>>>>>>  	 * Watts = Joules/Time delta
>>>>>>>  	 */
>>>>>>> -	return v << (32 - rapl_pkg_hw_unit[cfg - 1]);
>>>>>>> +	return v << (32 - hw_unit);
>>>>>>>  }
>>>>>>>  
>>>>>>>  static u64 rapl_event_update(struct perf_event *event)
>>>>>>> @@ -212,7 +236,7 @@ static u64 rapl_event_update(struct perf_event *event)
>>>>>>>  	delta = (new_raw_count << shift) - (prev_raw_count << shift);
>>>>>>>  	delta >>= shift;
>>>>>>>  
>>>>>>> -	sdelta = rapl_scale(delta, event->hw.config);
>>>>>>> +	sdelta = rapl_scale(delta, event);
>>>>>>>  
>>>>>>>  	local64_add(sdelta, &event->count);
>>>>>>>  
>>>>>>> @@ -341,13 +365,14 @@ static void rapl_pmu_event_del(struct perf_event *event, int flags)
>>>>>>>  static int rapl_pmu_event_init(struct perf_event *event)
>>>>>>>  {
>>>>>>>  	u64 cfg = event->attr.config & RAPL_EVENT_MASK;
>>>>>>> -	int bit, ret = 0;
>>>>>>> +	int bit, rapl_pmus_scope, ret = 0;
>>>>>>>  	struct rapl_pmu *rapl_pmu;
>>>>>>>  	unsigned int rapl_pmu_idx;
>>>>>>> +	struct rapl_pmus *rapl_pmus;
>>>>>>>  
>>>>>>> -	/* only look at RAPL events */
>>>>>>> -	if (event->attr.type != rapl_pmus_pkg->pmu.type)
>>>>>>> -		return -ENOENT;
>>>>>>> +	/* unsupported modes and filters */
>>>>>>> +	if (event->attr.sample_period) /* no sampling */
>>>>>>> +		return -EINVAL;
>>>>>>
>>>>>> Hi Dhananjay,
>>>>>>
>>>>>> On linux-next, since this commit, it seems that simple sampling with 'perf
>>>>>> record -- <command>' (i.e. the default event), 'perf top' etc. can
>>>>>> unexpectedly fail because rapl_pmu_event_init() now returns -EINVAL instead
>>>>>> of -ENOENT even in such cases of a type mismatch. I observed that this
>>>>>> prevents evsel__fallback() from falling back to cpu-clock or task-clock.
>>>>>>
>>>>>> Should we reorder the checks in rapl_pmu_event_init() to allow an early
>>>>>> return with -ENOENT in such cases, as shown below? I'm not very familiar
>>>>>> with this area and I might be missing something. I'd appreciate it if you
>>>>>> could share your thoughts.
>>>>>>
>>>>>> --- a/arch/x86/events/rapl.c
>>>>>> +++ b/arch/x86/events/rapl.c
>>>>>> @@ -370,17 +370,6 @@ static int rapl_pmu_event_init(struct perf_event *event)
>>>>>>         unsigned int rapl_pmu_idx;
>>>>>>         struct rapl_pmus *rapl_pmus;
>>>>>>
>>>>>> -       /* unsupported modes and filters */
>>>>>> -       if (event->attr.sample_period) /* no sampling */
>>>>>> -               return -EINVAL;
>>>>>> -
>>>>>> -       /* check only supported bits are set */
>>>>>> -       if (event->attr.config & ~RAPL_EVENT_MASK)
>>>>>> -               return -EINVAL;
>>>>>> -
>>>>>> -       if (event->cpu < 0)
>>>>>> -               return -EINVAL;
>>>>>> -
>>>>>>         rapl_pmus = container_of(event->pmu, struct rapl_pmus, pmu);
>>>>>>         if (!rapl_pmus)
>>>>>>                 return -EINVAL;
>>>>>> @@ -411,6 +400,17 @@ static int rapl_pmu_event_init(struct perf_event *event)
>>>>>>         } else
>>>>>>                 return -EINVAL;
>>>>>>
>>>>>> +       /* unsupported modes and filters */
>>>>>> +       if (event->attr.sample_period) /* no sampling */
>>>>>> +               return -EINVAL;
>>>>>> +
>>>>>> +       /* check only supported bits are set */
>>>>>> +       if (event->attr.config & ~RAPL_EVENT_MASK)
>>>>>> +               return -EINVAL;
>>>>>> +
>>>>>> +       if (event->cpu < 0)
>>>>>> +               return -EINVAL;
>>>>>> +
>>>>>>         /* check event supported */
>>>>>>         if (!(rapl_pmus->cntr_mask & (1 << bit)))
>>>>>>                 return -EINVAL;
>>>>>>
>>>>>> Thanks.
>>>>>>
>>>>>> -Koichiro
>>>>>
>>>>> Hello Koichiro,
>>>>>
>>>>> I tried reproducing the issue you mentioned using "sudo perf record -- sleep 2" and 
>>>>> "sudo perf top" commands on an AMD EPYC system, the commands worked successfully. 
>>>>> Can you please mention which system and which exact commands you're 
>>>>> running that reproduced the issue?
>>>>>
>>>>> My analysis is, if we are running "perf record/top" with the default event, we would 
>>>>> not enter the rapl_pmu_event_init() function, which renders the reordering of the type 
>>>>> checks irrelevant. Regardless, please let me know how I can reproduce the issue.
>>>>>
>>>>> Thanks,
>>>>> Dhananjay
>>>>
>>>> Hi,
>>>>
>>>> Apologies for the delayed response, and thank you for your comment. I
>>>> confirmed that just running "perf top" on a qemu instance reproduces it.
>>>> The host CPU model is Intel Core i9-13900K, which is passed through to
>>>> the guest.
>>>>
>>>> In my case, no pmu for PERF_TYPE_RAW is registered, but the rapl pmu is
>>>> present. Then, perf_init_event() reaches the line marked "---->" below, and
>>>> rapl_pmu_event_init() run, which returns -EINVAL before the type check.
>>>>
>>>>   static struct pmu *perf_init_event(struct perf_event *event)
>>>>   {
>>>>           [...]
>>>>           if (pmu) {
>>>>                   [...]
>>>>                   goto unlock;
>>>>           }
>>>>     
>>>>           list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
>>>>   ---->           ret = perf_try_init_event(pmu, event);
>>>>                   if (!ret)
>>>>                           goto unlock;
>>>>     
>>>>                   if (ret != -ENOENT) {
>>>>                           pmu = ERR_PTR(ret);
>>>>                           goto unlock;
>>>>                   }
>>>>           }
>>>>
>>>> I'll look into this a bit more on my side later and get back to you if
>>>> something becomes clear.
>>>
>>> Sorry for the delayed response, can you please try the below diff and let me know 
>>> if it fixes the issue?
>>
>> Hello Koichiro,
>>
>> Gentle ping, please let me know once you try out the below fix.
> 
> Sorry to be late. Yes it works.
> Early return with -ENOENT when event->attr.type != event->pmu->type seems
> like a common approach used in other pmu implementations as well.

Great!, I will post this as a fix for the last patch.

Thanks for your help!

Regards,
Dhananjay

> 
> Thanks,
> Koichiro
> 
>>
>> Thanks,
>> Dhananjay
>>
>>>
>>> diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
>>> index d3bb3865c1b1..4952faf03e82 100644
>>> --- a/arch/x86/events/rapl.c
>>> +++ b/arch/x86/events/rapl.c
>>> @@ -370,6 +370,10 @@ static int rapl_pmu_event_init(struct perf_event *event)
>>>         unsigned int rapl_pmu_idx;
>>>         struct rapl_pmus *rapl_pmus;
>>>
>>> +       /* only look at RAPL events */
>>> +       if (event->attr.type != event->pmu->type)
>>> +               return -ENOENT;
>>> +
>>>         /* unsupported modes and filters */
>>>         if (event->attr.sample_period) /* no sampling */
>>>                 return -EINVAL;
>>> @@ -387,10 +391,6 @@ static int rapl_pmu_event_init(struct perf_event *event)
>>>         rapl_pmus_scope = rapl_pmus->pmu.scope;
>>>
>>>         if (rapl_pmus_scope == PERF_PMU_SCOPE_PKG || rapl_pmus_scope == PERF_PMU_SCOPE_DIE) {
>>> -               /* only look at RAPL package events */
>>> -               if (event->attr.type != rapl_pmus_pkg->pmu.type)
>>> -                       return -ENOENT;
>>> -
>>>                 cfg = array_index_nospec((long)cfg, NR_RAPL_PKG_DOMAINS + 1);
>>>                 if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1)
>>>                         return -EINVAL;
>>> @@ -398,10 +398,6 @@ static int rapl_pmu_event_init(struct perf_event *event)
>>>                 bit = cfg - 1;
>>>                 event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr;
>>>         } else if (rapl_pmus_scope == PERF_PMU_SCOPE_CORE) {
>>> -               /* only look at RAPL core events */
>>> -               if (event->attr.type != rapl_pmus_core->pmu.type)
>>> -                       return -ENOENT;
>>> -
>>>                 cfg = array_index_nospec((long)cfg, NR_RAPL_CORE_DOMAINS + 1);
>>>                 if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1)
>>>                         return -EINVAL;
>>>
>>>>
>>>> Thanks,
>>>>
>>>> -Koichiro
>>>
>>


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ