linux-kernel - Re: [PATCH V2 2/3] powercap: intel_rapl: Introduce APIs for PMU support

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAJZ5v0hDdTS++iWLYDnoVotV3=e=Vn2di4EjUBJzHiNLzrbaig@mail.gmail.com>
Date: Tue, 16 Apr 2024 15:58:27 +0200
From: "Rafael J. Wysocki" <rafael@...nel.org>
To: Zhang Rui <rui.zhang@...el.com>
Cc: rafael.j.wysocki@...el.com, linux-kernel@...r.kernel.org, 
	linux-pm@...r.kernel.org, srinivas.pandruvada@...el.com
Subject: Re: [PATCH V2 2/3] powercap: intel_rapl: Introduce APIs for PMU support

On Mon, Apr 8, 2024 at 5:52 AM Zhang Rui <rui.zhang@...el.com> wrote:
>
> Introduce two new APIs rapl_package_add_pmu()/rapl_package_remove_pmu().
>
> RAPL driver can invoke these APIs to expose its supported energy
> counters via perf PMU. The new RAPL PMU is fully compatible with current
> MSR RAPL PMU, including using the same PMU name and events
> name/id/unit/scale, etc.
>
> For example, use below command
>  perf stat -e power/energy-pkg/ -e power/energy-ram/ -e power/energy-cores/ FOO
> to get the energy consumption for events in the "perf list" output.
>
> This does not introduce any conflict because TPMI RAPL is the only user
> of these APIs currently, and it never co-exists with MSR RAPL.
>
> Note that, TPMI RAPL is probed dynamically, and the events supported by
> each TPMI RAPL device can be different. Thus, for a new RAPL Package
> device that wants PMU support,
> 1. if PMU has not been registered yet, register the PMU with events
>    supported by current RAPL Package device
> 2. if PMU has been registered and no new events support is needed, do
>    nothing
> 2. or else, unregister current PMU and re-register the PMU with events
>    supported by all probed RAPL Package devices that need PMU support.
>    For example, on a dual-package system using TPMI RAPL, it is possible
>    that Package 1 behaves as TPMI domain root and supports Psys domain.
>    In this case, register PMU without Psys event when probing Package 0,
>    and re-register the PMU with Psys event when probing Package 1.
>
> Signed-off-by: Zhang Rui <rui.zhang@...el.com>
> ---
>  drivers/powercap/intel_rapl_common.c | 537 +++++++++++++++++++++++++++
>  include/linux/intel_rapl.h           |  23 ++
>  2 files changed, 560 insertions(+)
>
> diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c
> index 1f4a7aa12d77..f51012f023b0 100644
> --- a/drivers/powercap/intel_rapl_common.c
> +++ b/drivers/powercap/intel_rapl_common.c
> @@ -15,6 +15,8 @@
>  #include <linux/list.h>
>  #include <linux/log2.h>
>  #include <linux/module.h>
> +#include <linux/nospec.h>
> +#include <linux/perf_event.h>
>  #include <linux/platform_device.h>
>  #include <linux/powercap.h>
>  #include <linux/processor.h>
> @@ -1506,6 +1508,541 @@ static int rapl_detect_domains(struct rapl_package *rp)
>         return 0;
>  }
>
> +#ifdef CONFIG_PERF_EVENTS
> +
> +/* Support for RAPL PMU */

It would be good to add some details regarding the implementation etc.
to this comment.

> +
> +struct rapl_pmu {
> +       struct pmu pmu;
> +       u64 timer_ms;
> +       cpumask_t cpu_mask;
> +       unsigned long domain_map;       /* Events supported by all registered RAPL packages */
> +       bool registered;
> +};

The structure fields need to be documented.

> +
> +static struct rapl_pmu rapl_pmu;
> +
> +/* PMU helpers */

A blank like here?

> +static int get_pmu_cpu(struct rapl_package *rp)
> +{
> +       int cpu;
> +
> +       if (!rp->has_pmu)
> +               return nr_cpu_ids;
> +
> +       if (rp->lead_cpu >= 0)
> +               return rp->lead_cpu;
> +
> +       /* TPMI RAPL uses any CPU in the package for PMU */
> +       for_each_online_cpu(cpu)
> +               if (topology_physical_package_id(cpu) == rp->id)
> +                       return cpu;
> +
> +       return nr_cpu_ids;
> +}
> +
> +static bool is_rp_pmu_cpu(struct rapl_package *rp, int cpu)
> +{
> +       if (!rp->has_pmu)
> +               return false;
> +
> +       if (rp->lead_cpu >= 0)
> +               return cpu == rp->lead_cpu;

So if the given CPU is not the lead CPU, but it is located in the same
package as the lead CPU, the function will return 'false'.  TBH, this
is somewhat confusing.

> +
> +       /* TPMI RAPL uses any CPU in the package for PMU */
> +       return topology_physical_package_id(cpu) == rp->id;
> +}
> +
> +static struct rapl_package_pmu_data *event_to_pmu_data(struct perf_event *event)
> +{
> +       struct rapl_package *rp = event->pmu_private;
> +
> +       return &rp->pmu_data;
> +}
> +
> +/* PMU event callbacks */
> +
> +/* Return 0 for unsupported events or failed read */

Can you put this comment into the function body?

> +static u64 event_read_counter(struct perf_event *event)
> +{
> +       struct rapl_package *rp = event->pmu_private;
> +       u64 val;
> +       int ret;
> +
> +       if (event->hw.idx < 0)
> +               return 0;
> +
> +       ret = rapl_read_data_raw(&rp->domains[event->hw.idx], ENERGY_COUNTER, false, &val);
> +       return ret ? 0 : val;

I would prefer

if (ret)
    return 0;

return val;

> +}
> +
> +static void __rapl_pmu_event_start(struct perf_event *event)
> +{
> +       struct rapl_package_pmu_data *data = event_to_pmu_data(event);
> +
> +       if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
> +               return;
> +
> +       event->hw.state = 0;
> +
> +       list_add_tail(&event->active_entry, &data->active_list);
> +
> +       local64_set(&event->hw.prev_count, event_read_counter(event));
> +       if (++data->n_active == 1)
> +               hrtimer_start(&data->hrtimer, data->timer_interval,
> +                             HRTIMER_MODE_REL_PINNED);
> +}
> +
> +static void rapl_pmu_event_start(struct perf_event *event, int mode)
> +{
> +       struct rapl_package_pmu_data *data = event_to_pmu_data(event);
> +       unsigned long flags;
> +
> +       raw_spin_lock_irqsave(&data->lock, flags);
> +       __rapl_pmu_event_start(event);
> +       raw_spin_unlock_irqrestore(&data->lock, flags);

Why does it need to be raw_spin_lock_?

What exactly is protected by data->lock?

> +}
> +
> +static u64 rapl_event_update(struct perf_event *event)
> +{
> +       struct hw_perf_event *hwc = &event->hw;
> +       struct rapl_package_pmu_data *data = event_to_pmu_data(event);
> +       u64 prev_raw_count, new_raw_count;
> +       s64 delta, sdelta;
> +
> +again:
> +       prev_raw_count = local64_read(&hwc->prev_count);
> +       new_raw_count = event_read_counter(event);
> +
> +       if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
> +                       new_raw_count) != prev_raw_count)
> +               goto again;

It would be somewhat easier to follow if an additional u64 variable were used:

do {
      prev_raw_count = local64_read(&hwc->prev_count);
      new_raw_count = event_read_counter(event);
      tmp = (local64_cmpxchg(&hwc->prev_count, prev_raw_count, new_raw_count);
} while (tmp != prev_raw_count);

> +
> +       /*
> +        * Now we have the new raw value and have updated the prev
> +        * timestamp already. We can now calculate the elapsed delta
> +        * (event-)time and add that to the generic event.
> +        */
> +       delta = new_raw_count - prev_raw_count;
> +
> +       /*
> +        * Scale delta to smallest unit (2^-32)
> +        * users must then scale back: count * 1/(1e9*2^32) to get Joules
> +        * or use ldexp(count, -32).
> +        * Watts = Joules/Time delta
> +        */
> +       sdelta = delta * data->scale[event->hw.flags];
> +
> +       local64_add(sdelta, &event->count);
> +
> +       return new_raw_count;
> +}
> +
> +static void rapl_pmu_event_stop(struct perf_event *event, int mode)
> +{
> +       struct rapl_package_pmu_data *data = event_to_pmu_data(event);
> +       struct hw_perf_event *hwc = &event->hw;
> +       unsigned long flags;
> +
> +       raw_spin_lock_irqsave(&data->lock, flags);
> +
> +       /* Mark event as deactivated and stopped */
> +       if (!(hwc->state & PERF_HES_STOPPED)) {
> +               WARN_ON_ONCE(data->n_active <= 0);
> +               if (--data->n_active == 0)
> +                       hrtimer_cancel(&data->hrtimer);
> +
> +               list_del(&event->active_entry);
> +
> +               WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
> +               hwc->state |= PERF_HES_STOPPED;
> +       }
> +
> +       /* Check if update of sw counter is necessary */
> +       if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
> +               /*
> +                * Drain the remaining delta count out of a event
> +                * that we are disabling:
> +                */
> +               rapl_event_update(event);
> +               hwc->state |= PERF_HES_UPTODATE;
> +       }
> +
> +       raw_spin_unlock_irqrestore(&data->lock, flags);
> +}
> +
> +static int rapl_pmu_event_add(struct perf_event *event, int mode)
> +{
> +       struct rapl_package_pmu_data *data = event_to_pmu_data(event);
> +       struct hw_perf_event *hwc = &event->hw;
> +       unsigned long flags;
> +
> +       raw_spin_lock_irqsave(&data->lock, flags);
> +
> +       hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
> +
> +       if (mode & PERF_EF_START)
> +               __rapl_pmu_event_start(event);
> +
> +       raw_spin_unlock_irqrestore(&data->lock, flags);
> +
> +       return 0;
> +}
> +
> +static void rapl_pmu_event_del(struct perf_event *event, int flags)
> +{
> +       rapl_pmu_event_stop(event, PERF_EF_UPDATE);
> +}
> +
> +/*
> + * RAPL energy status counters
> + */
> +enum perf_rapl_events {
> +       PERF_RAPL_PP0 = 0,      /* all cores */
> +       PERF_RAPL_PKG,          /* entire package */
> +       PERF_RAPL_RAM,          /* DRAM */
> +       PERF_RAPL_PP1,          /* gpu */
> +       PERF_RAPL_PSYS,         /* psys */
> +};
> +#define RAPL_EVENT_MASK GENMASK(7, 0)
> +
> +static int event_to_domain(int event)
> +{
> +       switch (event) {
> +       case PERF_RAPL_PP0:
> +               return RAPL_DOMAIN_PP0;
> +       case PERF_RAPL_PKG:
> +               return RAPL_DOMAIN_PACKAGE;
> +       case PERF_RAPL_RAM:
> +               return RAPL_DOMAIN_DRAM;
> +       case PERF_RAPL_PP1:
> +               return RAPL_DOMAIN_PP1;
> +       case PERF_RAPL_PSYS:
> +               return RAPL_DOMAIN_PLATFORM;
> +       default:
> +               return RAPL_DOMAIN_MAX;
> +       }
> +}

IMV it would be somewhat cleaner to use an array instead of this
wrapper around switch ().

> +
> +static int rapl_pmu_event_init(struct perf_event *event)
> +{
> +       struct rapl_package *pos, *rp = NULL;
> +       u64 cfg = event->attr.config & RAPL_EVENT_MASK;
> +       int domain, idx;
> +
> +       /* Only look at RAPL events */
> +       if (event->attr.type != event->pmu->type)
> +               return -ENOENT;
> +
> +       /* Check only supported bits are set */
> +       if (event->attr.config & ~RAPL_EVENT_MASK)
> +               return -EINVAL;
> +
> +       if (event->cpu < 0)
> +               return -EINVAL;
> +
> +       /* Find out which Package the event belongs to */
> +       list_for_each_entry(pos, &rapl_packages, plist) {
> +               if (is_rp_pmu_cpu(pos, event->cpu)) {
> +                       rp = pos;
> +                       break;
> +               }
> +       }
> +       if (!rp)
> +               return -ENODEV;
> +
> +       /* Find out which RAPL Domain the event belongs to */
> +       domain = event_to_domain(cfg - 1);
> +       if (domain >= RAPL_DOMAIN_MAX)
> +               return -EINVAL;
> +
> +       event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
> +       event->pmu_private = rp;        /* Which package */
> +       event->hw.flags = domain;       /* Which domain */
> +
> +       event->hw.idx = -1;
> +       /* Find out the index in rp->domains[] to get domain pointer */
> +       for (idx = 0; idx < rp->nr_domains; idx++) {
> +               if (rp->domains[idx].id == domain) {
> +                       event->hw.idx = idx;
> +                       break;
> +               }
> +       }
> +
> +       return 0;
> +}
> +
> +static void rapl_pmu_event_read(struct perf_event *event)
> +{
> +       rapl_event_update(event);
> +}
> +
> +static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
> +{
> +       struct rapl_package_pmu_data *data =
> +               container_of(hrtimer, struct rapl_package_pmu_data, hrtimer);
> +       struct perf_event *event;
> +       unsigned long flags;
> +
> +       if (!data->n_active)
> +               return HRTIMER_NORESTART;
> +
> +       raw_spin_lock_irqsave(&data->lock, flags);
> +
> +       list_for_each_entry(event, &data->active_list, active_entry)
> +               rapl_event_update(event);
> +
> +       raw_spin_unlock_irqrestore(&data->lock, flags);
> +
> +       hrtimer_forward_now(hrtimer, data->timer_interval);
> +
> +       return HRTIMER_RESTART;
> +}
> +
> +/* PMU sysfs attributes */
> +
> +/*
> + * There are no default events, but we need to create "events" group (with
> + * empty attrs) before updating it with detected events.
> + */
> +static struct attribute *attrs_empty[] = {
> +       NULL,
> +};
> +
> +static struct attribute_group pmu_events_group = {
> +       .name = "events",
> +       .attrs = attrs_empty,
> +};
> +
> +static ssize_t cpumask_show(struct device *dev,
> +                           struct device_attribute *attr, char *buf)
> +{
> +       struct rapl_package *rp;
> +       int cpu;
> +
> +       cpus_read_lock();

Is rapl_packages protected by this?

> +       cpumask_clear(&rapl_pmu.cpu_mask);

It doesn't look like rapl_pmu.cpu_mask is used outside this function,
so why is it global?

> +
> +       /* Choose a cpu for each RAPL Package */
> +       list_for_each_entry(rp, &rapl_packages, plist) {
> +               cpu = get_pmu_cpu(rp);
> +               if (cpu < nr_cpu_ids)
> +                       cpumask_set_cpu(cpu, &rapl_pmu.cpu_mask);
> +       }
> +       cpus_read_unlock();
> +
> +       return cpumap_print_to_pagebuf(true, buf, &rapl_pmu.cpu_mask);
> +}
> +
> +static DEVICE_ATTR_RO(cpumask);
> +
> +static struct attribute *pmu_cpumask_attrs[] = {
> +       &dev_attr_cpumask.attr,
> +       NULL
> +};
> +
> +static struct attribute_group pmu_cpumask_group = {
> +       .attrs = pmu_cpumask_attrs,
> +};
> +
> +PMU_FORMAT_ATTR(event, "config:0-7");
> +static struct attribute *pmu_format_attr[] = {
> +       &format_attr_event.attr,
> +       NULL
> +};
> +
> +static struct attribute_group pmu_format_group = {
> +       .name = "format",
> +       .attrs = pmu_format_attr,
> +};
> +
> +static const struct attribute_group *pmu_attr_groups[] = {
> +       &pmu_events_group,
> +       &pmu_cpumask_group,
> +       &pmu_format_group,
> +       NULL
> +};
> +
> +#define RAPL_EVENT_ATTR_STR(_name, v, str)                                     \
> +static struct perf_pmu_events_attr event_attr_##v = {                          \
> +       .attr           = __ATTR(_name, 0444, perf_event_sysfs_show, NULL),     \
> +       .event_str      = str,                                                  \
> +}
> +
> +RAPL_EVENT_ATTR_STR(energy-cores,      rapl_cores,     "event=0x01");
> +RAPL_EVENT_ATTR_STR(energy-pkg,                rapl_pkg,       "event=0x02");
> +RAPL_EVENT_ATTR_STR(energy-ram,                rapl_ram,       "event=0x03");
> +RAPL_EVENT_ATTR_STR(energy-gpu,                rapl_gpu,       "event=0x04");
> +RAPL_EVENT_ATTR_STR(energy-psys,       rapl_psys,      "event=0x05");
> +
> +RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_unit_cores,        "Joules");
> +RAPL_EVENT_ATTR_STR(energy-pkg.unit,   rapl_unit_pkg,          "Joules");
> +RAPL_EVENT_ATTR_STR(energy-ram.unit,   rapl_unit_ram,          "Joules");
> +RAPL_EVENT_ATTR_STR(energy-gpu.unit,   rapl_unit_gpu,          "Joules");
> +RAPL_EVENT_ATTR_STR(energy-psys.unit,  rapl_unit_psys,         "Joules");
> +
> +RAPL_EVENT_ATTR_STR(energy-cores.scale,        rapl_scale_cores,       "2.3283064365386962890625e-10");
> +RAPL_EVENT_ATTR_STR(energy-pkg.scale,  rapl_scale_pkg,         "2.3283064365386962890625e-10");
> +RAPL_EVENT_ATTR_STR(energy-ram.scale,  rapl_scale_ram,         "2.3283064365386962890625e-10");
> +RAPL_EVENT_ATTR_STR(energy-gpu.scale,  rapl_scale_gpu,         "2.3283064365386962890625e-10");
> +RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_scale_psys,        "2.3283064365386962890625e-10");
> +
> +#define RAPL_EVENT_GROUP(_name, domain)                        \
> +static struct attribute *pmu_attr_##_name[] = {                \
> +       &event_attr_rapl_##_name.attr.attr,             \
> +       &event_attr_rapl_unit_##_name.attr.attr,        \
> +       &event_attr_rapl_scale_##_name.attr.attr,       \
> +       NULL                                            \
> +};                                                     \
> +static umode_t is_visible_##_name(struct kobject *kobj, struct attribute *attr, int event)     \
> +{                                                                                      \
> +       return rapl_pmu.domain_map & BIT(domain) ? attr->mode : 0;      \
> +}                                                      \
> +static struct attribute_group pmu_group_##_name = {    \
> +       .name  = "events",                              \
> +       .attrs = pmu_attr_##_name,                      \
> +       .is_visible = is_visible_##_name,               \
> +}
> +
> +RAPL_EVENT_GROUP(cores,        RAPL_DOMAIN_PP0);
> +RAPL_EVENT_GROUP(pkg,  RAPL_DOMAIN_PACKAGE);
> +RAPL_EVENT_GROUP(ram,  RAPL_DOMAIN_DRAM);
> +RAPL_EVENT_GROUP(gpu,  RAPL_DOMAIN_PP1);
> +RAPL_EVENT_GROUP(psys, RAPL_DOMAIN_PLATFORM);
> +
> +static const struct attribute_group *pmu_attr_update[] = {
> +       &pmu_group_cores,
> +       &pmu_group_pkg,
> +       &pmu_group_ram,
> +       &pmu_group_gpu,
> +       &pmu_group_psys,
> +       NULL
> +};
> +
> +static int rapl_pmu_update(struct rapl_package *rp)
> +{
> +       int ret;
> +
> +       /* Return if PMU already covers all events supported by current RAPL Package */
> +       if (rapl_pmu.registered && !(rp->domain_map & (~rapl_pmu.domain_map)))
> +               return 0;
> +
> +       /* Unregister previous registered PMU */
> +       if (rapl_pmu.registered) {
> +               perf_pmu_unregister(&rapl_pmu.pmu);
> +               memset(&rapl_pmu.pmu, 0, sizeof(struct pmu));
> +       }
> +
> +       rapl_pmu.domain_map |= rp->domain_map;
> +
> +       memset(&rapl_pmu.pmu, 0, sizeof(struct pmu));
> +       rapl_pmu.pmu.attr_groups = pmu_attr_groups;
> +       rapl_pmu.pmu.attr_update = pmu_attr_update;
> +       rapl_pmu.pmu.task_ctx_nr = perf_invalid_context;
> +       rapl_pmu.pmu.event_init = rapl_pmu_event_init;
> +       rapl_pmu.pmu.add = rapl_pmu_event_add;
> +       rapl_pmu.pmu.del = rapl_pmu_event_del;
> +       rapl_pmu.pmu.start = rapl_pmu_event_start;
> +       rapl_pmu.pmu.stop = rapl_pmu_event_stop;
> +       rapl_pmu.pmu.read = rapl_pmu_event_read;
> +       rapl_pmu.pmu.module = THIS_MODULE;
> +       rapl_pmu.pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT;
> +       ret = perf_pmu_register(&rapl_pmu.pmu, "power", -1);
> +       if (ret)
> +               pr_warn("Failed to register PMU\n");
> +
> +       rapl_pmu.registered = !ret;

Why don't you set rp->has_pmu here?

> +
> +       return ret;

It looks like this could be rearranged overall for more clarity:

ret = perf_pmu_register(&rapl_pmu.pmu, "power", -1);
if (ret) {
        pr_warn("Failed to register PMU\n");
        return ret;
}

rapl_pmu.registered = true;
rp->has_pmu = true;

return 0;

Also, the "Failed to register PMU\n" message is not particularly
useful AFAICS.  It would be good to add some more context to it and
maybe make it pr_info()?

> +}
> +
> +int rapl_package_add_pmu(struct rapl_package *rp)
> +{
> +       struct rapl_package_pmu_data *data = &rp->pmu_data;
> +       int idx;
> +       int ret;
> +
> +       if (rp->has_pmu)
> +               return -EEXIST;
> +
> +       guard(cpus_read_lock)();

Why does this lock need to be held around the entire code below?

> +
> +       for (idx = 0; idx < rp->nr_domains; idx++) {
> +               struct rapl_domain *rd = &rp->domains[idx];
> +               int domain = rd->id;
> +               u64 val;
> +
> +               if (!test_bit(domain, &rp->domain_map))
> +                       continue;
> +
> +               /*
> +                * The RAPL PMU granularity is 2^-32 Joules
> +                * data->scale[]: times of 2^-32 Joules for each ENERGY COUNTER increase
> +                */
> +               val = rd->energy_unit * (1ULL << 32);
> +               do_div(val, ENERGY_UNIT_SCALE * 1000000);
> +               data->scale[domain] = val;
> +
> +               if (!rapl_pmu.timer_ms) {
> +                       struct rapl_primitive_info *rpi = get_rpi(rp, ENERGY_COUNTER);
> +
> +                       /*
> +                        * Calculate the timer rate:
> +                        * Use reference of 200W for scaling the timeout to avoid counter
> +                        * overflows.
> +                        *
> +                        * max_count = rpi->mask >> rpi->shift + 1
> +                        * max_energy_pj = max_count * rd->energy_unit
> +                        * max_time_sec = (max_energy_pj / 1000000000) / 200w
> +                        *
> +                        * rapl_pmu.timer_ms = max_time_sec * 1000 / 2
> +                        */
> +                       val = (rpi->mask >> rpi->shift) + 1;
> +                       val *= rd->energy_unit;
> +                       do_div(val, 1000000 * 200 * 2);
> +                       rapl_pmu.timer_ms = val;
> +
> +                       pr_info("%llu ms ovfl timer\n", rapl_pmu.timer_ms);

s/ovfl/overflow/

And use pr_debug()?

> +               }
> +
> +               pr_info("Domain %s: hw unit %lld * 2^-32 Joules\n", rd->name, data->scale[domain]);

pr_debug() here too?

> +       }
> +
> +       /* Initialize per package PMU data */
> +       raw_spin_lock_init(&data->lock);
> +       INIT_LIST_HEAD(&data->active_list);
> +       data->timer_interval = ms_to_ktime(rapl_pmu.timer_ms);
> +       hrtimer_init(&data->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> +       data->hrtimer.function = rapl_hrtimer_handle;
> +
> +       ret = rapl_pmu_update(rp);
> +       if (!ret)
> +               rp->has_pmu = true;
> +       return ret;
> +}
> +EXPORT_SYMBOL_GPL(rapl_package_add_pmu);
> +
> +void rapl_package_remove_pmu(struct rapl_package *rp)
> +{
> +       struct rapl_package *pos;
> +
> +       if (!rp->has_pmu)
> +               return;
> +
> +       guard(cpus_read_lock)();
> +
> +       list_for_each_entry(pos, &rapl_packages, plist) {

rp can be used for walking this list, can't it?

> +               /* PMU is still needed */
> +               if (pos->has_pmu)
> +                       return;
> +       }
> +
> +       perf_pmu_unregister(&rapl_pmu.pmu);
> +       memset(&rapl_pmu, 0, sizeof(struct rapl_pmu));
> +}
> +EXPORT_SYMBOL_GPL(rapl_package_remove_pmu);
> +#endif
> +
>  /* called from CPU hotplug notifier, hotplug lock held */
>  void rapl_remove_package_cpuslocked(struct rapl_package *rp)
>  {
> diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
> index f3196f82fd8a..38e43dbb3dac 100644
> --- a/include/linux/intel_rapl.h
> +++ b/include/linux/intel_rapl.h
> @@ -158,6 +158,17 @@ struct rapl_if_priv {
>         void *rpi;
>  };
>
> +#ifdef CONFIG_PERF_EVENTS

Structure fields below need to be documented.

> +struct rapl_package_pmu_data {
> +       u64 scale[RAPL_DOMAIN_MAX];
> +       raw_spinlock_t lock;
> +       int n_active;
> +       struct list_head active_list;
> +       ktime_t timer_interval;
> +       struct hrtimer hrtimer;
> +};
> +#endif
> +
>  /* maximum rapl package domain name: package-%d-die-%d */
>  #define PACKAGE_DOMAIN_NAME_LENGTH 30
>
> @@ -176,6 +187,10 @@ struct rapl_package {
>         struct cpumask cpumask;
>         char name[PACKAGE_DOMAIN_NAME_LENGTH];
>         struct rapl_if_priv *priv;
> +#ifdef CONFIG_PERF_EVENTS
> +       bool has_pmu;
> +       struct rapl_package_pmu_data pmu_data;
> +#endif
>  };
>
>  struct rapl_package *rapl_find_package_domain_cpuslocked(int id, struct rapl_if_priv *priv,
> @@ -188,4 +203,12 @@ struct rapl_package *rapl_find_package_domain(int id, struct rapl_if_priv *priv,
>  struct rapl_package *rapl_add_package(int id, struct rapl_if_priv *priv, bool id_is_cpu);
>  void rapl_remove_package(struct rapl_package *rp);
>
> +#ifdef CONFIG_PERF_EVENTS
> +int rapl_package_add_pmu(struct rapl_package *rp);
> +void rapl_package_remove_pmu(struct rapl_package *rp);
> +#else
> +static inline int rapl_package_add_pmu(struct rapl_package *rp) { return 0; }
> +static inline void rapl_package_remove_pmu(struct rapl_package *rp) { }
> +#endif
> +
>  #endif /* __INTEL_RAPL_H__ */
> --

Thanks!