[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <0480a4a6-4b85-408f-8248-c7d28500b13e@intel.com>
Date: Tue, 2 Dec 2025 08:31:44 -0800
From: Reinette Chatre <reinette.chatre@...el.com>
To: Tony Luck <tony.luck@...el.com>, Fenghua Yu <fenghuay@...dia.com>, "Maciej
Wieczor-Retman" <maciej.wieczor-retman@...el.com>, Peter Newman
<peternewman@...gle.com>, James Morse <james.morse@....com>, Babu Moger
<babu.moger@....com>, Drew Fustini <dfustini@...libre.com>, Dave Martin
<Dave.Martin@....com>, Chen Yu <yu.c.chen@...el.com>
CC: <x86@...nel.org>, <linux-kernel@...r.kernel.org>,
<patches@...ts.linux.dev>
Subject: Re: [PATCH v14 25/32] x86/resctrl: Handle number of RMIDs supported
by RDT_RESOURCE_PERF_PKG
Hi Tony,
On 11/24/25 10:54 AM, Tony Luck wrote:
> There are now three meanings for "number of RMIDs":
>
> 1) The number for legacy features enumerated by CPUID leaf 0xF. This
> is the maximum number of distinct values that can be loaded into
> MSR_IA32_PQR_ASSOC. Note that systems with Sub-NUMA Cluster mode enabled
> will force scaling down the CPUID enumerated value by the number of SNC
> nodes per L3-cache.
Please check line lengths.
>
> 2) The number of registers in MMIO space for each event. This is enumerated
> in the XML files and is the value initialized into event_group::num_rmid.
>
> 3) The number of "hardware counters" (this isn't a strictly accurate
> description of how things work, but serves as a useful analogy that
> does describe the limitations) feeding to those MMIO registers. This
> is enumerated in telemetry_region::num_rmids returned from the call to
"from the call to" -> "by" ?
> intel_pmt_get_regions_by_feature()
>
> Event groups with insufficient "hardware counters" to track all RMIDs are
> difficult for users to use, since the system may reassign "hardware counters"
> at any time. This means that users cannot reliably collect two consecutive
> event counts to compute the rate at which events are occurring.
Please ensure changelogs remain coherent. Dropping the paragraph that describes
how under-resourced event groups are disabled may reflect new implementation but it
makes the changelog difficult to understand since the part that mentions how
a user can request to enable a disabled event group remains.
Even so, can the arch not still be expected to disable an under-resourced event group?
(more below)
>
> Limit an under-resourced event group's number of possible monitor resource
> groups to the lowest number of "hardware counters" if the user explicitly
> requests to enable it.
>
> Scan all enabled event groups and assign the RDT_RESOURCE_PERF_PKG resource
> "num_rmid" value to the smallest of these values as this value will be used
> later to compare against the number of RMIDs supported by other resources
> to determine how many monitoring resource groups are supported.
>
> N.B. Change type of resctrl_mon::num_rmid to u32 to match its usage and
> the type of event_group::num_rmid so that min(r->num_rmid, e->num_rmid)
> won't complain about mixing signed and unsigned types.
>
> Signed-off-by: Tony Luck <tony.luck@...el.com>
> ---
> include/linux/resctrl.h | 2 +-
> arch/x86/kernel/cpu/resctrl/intel_aet.c | 58 ++++++++++++++++++++++++-
> fs/resctrl/rdtgroup.c | 2 +-
> 3 files changed, 59 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
> index 14126d228e61..8623e450619a 100644
> --- a/include/linux/resctrl.h
> +++ b/include/linux/resctrl.h
> @@ -295,7 +295,7 @@ enum resctrl_schema_fmt {
> * events of monitor groups created via mkdir.
> */
> struct resctrl_mon {
> - int num_rmid;
> + u32 num_rmid;
> unsigned int mbm_cfg_mask;
> int num_mbm_cntrs;
> bool mbm_cntr_assignable;
> diff --git a/arch/x86/kernel/cpu/resctrl/intel_aet.c b/arch/x86/kernel/cpu/resctrl/intel_aet.c
> index 50c8b4c50790..6eff606541ad 100644
> --- a/arch/x86/kernel/cpu/resctrl/intel_aet.c
> +++ b/arch/x86/kernel/cpu/resctrl/intel_aet.c
> @@ -25,6 +25,7 @@
> #include <linux/intel_pmt_features.h>
> #include <linux/intel_vsec.h>
> #include <linux/io.h>
> +#include <linux/minmax.h>
> #include <linux/overflow.h>
> #include <linux/printk.h>
> #include <linux/rculist.h>
> @@ -68,6 +69,10 @@ struct pmt_event {
> * @force_on: True when "rdt" command line overrides disable of
> * this @guid due to insufficient @num_rmid.
> * @guid: Unique number per XML description file.
> + * @num_rmid: Number of RMIDs supported by this group. May be
> + * adjusted downwards if enumeration from
> + * intel_pmt_get_regions_by_feature() indicates fewer
> + * RMIDs can be tracked simultaneously.
> * @mmio_size: Number of bytes of MMIO registers for this group.
> * @num_events: Number of events in this group.
> * @evts: Array of event descriptors.
> @@ -81,6 +86,7 @@ struct event_group {
>
> /* Remaining fields initialized from XML file. */
> u32 guid;
> + u32 num_rmid;
> size_t mmio_size;
> unsigned int num_events;
> struct pmt_event evts[] __counted_by(num_events);
> @@ -97,6 +103,7 @@ static struct event_group energy_0x26696143 = {
> .feature = FEATURE_PER_RMID_ENERGY_TELEM,
> .name = "energy",
> .guid = 0x26696143,
> + .num_rmid = 576,
> .mmio_size = XML_MMIO_SIZE(576, 2, 3),
> .num_events = 2,
> .evts = {
> @@ -113,6 +120,7 @@ static struct event_group perf_0x26557651 = {
> .feature = FEATURE_PER_RMID_PERF_TELEM,
> .name = "perf",
> .guid = 0x26557651,
> + .num_rmid = 576,
> .mmio_size = XML_MMIO_SIZE(576, 7, 3),
> .num_events = 7,
> .evts = {
> @@ -208,8 +216,25 @@ static bool group_has_usable_regions(struct event_group *e, struct pmt_feature_g
> return usable_regions;
> }
>
> +static bool all_regions_have_sufficient_rmid(struct event_group *e, struct pmt_feature_group *p)
> +{
> + struct telemetry_region *tr;
> + bool ret = true;
> +
> + for (int i = 0; i < p->count; i++) {
> + if (!p->regions[i].addr)
> + continue;
> + tr = &p->regions[i];
> + if (tr->num_rmids < e->num_rmid)
> + ret = false;
e->force_off can be set here to ensure that the event group's state accurately reflects that
it is force disabled.
> + }
> +
> + return ret;
> +}
> +
> static bool enable_events(struct event_group *e, struct pmt_feature_group *p)
> {
> + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_PERF_PKG].r_resctrl;
> int skipped_events = 0;
>
> if (e->force_off)
> @@ -218,13 +243,44 @@ static bool enable_events(struct event_group *e, struct pmt_feature_group *p)
> if (!group_has_usable_regions(e, p))
> return false;
>
> + /*
> + * Only enable feature with insufficient RMIDs if the user
> + * requested it from the kernel command line.
(line length)
> + */
> + if (!all_regions_have_sufficient_rmid(e, p) && !e->force_on) {
> + pr_info("Feature %s:0x%x not enabled due to insufficient RMIDs\n",
The generic kernel parameter terminology of "feature" does not seem appropriate here. This can be
made specific with, for example, "%s %s:0x%x monitoring not enabled ...", (added %s for the resource
name) that matches the terms used in "detected" messages added in patch #29. Open to ideas.
> + e->name, e->guid);
> + return false;
> + }
> +
> + for (int i = 0; i < p->count; i++) {
> + if (!p->regions[i].addr)
> + continue;
> + /*
> + * e->num_rmid only adjusted lower if user (via rdt= kernel
> + * parameter) forces an event group with insufficient RMID
> + * to be enabled.
> + */
> + e->num_rmid = min(e->num_rmid, p->regions[i].num_rmids);
> + }
> +
> for (int j = 0; j < e->num_events; j++) {
> if (!resctrl_enable_mon_event(e->evts[j].id, true,
> e->evts[j].bin_bits, &e->evts[j]))
> skipped_events++;
> }
>
> - return skipped_events < e->num_events;
> + if (e->num_events == skipped_events) {
> + pr_info("No events enabled in %s %s:0x%x\n", r->name, e->name, e->guid);
> + return false;
> + }
Unrelated change.
> +
> + if (r->mon.num_rmid)
> + r->mon.num_rmid = min(r->mon.num_rmid, e->num_rmid);
> + else
> + r->mon.num_rmid = e->num_rmid;
> +
> + return true;
> }
>
> /*
> diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c
> index ac3c6e44b7c5..60ce2390723e 100644
> --- a/fs/resctrl/rdtgroup.c
> +++ b/fs/resctrl/rdtgroup.c
> @@ -1157,7 +1157,7 @@ static int rdt_num_rmids_show(struct kernfs_open_file *of,
> {
> struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
>
> - seq_printf(seq, "%d\n", r->mon.num_rmid);
> + seq_printf(seq, "%u\n", r->mon.num_rmid);
>
> return 0;
> }
Reinette
Powered by blists - more mailing lists