[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <563b1a69-8e4d-4c49-ac46-f5b845452a6a@linux.intel.com>
Date: Wed, 20 Aug 2025 11:08:27 -0700
From: "Liang, Kan" <kan.liang@...ux.intel.com>
To: "Mi, Dapeng" <dapeng1.mi@...ux.intel.com>, peterz@...radead.org,
mingo@...hat.com, acme@...nel.org, namhyung@...nel.org, tglx@...utronix.de,
dave.hansen@...ux.intel.com, irogers@...gle.com, adrian.hunter@...el.com,
jolsa@...nel.org, alexander.shishkin@...ux.intel.com,
linux-kernel@...r.kernel.org
Cc: ak@...ux.intel.com, zide.chen@...el.com, mark.rutland@....com,
broonie@...nel.org, ravi.bangoria@....com, eranian@...gle.com
Subject: Re: [PATCH V3 06/17] perf: Support SIMD registers
On 2025-08-20 2:55 a.m., Mi, Dapeng wrote:
>
> On 8/16/2025 5:34 AM, kan.liang@...ux.intel.com wrote:
>> From: Kan Liang <kan.liang@...ux.intel.com>
>>
>> The users may be interested in the SIMD registers in a sample while
>> profiling. The current sample_regs_XXX doesn't have enough space for all
>> SIMD registers.
>>
>> Add sets of the sample_simd_{pred,vec}_reg_* in the
>> struct perf_event_attr to define a set of SIMD registers to dump on
>> samples.
>> The current X86 supports the XMM registers in sample_regs_XXX. To
>> utilize the new SIMD registers configuration method, the
>> sample_simd_regs_enabled should always be set. If so, the XMM space in
>> the sample_regs_XXX is reserved for other usage.
>>
>> The SIMD registers are wider than 64. A new output format is introduced.
>> The number and width of SIMD registers will be dumped first, following
>> the register values. The number and width are the same as the user's
>> configuration now. If, for some reason (e.g., ARM) they are different,
>> an ARCH-specific perf_output_sample_simd_regs can be implemented later
>> separately.
>> Add a new ABI, PERF_SAMPLE_REGS_ABI_SIMD, to indicate the new format.
>> The enum perf_sample_regs_abi becomes a bitmap now. There should be no
>> impact on the existing tool, since the version and bitmap are the same
>> for 1 and 2.
>>
>> Add three new __weak functions to retrieve the number of available
>> registers, validate the configuration of the SIMD registers, and
>> retrieve the SIMD registers. The ARCH-specific functions will be
>> implemented in the following patches.
>>
>> Add a new flag PERF_PMU_CAP_SIMD_REGS to indicate that the PMU has the
>> capability to support SIMD registers dumping. Error out if the
>> sample_simd_{pred,vec}_reg_* mistakenly set for a PMU that doesn't have
>> the capability.
>>
>> Suggested-by: Peter Zijlstra (Intel) <peterz@...radead.org>
>> Signed-off-by: Kan Liang <kan.liang@...ux.intel.com>
>> ---
>> include/linux/perf_event.h | 13 ++++
>> include/linux/perf_regs.h | 9 +++
>> include/uapi/linux/perf_event.h | 47 +++++++++++++--
>> kernel/events/core.c | 101 +++++++++++++++++++++++++++++++-
>> 4 files changed, 162 insertions(+), 8 deletions(-)
>>
>> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
>> index 444b162f3f92..205361b7de2e 100644
>> --- a/include/linux/perf_event.h
>> +++ b/include/linux/perf_event.h
>> @@ -305,6 +305,7 @@ struct perf_event_pmu_context;
>> #define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100
>> #define PERF_PMU_CAP_AUX_PAUSE 0x0200
>> #define PERF_PMU_CAP_AUX_PREFER_LARGE 0x0400
>> +#define PERF_PMU_CAP_SIMD_REGS 0x0800
>>
>> /**
>> * pmu::scope
>> @@ -1526,6 +1527,18 @@ perf_event__output_id_sample(struct perf_event *event,
>> extern void
>> perf_log_lost_samples(struct perf_event *event, u64 lost);
>>
>> +static inline bool event_has_simd_regs(struct perf_event *event)
>> +{
>> + struct perf_event_attr *attr = &event->attr;
>> +
>> + return attr->sample_simd_regs_enabled != 0 ||
>> + attr->sample_simd_pred_reg_intr != 0 ||
>> + attr->sample_simd_pred_reg_user != 0 ||
>> + attr->sample_simd_vec_reg_qwords != 0 ||
>> + attr->sample_simd_vec_reg_intr != 0 ||
>> + attr->sample_simd_vec_reg_user != 0;
>> +}
>> +
>> static inline bool event_has_extended_regs(struct perf_event *event)
>> {
>> struct perf_event_attr *attr = &event->attr;
>> diff --git a/include/linux/perf_regs.h b/include/linux/perf_regs.h
>> index f632c5725f16..0172682b18fd 100644
>> --- a/include/linux/perf_regs.h
>> +++ b/include/linux/perf_regs.h
>> @@ -9,6 +9,15 @@ struct perf_regs {
>> struct pt_regs *regs;
>> };
>>
>> +int perf_simd_reg_validate(u16 vec_qwords, u64 vec_mask,
>> + u16 pred_qwords, u32 pred_mask);
>> +u64 perf_simd_reg_value(struct pt_regs *regs, int idx,
>> + u16 qwords_idx, bool pred);
>> +void perf_simd_reg_check(struct pt_regs *regs,
>> + u64 mask, u16 *nr_vectors, u16 *vec_qwords,
>> + u16 pred_mask, u16 *nr_pred, u16 *pred_qwords);
>> +
>> +
>> #ifdef CONFIG_HAVE_PERF_REGS
>> #include <asm/perf_regs.h>
>>
>> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
>> index 78a362b80027..2e9b16acbed6 100644
>> --- a/include/uapi/linux/perf_event.h
>> +++ b/include/uapi/linux/perf_event.h
>> @@ -313,9 +313,10 @@ enum {
>> * Values to determine ABI of the registers dump.
>> */
>> enum perf_sample_regs_abi {
>> - PERF_SAMPLE_REGS_ABI_NONE = 0,
>> - PERF_SAMPLE_REGS_ABI_32 = 1,
>> - PERF_SAMPLE_REGS_ABI_64 = 2,
>> + PERF_SAMPLE_REGS_ABI_NONE = 0x00,
>> + PERF_SAMPLE_REGS_ABI_32 = 0x01,
>> + PERF_SAMPLE_REGS_ABI_64 = 0x02,
>> + PERF_SAMPLE_REGS_ABI_SIMD = 0x04,
>
> Better change the definition to bitmap format, so it clearly indicates the
> ABI is a bitmap format.
>
> enum perf_sample_regs_abi {
> PERF_SAMPLE_REGS_ABI_NONE = 0,
> PERF_SAMPLE_REGS_ABI_32 = 1 << 0,
> PERF_SAMPLE_REGS_ABI_64 = 1 << 1,
> PERF_SAMPLE_REGS_ABI_SIMD = 1 << 2,
> };
>
>
BIT_ULL() should be better.
Thanks,
Kan
>
>> };
>>
>> /*
>> @@ -382,6 +383,7 @@ enum perf_event_read_format {
>> #define PERF_ATTR_SIZE_VER6 120 /* Add: aux_sample_size */
>> #define PERF_ATTR_SIZE_VER7 128 /* Add: sig_data */
>> #define PERF_ATTR_SIZE_VER8 136 /* Add: config3 */
>> +#define PERF_ATTR_SIZE_VER9 168 /* Add: sample_simd_{pred,vec}_reg_* */
>>
>> /*
>> * 'struct perf_event_attr' contains various attributes that define
>> @@ -543,6 +545,25 @@ struct perf_event_attr {
>> __u64 sig_data;
>>
>> __u64 config3; /* extension of config2 */
>> +
>> +
>> + /*
>> + * Defines set of SIMD registers to dump on samples.
>> + * The sample_simd_regs_enabled !=0 implies the
>> + * set of SIMD registers is used to config all SIMD registers.
>> + * If !sample_simd_regs_enabled, sample_regs_XXX may be used to
>> + * config some SIMD registers on X86.
>> + */
>> + union {
>> + __u16 sample_simd_regs_enabled;
>> + __u16 sample_simd_pred_reg_qwords;
>> + };
>> + __u32 sample_simd_pred_reg_intr;
>> + __u32 sample_simd_pred_reg_user;
>> + __u16 sample_simd_vec_reg_qwords;
>> + __u64 sample_simd_vec_reg_intr;
>> + __u64 sample_simd_vec_reg_user;
>> + __u32 __reserved_4;
>> };
>>
>> /*
>> @@ -1016,7 +1037,15 @@ enum perf_event_type {
>> * } && PERF_SAMPLE_BRANCH_STACK
>> *
>> * { u64 abi; # enum perf_sample_regs_abi
>> - * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER
>> + * u64 regs[weight(mask)];
>> + * struct {
>> + * u16 nr_vectors;
>> + * u16 vector_qwords;
>> + * u16 nr_pred;
>> + * u16 pred_qwords;
>> + * u64 data[nr_vectors * vector_qwords + nr_pred * pred_qwords];
>> + * } && (abi & PERF_SAMPLE_REGS_ABI_SIMD)
>> + * } && PERF_SAMPLE_REGS_USER
>> *
>> * { u64 size;
>> * char data[size];
>> @@ -1043,7 +1072,15 @@ enum perf_event_type {
>> * { u64 data_src; } && PERF_SAMPLE_DATA_SRC
>> * { u64 transaction; } && PERF_SAMPLE_TRANSACTION
>> * { u64 abi; # enum perf_sample_regs_abi
>> - * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
>> + * u64 regs[weight(mask)];
>> + * struct {
>> + * u16 nr_vectors;
>> + * u16 vector_qwords;
>> + * u16 nr_pred;
>> + * u16 pred_qwords;
>> + * u64 data[nr_vectors * vector_qwords + nr_pred * pred_qwords];
>> + * } && (abi & PERF_SAMPLE_REGS_ABI_SIMD)
>> + * } && PERF_SAMPLE_REGS_INTR
>> * { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR
>> * { u64 cgroup;} && PERF_SAMPLE_CGROUP
>> * { u64 data_page_size;} && PERF_SAMPLE_DATA_PAGE_SIZE
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 95a7b6f5af09..dd8cf3c7fb7a 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -7408,6 +7408,47 @@ perf_output_sample_regs(struct perf_output_handle *handle,
>> }
>> }
>>
>> +static void
>> +perf_output_sample_simd_regs(struct perf_output_handle *handle,
>> + struct perf_event *event,
>> + struct pt_regs *regs,
>> + u64 mask, u16 pred_mask)
>> +{
>> + u16 pred_qwords = event->attr.sample_simd_pred_reg_qwords;
>> + u16 vec_qwords = event->attr.sample_simd_vec_reg_qwords;
>> + u16 nr_pred = hweight16(pred_mask);
>> + u16 nr_vectors = hweight64(mask);
>> + int bit;
>> + u64 val;
>> + u16 i;
>> +
>> + /* Get the number of available regs */
>> + perf_simd_reg_check(regs, mask, &nr_vectors, &vec_qwords,
>> + pred_mask, &nr_pred, &pred_qwords);
>> +
>> + perf_output_put(handle, nr_vectors);
>> + perf_output_put(handle, vec_qwords);
>> + perf_output_put(handle, nr_pred);
>> + perf_output_put(handle, pred_qwords);
>> +
>> + if (nr_vectors) {
>> + for_each_set_bit(bit, (unsigned long *)&mask, sizeof(mask) * BITS_PER_BYTE) {
>> + for (i = 0; i < vec_qwords; i++) {
>> + val = perf_simd_reg_value(regs, bit, i, false);
>> + perf_output_put(handle, val);
>> + }
>> + }
>> + }
>> + if (nr_pred) {
>> + for_each_set_bit(bit, (unsigned long *)&pred_mask, sizeof(pred_mask) * BITS_PER_BYTE) {
>> + for (i = 0; i < pred_qwords; i++) {
>> + val = perf_simd_reg_value(regs, bit, i, true);
>> + perf_output_put(handle, val);
>> + }
>> + }
>> + }
>> +}
>> +
>> static void perf_sample_regs_user(struct perf_regs *regs_user,
>> struct pt_regs *regs)
>> {
>> @@ -7429,6 +7470,25 @@ static void perf_sample_regs_intr(struct perf_regs *regs_intr,
>> regs_intr->abi = perf_reg_abi(current);
>> }
>>
>> +int __weak perf_simd_reg_validate(u16 vec_qwords, u64 vec_mask,
>> + u16 pred_qwords, u32 pred_mask)
>> +{
>> + return vec_qwords || vec_mask || pred_qwords || pred_mask ? -ENOSYS : 0;
>> +}
>> +
>> +u64 __weak perf_simd_reg_value(struct pt_regs *regs, int idx,
>> + u16 qwords_idx, bool pred)
>> +{
>> + return 0;
>> +}
>> +
>> +void __weak perf_simd_reg_check(struct pt_regs *regs,
>> + u64 mask, u16 *nr_vectors, u16 *vec_qwords,
>> + u16 pred_mask, u16 *nr_pred, u16 *pred_qwords)
>> +{
>> + *nr_vectors = 0;
>> + *nr_pred = 0;
>> +}
>>
>> /*
>> * Get remaining task size from user stack pointer.
>> @@ -7961,10 +8021,17 @@ void perf_output_sample(struct perf_output_handle *handle,
>> perf_output_put(handle, abi);
>>
>> if (abi) {
>> - u64 mask = event->attr.sample_regs_user;
>> + struct perf_event_attr *attr = &event->attr;
>> + u64 mask = attr->sample_regs_user;
>> perf_output_sample_regs(handle,
>> data->regs_user.regs,
>> mask);
>> + if (abi & PERF_SAMPLE_REGS_ABI_SIMD) {
>> + perf_output_sample_simd_regs(handle, event,
>> + data->regs_user.regs,
>> + attr->sample_simd_vec_reg_user,
>> + attr->sample_simd_pred_reg_user);
>> + }
>> }
>> }
>>
>> @@ -7992,11 +8059,18 @@ void perf_output_sample(struct perf_output_handle *handle,
>> perf_output_put(handle, abi);
>>
>> if (abi) {
>> - u64 mask = event->attr.sample_regs_intr;
>> + struct perf_event_attr *attr = &event->attr;
>> + u64 mask = attr->sample_regs_intr;
>>
>> perf_output_sample_regs(handle,
>> data->regs_intr.regs,
>> mask);
>> + if (abi & PERF_SAMPLE_REGS_ABI_SIMD) {
>> + perf_output_sample_simd_regs(handle, event,
>> + data->regs_intr.regs,
>> + attr->sample_simd_vec_reg_intr,
>> + attr->sample_simd_pred_reg_intr);
>> + }
>> }
>> }
>>
>> @@ -12560,6 +12634,12 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
>> if (ret)
>> goto err_pmu;
>>
>> + if (!(pmu->capabilities & PERF_PMU_CAP_SIMD_REGS) &&
>> + event_has_simd_regs(event)) {
>> + ret = -EOPNOTSUPP;
>> + goto err_destroy;
>> + }
>> +
>> if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
>> event_has_extended_regs(event)) {
>> ret = -EOPNOTSUPP;
>> @@ -13101,6 +13181,12 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
>> ret = perf_reg_validate(attr->sample_regs_user);
>> if (ret)
>> return ret;
>> + ret = perf_simd_reg_validate(attr->sample_simd_vec_reg_qwords,
>> + attr->sample_simd_vec_reg_user,
>> + attr->sample_simd_pred_reg_qwords,
>> + attr->sample_simd_pred_reg_user);
>> + if (ret)
>> + return ret;
>> }
>>
>> if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
>> @@ -13121,8 +13207,17 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
>> if (!attr->sample_max_stack)
>> attr->sample_max_stack = sysctl_perf_event_max_stack;
>>
>> - if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
>> + if (attr->sample_type & PERF_SAMPLE_REGS_INTR) {
>> ret = perf_reg_validate(attr->sample_regs_intr);
>> + if (ret)
>> + return ret;
>> + ret = perf_simd_reg_validate(attr->sample_simd_vec_reg_qwords,
>> + attr->sample_simd_vec_reg_intr,
>> + attr->sample_simd_pred_reg_qwords,
>> + attr->sample_simd_pred_reg_intr);
>> + if (ret)
>> + return ret;
>> + }
>>
>> #ifndef CONFIG_CGROUP_PERF
>> if (attr->sample_type & PERF_SAMPLE_CGROUP)
Powered by blists - more mailing lists