lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <563b1a69-8e4d-4c49-ac46-f5b845452a6a@linux.intel.com>
Date: Wed, 20 Aug 2025 11:08:27 -0700
From: "Liang, Kan" <kan.liang@...ux.intel.com>
To: "Mi, Dapeng" <dapeng1.mi@...ux.intel.com>, peterz@...radead.org,
 mingo@...hat.com, acme@...nel.org, namhyung@...nel.org, tglx@...utronix.de,
 dave.hansen@...ux.intel.com, irogers@...gle.com, adrian.hunter@...el.com,
 jolsa@...nel.org, alexander.shishkin@...ux.intel.com,
 linux-kernel@...r.kernel.org
Cc: ak@...ux.intel.com, zide.chen@...el.com, mark.rutland@....com,
 broonie@...nel.org, ravi.bangoria@....com, eranian@...gle.com
Subject: Re: [PATCH V3 06/17] perf: Support SIMD registers



On 2025-08-20 2:55 a.m., Mi, Dapeng wrote:
> 
> On 8/16/2025 5:34 AM, kan.liang@...ux.intel.com wrote:
>> From: Kan Liang <kan.liang@...ux.intel.com>
>>
>> The users may be interested in the SIMD registers in a sample while
>> profiling. The current sample_regs_XXX doesn't have enough space for all
>> SIMD registers.
>>
>> Add sets of the sample_simd_{pred,vec}_reg_* in the
>> struct perf_event_attr to define a set of SIMD registers to dump on
>> samples.
>> The current X86 supports the XMM registers in sample_regs_XXX. To
>> utilize the new SIMD registers configuration method, the
>> sample_simd_regs_enabled should always be set. If so, the XMM space in
>> the sample_regs_XXX is reserved for other usage.
>>
>> The SIMD registers are wider than 64. A new output format is introduced.
>> The number and width of SIMD registers will be dumped first, following
>> the register values. The number and width are the same as the user's
>> configuration now. If, for some reason (e.g., ARM) they are different,
>> an ARCH-specific perf_output_sample_simd_regs can be implemented later
>> separately.
>> Add a new ABI, PERF_SAMPLE_REGS_ABI_SIMD, to indicate the new format.
>> The enum perf_sample_regs_abi becomes a bitmap now. There should be no
>> impact on the existing tool, since the version and bitmap are the same
>> for 1 and 2.
>>
>> Add three new __weak functions to retrieve the number of available
>> registers, validate the configuration of the SIMD registers, and
>> retrieve the SIMD registers. The ARCH-specific functions will be
>> implemented in the following patches.
>>
>> Add a new flag PERF_PMU_CAP_SIMD_REGS to indicate that the PMU has the
>> capability to support SIMD registers dumping. Error out if the
>> sample_simd_{pred,vec}_reg_* mistakenly set for a PMU that doesn't have
>> the capability.
>>
>> Suggested-by: Peter Zijlstra (Intel) <peterz@...radead.org>
>> Signed-off-by: Kan Liang <kan.liang@...ux.intel.com>
>> ---
>>  include/linux/perf_event.h      |  13 ++++
>>  include/linux/perf_regs.h       |   9 +++
>>  include/uapi/linux/perf_event.h |  47 +++++++++++++--
>>  kernel/events/core.c            | 101 +++++++++++++++++++++++++++++++-
>>  4 files changed, 162 insertions(+), 8 deletions(-)
>>
>> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
>> index 444b162f3f92..205361b7de2e 100644
>> --- a/include/linux/perf_event.h
>> +++ b/include/linux/perf_event.h
>> @@ -305,6 +305,7 @@ struct perf_event_pmu_context;
>>  #define PERF_PMU_CAP_EXTENDED_HW_TYPE	0x0100
>>  #define PERF_PMU_CAP_AUX_PAUSE		0x0200
>>  #define PERF_PMU_CAP_AUX_PREFER_LARGE	0x0400
>> +#define PERF_PMU_CAP_SIMD_REGS		0x0800
>>  
>>  /**
>>   * pmu::scope
>> @@ -1526,6 +1527,18 @@ perf_event__output_id_sample(struct perf_event *event,
>>  extern void
>>  perf_log_lost_samples(struct perf_event *event, u64 lost);
>>  
>> +static inline bool event_has_simd_regs(struct perf_event *event)
>> +{
>> +	struct perf_event_attr *attr = &event->attr;
>> +
>> +	return attr->sample_simd_regs_enabled != 0 ||
>> +	       attr->sample_simd_pred_reg_intr != 0 ||
>> +	       attr->sample_simd_pred_reg_user != 0 ||
>> +	       attr->sample_simd_vec_reg_qwords != 0 ||
>> +	       attr->sample_simd_vec_reg_intr != 0 ||
>> +	       attr->sample_simd_vec_reg_user != 0;
>> +}
>> +
>>  static inline bool event_has_extended_regs(struct perf_event *event)
>>  {
>>  	struct perf_event_attr *attr = &event->attr;
>> diff --git a/include/linux/perf_regs.h b/include/linux/perf_regs.h
>> index f632c5725f16..0172682b18fd 100644
>> --- a/include/linux/perf_regs.h
>> +++ b/include/linux/perf_regs.h
>> @@ -9,6 +9,15 @@ struct perf_regs {
>>  	struct pt_regs	*regs;
>>  };
>>  
>> +int perf_simd_reg_validate(u16 vec_qwords, u64 vec_mask,
>> +			   u16 pred_qwords, u32 pred_mask);
>> +u64 perf_simd_reg_value(struct pt_regs *regs, int idx,
>> +			u16 qwords_idx, bool pred);
>> +void perf_simd_reg_check(struct pt_regs *regs,
>> +			 u64 mask, u16 *nr_vectors, u16 *vec_qwords,
>> +			 u16 pred_mask, u16 *nr_pred, u16 *pred_qwords);
>> +
>> +
>>  #ifdef CONFIG_HAVE_PERF_REGS
>>  #include <asm/perf_regs.h>
>>  
>> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
>> index 78a362b80027..2e9b16acbed6 100644
>> --- a/include/uapi/linux/perf_event.h
>> +++ b/include/uapi/linux/perf_event.h
>> @@ -313,9 +313,10 @@ enum {
>>   * Values to determine ABI of the registers dump.
>>   */
>>  enum perf_sample_regs_abi {
>> -	PERF_SAMPLE_REGS_ABI_NONE		= 0,
>> -	PERF_SAMPLE_REGS_ABI_32			= 1,
>> -	PERF_SAMPLE_REGS_ABI_64			= 2,
>> +	PERF_SAMPLE_REGS_ABI_NONE		= 0x00,
>> +	PERF_SAMPLE_REGS_ABI_32			= 0x01,
>> +	PERF_SAMPLE_REGS_ABI_64			= 0x02,
>> +	PERF_SAMPLE_REGS_ABI_SIMD		= 0x04,
> 
> Better change the definition to bitmap format, so it clearly indicates the
> ABI is a bitmap format.
> 
> enum perf_sample_regs_abi {
>     PERF_SAMPLE_REGS_ABI_NONE        = 0,
>     PERF_SAMPLE_REGS_ABI_32            = 1 << 0,
>     PERF_SAMPLE_REGS_ABI_64            = 1 << 1,
>     PERF_SAMPLE_REGS_ABI_SIMD        = 1 << 2,
> };
> 
> 

BIT_ULL() should be better.

Thanks,
Kan

> 
>>  };
>>  
>>  /*
>> @@ -382,6 +383,7 @@ enum perf_event_read_format {
>>  #define PERF_ATTR_SIZE_VER6			120	/* Add: aux_sample_size */
>>  #define PERF_ATTR_SIZE_VER7			128	/* Add: sig_data */
>>  #define PERF_ATTR_SIZE_VER8			136	/* Add: config3 */
>> +#define PERF_ATTR_SIZE_VER9			168	/* Add: sample_simd_{pred,vec}_reg_* */
>>  
>>  /*
>>   * 'struct perf_event_attr' contains various attributes that define
>> @@ -543,6 +545,25 @@ struct perf_event_attr {
>>  	__u64	sig_data;
>>  
>>  	__u64	config3; /* extension of config2 */
>> +
>> +
>> +	/*
>> +	 * Defines set of SIMD registers to dump on samples.
>> +	 * The sample_simd_regs_enabled !=0 implies the
>> +	 * set of SIMD registers is used to config all SIMD registers.
>> +	 * If !sample_simd_regs_enabled, sample_regs_XXX may be used to
>> +	 * config some SIMD registers on X86.
>> +	 */
>> +	union {
>> +		__u16 sample_simd_regs_enabled;
>> +		__u16 sample_simd_pred_reg_qwords;
>> +	};
>> +	__u32 sample_simd_pred_reg_intr;
>> +	__u32 sample_simd_pred_reg_user;
>> +	__u16 sample_simd_vec_reg_qwords;
>> +	__u64 sample_simd_vec_reg_intr;
>> +	__u64 sample_simd_vec_reg_user;
>> +	__u32 __reserved_4;
>>  };
>>  
>>  /*
>> @@ -1016,7 +1037,15 @@ enum perf_event_type {
>>  	 *      } && PERF_SAMPLE_BRANCH_STACK
>>  	 *
>>  	 *	{ u64			abi; # enum perf_sample_regs_abi
>> -	 *	  u64			regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER
>> +	 *	  u64			regs[weight(mask)];
>> +	 *	  struct {
>> +	 *		u16 nr_vectors;
>> +	 *		u16 vector_qwords;
>> +	 *		u16 nr_pred;
>> +	 *		u16 pred_qwords;
>> +	 *		u64 data[nr_vectors * vector_qwords + nr_pred * pred_qwords];
>> +	 *	  } && (abi & PERF_SAMPLE_REGS_ABI_SIMD)
>> +	 *	} && PERF_SAMPLE_REGS_USER
>>  	 *
>>  	 *	{ u64			size;
>>  	 *	  char			data[size];
>> @@ -1043,7 +1072,15 @@ enum perf_event_type {
>>  	 *	{ u64			data_src; } && PERF_SAMPLE_DATA_SRC
>>  	 *	{ u64			transaction; } && PERF_SAMPLE_TRANSACTION
>>  	 *	{ u64			abi; # enum perf_sample_regs_abi
>> -	 *	  u64			regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
>> +	 *	  u64			regs[weight(mask)];
>> +	 *	  struct {
>> +	 *		u16 nr_vectors;
>> +	 *		u16 vector_qwords;
>> +	 *		u16 nr_pred;
>> +	 *		u16 pred_qwords;
>> +	 *		u64 data[nr_vectors * vector_qwords + nr_pred * pred_qwords];
>> +	 *	  } && (abi & PERF_SAMPLE_REGS_ABI_SIMD)
>> +	 *	} && PERF_SAMPLE_REGS_INTR
>>  	 *	{ u64			phys_addr;} && PERF_SAMPLE_PHYS_ADDR
>>  	 *	{ u64			cgroup;} && PERF_SAMPLE_CGROUP
>>  	 *	{ u64			data_page_size;} && PERF_SAMPLE_DATA_PAGE_SIZE
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 95a7b6f5af09..dd8cf3c7fb7a 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -7408,6 +7408,47 @@ perf_output_sample_regs(struct perf_output_handle *handle,
>>  	}
>>  }
>>  
>> +static void
>> +perf_output_sample_simd_regs(struct perf_output_handle *handle,
>> +			     struct perf_event *event,
>> +			     struct pt_regs *regs,
>> +			     u64 mask, u16 pred_mask)
>> +{
>> +	u16 pred_qwords = event->attr.sample_simd_pred_reg_qwords;
>> +	u16 vec_qwords = event->attr.sample_simd_vec_reg_qwords;
>> +	u16 nr_pred = hweight16(pred_mask);
>> +	u16 nr_vectors = hweight64(mask);
>> +	int bit;
>> +	u64 val;
>> +	u16 i;
>> +
>> +	/* Get the number of available regs */
>> +	perf_simd_reg_check(regs, mask, &nr_vectors, &vec_qwords,
>> +			    pred_mask, &nr_pred, &pred_qwords);
>> +
>> +	perf_output_put(handle, nr_vectors);
>> +	perf_output_put(handle, vec_qwords);
>> +	perf_output_put(handle, nr_pred);
>> +	perf_output_put(handle, pred_qwords);
>> +
>> +	if (nr_vectors) {
>> +		for_each_set_bit(bit, (unsigned long *)&mask, sizeof(mask) * BITS_PER_BYTE) {
>> +			for (i = 0; i < vec_qwords; i++) {
>> +				val = perf_simd_reg_value(regs, bit, i, false);
>> +				perf_output_put(handle, val);
>> +			}
>> +		}
>> +	}
>> +	if (nr_pred) {
>> +		for_each_set_bit(bit, (unsigned long *)&pred_mask, sizeof(pred_mask) * BITS_PER_BYTE) {
>> +			for (i = 0; i < pred_qwords; i++) {
>> +				val = perf_simd_reg_value(regs, bit, i, true);
>> +				perf_output_put(handle, val);
>> +			}
>> +		}
>> +	}
>> +}
>> +
>>  static void perf_sample_regs_user(struct perf_regs *regs_user,
>>  				  struct pt_regs *regs)
>>  {
>> @@ -7429,6 +7470,25 @@ static void perf_sample_regs_intr(struct perf_regs *regs_intr,
>>  	regs_intr->abi  = perf_reg_abi(current);
>>  }
>>  
>> +int __weak perf_simd_reg_validate(u16 vec_qwords, u64 vec_mask,
>> +				  u16 pred_qwords, u32 pred_mask)
>> +{
>> +	return vec_qwords || vec_mask || pred_qwords || pred_mask ? -ENOSYS : 0;
>> +}
>> +
>> +u64 __weak perf_simd_reg_value(struct pt_regs *regs, int idx,
>> +			       u16 qwords_idx, bool pred)
>> +{
>> +	return 0;
>> +}
>> +
>> +void __weak perf_simd_reg_check(struct pt_regs *regs,
>> +				u64 mask, u16 *nr_vectors, u16 *vec_qwords,
>> +				u16 pred_mask, u16 *nr_pred, u16 *pred_qwords)
>> +{
>> +	*nr_vectors = 0;
>> +	*nr_pred = 0;
>> +}
>>  
>>  /*
>>   * Get remaining task size from user stack pointer.
>> @@ -7961,10 +8021,17 @@ void perf_output_sample(struct perf_output_handle *handle,
>>  		perf_output_put(handle, abi);
>>  
>>  		if (abi) {
>> -			u64 mask = event->attr.sample_regs_user;
>> +			struct perf_event_attr *attr = &event->attr;
>> +			u64 mask = attr->sample_regs_user;
>>  			perf_output_sample_regs(handle,
>>  						data->regs_user.regs,
>>  						mask);
>> +			if (abi & PERF_SAMPLE_REGS_ABI_SIMD) {
>> +				perf_output_sample_simd_regs(handle, event,
>> +							     data->regs_user.regs,
>> +							     attr->sample_simd_vec_reg_user,
>> +							     attr->sample_simd_pred_reg_user);
>> +			}
>>  		}
>>  	}
>>  
>> @@ -7992,11 +8059,18 @@ void perf_output_sample(struct perf_output_handle *handle,
>>  		perf_output_put(handle, abi);
>>  
>>  		if (abi) {
>> -			u64 mask = event->attr.sample_regs_intr;
>> +			struct perf_event_attr *attr = &event->attr;
>> +			u64 mask = attr->sample_regs_intr;
>>  
>>  			perf_output_sample_regs(handle,
>>  						data->regs_intr.regs,
>>  						mask);
>> +			if (abi & PERF_SAMPLE_REGS_ABI_SIMD) {
>> +				perf_output_sample_simd_regs(handle, event,
>> +							     data->regs_intr.regs,
>> +							     attr->sample_simd_vec_reg_intr,
>> +							     attr->sample_simd_pred_reg_intr);
>> +			}
>>  		}
>>  	}
>>  
>> @@ -12560,6 +12634,12 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
>>  	if (ret)
>>  		goto err_pmu;
>>  
>> +	if (!(pmu->capabilities & PERF_PMU_CAP_SIMD_REGS) &&
>> +	    event_has_simd_regs(event)) {
>> +		ret = -EOPNOTSUPP;
>> +		goto err_destroy;
>> +	}
>> +
>>  	if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
>>  	    event_has_extended_regs(event)) {
>>  		ret = -EOPNOTSUPP;
>> @@ -13101,6 +13181,12 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
>>  		ret = perf_reg_validate(attr->sample_regs_user);
>>  		if (ret)
>>  			return ret;
>> +		ret = perf_simd_reg_validate(attr->sample_simd_vec_reg_qwords,
>> +					     attr->sample_simd_vec_reg_user,
>> +					     attr->sample_simd_pred_reg_qwords,
>> +					     attr->sample_simd_pred_reg_user);
>> +		if (ret)
>> +			return ret;
>>  	}
>>  
>>  	if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
>> @@ -13121,8 +13207,17 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
>>  	if (!attr->sample_max_stack)
>>  		attr->sample_max_stack = sysctl_perf_event_max_stack;
>>  
>> -	if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
>> +	if (attr->sample_type & PERF_SAMPLE_REGS_INTR) {
>>  		ret = perf_reg_validate(attr->sample_regs_intr);
>> +		if (ret)
>> +			return ret;
>> +		ret = perf_simd_reg_validate(attr->sample_simd_vec_reg_qwords,
>> +					     attr->sample_simd_vec_reg_intr,
>> +					     attr->sample_simd_pred_reg_qwords,
>> +					     attr->sample_simd_pred_reg_intr);
>> +		if (ret)
>> +			return ret;
>> +	}
>>  
>>  #ifndef CONFIG_CGROUP_PERF
>>  	if (attr->sample_type & PERF_SAMPLE_CGROUP)


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ