lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <66e33b64-e8d7-4fe3-a72b-059a11061b21@kernel.org>
Date: Tue, 4 Nov 2025 09:22:46 -0600
From: Mario Limonciello <superm1@...nel.org>
To: Lizhi Hou <lizhi.hou@....com>, ogabbay@...nel.org,
 quic_jhugo@...cinc.com, maciej.falkowski@...ux.intel.com,
 dri-devel@...ts.freedesktop.org
Cc: linux-kernel@...r.kernel.org, max.zhen@....com, sonal.santan@....com
Subject: Re: [PATCH 3/3] accel/amdxdna: Add IOCTL parameter for telemetry data

On 11/4/25 12:25 AM, Lizhi Hou wrote:
> Extend DRM_IOCTL_AMDXDNA_GET_INFO to include additional parameters
> that allow collection of telemetry data.
> 
> Signed-off-by: Lizhi Hou <lizhi.hou@....com>
Reviewed-by: Mario Limonciello (AMD) <superm1@...nel.org>> ---
>   drivers/accel/amdxdna/aie2_message.c          | 56 ++++++++++++--
>   drivers/accel/amdxdna/aie2_msg_priv.h         | 25 ++++++-
>   drivers/accel/amdxdna/aie2_pci.c              | 73 +++++++++++++++++++
>   drivers/accel/amdxdna/aie2_pci.h              |  3 +
>   .../accel/amdxdna/amdxdna_mailbox_helper.h    |  6 +-
>   drivers/accel/amdxdna/amdxdna_pci_drv.c       |  3 +-
>   include/uapi/drm/amdxdna_accel.h              | 17 +++++
>   7 files changed, 173 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/accel/amdxdna/aie2_message.c b/drivers/accel/amdxdna/aie2_message.c
> index 39214253d804..69cdce9ff208 100644
> --- a/drivers/accel/amdxdna/aie2_message.c
> +++ b/drivers/accel/amdxdna/aie2_message.c
> @@ -47,7 +47,7 @@ static int aie2_send_mgmt_msg_wait(struct amdxdna_dev_hdl *ndev,
>   		ndev->mgmt_chann = NULL;
>   	}
>   
> -	if (!ret && *hdl->data != AIE2_STATUS_SUCCESS) {
> +	if (!ret && *hdl->status != AIE2_STATUS_SUCCESS) {
>   		XDNA_ERR(xdna, "command opcode 0x%x failed, status 0x%x",
>   			 msg->opcode, *hdl->data);
>   		ret = -EINVAL;
> @@ -336,11 +336,6 @@ int aie2_query_status(struct amdxdna_dev_hdl *ndev, char __user *buf,
>   		goto fail;
>   	}
>   
> -	if (resp.status != AIE2_STATUS_SUCCESS) {
> -		XDNA_ERR(xdna, "Query NPU status failed, status 0x%x", resp.status);
> -		ret = -EINVAL;
> -		goto fail;
> -	}
>   	XDNA_DBG(xdna, "Query NPU status completed");
>   
>   	if (size < resp.size) {
> @@ -362,6 +357,55 @@ int aie2_query_status(struct amdxdna_dev_hdl *ndev, char __user *buf,
>   	return ret;
>   }
>   
> +int aie2_query_telemetry(struct amdxdna_dev_hdl *ndev,
> +			 char __user *buf, u32 size,
> +			 struct amdxdna_drm_query_telemetry_header *header)
> +{
> +	DECLARE_AIE2_MSG(get_telemetry, MSG_OP_GET_TELEMETRY);
> +	struct amdxdna_dev *xdna = ndev->xdna;
> +	dma_addr_t dma_addr;
> +	u8 *addr;
> +	int ret;
> +
> +	if (header->type >= MAX_TELEMETRY_TYPE)
> +		return -EINVAL;
> +
> +	addr = dma_alloc_noncoherent(xdna->ddev.dev, size, &dma_addr,
> +				     DMA_FROM_DEVICE, GFP_KERNEL);
> +	if (!addr)
> +		return -ENOMEM;
> +
> +	req.buf_addr = dma_addr;
> +	req.buf_size = size;
> +	req.type = header->type;
> +
> +	drm_clflush_virt_range(addr, size); /* device can access */
> +	ret = aie2_send_mgmt_msg_wait(ndev, &msg);
> +	if (ret) {
> +		XDNA_ERR(xdna, "Query telemetry failed, status %d", ret);
> +		goto free_buf;
> +	}
> +
> +	if (size < resp.size) {
> +		ret = -EINVAL;
> +		XDNA_ERR(xdna, "Bad buffer size. Available: %u. Needs: %u", size, resp.size);
> +		goto free_buf;
> +	}
> +
> +	if (copy_to_user(buf, addr, resp.size)) {
> +		ret = -EFAULT;
> +		XDNA_ERR(xdna, "Failed to copy telemetry to user space");
> +		goto free_buf;
> +	}
> +
> +	header->major = resp.major;
> +	header->minor = resp.minor;
> +
> +free_buf:
> +	dma_free_noncoherent(xdna->ddev.dev, size, addr, dma_addr, DMA_FROM_DEVICE);
> +	return ret;
> +}
> +
>   int aie2_register_asyn_event_msg(struct amdxdna_dev_hdl *ndev, dma_addr_t addr, u32 size,
>   				 void *handle, int (*cb)(void*, void __iomem *, size_t))
>   {
> diff --git a/drivers/accel/amdxdna/aie2_msg_priv.h b/drivers/accel/amdxdna/aie2_msg_priv.h
> index 945140011763..947daa63f064 100644
> --- a/drivers/accel/amdxdna/aie2_msg_priv.h
> +++ b/drivers/accel/amdxdna/aie2_msg_priv.h
> @@ -9,7 +9,8 @@
>   enum aie2_msg_opcode {
>   	MSG_OP_CREATE_CONTEXT              = 0x2,
>   	MSG_OP_DESTROY_CONTEXT             = 0x3,
> -	MSG_OP_SYNC_BO			   = 0x7,
> +	MSG_OP_GET_TELEMETRY               = 0x4,
> +	MSG_OP_SYNC_BO                     = 0x7,
>   	MSG_OP_EXECUTE_BUFFER_CF           = 0xC,
>   	MSG_OP_QUERY_COL_STATUS            = 0xD,
>   	MSG_OP_QUERY_AIE_TILE_INFO         = 0xE,
> @@ -137,6 +138,28 @@ struct destroy_ctx_resp {
>   	enum aie2_msg_status	status;
>   } __packed;
>   
> +enum telemetry_type {
> +	TELEMETRY_TYPE_DISABLED,
> +	TELEMETRY_TYPE_HEALTH,
> +	TELEMETRY_TYPE_ERROR_INFO,
> +	TELEMETRY_TYPE_PROFILING,
> +	TELEMETRY_TYPE_DEBUG,
> +	MAX_TELEMETRY_TYPE
> +};
> +
> +struct get_telemetry_req {
> +	enum telemetry_type	type;
> +	__u64	buf_addr;
> +	__u32	buf_size;
> +} __packed;
> +
> +struct get_telemetry_resp {
> +	__u32	major;
> +	__u32	minor;
> +	__u32	size;
> +	enum aie2_msg_status	status;
> +} __packed;
> +
>   struct execute_buffer_req {
>   	__u32	cu_idx;
>   	__u32	payload[19];
> diff --git a/drivers/accel/amdxdna/aie2_pci.c b/drivers/accel/amdxdna/aie2_pci.c
> index 396dc6e06007..d7ccbdaf47f5 100644
> --- a/drivers/accel/amdxdna/aie2_pci.c
> +++ b/drivers/accel/amdxdna/aie2_pci.c
> @@ -862,6 +862,76 @@ static int aie2_query_resource_info(struct amdxdna_client *client,
>   	return 0;
>   }
>   
> +static int aie2_fill_hwctx_map(struct amdxdna_hwctx *hwctx, void *arg)
> +{
> +	struct amdxdna_dev *xdna = hwctx->client->xdna;
> +	u32 *map = arg;
> +
> +	if (hwctx->fw_ctx_id >= xdna->dev_handle->priv->hwctx_limit) {
> +		XDNA_ERR(xdna, "Invalid fw ctx id %d/%d ", hwctx->fw_ctx_id,
> +			 xdna->dev_handle->priv->hwctx_limit);
> +		return -EINVAL;
> +	}
> +
> +	map[hwctx->fw_ctx_id] = hwctx->id;
> +	return 0;
> +}
> +
> +static int aie2_get_telemetry(struct amdxdna_client *client,
> +			      struct amdxdna_drm_get_info *args)
> +{
> +	struct amdxdna_drm_query_telemetry_header *header __free(kfree) = NULL;
> +	u32 telemetry_data_sz, header_sz, elem_num;
> +	struct amdxdna_dev *xdna = client->xdna;
> +	struct amdxdna_client *tmp_client;
> +	int ret;
> +
> +	elem_num = xdna->dev_handle->priv->hwctx_limit;
> +	header_sz = struct_size(header, map, elem_num);
> +	if (args->buffer_size <= header_sz) {
> +		XDNA_ERR(xdna, "Invalid buffer size");
> +		return -EINVAL;
> +	}
> +
> +	telemetry_data_sz = args->buffer_size - header_sz;
> +	if (telemetry_data_sz > SZ_4M) {
> +		XDNA_ERR(xdna, "Buffer size is too big, %d", telemetry_data_sz);
> +		return -EINVAL;
> +	}
> +
> +	header = kzalloc(header_sz, GFP_KERNEL);
> +	if (!header)
> +		return -ENOMEM;
> +
> +	if (copy_from_user(header, u64_to_user_ptr(args->buffer), sizeof(*header))) {
> +		XDNA_ERR(xdna, "Failed to copy telemetry header from user");
> +		return -EFAULT;
> +	}
> +
> +	header->map_num_elements = elem_num;
> +	list_for_each_entry(tmp_client, &xdna->client_list, node) {
> +		ret = amdxdna_hwctx_walk(tmp_client, &header->map,
> +					 aie2_fill_hwctx_map);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	ret = aie2_query_telemetry(xdna->dev_handle,
> +				   u64_to_user_ptr(args->buffer + header_sz),
> +				   telemetry_data_sz, header);
> +	if (ret) {
> +		XDNA_ERR(xdna, "Query telemetry failed ret %d", ret);
> +		return ret;
> +	}
> +
> +	if (copy_to_user(u64_to_user_ptr(args->buffer), header, header_sz)) {
> +		XDNA_ERR(xdna, "Copy header failed");
> +		return -EFAULT;
> +	}
> +
> +	return 0;
> +}
> +
>   static int aie2_get_info(struct amdxdna_client *client, struct amdxdna_drm_get_info *args)
>   {
>   	struct amdxdna_dev *xdna = client->xdna;
> @@ -896,6 +966,9 @@ static int aie2_get_info(struct amdxdna_client *client, struct amdxdna_drm_get_i
>   	case DRM_AMDXDNA_GET_POWER_MODE:
>   		ret = aie2_get_power_mode(client, args);
>   		break;
> +	case DRM_AMDXDNA_QUERY_TELEMETRY:
> +		ret = aie2_get_telemetry(client, args);
> +		break;
>   	case DRM_AMDXDNA_QUERY_RESOURCE_INFO:
>   		ret = aie2_query_resource_info(client, args);
>   		break;
> diff --git a/drivers/accel/amdxdna/aie2_pci.h b/drivers/accel/amdxdna/aie2_pci.h
> index a79f4f71ff6b..9793cd1e0c55 100644
> --- a/drivers/accel/amdxdna/aie2_pci.h
> +++ b/drivers/accel/amdxdna/aie2_pci.h
> @@ -305,6 +305,9 @@ int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwct
>   int aie2_destroy_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx);
>   int aie2_map_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, u64 addr, u64 size);
>   int aie2_query_status(struct amdxdna_dev_hdl *ndev, char __user *buf, u32 size, u32 *cols_filled);
> +int aie2_query_telemetry(struct amdxdna_dev_hdl *ndev,
> +			 char __user *buf, u32 size,
> +			 struct amdxdna_drm_query_telemetry_header *header);
>   int aie2_register_asyn_event_msg(struct amdxdna_dev_hdl *ndev, dma_addr_t addr, u32 size,
>   				 void *handle, int (*cb)(void*, void __iomem *, size_t));
>   int aie2_config_cu(struct amdxdna_hwctx *hwctx,
> diff --git a/drivers/accel/amdxdna/amdxdna_mailbox_helper.h b/drivers/accel/amdxdna/amdxdna_mailbox_helper.h
> index 710ff8873d61..556c712cad0a 100644
> --- a/drivers/accel/amdxdna/amdxdna_mailbox_helper.h
> +++ b/drivers/accel/amdxdna/amdxdna_mailbox_helper.h
> @@ -16,16 +16,18 @@ struct xdna_notify {
>   	u32			*data;
>   	size_t			size;
>   	int			error;
> +	u32			*status;
>   };
>   
> -#define DECLARE_XDNA_MSG_COMMON(name, op, status)			\
> +#define DECLARE_XDNA_MSG_COMMON(name, op, s)				\
>   	struct name##_req	req = { 0 };				\
> -	struct name##_resp	resp = { status	};			\
> +	struct name##_resp	resp = { .status = s };			\
>   	struct xdna_notify	hdl = {					\
>   		.error = 0,						\
>   		.data = (u32 *)&resp,					\
>   		.size = sizeof(resp),					\
>   		.comp = COMPLETION_INITIALIZER_ONSTACK(hdl.comp),	\
> +		.status = (u32 *)&resp.status,				\
>   	};								\
>   	struct xdna_mailbox_msg msg = {					\
>   		.send_data = (u8 *)&req,				\
> diff --git a/drivers/accel/amdxdna/amdxdna_pci_drv.c b/drivers/accel/amdxdna/amdxdna_pci_drv.c
> index af943a603ad1..7590265d4485 100644
> --- a/drivers/accel/amdxdna/amdxdna_pci_drv.c
> +++ b/drivers/accel/amdxdna/amdxdna_pci_drv.c
> @@ -30,9 +30,10 @@ MODULE_FIRMWARE("amdnpu/17f0_20/npu.sbin");
>    * 0.2: Support getting last error hardware error
>    * 0.3: Support firmware debug buffer
>    * 0.4: Support getting resource information
> + * 0.5: Support getting telemetry data
>    */
>   #define AMDXDNA_DRIVER_MAJOR		0
> -#define AMDXDNA_DRIVER_MINOR		4
> +#define AMDXDNA_DRIVER_MINOR		5
>   
>   /*
>    * Bind the driver base on (vendor_id, device_id) pair and later use the
> diff --git a/include/uapi/drm/amdxdna_accel.h b/include/uapi/drm/amdxdna_accel.h
> index 8b679c38d308..8ad254bc35a5 100644
> --- a/include/uapi/drm/amdxdna_accel.h
> +++ b/include/uapi/drm/amdxdna_accel.h
> @@ -442,6 +442,7 @@ enum amdxdna_drm_get_param {
>   	DRM_AMDXDNA_QUERY_HW_CONTEXTS,
>   	DRM_AMDXDNA_QUERY_FIRMWARE_VERSION = 8,
>   	DRM_AMDXDNA_GET_POWER_MODE,
> +	DRM_AMDXDNA_QUERY_TELEMETRY,
>   	DRM_AMDXDNA_QUERY_RESOURCE_INFO = 12,
>   };
>   
> @@ -461,6 +462,22 @@ struct amdxdna_drm_get_resource_info {
>   	__u64 npu_task_curr;
>   };
>   
> +/**
> + * struct amdxdna_drm_query_telemetry_header - Telemetry data header
> + */
> +struct amdxdna_drm_query_telemetry_header {
> +	/** @major: Firmware telemetry interface major version number */
> +	__u32 major;
> +	/** @minor: Firmware telemetry interface minor version number */
> +	__u32 minor;
> +	/** @type: Telemetry query type */
> +	__u32 type;
> +	/** @map_num_elements: Total number of elements in the map table */
> +	__u32 map_num_elements;
> +	/** @map: Element map */
> +	__u32 map[];
> +};
> +
>   /**
>    * struct amdxdna_drm_get_info - Get some information from the AIE hardware.
>    * @param: Value in enum amdxdna_drm_get_param. Specifies the structure passed in the buffer.


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ