lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1c16640f-0bc3-2692-910f-09ea5869a5b6@quicinc.com>
Date: Fri, 18 Oct 2024 15:01:45 -0600
From: Jeffrey Hugo <quic_jhugo@...cinc.com>
To: Lizhi Hou <lizhi.hou@....com>, <ogabbay@...nel.org>,
        <dri-devel@...ts.freedesktop.org>
CC: <linux-kernel@...r.kernel.org>, <min.ma@....com>, <max.zhen@....com>,
        <sonal.santan@....com>, <king.tam@....com>
Subject: Re: [PATCH V4 09/10] accel/amdxdna: Add error handling

On 10/11/2024 5:12 PM, Lizhi Hou wrote:
> When there is a hardware error, the NPU firmware notifies the host through
> a mailbox message. The message includes details of the error, such as the
> tile and column indexes where the error occurred.
> 
> The driver starts a thread to handle the NPU error message. The thread
> stops the clients which are using the column where error occurred. Then
> the driver resets that column.
> 
> Co-developed-by: Min Ma<min.ma@....com>
> Signed-off-by: Min Ma<min.ma@....com>
> Signed-off-by: Lizhi Hou<lizhi.hou@....com>
> ---
>   drivers/accel/amdxdna/Makefile       |   1 +
>   drivers/accel/amdxdna/aie2_error.c   | 356 +++++++++++++++++++++++++++
>   drivers/accel/amdxdna/aie2_message.c |  19 ++
>   drivers/accel/amdxdna/aie2_pci.c     |  32 +++
>   drivers/accel/amdxdna/aie2_pci.h     |   9 +
>   5 files changed, 417 insertions(+)
>   create mode 100644 drivers/accel/amdxdna/aie2_error.c
> 
> diff --git a/drivers/accel/amdxdna/Makefile b/drivers/accel/amdxdna/Makefile
> index a688c378761f..ed6f87910880 100644
> --- a/drivers/accel/amdxdna/Makefile
> +++ b/drivers/accel/amdxdna/Makefile
> @@ -2,6 +2,7 @@
>   
>   amdxdna-y := \
>   	aie2_ctx.o \
> +	aie2_error.o \
>   	aie2_message.o \
>   	aie2_pci.o \
>   	aie2_psp.o \
> diff --git a/drivers/accel/amdxdna/aie2_error.c b/drivers/accel/amdxdna/aie2_error.c
> new file mode 100644
> index 000000000000..d2787549f3b7
> --- /dev/null
> +++ b/drivers/accel/amdxdna/aie2_error.c
> @@ -0,0 +1,356 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
> + */
> +
> +#include <drm/drm_cache.h>
> +#include <drm/drm_device.h>
> +#include <drm/drm_print.h>
> +#include <drm/gpu_scheduler.h>
> +#include <linux/dma-mapping.h>
> +#include <linux/kthread.h>
> +#include <linux/kernel.h>
> +
> +#include "aie2_msg_priv.h"
> +#include "aie2_pci.h"
> +#include "amdxdna_mailbox.h"
> +#include "amdxdna_pci_drv.h"
> +
> +struct async_event {
> +	struct amdxdna_dev_hdl		*ndev;
> +	struct async_event_msg_resp	resp;
> +	struct workqueue_struct		*wq;
> +	struct work_struct		work;
> +	u8				*buf;
> +	dma_addr_t			addr;
> +	u32				size;
> +};
> +
> +struct async_events {
> +	struct workqueue_struct		*wq;
> +	u8				*buf;
> +	dma_addr_t			addr;
> +	u32				size;
> +	u32				event_cnt;
> +	struct async_event		event[] __counted_by(event_cnt);
> +};
> +
> +/*
> + * Below enum, struct and lookup tables are porting from XAIE util header file.
> + *
> + * Below data is defined by AIE device and it is used for decode error message
> + * from the device.
> + */
> +
> +enum aie_module_type {
> +	AIE_MEM_MOD = 0,
> +	AIE_CORE_MOD,
> +	AIE_PL_MOD,
> +};
> +
> +enum aie_error_category {
> +	AIE_ERROR_SATURATION = 0,
> +	AIE_ERROR_FP,
> +	AIE_ERROR_STREAM,
> +	AIE_ERROR_ACCESS,
> +	AIE_ERROR_BUS,
> +	AIE_ERROR_INSTRUCTION,
> +	AIE_ERROR_ECC,
> +	AIE_ERROR_LOCK,
> +	AIE_ERROR_DMA,
> +	AIE_ERROR_MEM_PARITY,
> +	/* Unknown is not from XAIE, added for better category */
> +	AIE_ERROR_UNKNOWN,
> +};
> +
> +/* Don't pack, unless XAIE side changed */
> +struct aie_error {
> +	u8			row;
> +	u8			col;
> +	u32			mod_type;
> +	u8			event_id;
> +};

This looks like it is a structure to decode data from an external 
device.  Assuming that is so, the wrong types are used here.  Should be 
the "__" types like "__u8", no?  Normal u8, etc are kernel internal only 
types.

> +
> +struct aie_err_info {
> +	u32			err_cnt;
> +	u32			ret_code;
> +	u32			rsvd;
> +	struct aie_error	payload[] __counted_by(err_cnt);
> +};
> +
> +struct aie_event_category {
> +	u8			event_id;
> +	enum aie_error_category category;
> +};


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ