[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <0fe13309-b0bb-6868-e1d2-2fa59f329b7b@amd.com>
Date: Fri, 18 Oct 2024 14:19:05 -0700
From: Lizhi Hou <lizhi.hou@....com>
To: Jeffrey Hugo <quic_jhugo@...cinc.com>, <ogabbay@...nel.org>,
<dri-devel@...ts.freedesktop.org>
CC: <linux-kernel@...r.kernel.org>, <min.ma@....com>, <max.zhen@....com>,
<sonal.santan@....com>, <king.tam@....com>
Subject: Re: [PATCH V4 09/10] accel/amdxdna: Add error handling
On 10/18/24 14:01, Jeffrey Hugo wrote:
> On 10/11/2024 5:12 PM, Lizhi Hou wrote:
>> When there is a hardware error, the NPU firmware notifies the host
>> through
>> a mailbox message. The message includes details of the error, such as
>> the
>> tile and column indexes where the error occurred.
>>
>> The driver starts a thread to handle the NPU error message. The thread
>> stops the clients which are using the column where error occurred. Then
>> the driver resets that column.
>>
>> Co-developed-by: Min Ma<min.ma@....com>
>> Signed-off-by: Min Ma<min.ma@....com>
>> Signed-off-by: Lizhi Hou<lizhi.hou@....com>
>> ---
>> drivers/accel/amdxdna/Makefile | 1 +
>> drivers/accel/amdxdna/aie2_error.c | 356 +++++++++++++++++++++++++++
>> drivers/accel/amdxdna/aie2_message.c | 19 ++
>> drivers/accel/amdxdna/aie2_pci.c | 32 +++
>> drivers/accel/amdxdna/aie2_pci.h | 9 +
>> 5 files changed, 417 insertions(+)
>> create mode 100644 drivers/accel/amdxdna/aie2_error.c
>>
>> diff --git a/drivers/accel/amdxdna/Makefile
>> b/drivers/accel/amdxdna/Makefile
>> index a688c378761f..ed6f87910880 100644
>> --- a/drivers/accel/amdxdna/Makefile
>> +++ b/drivers/accel/amdxdna/Makefile
>> @@ -2,6 +2,7 @@
>> amdxdna-y := \
>> aie2_ctx.o \
>> + aie2_error.o \
>> aie2_message.o \
>> aie2_pci.o \
>> aie2_psp.o \
>> diff --git a/drivers/accel/amdxdna/aie2_error.c
>> b/drivers/accel/amdxdna/aie2_error.c
>> new file mode 100644
>> index 000000000000..d2787549f3b7
>> --- /dev/null
>> +++ b/drivers/accel/amdxdna/aie2_error.c
>> @@ -0,0 +1,356 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
>> + */
>> +
>> +#include <drm/drm_cache.h>
>> +#include <drm/drm_device.h>
>> +#include <drm/drm_print.h>
>> +#include <drm/gpu_scheduler.h>
>> +#include <linux/dma-mapping.h>
>> +#include <linux/kthread.h>
>> +#include <linux/kernel.h>
>> +
>> +#include "aie2_msg_priv.h"
>> +#include "aie2_pci.h"
>> +#include "amdxdna_mailbox.h"
>> +#include "amdxdna_pci_drv.h"
>> +
>> +struct async_event {
>> + struct amdxdna_dev_hdl *ndev;
>> + struct async_event_msg_resp resp;
>> + struct workqueue_struct *wq;
>> + struct work_struct work;
>> + u8 *buf;
>> + dma_addr_t addr;
>> + u32 size;
>> +};
>> +
>> +struct async_events {
>> + struct workqueue_struct *wq;
>> + u8 *buf;
>> + dma_addr_t addr;
>> + u32 size;
>> + u32 event_cnt;
>> + struct async_event event[] __counted_by(event_cnt);
>> +};
>> +
>> +/*
>> + * Below enum, struct and lookup tables are porting from XAIE util
>> header file.
>> + *
>> + * Below data is defined by AIE device and it is used for decode
>> error message
>> + * from the device.
>> + */
>> +
>> +enum aie_module_type {
>> + AIE_MEM_MOD = 0,
>> + AIE_CORE_MOD,
>> + AIE_PL_MOD,
>> +};
>> +
>> +enum aie_error_category {
>> + AIE_ERROR_SATURATION = 0,
>> + AIE_ERROR_FP,
>> + AIE_ERROR_STREAM,
>> + AIE_ERROR_ACCESS,
>> + AIE_ERROR_BUS,
>> + AIE_ERROR_INSTRUCTION,
>> + AIE_ERROR_ECC,
>> + AIE_ERROR_LOCK,
>> + AIE_ERROR_DMA,
>> + AIE_ERROR_MEM_PARITY,
>> + /* Unknown is not from XAIE, added for better category */
>> + AIE_ERROR_UNKNOWN,
>> +};
>> +
>> +/* Don't pack, unless XAIE side changed */
>> +struct aie_error {
>> + u8 row;
>> + u8 col;
>> + u32 mod_type;
>> + u8 event_id;
>> +};
>
> This looks like it is a structure to decode data from an external
> device. Assuming that is so, the wrong types are used here. Should be
> the "__" types like "__u8", no? Normal u8, etc are kernel internal
> only types.
Yes, you are correct. I will fix this.
Thanks,
Lizhi
>
>> +
>> +struct aie_err_info {
>> + u32 err_cnt;
>> + u32 ret_code;
>> + u32 rsvd;
>> + struct aie_error payload[] __counted_by(err_cnt);
>> +};
>> +
>> +struct aie_event_category {
>> + u8 event_id;
>> + enum aie_error_category category;
>> +};
>
Powered by blists - more mailing lists