[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <86378b86-bf98-6dc5-0f3b-0a92cfa6a5f1@amd.com>
Date: Tue, 2 Dec 2025 08:46:54 -0800
From: Lizhi Hou <lizhi.hou@....com>
To: Mario Limonciello <mario.limonciello@....com>, <ogabbay@...nel.org>,
<quic_jhugo@...cinc.com>, <dri-devel@...ts.freedesktop.org>,
<maciej.falkowski@...ux.intel.com>
CC: <linux-kernel@...r.kernel.org>, <max.zhen@....com>, <sonal.santan@....com>
Subject: Re: [PATCH V1] accel/amdxdna: Poll MPNPU_PWAITMODE after requesting
firmware suspend
On 12/1/25 21:51, Mario Limonciello wrote:
>
>
> On 12/1/2025 11:37 PM, Lizhi Hou wrote:
>> After issuing a firmware suspend request, the driver must ensure that
>> the
>> suspend operation has completed before proceeding. Add polling of the
>> MPNPU_PWAITMODE register to confirm that the firmware has fully entered
>> the suspended state. This prevents race conditions where subsequent
>> operations assume the firmware is idle before it has actually completed
>> its suspend sequence.
>>
>> Signed-off-by: Lizhi Hou <lizhi.hou@....com>
>> ---
>> drivers/accel/amdxdna/aie2_message.c | 9 ++++++++-
>> drivers/accel/amdxdna/aie2_pci.h | 2 ++
>> drivers/accel/amdxdna/aie2_psp.c | 15 +++++++++++++++
>> drivers/accel/amdxdna/npu1_regs.c | 2 ++
>> drivers/accel/amdxdna/npu2_regs.c | 2 ++
>> drivers/accel/amdxdna/npu4_regs.c | 2 ++
>> drivers/accel/amdxdna/npu5_regs.c | 4 +++-
>> drivers/accel/amdxdna/npu6_regs.c | 2 ++
>> 8 files changed, 36 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/accel/amdxdna/aie2_message.c
>> b/drivers/accel/amdxdna/aie2_message.c
>> index d493bb1c3360..fee3b0627aba 100644
>> --- a/drivers/accel/amdxdna/aie2_message.c
>> +++ b/drivers/accel/amdxdna/aie2_message.c
>> @@ -59,8 +59,15 @@ static int aie2_send_mgmt_msg_wait(struct
>> amdxdna_dev_hdl *ndev,
>> int aie2_suspend_fw(struct amdxdna_dev_hdl *ndev)
>> {
>> DECLARE_AIE2_MSG(suspend, MSG_OP_SUSPEND);
>> + int ret;
>> - return aie2_send_mgmt_msg_wait(ndev, &msg);
>> + ret = aie2_send_mgmt_msg_wait(ndev, &msg);
>> + if (ret) {
>> + XDNA_ERR(ndev->xdna, "Failed to suspend fw, ret %d", ret);
>> + return ret;
>> + }
>> +
>> + return aie2_psp_waitmode_poll(ndev->psp_hdl);
>> }
>> int aie2_resume_fw(struct amdxdna_dev_hdl *ndev)
>> diff --git a/drivers/accel/amdxdna/aie2_pci.h
>> b/drivers/accel/amdxdna/aie2_pci.h
>> index a5f9c42155d1..cc9f933f80b2 100644
>> --- a/drivers/accel/amdxdna/aie2_pci.h
>> +++ b/drivers/accel/amdxdna/aie2_pci.h
>> @@ -70,6 +70,7 @@ enum psp_reg_idx {
>> PSP_INTR_REG = PSP_NUM_IN_REGS,
>> PSP_STATUS_REG,
>> PSP_RESP_REG,
>> + PSP_PWAITMODE_REG,
>> PSP_MAX_REGS /* Keep this at the end */
>> };
>> @@ -290,6 +291,7 @@ int aie2_pm_set_mode(struct amdxdna_dev_hdl
>> *ndev, enum amdxdna_power_mode_type
>> struct psp_device *aie2m_psp_create(struct drm_device *ddev, struct
>> psp_config *conf);
>> int aie2_psp_start(struct psp_device *psp);
>> void aie2_psp_stop(struct psp_device *psp);
>> +int aie2_psp_waitmode_poll(struct psp_device *psp);
>> /* aie2_error.c */
>> int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev);
>> diff --git a/drivers/accel/amdxdna/aie2_psp.c
>> b/drivers/accel/amdxdna/aie2_psp.c
>> index f28a060a8810..4bc60f458fd4 100644
>> --- a/drivers/accel/amdxdna/aie2_psp.c
>> +++ b/drivers/accel/amdxdna/aie2_psp.c
>> @@ -76,6 +76,21 @@ static int psp_exec(struct psp_device *psp, u32
>> *reg_vals)
>> return 0;
>> }
>> +int aie2_psp_waitmode_poll(struct psp_device *psp)
>> +{
>> + int mode_reg = -1, ret;
>
> It seems like from the usage mode_reg shouldn't be a signed integer.
Will change it to u32.
>
>> +
>> + ret = readx_poll_timeout(readl, PSP_REG(psp, PSP_PWAITMODE_REG),
>> mode_reg,
>> + (mode_reg & 0x1) == 1,
>> + PSP_POLL_INTERVAL, PSP_POLL_TIMEOUT);
>> + if (ret) {
>> + drm_err(psp->ddev, "fw waitmode reg error, ret 0x%x", ret);
>
> Aren't these return codes going to be negative in the case of an
> error? IE they shouldn't be 0x%x, they should be %d.
Agree. I will fix this.
>
> Also shouldn't you be using XDNA_ERR()?
Sure. I can use XDNA_ERR.
>
>> + return ret;
>> + }
>> +
>> + return 0;
>
> You can probably simplify as:
>
> if (ret)
> drm_err();
> return ret;
Agree.
>
>> +}
>> +
>> void aie2_psp_stop(struct psp_device *psp)
>> {
>> u32 reg_vals[PSP_NUM_IN_REGS] = { PSP_RELEASE_TMR, };
>> diff --git a/drivers/accel/amdxdna/npu1_regs.c
>> b/drivers/accel/amdxdna/npu1_regs.c
>> index ec407f3b48fc..ebc6e2802297 100644
>> --- a/drivers/accel/amdxdna/npu1_regs.c
>> +++ b/drivers/accel/amdxdna/npu1_regs.c
>> @@ -13,6 +13,7 @@
>> #include "amdxdna_pci_drv.h"
>> /* Address definition from NPU1 docs */
>> +#define MPNPU_PWAITMODE 0x3010034
>> #define MPNPU_PUB_SEC_INTR 0x3010090
>> #define MPNPU_PUB_PWRMGMT_INTR 0x3010094
>> #define MPNPU_PUB_SCRATCH2 0x30100A0
>> @@ -92,6 +93,7 @@ static const struct amdxdna_dev_priv npu1_dev_priv = {
>> DEFINE_BAR_OFFSET(PSP_INTR_REG, NPU1_PSP,
>> MPNPU_PUB_SEC_INTR),
>> DEFINE_BAR_OFFSET(PSP_STATUS_REG, NPU1_PSP,
>> MPNPU_PUB_SCRATCH2),
>> DEFINE_BAR_OFFSET(PSP_RESP_REG, NPU1_PSP,
>> MPNPU_PUB_SCRATCH3),
>> + DEFINE_BAR_OFFSET(PSP_PWAITMODE_REG, NPU1_PSP,
>> MPNPU_PWAITMODE),
>> },
>> .smu_regs_off = {
>> DEFINE_BAR_OFFSET(SMU_CMD_REG, NPU1_SMU, MPNPU_PUB_SCRATCH5),
>> diff --git a/drivers/accel/amdxdna/npu2_regs.c
>> b/drivers/accel/amdxdna/npu2_regs.c
>> index 86f87d0d1354..ad0743fb06d5 100644
>> --- a/drivers/accel/amdxdna/npu2_regs.c
>> +++ b/drivers/accel/amdxdna/npu2_regs.c
>> @@ -13,6 +13,7 @@
>> #include "amdxdna_pci_drv.h"
>> /* NPU Public Registers on MpNPUAxiXbar (refer to Diag
>> npu_registers.h) */
>> +#define MPNPU_PWAITMODE 0x301003C
>> #define MPNPU_PUB_SEC_INTR 0x3010060
>> #define MPNPU_PUB_PWRMGMT_INTR 0x3010064
>> #define MPNPU_PUB_SCRATCH0 0x301006C
>> @@ -85,6 +86,7 @@ static const struct amdxdna_dev_priv npu2_dev_priv = {
>> DEFINE_BAR_OFFSET(PSP_INTR_REG, NPU2_PSP, MP0_C2PMSG_73),
>> DEFINE_BAR_OFFSET(PSP_STATUS_REG, NPU2_PSP, MP0_C2PMSG_123),
>> DEFINE_BAR_OFFSET(PSP_RESP_REG, NPU2_REG,
>> MPNPU_PUB_SCRATCH3),
>> + DEFINE_BAR_OFFSET(PSP_PWAITMODE_REG, NPU2_REG,
>> MPNPU_PWAITMODE),
>> },
>> .smu_regs_off = {
>> DEFINE_BAR_OFFSET(SMU_CMD_REG, NPU2_SMU, MP1_C2PMSG_0),
>> diff --git a/drivers/accel/amdxdna/npu4_regs.c
>> b/drivers/accel/amdxdna/npu4_regs.c
>> index 986a5f28ba24..4ca21db70478 100644
>> --- a/drivers/accel/amdxdna/npu4_regs.c
>> +++ b/drivers/accel/amdxdna/npu4_regs.c
>> @@ -13,6 +13,7 @@
>> #include "amdxdna_pci_drv.h"
>> /* NPU Public Registers on MpNPUAxiXbar (refer to Diag
>> npu_registers.h) */
>> +#define MPNPU_PWAITMODE 0x301003C
>> #define MPNPU_PUB_SEC_INTR 0x3010060
>> #define MPNPU_PUB_PWRMGMT_INTR 0x3010064
>> #define MPNPU_PUB_SCRATCH0 0x301006C
>> @@ -116,6 +117,7 @@ static const struct amdxdna_dev_priv
>> npu4_dev_priv = {
>> DEFINE_BAR_OFFSET(PSP_INTR_REG, NPU4_PSP, MP0_C2PMSG_73),
>> DEFINE_BAR_OFFSET(PSP_STATUS_REG, NPU4_PSP, MP0_C2PMSG_123),
>> DEFINE_BAR_OFFSET(PSP_RESP_REG, NPU4_REG,
>> MPNPU_PUB_SCRATCH3),
>> + DEFINE_BAR_OFFSET(PSP_PWAITMODE_REG, NPU4_REG,
>> MPNPU_PWAITMODE),
>> },
>> .smu_regs_off = {
>> DEFINE_BAR_OFFSET(SMU_CMD_REG, NPU4_SMU, MP1_C2PMSG_0),
>> diff --git a/drivers/accel/amdxdna/npu5_regs.c
>> b/drivers/accel/amdxdna/npu5_regs.c
>> index 75ad97f0b937..f761a6661f40 100644
>> --- a/drivers/accel/amdxdna/npu5_regs.c
>> +++ b/drivers/accel/amdxdna/npu5_regs.c
>> @@ -13,6 +13,7 @@
>> #include "amdxdna_pci_drv.h"
>> /* NPU Public Registers on MpNPUAxiXbar (refer to Diag
>> npu_registers.h) */
>> +#define MPNPU_PWAITMODE 0x301003C
>> #define MPNPU_PUB_SEC_INTR 0x3010060
>> #define MPNPU_PUB_PWRMGMT_INTR 0x3010064
>> #define MPNPU_PUB_SCRATCH0 0x301006C
>> @@ -62,7 +63,7 @@
>> #define NPU5_SRAM_BAR_BASE MMNPU_APERTURE1_BASE
>> static const struct amdxdna_dev_priv npu5_dev_priv = {
>> - .fw_path = "amdnpu/17f0_11/npu.sbin",
>> + .fw_path = "amdnpu/17f0_11/npu.dev.sbin",
>
> This seems like an unintended change.
Ah. Yes. It is a merging issue. Sorry about the confusion.
Thanks,
Lizhi
>
>> .protocol_major = 0x6,
>> .protocol_minor = 12,
>> .rt_config = npu4_default_rt_cfg,
>> @@ -85,6 +86,7 @@ static const struct amdxdna_dev_priv npu5_dev_priv = {
>> DEFINE_BAR_OFFSET(PSP_INTR_REG, NPU5_PSP, MP0_C2PMSG_73),
>> DEFINE_BAR_OFFSET(PSP_STATUS_REG, NPU5_PSP, MP0_C2PMSG_123),
>> DEFINE_BAR_OFFSET(PSP_RESP_REG, NPU5_REG,
>> MPNPU_PUB_SCRATCH3),
>> + DEFINE_BAR_OFFSET(PSP_PWAITMODE_REG, NPU5_REG,
>> MPNPU_PWAITMODE),
>> },
>> .smu_regs_off = {
>> DEFINE_BAR_OFFSET(SMU_CMD_REG, NPU5_SMU, MP1_C2PMSG_0),
>> diff --git a/drivers/accel/amdxdna/npu6_regs.c
>> b/drivers/accel/amdxdna/npu6_regs.c
>> index 758dc013fe13..1f71285655b2 100644
>> --- a/drivers/accel/amdxdna/npu6_regs.c
>> +++ b/drivers/accel/amdxdna/npu6_regs.c
>> @@ -13,6 +13,7 @@
>> #include "amdxdna_pci_drv.h"
>> /* NPU Public Registers on MpNPUAxiXbar (refer to Diag
>> npu_registers.h) */
>> +#define MPNPU_PWAITMODE 0x301003C
>> #define MPNPU_PUB_SEC_INTR 0x3010060
>> #define MPNPU_PUB_PWRMGMT_INTR 0x3010064
>> #define MPNPU_PUB_SCRATCH0 0x301006C
>> @@ -85,6 +86,7 @@ static const struct amdxdna_dev_priv npu6_dev_priv = {
>> DEFINE_BAR_OFFSET(PSP_INTR_REG, NPU6_PSP, MP0_C2PMSG_73),
>> DEFINE_BAR_OFFSET(PSP_STATUS_REG, NPU6_PSP, MP0_C2PMSG_123),
>> DEFINE_BAR_OFFSET(PSP_RESP_REG, NPU6_REG,
>> MPNPU_PUB_SCRATCH3),
>> + DEFINE_BAR_OFFSET(PSP_PWAITMODE_REG, NPU6_REG,
>> MPNPU_PWAITMODE),
>> },
>> .smu_regs_off = {
>> DEFINE_BAR_OFFSET(SMU_CMD_REG, NPU6_SMU, MP1_C2PMSG_0),
>
Powered by blists - more mailing lists