[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <a4c586e5-f89c-4a92-b74e-d358e7cf7a2d@amd.com>
Date: Mon, 1 Dec 2025 23:04:52 -0600
From: Tanmay Shah <tanmay.shah@....com>
To: Mathieu Poirier <mathieu.poirier@...aro.org>
CC: <andersson@...nel.org>, <linux-remoteproc@...r.kernel.org>,
<linux-kernel@...r.kernel.org>
Subject: Re: [PATCH v2 3/3] remoteproc: xlnx: add crash detection mechanism
On 11/21/25 9:37 AM, Mathieu Poirier wrote:
> On Thu, Nov 13, 2025 at 07:44:04AM -0800, Tanmay Shah wrote:
>> Remote processor will report the crash reason via the resource table
>> and notify the host via kick. The host checks this crash reason on
>> every kick notification from the remote and report to the core
>> framework. Then the rproc core framework will start the recovery
>> process.
>
> Please substitute the word "kick" for "mailbox notification". I also have to
> assume "core framework" and "rproc core framework" are the same. Pick one and
> stick with it.
>
Ack.
>>
>> Signed-off-by: Tanmay Shah <tanmay.shah@....com>
>> ---
>>
>> Changes in v2:
>> - clear attach recovery boot flag during detach and stop ops
>>
>> drivers/remoteproc/xlnx_r5_remoteproc.c | 56 +++++++++++++++++++++++++
>> 1 file changed, 56 insertions(+)
>>
>> diff --git a/drivers/remoteproc/xlnx_r5_remoteproc.c b/drivers/remoteproc/xlnx_r5_remoteproc.c
>> index 8677b732ad14..5d04e8c0dc52 100644
>> --- a/drivers/remoteproc/xlnx_r5_remoteproc.c
>> +++ b/drivers/remoteproc/xlnx_r5_remoteproc.c
>> @@ -108,6 +108,10 @@ struct rsc_tbl_data {
>> const uintptr_t rsc_tbl;
>> } __packed;
>>
>> +enum fw_vendor_rsc {
>> + FW_RSC_VENDOR_CRASH_REASON = RSC_VENDOR_START,
>> +};
>> +
>> /*
>> * Hardcoded TCM bank values. This will stay in driver to maintain backward
>> * compatibility with device-tree that does not have TCM information.
>> @@ -127,9 +131,21 @@ static const struct mem_bank_data zynqmp_tcm_banks_lockstep[] = {
>> {0xffe30000UL, 0x30000, 0x10000UL, PD_R5_1_BTCM, "btcm1"},
>> };
>>
>> +/**
>> + * struct xlnx_rproc_crash_report - resource to know crash status and reason
>> + *
>> + * @crash_state: if true, the rproc is notifying crash, time to recover
>> + * @crash_reason: reason of crash
>> + */
>> +struct xlnx_rproc_crash_report {
>> + u32 crash_state;
>> + u32 crash_reason;
>> +} __packed;
>> +
>> /**
>> * struct zynqmp_r5_core - remoteproc core's internal data
>> *
>> + * @crash_report: rproc crash state and reason
>> * @rsc_tbl_va: resource table virtual address
>> * @sram: Array of sram memories assigned to this core
>> * @num_sram: number of sram for this core
>> @@ -143,6 +159,7 @@ static const struct mem_bank_data zynqmp_tcm_banks_lockstep[] = {
>> * @ipi: pointer to mailbox information
>> */
>> struct zynqmp_r5_core {
>> + struct xlnx_rproc_crash_report *crash_report;
>> void __iomem *rsc_tbl_va;
>> struct zynqmp_sram_bank *sram;
>> int num_sram;
>> @@ -227,10 +244,14 @@ static void handle_event_notified(struct work_struct *work)
>> static void zynqmp_r5_mb_rx_cb(struct mbox_client *cl, void *msg)
>> {
>> struct zynqmp_ipi_message *ipi_msg, *buf_msg;
>> + struct zynqmp_r5_core *r5_core;
>> + struct rproc *rproc;
>> struct mbox_info *ipi;
>> size_t len;
>>
>> ipi = container_of(cl, struct mbox_info, mbox_cl);
>> + r5_core = ipi->r5_core;
>> + rproc = r5_core->rproc;
>>
>> /* copy data from ipi buffer to r5_core */
>> ipi_msg = (struct zynqmp_ipi_message *)msg;
>> @@ -244,6 +265,13 @@ static void zynqmp_r5_mb_rx_cb(struct mbox_client *cl, void *msg)
>> buf_msg->len = len;
>> memcpy(buf_msg->data, ipi_msg->data, len);
>>
>> + /* Check for crash only if rproc crash is expected */
>> + if (rproc->state == RPROC_ATTACHED || rproc->state == RPROC_RUNNING) {
>> + if (r5_core->crash_report->crash_state)
>> + rproc_report_crash(rproc,
>> + r5_core->crash_report->crash_reason);
>
> At this stage ->crash_state indicates that a crash occured, but how is it reset
> once the crash has been handle? How do we make sure the next mailbox
> notification won't trigger another crash report?
>
I was counting on the remote firmware to reset the crash_state once it
reboots before sending the next mailbox notification.
If it's not the best idea, I can reset the crash_state field in start()
callback or attach() callback at the end. That will indicate that remote
firmware has started successfully.
>> + }
>> +
>> /* received and processed interrupt ack */
>> if (mbox_send_message(ipi->rx_chan, NULL) < 0)
>> dev_err(cl->dev, "ack failed to mbox rx_chan\n");
>> @@ -397,6 +425,7 @@ static int zynqmp_r5_rproc_start(struct rproc *rproc)
>> if (ret)
>> dev_err(r5_core->dev,
>> "failed to start RPU = 0x%x\n", r5_core->pm_domain_id);
>> +
>
> Spurious change
>
Ack will remove it.
>> return ret;
>> }
>>
>> @@ -438,6 +467,8 @@ static int zynqmp_r5_rproc_stop(struct rproc *rproc)
>> if (ret)
>> dev_err(r5_core->dev, "core force power down failed\n");
>>
>> + test_and_clear_bit(RPROC_FEAT_ATTACH_ON_RECOVERY, rproc->features);
>> +
>> return ret;
>> }
>>
>> @@ -874,6 +905,8 @@ static int zynqmp_r5_get_rsc_table_va(struct zynqmp_r5_core *r5_core)
>>
>> static int zynqmp_r5_attach(struct rproc *rproc)
>> {
>> + rproc_set_feature(rproc, RPROC_FEAT_ATTACH_ON_RECOVERY);
>> +
>
> Why can't this be set in probe() and left alone from thereon?
>
Right now no specific reason. But I wanted to enable recovery only if
attach() callback is successful. If execution fails anytime before that,
then no point in enabling it.
>> dev_dbg(&rproc->dev, "rproc %d attached\n", rproc->index);
>>
>> return 0;
>> @@ -888,6 +921,8 @@ static int zynqmp_r5_detach(struct rproc *rproc)
>> */
>> zynqmp_r5_rproc_kick(rproc, 0);
>>
>> + clear_bit(RPROC_FEAT_ATTACH_ON_RECOVERY, rproc->features);
>> +
>
> I'm not sure why this needs to be done, same comment for zynqmp_r5_rproc_stop().
>
I think for detach() may be it's not needed. I added it as a cleanup
sequence i.e. reverse of what's done in the attach() callback.
For stop it is needed in the following case:
attach() -> stop () -> load fw () -> start ().
In this sequence we need to make sure that if recovery is requested
after start(), then we execute "boot recovery" and not "attach recovery".
Thanks,
Tanmay
>> return 0;
>> }
>>
>> @@ -896,6 +931,26 @@ static void zynqmp_r5_coredump(struct rproc *rproc)
>> (void)rproc;
>> }
>>
>> +static int zynqmp_r5_handle_crash_rsc(struct rproc *rproc, void *rsc,
>> + int offset, int avail)
>> +{
>> + struct zynqmp_r5_core *r5_core = rproc->priv;
>> +
>> + r5_core->crash_report =
>> + (struct xlnx_rproc_crash_report *)(r5_core->rsc_tbl_va + offset);
>> +
>
> This function is so simple that I would fold it in zynqmp_r5_handle_rsc() below.
>
Ack.
> Thanks,
> Mathieu
>
>> + return RSC_HANDLED;
>> +}
>> +
>> +static int zynqmp_r5_handle_rsc(struct rproc *rproc, u32 rsc_type, void *rsc,
>> + int offset, int avail)
>> +{
>> + if (rsc_type == FW_RSC_VENDOR_CRASH_REASON)
>> + return zynqmp_r5_handle_crash_rsc(rproc, rsc, offset, avail);
>> +
>> + return RSC_IGNORED;
>> +}
>> +
>> static const struct rproc_ops zynqmp_r5_rproc_ops = {
>> .prepare = zynqmp_r5_rproc_prepare,
>> .unprepare = zynqmp_r5_rproc_unprepare,
>> @@ -911,6 +966,7 @@ static const struct rproc_ops zynqmp_r5_rproc_ops = {
>> .attach = zynqmp_r5_attach,
>> .detach = zynqmp_r5_detach,
>> .coredump = zynqmp_r5_coredump,
>> + .handle_rsc = zynqmp_r5_handle_rsc,
>> };
>>
>> /**
>> --
>> 2.34.1
>>
Powered by blists - more mailing lists