[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <40fd3f7e-dd03-4f6f-99d2-919192b91434@oss.qualcomm.com>
Date: Wed, 10 Dec 2025 14:17:42 +0800
From: Jingyi Wang <jingyi.wang@....qualcomm.com>
To: Bjorn Andersson <andersson@...nel.org>
Cc: Mathieu Poirier <mathieu.poirier@...aro.org>,
Rob Herring <robh@...nel.org>,
Krzysztof Kozlowski <krzk+dt@...nel.org>,
Conor Dooley <conor+dt@...nel.org>,
Manivannan Sadhasivam <mani@...nel.org>, aiqun.yu@....qualcomm.com,
tingwei.zhang@....qualcomm.com, trilok.soni@....qualcomm.com,
yijie.yang@....qualcomm.com, linux-arm-msm@...r.kernel.org,
linux-remoteproc@...r.kernel.org, devicetree@...r.kernel.org,
linux-kernel@...r.kernel.org,
Gokul krishna Krishnakumar <Gokul.krishnakumar@....qualcomm>
Subject: Re: [PATCH v2 4/7] remoteproc: qcom: pas: Add late attach support for
subsystems
On 11/2/2025 2:34 AM, Bjorn Andersson wrote:
> On Wed, Oct 29, 2025 at 01:05:42AM -0700, Jingyi Wang wrote:
>> From: Gokul krishna Krishnakumar <Gokul.krishnakumar@....qualcomm>
>>
>> Subsystems can be brought out of reset by entities such as
>> bootloaders.
>
> Good start of the commit message.
>
>> Before attaching such subsystems, it is important to
>> check the state of the subsystem.
>
> Why?
>
> I see three possible outcomes:
> 1) The subsystem was booted and is still running just fine.
> 2) The subsystem hit fatal error and flagged this in smp2p
> 3) The subsystem hit a wdog and I presume there would be a interrupt
> waiting for us as soon as we register a handler?
>
> Perhaps I'm wrong about the semantics of #3? If so this should be
> clearly documented in the commit message.
>
>
> Also, at this point in the commit message you've established the
> problem, the remainder is a description of how you're addressing the
> problem. A paragraph break would be suitable.
>
will make the commit msg more clean in the next version.
>> This patch adds support to attach
>> to a subsystem by ensuring that the subsystem is in a sane state by
>> reading SMP2P bits and pinging the subsystem.
>>
>> Signed-off-by: Gokul krishna Krishnakumar <Gokul.krishnakumar@....qualcomm>
>
> I would prefer email addresses to be all lowercase, but more
> importantly, you lost the tail end of that address (same with author).
>
will fix
>> Co-developed-by: Jingyi Wang <jingyi.wang@....qualcomm.com>
>> Signed-off-by: Jingyi Wang <jingyi.wang@....qualcomm.com>
>> ---
>> drivers/remoteproc/qcom_q6v5.c | 89 ++++++++++++++++++++++++++++++++++++-
>> drivers/remoteproc/qcom_q6v5.h | 14 +++++-
>> drivers/remoteproc/qcom_q6v5_adsp.c | 2 +-
>> drivers/remoteproc/qcom_q6v5_mss.c | 2 +-
>> drivers/remoteproc/qcom_q6v5_pas.c | 63 +++++++++++++++++++++++++-
>> 5 files changed, 165 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/remoteproc/qcom_q6v5.c b/drivers/remoteproc/qcom_q6v5.c
>> index 58d5b85e58cd..4ce9e43fc5c7 100644
>> --- a/drivers/remoteproc/qcom_q6v5.c
>> +++ b/drivers/remoteproc/qcom_q6v5.c
>> @@ -94,6 +94,9 @@ static irqreturn_t q6v5_wdog_interrupt(int irq, void *data)
>> size_t len;
>> char *msg;
>>
>> + if (q6v5->early_boot)
>> + complete(&q6v5->subsys_booted);
>> +
>> /* Sometimes the stop triggers a watchdog rather than a stop-ack */
>> if (!q6v5->running) {
>> complete(&q6v5->stop_done);
>> @@ -118,6 +121,9 @@ static irqreturn_t q6v5_fatal_interrupt(int irq, void *data)
>> size_t len;
>> char *msg;
>>
>> + if (q6v5->early_boot)
>> + complete(&q6v5->subsys_booted);
>> +
>> if (!q6v5->running)
>> return IRQ_HANDLED;
>>
>> @@ -139,6 +145,9 @@ static irqreturn_t q6v5_ready_interrupt(int irq, void *data)
>>
>> complete(&q6v5->start_done);
>>
>> + if (q6v5->early_boot)
>> + complete(&q6v5->subsys_booted);
>> +
>> return IRQ_HANDLED;
>> }
>>
>> @@ -172,6 +181,9 @@ static irqreturn_t q6v5_handover_interrupt(int irq, void *data)
>> if (q6v5->handover)
>> q6v5->handover(q6v5);
>>
>> + if (q6v5->early_boot)
>> + complete(&q6v5->subsys_booted);
>> +
>> icc_set_bw(q6v5->path, 0, 0);
>>
>> q6v5->handover_issued = true;
>> @@ -234,6 +246,77 @@ unsigned long qcom_q6v5_panic(struct qcom_q6v5 *q6v5)
>> }
>> EXPORT_SYMBOL_GPL(qcom_q6v5_panic);
>>
>> +static irqreturn_t q6v5_pong_interrupt(int irq, void *data)
>> +{
>> + struct qcom_q6v5 *q6v5 = data;
>> +
>> + complete(&q6v5->ping_done);
>> +
>> + return IRQ_HANDLED;
>> +}
>> +
>> +int qcom_q6v5_ping_subsystem(struct qcom_q6v5 *q6v5)
>> +{
>> + int ret;
>> + int ping_failed = 0;
>> +
>> + reinit_completion(&q6v5->ping_done);
>> +
>> + /* Set master kernel Ping bit */
>> + ret = qcom_smem_state_update_bits(q6v5->ping_state,
>> + BIT(q6v5->ping_bit), BIT(q6v5->ping_bit));
>> + if (ret) {
>> + dev_err(q6v5->dev, "Failed to update ping bits\n");
>> + return ret;
>> + }
>> +
>> + ret = wait_for_completion_timeout(&q6v5->ping_done, msecs_to_jiffies(PING_TIMEOUT));
>> + if (!ret) {
>> + ping_failed = -ETIMEDOUT;
>> + dev_err(q6v5->dev, "Failed to get back pong\n");
>> + }
>> +
>> + /* Clear ping bit master kernel */
>> + ret = qcom_smem_state_update_bits(q6v5->ping_state, BIT(q6v5->ping_bit), 0);
>> + if (ret) {
>> + pr_err("Failed to clear master kernel bits\n");
>
> Two dev_err and one pr_err?
>
will fix
>> + return ret;
>> + }
>> +
>> + if (ping_failed)
>> + return ping_failed;
>> +
>> + return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(qcom_q6v5_ping_subsystem);
>> +
>> +int qcom_q6v5_ping_subsystem_init(struct qcom_q6v5 *q6v5, struct platform_device *pdev)
>> +{
>> + int ret = -ENODEV;
>> +
>> + q6v5->ping_state = devm_qcom_smem_state_get(&pdev->dev, "ping", &q6v5->ping_bit);
>> + if (IS_ERR(q6v5->ping_state)) {
>> + dev_err(&pdev->dev, "failed to acquire smem state %ld\n",
>> + PTR_ERR(q6v5->ping_state));
>> + return ret;
>> + }
>> +
>> + q6v5->pong_irq = platform_get_irq_byname(pdev, "pong");
>> + if (q6v5->pong_irq < 0)
>> + return q6v5->pong_irq;
>> +
>> + ret = devm_request_threaded_irq(&pdev->dev, q6v5->pong_irq, NULL,
>> + q6v5_pong_interrupt, IRQF_TRIGGER_RISING | IRQF_ONESHOT,
>> + "q6v5 pong", q6v5);
>> + if (ret)
>> + dev_err(&pdev->dev, "failed to acquire pong IRQ\n");
>> +
>> + init_completion(&q6v5->ping_done);
>> +
>> + return ret;
>> +}
>> +EXPORT_SYMBOL_GPL(qcom_q6v5_ping_subsystem_init);
>> +
>> /**
>> * qcom_q6v5_init() - initializer of the q6v5 common struct
>> * @q6v5: handle to be initialized
>> @@ -247,7 +330,7 @@ EXPORT_SYMBOL_GPL(qcom_q6v5_panic);
>> */
>> int qcom_q6v5_init(struct qcom_q6v5 *q6v5, struct platform_device *pdev,
>> struct rproc *rproc, int crash_reason, const char *load_state,
>> - void (*handover)(struct qcom_q6v5 *q6v5))
>> + bool early_boot, void (*handover)(struct qcom_q6v5 *q6v5))
>> {
>> int ret;
>>
>> @@ -255,10 +338,14 @@ int qcom_q6v5_init(struct qcom_q6v5 *q6v5, struct platform_device *pdev,
>> q6v5->dev = &pdev->dev;
>> q6v5->crash_reason = crash_reason;
>> q6v5->handover = handover;
>> + q6v5->early_boot = early_boot;
>>
>> init_completion(&q6v5->start_done);
>> init_completion(&q6v5->stop_done);
>>
>> + if (early_boot)
>> + init_completion(&q6v5->subsys_booted);
>> +
>> q6v5->wdog_irq = platform_get_irq_byname(pdev, "wdog");
>> if (q6v5->wdog_irq < 0)
>> return q6v5->wdog_irq;
>> diff --git a/drivers/remoteproc/qcom_q6v5.h b/drivers/remoteproc/qcom_q6v5.h
>> index 5a859c41896e..8a227bf70d7e 100644
>> --- a/drivers/remoteproc/qcom_q6v5.h
>> +++ b/drivers/remoteproc/qcom_q6v5.h
>> @@ -12,27 +12,35 @@ struct rproc;
>> struct qcom_smem_state;
>> struct qcom_sysmon;
>>
>> +#define PING_TIMEOUT 500 /* in milliseconds */
>> +#define PING_TEST_WAIT 500 /* in milliseconds */
>> +
>> struct qcom_q6v5 {
>> struct device *dev;
>> struct rproc *rproc;
>>
>> struct qcom_smem_state *state;
>> + struct qcom_smem_state *ping_state;
>> struct qmp *qmp;
>>
>> struct icc_path *path;
>>
>> unsigned stop_bit;
>> + unsigned int ping_bit;
>>
>> int wdog_irq;
>> int fatal_irq;
>> int ready_irq;
>> int handover_irq;
>> int stop_irq;
>> + int pong_irq;
>>
>> bool handover_issued;
>>
>> struct completion start_done;
>> struct completion stop_done;
>> + struct completion subsys_booted;
>> + struct completion ping_done;
>>
>> int crash_reason;
>>
>> @@ -40,11 +48,13 @@ struct qcom_q6v5 {
>>
>> const char *load_state;
>> void (*handover)(struct qcom_q6v5 *q6v5);
>> +
>> + bool early_boot;
>> };
>>
>> int qcom_q6v5_init(struct qcom_q6v5 *q6v5, struct platform_device *pdev,
>> struct rproc *rproc, int crash_reason, const char *load_state,
>> - void (*handover)(struct qcom_q6v5 *q6v5));
>> + bool early_boot, void (*handover)(struct qcom_q6v5 *q6v5));
>> void qcom_q6v5_deinit(struct qcom_q6v5 *q6v5);
>>
>> int qcom_q6v5_prepare(struct qcom_q6v5 *q6v5);
>> @@ -52,5 +62,7 @@ int qcom_q6v5_unprepare(struct qcom_q6v5 *q6v5);
>> int qcom_q6v5_request_stop(struct qcom_q6v5 *q6v5, struct qcom_sysmon *sysmon);
>> int qcom_q6v5_wait_for_start(struct qcom_q6v5 *q6v5, int timeout);
>> unsigned long qcom_q6v5_panic(struct qcom_q6v5 *q6v5);
>> +int qcom_q6v5_ping_subsystem(struct qcom_q6v5 *q6v5);
>> +int qcom_q6v5_ping_subsystem_init(struct qcom_q6v5 *q6v5, struct platform_device *pdev);
>>
>> #endif
>> diff --git a/drivers/remoteproc/qcom_q6v5_adsp.c b/drivers/remoteproc/qcom_q6v5_adsp.c
>> index e98b7e03162c..1576b435b921 100644
>> --- a/drivers/remoteproc/qcom_q6v5_adsp.c
>> +++ b/drivers/remoteproc/qcom_q6v5_adsp.c
>> @@ -717,7 +717,7 @@ static int adsp_probe(struct platform_device *pdev)
>> goto disable_pm;
>>
>> ret = qcom_q6v5_init(&adsp->q6v5, pdev, rproc, desc->crash_reason_smem,
>> - desc->load_state, qcom_adsp_pil_handover);
>> + desc->load_state, false, qcom_adsp_pil_handover);
>> if (ret)
>> goto disable_pm;
>>
>> diff --git a/drivers/remoteproc/qcom_q6v5_mss.c b/drivers/remoteproc/qcom_q6v5_mss.c
>> index 3087d895b87f..ee9bf048820a 100644
>> --- a/drivers/remoteproc/qcom_q6v5_mss.c
>> +++ b/drivers/remoteproc/qcom_q6v5_mss.c
>> @@ -2165,7 +2165,7 @@ static int q6v5_probe(struct platform_device *pdev)
>> qproc->has_mba_logs = desc->has_mba_logs;
>>
>> ret = qcom_q6v5_init(&qproc->q6v5, pdev, rproc, MPSS_CRASH_REASON_SMEM, "modem",
>> - qcom_msa_handover);
>> + false, qcom_msa_handover);
>> if (ret)
>> goto detach_proxy_pds;
>>
>> diff --git a/drivers/remoteproc/qcom_q6v5_pas.c b/drivers/remoteproc/qcom_q6v5_pas.c
>> index 158bcd6cc85c..b667c11aadb5 100644
>> --- a/drivers/remoteproc/qcom_q6v5_pas.c
>> +++ b/drivers/remoteproc/qcom_q6v5_pas.c
>> @@ -35,6 +35,8 @@
>>
>> #define MAX_ASSIGN_COUNT 3
>>
>> +#define EARLY_BOOT_RETRY_INTERVAL_MS 5000
>
> "retry" makes it sounds like we're doing something in a loop, but as far
> as I can tell this is the "attach timeout".
>
will fix
>> +
>> struct qcom_pas_data {
>> int crash_reason_smem;
>> const char *firmware_name;
>> @@ -59,6 +61,7 @@ struct qcom_pas_data {
>> int region_assign_count;
>> bool region_assign_shared;
>> int region_assign_vmid;
>> + bool early_boot;
>> };
>>
>> struct qcom_pas {
>> @@ -409,6 +412,8 @@ static int qcom_pas_stop(struct rproc *rproc)
>> if (pas->smem_host_id)
>> ret = qcom_smem_bust_hwspin_lock_by_host(pas->smem_host_id);
>>
>> + pas->q6v5.early_boot = false;
>
>
>
>> +
>> return ret;
>> }
>>
>> @@ -434,6 +439,51 @@ static unsigned long qcom_pas_panic(struct rproc *rproc)
>> return qcom_q6v5_panic(&pas->q6v5);
>> }
>>
>> +static int qcom_pas_attach(struct rproc *rproc)
>> +{
>> + int ret;
>> + struct qcom_pas *adsp = rproc->priv;
>
> Why is this variable named "adsp"?
>
will fix
>> + bool ready_state;
>> + bool crash_state;
>> +
>> + if (!adsp->q6v5.early_boot)
>
> This would imply that rproc->state == RPROC_DETACHED for a subsystem
> with !early_boot. That shouldn't be possible, I think you should be more
> vocal about this. E.g. by making this:
> if (WARN_ON(!adsp->q6v5.early_boot))
>
> Or just decide that it's too defensive and drop the check.
>
ack, we can drop this.
>> + return -EINVAL;
>> +
>> + ret = irq_get_irqchip_state(adsp->q6v5.fatal_irq,
>> + IRQCHIP_STATE_LINE_LEVEL, &crash_state);
>> +
>> + if (crash_state) {
>> + dev_err(adsp->dev, "Sub system has crashed before driver probe\n");
>> + adsp->rproc->state = RPROC_CRASHED;
>
> We're attaching to a running subsystem, we conclude that it has crashed,
> and we're doing nothing?
>
> Why don't we call rproc_report_crash() here?
>
ack
>> + return -EINVAL;
>> + }
>> +
>> + ret = irq_get_irqchip_state(adsp->q6v5.ready_irq,
>> + IRQCHIP_STATE_LINE_LEVEL, &ready_state);
>> +
>> + if (ready_state) {
>> + dev_info(adsp->dev, "Sub system has boot-up before driver probe\n");
>
> What does this mean? Success?
>
>> + adsp->rproc->state = RPROC_DETACHED;
>
> Why do we just put it back in RPROC_DETACHED? What will happen now?
> Isn't this the point where we should mark it as RPROC_RUNNING?
>
will drop this RPROC_DETACHED, but I think it is not the right time
to set RPROC_RUNNING as we need to ping the subsystem, moreover,
__rproc_attach will set it as RPROC_ATTACHED instead of RPROC_RUNNING
>> + } else {
>
> /*
> * The early-boot subsystem has not crashed, nor signalled that it's
> * ready, wait for 5 more seconds in case it's late.
> */
>
> That said, this begs the questions: can this really happen and why 5
> seconds?
>
>> + ret = wait_for_completion_timeout(&adsp->q6v5.subsys_booted,
>> + msecs_to_jiffies(EARLY_BOOT_RETRY_INTERVAL_MS));
>> + if (!ret) {
>> + dev_err(adsp->dev, "Timeout on waiting for subsystem interrupt\n");
>
> So, we gave the subsystem timeof(kernel-boot) + 5 seconds to show sign
> of life, and it didn't show up.
>
> To me there are two possible reasons:
> 1) The bootloader never started the subsystem
> 2) The bootloader started it, it somehow ended up in a faulty scenario
> and we're failing to detect that.
>
> #1 seems like a perfectly reasonable scenario e.g. during bringup and/or
> development, and the path out of it seems to be to start the subsystem.
>
> #2 would be a faulty scenario that we want to debug, but the user
> probably want their subsystem (re)booted in the meantime.
>
per my understanding, for the 2 reasons above, wait_for_completion_timeout
will get err, I think we can then set the status to RPROC_OFFLINE, so the
user can start it manually and do further debug.
And this wait may get success if the remoteproc is recovering by itself
at the moment/the bootup is delayed, so we set a 5 seconds timeout just
like what adsp start function do.
Actually we didn't see this really happen when we are testing with upstream
kernel, could you share your suggestion, shall we keep it or just remove the
check?
>> + return -ETIMEDOUT;
>> + }
>
> We didn't time out, the subsystem did tell us something happen. There
> are four sources of this event, one of those correspond to our
> "start_done" state during normal startup. Should we really treat all
> four sources the same way here?
>
> For two of the events, we can immediately attempt a recovery, no need to
> ping the subsystem and wait for the timeout.
>
ACK, we can do a status check here, only ping subsystem when we get status
ready.
For wdog/fatal interrupt, we can rely on the interrupt handler to do the
report crash and recovery.
For handover, I noticed it is enabled in the qcom_q6v5_prepare called from
qcom_pas_start, may need further check if we also need to do this prepare
in qcom_pas_attach.
>> + }
>> +
>> + ret = qcom_q6v5_ping_subsystem(&adsp->q6v5);
>> + if (ret) {
>> + dev_err(adsp->dev, "Failed to ping subsystem, assuming device crashed\n");
>> + rproc->state = RPROC_CRASHED;
>
> As above, you mark it as crashed...and then what?
>
will call rproc_report_crash here
>> + return ret;
>> + }
>> +
>> + adsp->q6v5.running = true;
>> + return ret;
>> +}
>> +
>> static const struct rproc_ops qcom_pas_ops = {
>> .unprepare = qcom_pas_unprepare,
>> .start = qcom_pas_start,
>> @@ -442,6 +492,7 @@ static const struct rproc_ops qcom_pas_ops = {
>> .parse_fw = qcom_register_dump_segments,
>> .load = qcom_pas_load,
>> .panic = qcom_pas_panic,
>> + .attach = qcom_pas_attach,
>> };
>>
>> static const struct rproc_ops qcom_pas_minidump_ops = {
>> @@ -765,7 +816,7 @@ static int qcom_pas_probe(struct platform_device *pdev)
>> pas->proxy_pd_count = ret;
>>
>> ret = qcom_q6v5_init(&pas->q6v5, pdev, rproc, desc->crash_reason_smem,
>> - desc->load_state, qcom_pas_handover);
>> + desc->load_state, desc->early_boot, qcom_pas_handover);
>> if (ret)
>> goto detach_proxy_pds;
>>
>> @@ -779,6 +830,16 @@ static int qcom_pas_probe(struct platform_device *pdev)
>> }
>>
>> qcom_add_ssr_subdev(rproc, &pas->ssr_subdev, desc->ssr_name);
>> +
>> + if (pas->q6v5.early_boot) {
>> + ret = qcom_q6v5_ping_subsystem_init(&pas->q6v5, pdev);
>> + if (ret)
>> + dev_err(&pdev->dev,
>> + "Unable to find ping/pong bits, falling back to firmware load\n");
>
> You already printed more specific errors in
> qcom_q6v5_ping_subsystem_init(), no need to print once more here.
>
will fix
> Regards,
> Bjorn
>
>> + else
>> + pas->rproc->state = RPROC_DETACHED;
>> + }
>> +
>> ret = rproc_add(rproc);
>> if (ret)
>> goto remove_ssr_sysmon;
>>
>> --
>> 2.25.1
>>
Thanks,
Jingyi
Powered by blists - more mailing lists