[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <qfls6xlvfppqw7p6rjpmzqesh6sbob4myfc6dz47qh3jywqrjk@5xiutkbybk5d>
Date: Sat, 1 Nov 2025 13:34:56 -0500
From: Bjorn Andersson <andersson@...nel.org>
To: Jingyi Wang <jingyi.wang@....qualcomm.com>
Cc: Mathieu Poirier <mathieu.poirier@...aro.org>,
Rob Herring <robh@...nel.org>, Krzysztof Kozlowski <krzk+dt@...nel.org>,
Conor Dooley <conor+dt@...nel.org>, Manivannan Sadhasivam <mani@...nel.org>,
aiqun.yu@....qualcomm.com, tingwei.zhang@....qualcomm.com, trilok.soni@....qualcomm.com,
yijie.yang@....qualcomm.com, linux-arm-msm@...r.kernel.org, linux-remoteproc@...r.kernel.org,
devicetree@...r.kernel.org, linux-kernel@...r.kernel.org,
Gokul krishna Krishnakumar <Gokul.krishnakumar@....qualcomm>
Subject: Re: [PATCH v2 4/7] remoteproc: qcom: pas: Add late attach support
for subsystems
On Wed, Oct 29, 2025 at 01:05:42AM -0700, Jingyi Wang wrote:
> From: Gokul krishna Krishnakumar <Gokul.krishnakumar@....qualcomm>
>
> Subsystems can be brought out of reset by entities such as
> bootloaders.
Good start of the commit message.
> Before attaching such subsystems, it is important to
> check the state of the subsystem.
Why?
I see three possible outcomes:
1) The subsystem was booted and is still running just fine.
2) The subsystem hit fatal error and flagged this in smp2p
3) The subsystem hit a wdog and I presume there would be a interrupt
waiting for us as soon as we register a handler?
Perhaps I'm wrong about the semantics of #3? If so this should be
clearly documented in the commit message.
Also, at this point in the commit message you've established the
problem, the remainder is a description of how you're addressing the
problem. A paragraph break would be suitable.
> This patch adds support to attach
> to a subsystem by ensuring that the subsystem is in a sane state by
> reading SMP2P bits and pinging the subsystem.
>
> Signed-off-by: Gokul krishna Krishnakumar <Gokul.krishnakumar@....qualcomm>
I would prefer email addresses to be all lowercase, but more
importantly, you lost the tail end of that address (same with author).
> Co-developed-by: Jingyi Wang <jingyi.wang@....qualcomm.com>
> Signed-off-by: Jingyi Wang <jingyi.wang@....qualcomm.com>
> ---
> drivers/remoteproc/qcom_q6v5.c | 89 ++++++++++++++++++++++++++++++++++++-
> drivers/remoteproc/qcom_q6v5.h | 14 +++++-
> drivers/remoteproc/qcom_q6v5_adsp.c | 2 +-
> drivers/remoteproc/qcom_q6v5_mss.c | 2 +-
> drivers/remoteproc/qcom_q6v5_pas.c | 63 +++++++++++++++++++++++++-
> 5 files changed, 165 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/remoteproc/qcom_q6v5.c b/drivers/remoteproc/qcom_q6v5.c
> index 58d5b85e58cd..4ce9e43fc5c7 100644
> --- a/drivers/remoteproc/qcom_q6v5.c
> +++ b/drivers/remoteproc/qcom_q6v5.c
> @@ -94,6 +94,9 @@ static irqreturn_t q6v5_wdog_interrupt(int irq, void *data)
> size_t len;
> char *msg;
>
> + if (q6v5->early_boot)
> + complete(&q6v5->subsys_booted);
> +
> /* Sometimes the stop triggers a watchdog rather than a stop-ack */
> if (!q6v5->running) {
> complete(&q6v5->stop_done);
> @@ -118,6 +121,9 @@ static irqreturn_t q6v5_fatal_interrupt(int irq, void *data)
> size_t len;
> char *msg;
>
> + if (q6v5->early_boot)
> + complete(&q6v5->subsys_booted);
> +
> if (!q6v5->running)
> return IRQ_HANDLED;
>
> @@ -139,6 +145,9 @@ static irqreturn_t q6v5_ready_interrupt(int irq, void *data)
>
> complete(&q6v5->start_done);
>
> + if (q6v5->early_boot)
> + complete(&q6v5->subsys_booted);
> +
> return IRQ_HANDLED;
> }
>
> @@ -172,6 +181,9 @@ static irqreturn_t q6v5_handover_interrupt(int irq, void *data)
> if (q6v5->handover)
> q6v5->handover(q6v5);
>
> + if (q6v5->early_boot)
> + complete(&q6v5->subsys_booted);
> +
> icc_set_bw(q6v5->path, 0, 0);
>
> q6v5->handover_issued = true;
> @@ -234,6 +246,77 @@ unsigned long qcom_q6v5_panic(struct qcom_q6v5 *q6v5)
> }
> EXPORT_SYMBOL_GPL(qcom_q6v5_panic);
>
> +static irqreturn_t q6v5_pong_interrupt(int irq, void *data)
> +{
> + struct qcom_q6v5 *q6v5 = data;
> +
> + complete(&q6v5->ping_done);
> +
> + return IRQ_HANDLED;
> +}
> +
> +int qcom_q6v5_ping_subsystem(struct qcom_q6v5 *q6v5)
> +{
> + int ret;
> + int ping_failed = 0;
> +
> + reinit_completion(&q6v5->ping_done);
> +
> + /* Set master kernel Ping bit */
> + ret = qcom_smem_state_update_bits(q6v5->ping_state,
> + BIT(q6v5->ping_bit), BIT(q6v5->ping_bit));
> + if (ret) {
> + dev_err(q6v5->dev, "Failed to update ping bits\n");
> + return ret;
> + }
> +
> + ret = wait_for_completion_timeout(&q6v5->ping_done, msecs_to_jiffies(PING_TIMEOUT));
> + if (!ret) {
> + ping_failed = -ETIMEDOUT;
> + dev_err(q6v5->dev, "Failed to get back pong\n");
> + }
> +
> + /* Clear ping bit master kernel */
> + ret = qcom_smem_state_update_bits(q6v5->ping_state, BIT(q6v5->ping_bit), 0);
> + if (ret) {
> + pr_err("Failed to clear master kernel bits\n");
Two dev_err and one pr_err?
> + return ret;
> + }
> +
> + if (ping_failed)
> + return ping_failed;
> +
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(qcom_q6v5_ping_subsystem);
> +
> +int qcom_q6v5_ping_subsystem_init(struct qcom_q6v5 *q6v5, struct platform_device *pdev)
> +{
> + int ret = -ENODEV;
> +
> + q6v5->ping_state = devm_qcom_smem_state_get(&pdev->dev, "ping", &q6v5->ping_bit);
> + if (IS_ERR(q6v5->ping_state)) {
> + dev_err(&pdev->dev, "failed to acquire smem state %ld\n",
> + PTR_ERR(q6v5->ping_state));
> + return ret;
> + }
> +
> + q6v5->pong_irq = platform_get_irq_byname(pdev, "pong");
> + if (q6v5->pong_irq < 0)
> + return q6v5->pong_irq;
> +
> + ret = devm_request_threaded_irq(&pdev->dev, q6v5->pong_irq, NULL,
> + q6v5_pong_interrupt, IRQF_TRIGGER_RISING | IRQF_ONESHOT,
> + "q6v5 pong", q6v5);
> + if (ret)
> + dev_err(&pdev->dev, "failed to acquire pong IRQ\n");
> +
> + init_completion(&q6v5->ping_done);
> +
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(qcom_q6v5_ping_subsystem_init);
> +
> /**
> * qcom_q6v5_init() - initializer of the q6v5 common struct
> * @q6v5: handle to be initialized
> @@ -247,7 +330,7 @@ EXPORT_SYMBOL_GPL(qcom_q6v5_panic);
> */
> int qcom_q6v5_init(struct qcom_q6v5 *q6v5, struct platform_device *pdev,
> struct rproc *rproc, int crash_reason, const char *load_state,
> - void (*handover)(struct qcom_q6v5 *q6v5))
> + bool early_boot, void (*handover)(struct qcom_q6v5 *q6v5))
> {
> int ret;
>
> @@ -255,10 +338,14 @@ int qcom_q6v5_init(struct qcom_q6v5 *q6v5, struct platform_device *pdev,
> q6v5->dev = &pdev->dev;
> q6v5->crash_reason = crash_reason;
> q6v5->handover = handover;
> + q6v5->early_boot = early_boot;
>
> init_completion(&q6v5->start_done);
> init_completion(&q6v5->stop_done);
>
> + if (early_boot)
> + init_completion(&q6v5->subsys_booted);
> +
> q6v5->wdog_irq = platform_get_irq_byname(pdev, "wdog");
> if (q6v5->wdog_irq < 0)
> return q6v5->wdog_irq;
> diff --git a/drivers/remoteproc/qcom_q6v5.h b/drivers/remoteproc/qcom_q6v5.h
> index 5a859c41896e..8a227bf70d7e 100644
> --- a/drivers/remoteproc/qcom_q6v5.h
> +++ b/drivers/remoteproc/qcom_q6v5.h
> @@ -12,27 +12,35 @@ struct rproc;
> struct qcom_smem_state;
> struct qcom_sysmon;
>
> +#define PING_TIMEOUT 500 /* in milliseconds */
> +#define PING_TEST_WAIT 500 /* in milliseconds */
> +
> struct qcom_q6v5 {
> struct device *dev;
> struct rproc *rproc;
>
> struct qcom_smem_state *state;
> + struct qcom_smem_state *ping_state;
> struct qmp *qmp;
>
> struct icc_path *path;
>
> unsigned stop_bit;
> + unsigned int ping_bit;
>
> int wdog_irq;
> int fatal_irq;
> int ready_irq;
> int handover_irq;
> int stop_irq;
> + int pong_irq;
>
> bool handover_issued;
>
> struct completion start_done;
> struct completion stop_done;
> + struct completion subsys_booted;
> + struct completion ping_done;
>
> int crash_reason;
>
> @@ -40,11 +48,13 @@ struct qcom_q6v5 {
>
> const char *load_state;
> void (*handover)(struct qcom_q6v5 *q6v5);
> +
> + bool early_boot;
> };
>
> int qcom_q6v5_init(struct qcom_q6v5 *q6v5, struct platform_device *pdev,
> struct rproc *rproc, int crash_reason, const char *load_state,
> - void (*handover)(struct qcom_q6v5 *q6v5));
> + bool early_boot, void (*handover)(struct qcom_q6v5 *q6v5));
> void qcom_q6v5_deinit(struct qcom_q6v5 *q6v5);
>
> int qcom_q6v5_prepare(struct qcom_q6v5 *q6v5);
> @@ -52,5 +62,7 @@ int qcom_q6v5_unprepare(struct qcom_q6v5 *q6v5);
> int qcom_q6v5_request_stop(struct qcom_q6v5 *q6v5, struct qcom_sysmon *sysmon);
> int qcom_q6v5_wait_for_start(struct qcom_q6v5 *q6v5, int timeout);
> unsigned long qcom_q6v5_panic(struct qcom_q6v5 *q6v5);
> +int qcom_q6v5_ping_subsystem(struct qcom_q6v5 *q6v5);
> +int qcom_q6v5_ping_subsystem_init(struct qcom_q6v5 *q6v5, struct platform_device *pdev);
>
> #endif
> diff --git a/drivers/remoteproc/qcom_q6v5_adsp.c b/drivers/remoteproc/qcom_q6v5_adsp.c
> index e98b7e03162c..1576b435b921 100644
> --- a/drivers/remoteproc/qcom_q6v5_adsp.c
> +++ b/drivers/remoteproc/qcom_q6v5_adsp.c
> @@ -717,7 +717,7 @@ static int adsp_probe(struct platform_device *pdev)
> goto disable_pm;
>
> ret = qcom_q6v5_init(&adsp->q6v5, pdev, rproc, desc->crash_reason_smem,
> - desc->load_state, qcom_adsp_pil_handover);
> + desc->load_state, false, qcom_adsp_pil_handover);
> if (ret)
> goto disable_pm;
>
> diff --git a/drivers/remoteproc/qcom_q6v5_mss.c b/drivers/remoteproc/qcom_q6v5_mss.c
> index 3087d895b87f..ee9bf048820a 100644
> --- a/drivers/remoteproc/qcom_q6v5_mss.c
> +++ b/drivers/remoteproc/qcom_q6v5_mss.c
> @@ -2165,7 +2165,7 @@ static int q6v5_probe(struct platform_device *pdev)
> qproc->has_mba_logs = desc->has_mba_logs;
>
> ret = qcom_q6v5_init(&qproc->q6v5, pdev, rproc, MPSS_CRASH_REASON_SMEM, "modem",
> - qcom_msa_handover);
> + false, qcom_msa_handover);
> if (ret)
> goto detach_proxy_pds;
>
> diff --git a/drivers/remoteproc/qcom_q6v5_pas.c b/drivers/remoteproc/qcom_q6v5_pas.c
> index 158bcd6cc85c..b667c11aadb5 100644
> --- a/drivers/remoteproc/qcom_q6v5_pas.c
> +++ b/drivers/remoteproc/qcom_q6v5_pas.c
> @@ -35,6 +35,8 @@
>
> #define MAX_ASSIGN_COUNT 3
>
> +#define EARLY_BOOT_RETRY_INTERVAL_MS 5000
"retry" makes it sounds like we're doing something in a loop, but as far
as I can tell this is the "attach timeout".
> +
> struct qcom_pas_data {
> int crash_reason_smem;
> const char *firmware_name;
> @@ -59,6 +61,7 @@ struct qcom_pas_data {
> int region_assign_count;
> bool region_assign_shared;
> int region_assign_vmid;
> + bool early_boot;
> };
>
> struct qcom_pas {
> @@ -409,6 +412,8 @@ static int qcom_pas_stop(struct rproc *rproc)
> if (pas->smem_host_id)
> ret = qcom_smem_bust_hwspin_lock_by_host(pas->smem_host_id);
>
> + pas->q6v5.early_boot = false;
> +
> return ret;
> }
>
> @@ -434,6 +439,51 @@ static unsigned long qcom_pas_panic(struct rproc *rproc)
> return qcom_q6v5_panic(&pas->q6v5);
> }
>
> +static int qcom_pas_attach(struct rproc *rproc)
> +{
> + int ret;
> + struct qcom_pas *adsp = rproc->priv;
Why is this variable named "adsp"?
> + bool ready_state;
> + bool crash_state;
> +
> + if (!adsp->q6v5.early_boot)
This would imply that rproc->state == RPROC_DETACHED for a subsystem
with !early_boot. That shouldn't be possible, I think you should be more
vocal about this. E.g. by making this:
if (WARN_ON(!adsp->q6v5.early_boot))
Or just decide that it's too defensive and drop the check.
> + return -EINVAL;
> +
> + ret = irq_get_irqchip_state(adsp->q6v5.fatal_irq,
> + IRQCHIP_STATE_LINE_LEVEL, &crash_state);
> +
> + if (crash_state) {
> + dev_err(adsp->dev, "Sub system has crashed before driver probe\n");
> + adsp->rproc->state = RPROC_CRASHED;
We're attaching to a running subsystem, we conclude that it has crashed,
and we're doing nothing?
Why don't we call rproc_report_crash() here?
> + return -EINVAL;
> + }
> +
> + ret = irq_get_irqchip_state(adsp->q6v5.ready_irq,
> + IRQCHIP_STATE_LINE_LEVEL, &ready_state);
> +
> + if (ready_state) {
> + dev_info(adsp->dev, "Sub system has boot-up before driver probe\n");
What does this mean? Success?
> + adsp->rproc->state = RPROC_DETACHED;
Why do we just put it back in RPROC_DETACHED? What will happen now?
Isn't this the point where we should mark it as RPROC_RUNNING?
> + } else {
/*
* The early-boot subsystem has not crashed, nor signalled that it's
* ready, wait for 5 more seconds in case it's late.
*/
That said, this begs the questions: can this really happen and why 5
seconds?
> + ret = wait_for_completion_timeout(&adsp->q6v5.subsys_booted,
> + msecs_to_jiffies(EARLY_BOOT_RETRY_INTERVAL_MS));
> + if (!ret) {
> + dev_err(adsp->dev, "Timeout on waiting for subsystem interrupt\n");
So, we gave the subsystem timeof(kernel-boot) + 5 seconds to show sign
of life, and it didn't show up.
To me there are two possible reasons:
1) The bootloader never started the subsystem
2) The bootloader started it, it somehow ended up in a faulty scenario
and we're failing to detect that.
#1 seems like a perfectly reasonable scenario e.g. during bringup and/or
development, and the path out of it seems to be to start the subsystem.
#2 would be a faulty scenario that we want to debug, but the user
probably want their subsystem (re)booted in the meantime.
> + return -ETIMEDOUT;
> + }
We didn't time out, the subsystem did tell us something happen. There
are four sources of this event, one of those correspond to our
"start_done" state during normal startup. Should we really treat all
four sources the same way here?
For two of the events, we can immediately attempt a recovery, no need to
ping the subsystem and wait for the timeout.
> + }
> +
> + ret = qcom_q6v5_ping_subsystem(&adsp->q6v5);
> + if (ret) {
> + dev_err(adsp->dev, "Failed to ping subsystem, assuming device crashed\n");
> + rproc->state = RPROC_CRASHED;
As above, you mark it as crashed...and then what?
> + return ret;
> + }
> +
> + adsp->q6v5.running = true;
> + return ret;
> +}
> +
> static const struct rproc_ops qcom_pas_ops = {
> .unprepare = qcom_pas_unprepare,
> .start = qcom_pas_start,
> @@ -442,6 +492,7 @@ static const struct rproc_ops qcom_pas_ops = {
> .parse_fw = qcom_register_dump_segments,
> .load = qcom_pas_load,
> .panic = qcom_pas_panic,
> + .attach = qcom_pas_attach,
> };
>
> static const struct rproc_ops qcom_pas_minidump_ops = {
> @@ -765,7 +816,7 @@ static int qcom_pas_probe(struct platform_device *pdev)
> pas->proxy_pd_count = ret;
>
> ret = qcom_q6v5_init(&pas->q6v5, pdev, rproc, desc->crash_reason_smem,
> - desc->load_state, qcom_pas_handover);
> + desc->load_state, desc->early_boot, qcom_pas_handover);
> if (ret)
> goto detach_proxy_pds;
>
> @@ -779,6 +830,16 @@ static int qcom_pas_probe(struct platform_device *pdev)
> }
>
> qcom_add_ssr_subdev(rproc, &pas->ssr_subdev, desc->ssr_name);
> +
> + if (pas->q6v5.early_boot) {
> + ret = qcom_q6v5_ping_subsystem_init(&pas->q6v5, pdev);
> + if (ret)
> + dev_err(&pdev->dev,
> + "Unable to find ping/pong bits, falling back to firmware load\n");
You already printed more specific errors in
qcom_q6v5_ping_subsystem_init(), no need to print once more here.
Regards,
Bjorn
> + else
> + pas->rproc->state = RPROC_DETACHED;
> + }
> +
> ret = rproc_add(rproc);
> if (ret)
> goto remove_ssr_sysmon;
>
> --
> 2.25.1
>
Powered by blists - more mailing lists