[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aF3Accq8A4zm9Dii@intel.com>
Date: Thu, 26 Jun 2025 17:49:37 -0400
From: Rodrigo Vivi <rodrigo.vivi@...el.com>
To: Daniele Ceraolo Spurio <daniele.ceraolospurio@...el.com>
CC: Badal Nilawar <badal.nilawar@...el.com>, <intel-xe@...ts.freedesktop.org>,
<dri-devel@...ts.freedesktop.org>, <linux-kernel@...r.kernel.org>,
<anshuman.gupta@...el.com>, <alexander.usyskin@...el.com>,
<gregkh@...uxfoundation.org>
Subject: Re: [PATCH v4 05/10] drm/xe/xe_late_bind_fw: Load late binding
firmware
On Thu, Jun 26, 2025 at 02:27:50PM -0700, Daniele Ceraolo Spurio wrote:
>
>
> On 6/26/2025 10:24 AM, Rodrigo Vivi wrote:
> > On Wed, Jun 25, 2025 at 10:30:10PM +0530, Badal Nilawar wrote:
> > > Load late binding firmware
> > >
> > > v2:
> > > - s/EAGAIN/EBUSY/
> > > - Flush worker in suspend and driver unload (Daniele)
> > > v3:
> > > - Use retry interval of 6s, in steps of 200ms, to allow
> > > other OS components release MEI CL handle (Sasha)
> > > v4:
> > > - return -ENODEV if component not added (Daniele)
> > > - parse and print status returned by csc
> > > - Use xe_pm_get_if_in_active (Daniele)
> > The worker is considered outer bound and it is safe
> > to use xe_pm_runtime_get which takes the reference
> > and resume synchronously.
> >
> > Otherwise, if using get_if_active you need to reschedule
> > the work or you lose your job.
>
> The issue is that the next patch adds code to re-queue the work from the rpm
> resume path, so if we do a sync resume here the worker will re-queue itself
> immediately when not needed.
ops, I had forgotten about that case, I'm sorry.
> Also, when the re-queued work runs it might end
> up doing another sync resume and re-queuing itself once more.
I believe it might be worse than that and even hang. This is the right
case for the if_active indeed. But we need to ensure that we will
always have an outer bound for that.
> However, in
> the next patch we do also have a flush of the work in the rpm_suspend path,
> so maybe the worker running when we are rpm suspended is not actually a
> possible case?
that's the kaboom case!
> Also, thinking about this more, that re-queuing on rpm resume only happens
> if d3cold is allowed, so when d3cold is not allowed we do want to proceed
> here we can actually reach here when rpm suspended.
no, when d3cold is not allowed we don't want to re-flash the fw.
We just skip and move forward.
My bad, sorry for the noise and please keep the if_active variant in here.
>
> >
> > > Signed-off-by: Badal Nilawar <badal.nilawar@...el.com>
> > > ---
> > > drivers/gpu/drm/xe/xe_late_bind_fw.c | 149 ++++++++++++++++++++-
> > > drivers/gpu/drm/xe/xe_late_bind_fw.h | 1 +
> > > drivers/gpu/drm/xe/xe_late_bind_fw_types.h | 7 +
> > > 3 files changed, 156 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/drivers/gpu/drm/xe/xe_late_bind_fw.c b/drivers/gpu/drm/xe/xe_late_bind_fw.c
> > > index 32d1436e7191..52243063d98a 100644
> > > --- a/drivers/gpu/drm/xe/xe_late_bind_fw.c
> > > +++ b/drivers/gpu/drm/xe/xe_late_bind_fw.c
> > > @@ -16,6 +16,20 @@
> > > #include "xe_late_bind_fw.h"
> > > #include "xe_pcode.h"
> > > #include "xe_pcode_api.h"
> > > +#include "xe_pm.h"
> > > +
> > > +/*
> > > + * The component should load quite quickly in most cases, but it could take
> > > + * a bit. Using a very big timeout just to cover the worst case scenario
> > > + */
> > > +#define LB_INIT_TIMEOUT_MS 20000
> > > +
> > > +/*
> > > + * Retry interval set to 6 seconds, in steps of 200 ms, to allow time for
> > > + * other OS components to release the MEI CL handle
> > > + */
> > > +#define LB_FW_LOAD_RETRY_MAXCOUNT 30
> > > +#define LB_FW_LOAD_RETRY_PAUSE_MS 200
> > > static const u32 fw_id_to_type[] = {
> > > [XE_LB_FW_FAN_CONTROL] = CSC_LATE_BINDING_TYPE_FAN_CONTROL,
> > > @@ -31,6 +45,30 @@ late_bind_to_xe(struct xe_late_bind *late_bind)
> > > return container_of(late_bind, struct xe_device, late_bind);
> > > }
> > > +static const char *xe_late_bind_parse_status(uint32_t status)
> > > +{
> > > + switch (status) {
> > > + case CSC_LATE_BINDING_STATUS_SUCCESS:
> > > + return "success";
> > > + case CSC_LATE_BINDING_STATUS_4ID_MISMATCH:
> > > + return "4Id Mismatch";
> > > + case CSC_LATE_BINDING_STATUS_ARB_FAILURE:
> > > + return "ARB Failure";
> > > + case CSC_LATE_BINDING_STATUS_GENERAL_ERROR:
> > > + return "General Error";
> > > + case CSC_LATE_BINDING_STATUS_INVALID_PARAMS:
> > > + return "Invalid Params";
> > > + case CSC_LATE_BINDING_STATUS_INVALID_SIGNATURE:
> > > + return "Invalid Signature";
> > > + case CSC_LATE_BINDING_STATUS_INVALID_PAYLOAD:
> > > + return "Invalid Payload";
> > > + case CSC_LATE_BINDING_STATUS_TIMEOUT:
> > > + return "Timeout";
> > > + default:
> > > + return "Unknown error";
> > > + }
> > > +}
> > > +
> > > static int xe_late_bind_fw_num_fans(struct xe_late_bind *late_bind)
> > > {
> > > struct xe_device *xe = late_bind_to_xe(late_bind);
> > > @@ -44,6 +82,93 @@ static int xe_late_bind_fw_num_fans(struct xe_late_bind *late_bind)
> > > return 0;
> > > }
> > > +static void xe_late_bind_wait_for_worker_completion(struct xe_late_bind *late_bind)
> > > +{
> > > + struct xe_device *xe = late_bind_to_xe(late_bind);
> > > + struct xe_late_bind_fw *lbfw;
> > > + int fw_id;
> > > +
> > > + for (fw_id = 0; fw_id < XE_LB_FW_MAX_ID; fw_id++) {
> > > + lbfw = &late_bind->late_bind_fw[fw_id];
> > > + if (lbfw->valid && late_bind->wq) {
> > > + drm_dbg(&xe->drm, "Flush work: load %s firmware\n",
> > > + fw_id_to_name[lbfw->id]);
> > > + flush_work(&lbfw->work);
> > > + }
> > > + }
> > > +}
> > > +
> > > +static void xe_late_bind_work(struct work_struct *work)
> > > +{
> > > + struct xe_late_bind_fw *lbfw = container_of(work, struct xe_late_bind_fw, work);
> > > + struct xe_late_bind *late_bind = container_of(lbfw, struct xe_late_bind,
> > > + late_bind_fw[lbfw->id]);
> > > + struct xe_device *xe = late_bind_to_xe(late_bind);
> > > + int retry = LB_FW_LOAD_RETRY_MAXCOUNT;
> > > + int ret;
> > > + int slept;
> > > +
> > > + /* we can queue this before the component is bound */
> > > + for (slept = 0; slept < LB_INIT_TIMEOUT_MS; slept += 100) {
> > > + if (late_bind->component.ops)
> > > + break;
> > > + msleep(100);
> > > + }
> > > +
> > > + if (!xe_pm_runtime_get_if_active(xe))
> > > + return;
> > > +
> > > + mutex_lock(&late_bind->mutex);
> > > +
> > > + if (!late_bind->component.ops) {
> > > + drm_err(&xe->drm, "Late bind component not bound\n");
> > > + goto out;
> > > + }
> > > +
> > > + drm_dbg(&xe->drm, "Load %s firmware\n", fw_id_to_name[lbfw->id]);
> > > +
> > > + do {
> > > + ret = late_bind->component.ops->push_config(late_bind->component.mei_dev,
> > > + lbfw->type, lbfw->flags,
> > > + lbfw->payload, lbfw->payload_size);
> > > + if (!ret)
> > > + break;
> > > + msleep(LB_FW_LOAD_RETRY_PAUSE_MS);
> > > + } while (--retry && ret == -EBUSY);
> > > +
> > > + if (!ret) {
> > > + drm_dbg(&xe->drm, "Load %s firmware successful\n",
> > > + fw_id_to_name[lbfw->id]);
> > > + goto out;
> > > + }
> > > +
> > > + if (ret > 0)
>
> nit: here you can just do "else if" and drop the goto.
>
> Daniele
>
> > > + drm_err(&xe->drm, "Load %s firmware failed with err %d, %s\n",
> > > + fw_id_to_name[lbfw->id], ret, xe_late_bind_parse_status(ret));
> > > + else
> > > + drm_err(&xe->drm, "Load %s firmware failed with err %d",
> > > + fw_id_to_name[lbfw->id], ret);
> > > +out:
> > > + mutex_unlock(&late_bind->mutex);
> > > + xe_pm_runtime_put(xe);
> > > +}
> > > +
> > > +int xe_late_bind_fw_load(struct xe_late_bind *late_bind)
> > > +{
> > > + struct xe_late_bind_fw *lbfw;
> > > + int fw_id;
> > > +
> > > + if (!late_bind->component_added)
> > > + return -ENODEV;
> > > +
> > > + for (fw_id = 0; fw_id < XE_LB_FW_MAX_ID; fw_id++) {
> > > + lbfw = &late_bind->late_bind_fw[fw_id];
> > > + if (lbfw->valid)
> > > + queue_work(late_bind->wq, &lbfw->work);
> > > + }
> > > + return 0;
> > > +}
> > > +
> > > static int __xe_late_bind_fw_init(struct xe_late_bind *late_bind, u32 fw_id)
> > > {
> > > struct xe_device *xe = late_bind_to_xe(late_bind);
> > > @@ -99,6 +224,7 @@ static int __xe_late_bind_fw_init(struct xe_late_bind *late_bind, u32 fw_id)
> > > memcpy(lb_fw->payload, fw->data, lb_fw->payload_size);
> > > release_firmware(fw);
> > > + INIT_WORK(&lb_fw->work, xe_late_bind_work);
> > > lb_fw->valid = true;
> > > return 0;
> > > @@ -109,11 +235,16 @@ static int xe_late_bind_fw_init(struct xe_late_bind *late_bind)
> > > int ret;
> > > int fw_id;
> > > + late_bind->wq = alloc_ordered_workqueue("late-bind-ordered-wq", 0);
> > > + if (!late_bind->wq)
> > > + return -ENOMEM;
> > > +
> > > for (fw_id = 0; fw_id < XE_LB_FW_MAX_ID; fw_id++) {
> > > ret = __xe_late_bind_fw_init(late_bind, fw_id);
> > > if (ret)
> > > return ret;
> > > }
> > > +
> > > return 0;
> > > }
> > > @@ -137,6 +268,8 @@ static void xe_late_bind_component_unbind(struct device *xe_kdev,
> > > struct xe_device *xe = kdev_to_xe_device(xe_kdev);
> > > struct xe_late_bind *late_bind = &xe->late_bind;
> > > + xe_late_bind_wait_for_worker_completion(late_bind);
> > > +
> > > mutex_lock(&late_bind->mutex);
> > > late_bind->component.ops = NULL;
> > > mutex_unlock(&late_bind->mutex);
> > > @@ -152,7 +285,15 @@ static void xe_late_bind_remove(void *arg)
> > > struct xe_late_bind *late_bind = arg;
> > > struct xe_device *xe = late_bind_to_xe(late_bind);
> > > + xe_late_bind_wait_for_worker_completion(late_bind);
> > > +
> > > + late_bind->component_added = false;
> > > +
> > > component_del(xe->drm.dev, &xe_late_bind_component_ops);
> > > + if (late_bind->wq) {
> > > + destroy_workqueue(late_bind->wq);
> > > + late_bind->wq = NULL;
> > > + }
> > > mutex_destroy(&late_bind->mutex);
> > > }
> > > @@ -183,9 +324,15 @@ int xe_late_bind_init(struct xe_late_bind *late_bind)
> > > return err;
> > > }
> > > + late_bind->component_added = true;
> > > +
> > > err = devm_add_action_or_reset(xe->drm.dev, xe_late_bind_remove, late_bind);
> > > if (err)
> > > return err;
> > > - return xe_late_bind_fw_init(late_bind);
> > > + err = xe_late_bind_fw_init(late_bind);
> > > + if (err)
> > > + return err;
> > > +
> > > + return xe_late_bind_fw_load(late_bind);
> > > }
> > > diff --git a/drivers/gpu/drm/xe/xe_late_bind_fw.h b/drivers/gpu/drm/xe/xe_late_bind_fw.h
> > > index 4c73571c3e62..28d56ed2bfdc 100644
> > > --- a/drivers/gpu/drm/xe/xe_late_bind_fw.h
> > > +++ b/drivers/gpu/drm/xe/xe_late_bind_fw.h
> > > @@ -11,5 +11,6 @@
> > > struct xe_late_bind;
> > > int xe_late_bind_init(struct xe_late_bind *late_bind);
> > > +int xe_late_bind_fw_load(struct xe_late_bind *late_bind);
> > > #endif
> > > diff --git a/drivers/gpu/drm/xe/xe_late_bind_fw_types.h b/drivers/gpu/drm/xe/xe_late_bind_fw_types.h
> > > index 93abf4c51789..f119a75f4c9c 100644
> > > --- a/drivers/gpu/drm/xe/xe_late_bind_fw_types.h
> > > +++ b/drivers/gpu/drm/xe/xe_late_bind_fw_types.h
> > > @@ -9,6 +9,7 @@
> > > #include <linux/iosys-map.h>
> > > #include <linux/mutex.h>
> > > #include <linux/types.h>
> > > +#include <linux/workqueue.h>
> > > #define MAX_PAYLOAD_SIZE SZ_4K
> > > @@ -38,6 +39,8 @@ struct xe_late_bind_fw {
> > > u8 *payload;
> > > /** @late_bind_fw.payload_size: late binding blob payload_size */
> > > size_t payload_size;
> > > + /** @late_bind_fw.work: worker to upload latebind blob */
> > > + struct work_struct work;
> > > };
> > > /**
> > > @@ -64,6 +67,10 @@ struct xe_late_bind {
> > > struct mutex mutex;
> > > /** @late_bind.late_bind_fw: late binding firmware array */
> > > struct xe_late_bind_fw late_bind_fw[XE_LB_FW_MAX_ID];
> > > + /** @late_bind.wq: workqueue to submit request to download late bind blob */
> > > + struct workqueue_struct *wq;
> > > + /** @late_bind.component_added: whether the component has been added */
> > > + bool component_added;
> > > };
> > > #endif
> > > --
> > > 2.34.1
> > >
>
Powered by blists - more mailing lists