[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <c159aa47-1e7c-4cb9-b544-89955100bfb8@intel.com>
Date: Thu, 26 Jun 2025 15:38:18 -0700
From: Daniele Ceraolo Spurio <daniele.ceraolospurio@...el.com>
To: Rodrigo Vivi <rodrigo.vivi@...el.com>
CC: Badal Nilawar <badal.nilawar@...el.com>, <intel-xe@...ts.freedesktop.org>,
<dri-devel@...ts.freedesktop.org>, <linux-kernel@...r.kernel.org>,
<anshuman.gupta@...el.com>, <alexander.usyskin@...el.com>,
<gregkh@...uxfoundation.org>
Subject: Re: [PATCH v4 05/10] drm/xe/xe_late_bind_fw: Load late binding
firmware
On 6/26/2025 2:49 PM, Rodrigo Vivi wrote:
> On Thu, Jun 26, 2025 at 02:27:50PM -0700, Daniele Ceraolo Spurio wrote:
>>
>> On 6/26/2025 10:24 AM, Rodrigo Vivi wrote:
>>> On Wed, Jun 25, 2025 at 10:30:10PM +0530, Badal Nilawar wrote:
>>>> Load late binding firmware
>>>>
>>>> v2:
>>>> - s/EAGAIN/EBUSY/
>>>> - Flush worker in suspend and driver unload (Daniele)
>>>> v3:
>>>> - Use retry interval of 6s, in steps of 200ms, to allow
>>>> other OS components release MEI CL handle (Sasha)
>>>> v4:
>>>> - return -ENODEV if component not added (Daniele)
>>>> - parse and print status returned by csc
>>>> - Use xe_pm_get_if_in_active (Daniele)
>>> The worker is considered outer bound and it is safe
>>> to use xe_pm_runtime_get which takes the reference
>>> and resume synchronously.
>>>
>>> Otherwise, if using get_if_active you need to reschedule
>>> the work or you lose your job.
>> The issue is that the next patch adds code to re-queue the work from the rpm
>> resume path, so if we do a sync resume here the worker will re-queue itself
>> immediately when not needed.
> ops, I had forgotten about that case, I'm sorry.
>
>> Also, when the re-queued work runs it might end
>> up doing another sync resume and re-queuing itself once more.
> I believe it might be worse than that and even hang. This is the right
> case for the if_active indeed. But we need to ensure that we will
> always have an outer bound for that.
>
>> However, in
>> the next patch we do also have a flush of the work in the rpm_suspend path,
>> so maybe the worker running when we are rpm suspended is not actually a
>> possible case?
> that's the kaboom case!
>
>> Also, thinking about this more, that re-queuing on rpm resume only happens
>> if d3cold is allowed, so when d3cold is not allowed we do want to proceed
>> here we can actually reach here when rpm suspended.
> no, when d3cold is not allowed we don't want to re-flash the fw.
> We just skip and move forward.
My concern was about the first time we attempt the load in the d3cold
disabled scenario. If we've somehow managed to rpm suspend between
queuing the work for the first time and the work actually running,
skipping the flashing would mean the binary is not actually ever loaded.
Not sure if that's a case we can hit though.
Daniele
>
> My bad, sorry for the noise and please keep the if_active variant in here.
>
>>>> Signed-off-by: Badal Nilawar <badal.nilawar@...el.com>
>>>> ---
>>>> drivers/gpu/drm/xe/xe_late_bind_fw.c | 149 ++++++++++++++++++++-
>>>> drivers/gpu/drm/xe/xe_late_bind_fw.h | 1 +
>>>> drivers/gpu/drm/xe/xe_late_bind_fw_types.h | 7 +
>>>> 3 files changed, 156 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/xe/xe_late_bind_fw.c b/drivers/gpu/drm/xe/xe_late_bind_fw.c
>>>> index 32d1436e7191..52243063d98a 100644
>>>> --- a/drivers/gpu/drm/xe/xe_late_bind_fw.c
>>>> +++ b/drivers/gpu/drm/xe/xe_late_bind_fw.c
>>>> @@ -16,6 +16,20 @@
>>>> #include "xe_late_bind_fw.h"
>>>> #include "xe_pcode.h"
>>>> #include "xe_pcode_api.h"
>>>> +#include "xe_pm.h"
>>>> +
>>>> +/*
>>>> + * The component should load quite quickly in most cases, but it could take
>>>> + * a bit. Using a very big timeout just to cover the worst case scenario
>>>> + */
>>>> +#define LB_INIT_TIMEOUT_MS 20000
>>>> +
>>>> +/*
>>>> + * Retry interval set to 6 seconds, in steps of 200 ms, to allow time for
>>>> + * other OS components to release the MEI CL handle
>>>> + */
>>>> +#define LB_FW_LOAD_RETRY_MAXCOUNT 30
>>>> +#define LB_FW_LOAD_RETRY_PAUSE_MS 200
>>>> static const u32 fw_id_to_type[] = {
>>>> [XE_LB_FW_FAN_CONTROL] = CSC_LATE_BINDING_TYPE_FAN_CONTROL,
>>>> @@ -31,6 +45,30 @@ late_bind_to_xe(struct xe_late_bind *late_bind)
>>>> return container_of(late_bind, struct xe_device, late_bind);
>>>> }
>>>> +static const char *xe_late_bind_parse_status(uint32_t status)
>>>> +{
>>>> + switch (status) {
>>>> + case CSC_LATE_BINDING_STATUS_SUCCESS:
>>>> + return "success";
>>>> + case CSC_LATE_BINDING_STATUS_4ID_MISMATCH:
>>>> + return "4Id Mismatch";
>>>> + case CSC_LATE_BINDING_STATUS_ARB_FAILURE:
>>>> + return "ARB Failure";
>>>> + case CSC_LATE_BINDING_STATUS_GENERAL_ERROR:
>>>> + return "General Error";
>>>> + case CSC_LATE_BINDING_STATUS_INVALID_PARAMS:
>>>> + return "Invalid Params";
>>>> + case CSC_LATE_BINDING_STATUS_INVALID_SIGNATURE:
>>>> + return "Invalid Signature";
>>>> + case CSC_LATE_BINDING_STATUS_INVALID_PAYLOAD:
>>>> + return "Invalid Payload";
>>>> + case CSC_LATE_BINDING_STATUS_TIMEOUT:
>>>> + return "Timeout";
>>>> + default:
>>>> + return "Unknown error";
>>>> + }
>>>> +}
>>>> +
>>>> static int xe_late_bind_fw_num_fans(struct xe_late_bind *late_bind)
>>>> {
>>>> struct xe_device *xe = late_bind_to_xe(late_bind);
>>>> @@ -44,6 +82,93 @@ static int xe_late_bind_fw_num_fans(struct xe_late_bind *late_bind)
>>>> return 0;
>>>> }
>>>> +static void xe_late_bind_wait_for_worker_completion(struct xe_late_bind *late_bind)
>>>> +{
>>>> + struct xe_device *xe = late_bind_to_xe(late_bind);
>>>> + struct xe_late_bind_fw *lbfw;
>>>> + int fw_id;
>>>> +
>>>> + for (fw_id = 0; fw_id < XE_LB_FW_MAX_ID; fw_id++) {
>>>> + lbfw = &late_bind->late_bind_fw[fw_id];
>>>> + if (lbfw->valid && late_bind->wq) {
>>>> + drm_dbg(&xe->drm, "Flush work: load %s firmware\n",
>>>> + fw_id_to_name[lbfw->id]);
>>>> + flush_work(&lbfw->work);
>>>> + }
>>>> + }
>>>> +}
>>>> +
>>>> +static void xe_late_bind_work(struct work_struct *work)
>>>> +{
>>>> + struct xe_late_bind_fw *lbfw = container_of(work, struct xe_late_bind_fw, work);
>>>> + struct xe_late_bind *late_bind = container_of(lbfw, struct xe_late_bind,
>>>> + late_bind_fw[lbfw->id]);
>>>> + struct xe_device *xe = late_bind_to_xe(late_bind);
>>>> + int retry = LB_FW_LOAD_RETRY_MAXCOUNT;
>>>> + int ret;
>>>> + int slept;
>>>> +
>>>> + /* we can queue this before the component is bound */
>>>> + for (slept = 0; slept < LB_INIT_TIMEOUT_MS; slept += 100) {
>>>> + if (late_bind->component.ops)
>>>> + break;
>>>> + msleep(100);
>>>> + }
>>>> +
>>>> + if (!xe_pm_runtime_get_if_active(xe))
>>>> + return;
>>>> +
>>>> + mutex_lock(&late_bind->mutex);
>>>> +
>>>> + if (!late_bind->component.ops) {
>>>> + drm_err(&xe->drm, "Late bind component not bound\n");
>>>> + goto out;
>>>> + }
>>>> +
>>>> + drm_dbg(&xe->drm, "Load %s firmware\n", fw_id_to_name[lbfw->id]);
>>>> +
>>>> + do {
>>>> + ret = late_bind->component.ops->push_config(late_bind->component.mei_dev,
>>>> + lbfw->type, lbfw->flags,
>>>> + lbfw->payload, lbfw->payload_size);
>>>> + if (!ret)
>>>> + break;
>>>> + msleep(LB_FW_LOAD_RETRY_PAUSE_MS);
>>>> + } while (--retry && ret == -EBUSY);
>>>> +
>>>> + if (!ret) {
>>>> + drm_dbg(&xe->drm, "Load %s firmware successful\n",
>>>> + fw_id_to_name[lbfw->id]);
>>>> + goto out;
>>>> + }
>>>> +
>>>> + if (ret > 0)
>> nit: here you can just do "else if" and drop the goto.
>>
>> Daniele
>>
>>>> + drm_err(&xe->drm, "Load %s firmware failed with err %d, %s\n",
>>>> + fw_id_to_name[lbfw->id], ret, xe_late_bind_parse_status(ret));
>>>> + else
>>>> + drm_err(&xe->drm, "Load %s firmware failed with err %d",
>>>> + fw_id_to_name[lbfw->id], ret);
>>>> +out:
>>>> + mutex_unlock(&late_bind->mutex);
>>>> + xe_pm_runtime_put(xe);
>>>> +}
>>>> +
>>>> +int xe_late_bind_fw_load(struct xe_late_bind *late_bind)
>>>> +{
>>>> + struct xe_late_bind_fw *lbfw;
>>>> + int fw_id;
>>>> +
>>>> + if (!late_bind->component_added)
>>>> + return -ENODEV;
>>>> +
>>>> + for (fw_id = 0; fw_id < XE_LB_FW_MAX_ID; fw_id++) {
>>>> + lbfw = &late_bind->late_bind_fw[fw_id];
>>>> + if (lbfw->valid)
>>>> + queue_work(late_bind->wq, &lbfw->work);
>>>> + }
>>>> + return 0;
>>>> +}
>>>> +
>>>> static int __xe_late_bind_fw_init(struct xe_late_bind *late_bind, u32 fw_id)
>>>> {
>>>> struct xe_device *xe = late_bind_to_xe(late_bind);
>>>> @@ -99,6 +224,7 @@ static int __xe_late_bind_fw_init(struct xe_late_bind *late_bind, u32 fw_id)
>>>> memcpy(lb_fw->payload, fw->data, lb_fw->payload_size);
>>>> release_firmware(fw);
>>>> + INIT_WORK(&lb_fw->work, xe_late_bind_work);
>>>> lb_fw->valid = true;
>>>> return 0;
>>>> @@ -109,11 +235,16 @@ static int xe_late_bind_fw_init(struct xe_late_bind *late_bind)
>>>> int ret;
>>>> int fw_id;
>>>> + late_bind->wq = alloc_ordered_workqueue("late-bind-ordered-wq", 0);
>>>> + if (!late_bind->wq)
>>>> + return -ENOMEM;
>>>> +
>>>> for (fw_id = 0; fw_id < XE_LB_FW_MAX_ID; fw_id++) {
>>>> ret = __xe_late_bind_fw_init(late_bind, fw_id);
>>>> if (ret)
>>>> return ret;
>>>> }
>>>> +
>>>> return 0;
>>>> }
>>>> @@ -137,6 +268,8 @@ static void xe_late_bind_component_unbind(struct device *xe_kdev,
>>>> struct xe_device *xe = kdev_to_xe_device(xe_kdev);
>>>> struct xe_late_bind *late_bind = &xe->late_bind;
>>>> + xe_late_bind_wait_for_worker_completion(late_bind);
>>>> +
>>>> mutex_lock(&late_bind->mutex);
>>>> late_bind->component.ops = NULL;
>>>> mutex_unlock(&late_bind->mutex);
>>>> @@ -152,7 +285,15 @@ static void xe_late_bind_remove(void *arg)
>>>> struct xe_late_bind *late_bind = arg;
>>>> struct xe_device *xe = late_bind_to_xe(late_bind);
>>>> + xe_late_bind_wait_for_worker_completion(late_bind);
>>>> +
>>>> + late_bind->component_added = false;
>>>> +
>>>> component_del(xe->drm.dev, &xe_late_bind_component_ops);
>>>> + if (late_bind->wq) {
>>>> + destroy_workqueue(late_bind->wq);
>>>> + late_bind->wq = NULL;
>>>> + }
>>>> mutex_destroy(&late_bind->mutex);
>>>> }
>>>> @@ -183,9 +324,15 @@ int xe_late_bind_init(struct xe_late_bind *late_bind)
>>>> return err;
>>>> }
>>>> + late_bind->component_added = true;
>>>> +
>>>> err = devm_add_action_or_reset(xe->drm.dev, xe_late_bind_remove, late_bind);
>>>> if (err)
>>>> return err;
>>>> - return xe_late_bind_fw_init(late_bind);
>>>> + err = xe_late_bind_fw_init(late_bind);
>>>> + if (err)
>>>> + return err;
>>>> +
>>>> + return xe_late_bind_fw_load(late_bind);
>>>> }
>>>> diff --git a/drivers/gpu/drm/xe/xe_late_bind_fw.h b/drivers/gpu/drm/xe/xe_late_bind_fw.h
>>>> index 4c73571c3e62..28d56ed2bfdc 100644
>>>> --- a/drivers/gpu/drm/xe/xe_late_bind_fw.h
>>>> +++ b/drivers/gpu/drm/xe/xe_late_bind_fw.h
>>>> @@ -11,5 +11,6 @@
>>>> struct xe_late_bind;
>>>> int xe_late_bind_init(struct xe_late_bind *late_bind);
>>>> +int xe_late_bind_fw_load(struct xe_late_bind *late_bind);
>>>> #endif
>>>> diff --git a/drivers/gpu/drm/xe/xe_late_bind_fw_types.h b/drivers/gpu/drm/xe/xe_late_bind_fw_types.h
>>>> index 93abf4c51789..f119a75f4c9c 100644
>>>> --- a/drivers/gpu/drm/xe/xe_late_bind_fw_types.h
>>>> +++ b/drivers/gpu/drm/xe/xe_late_bind_fw_types.h
>>>> @@ -9,6 +9,7 @@
>>>> #include <linux/iosys-map.h>
>>>> #include <linux/mutex.h>
>>>> #include <linux/types.h>
>>>> +#include <linux/workqueue.h>
>>>> #define MAX_PAYLOAD_SIZE SZ_4K
>>>> @@ -38,6 +39,8 @@ struct xe_late_bind_fw {
>>>> u8 *payload;
>>>> /** @late_bind_fw.payload_size: late binding blob payload_size */
>>>> size_t payload_size;
>>>> + /** @late_bind_fw.work: worker to upload latebind blob */
>>>> + struct work_struct work;
>>>> };
>>>> /**
>>>> @@ -64,6 +67,10 @@ struct xe_late_bind {
>>>> struct mutex mutex;
>>>> /** @late_bind.late_bind_fw: late binding firmware array */
>>>> struct xe_late_bind_fw late_bind_fw[XE_LB_FW_MAX_ID];
>>>> + /** @late_bind.wq: workqueue to submit request to download late bind blob */
>>>> + struct workqueue_struct *wq;
>>>> + /** @late_bind.component_added: whether the component has been added */
>>>> + bool component_added;
>>>> };
>>>> #endif
>>>> --
>>>> 2.34.1
>>>>
Powered by blists - more mailing lists