linux-kernel - Re: [PATCH v4 05/10] drm/xe/xe_late_bind

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <e392779f-a205-4085-8663-4cfbbab4bd82@intel.com>
Date: Thu, 26 Jun 2025 14:27:50 -0700
From: Daniele Ceraolo Spurio <daniele.ceraolospurio@...el.com>
To: Rodrigo Vivi <rodrigo.vivi@...el.com>, Badal Nilawar
	<badal.nilawar@...el.com>
CC: <intel-xe@...ts.freedesktop.org>, <dri-devel@...ts.freedesktop.org>,
	<linux-kernel@...r.kernel.org>, <anshuman.gupta@...el.com>,
	<alexander.usyskin@...el.com>, <gregkh@...uxfoundation.org>
Subject: Re: [PATCH v4 05/10] drm/xe/xe_late_bind_fw: Load late binding
 firmware



On 6/26/2025 10:24 AM, Rodrigo Vivi wrote:
> On Wed, Jun 25, 2025 at 10:30:10PM +0530, Badal Nilawar wrote:
>> Load late binding firmware
>>
>> v2:
>>   - s/EAGAIN/EBUSY/
>>   - Flush worker in suspend and driver unload (Daniele)
>> v3:
>>   - Use retry interval of 6s, in steps of 200ms, to allow
>>     other OS components release MEI CL handle (Sasha)
>> v4:
>>   - return -ENODEV if component not added (Daniele)
>>   - parse and print status returned by csc
>>   - Use xe_pm_get_if_in_active (Daniele)
> The worker is considered outer bound and it is safe
> to use xe_pm_runtime_get which takes the reference
> and resume synchronously.
>
> Otherwise, if using get_if_active you need to reschedule
> the work or you lose your job.

The issue is that the next patch adds code to re-queue the work from the 
rpm resume path, so if we do a sync resume here the worker will re-queue 
itself immediately when not needed. Also, when the re-queued work runs 
it might end up doing another sync resume and re-queuing itself once 
more. However, in the next patch we do also have a flush of the work in 
the rpm_suspend path, so maybe the worker running when we are rpm 
suspended is not actually a possible case?
Also, thinking about this more, that re-queuing on rpm resume only 
happens if d3cold is allowed, so when d3cold is not allowed we do want 
to proceed here we can actually reach here when rpm suspended.

>
>> Signed-off-by: Badal Nilawar <badal.nilawar@...el.com>
>> ---
>>   drivers/gpu/drm/xe/xe_late_bind_fw.c       | 149 ++++++++++++++++++++-
>>   drivers/gpu/drm/xe/xe_late_bind_fw.h       |   1 +
>>   drivers/gpu/drm/xe/xe_late_bind_fw_types.h |   7 +
>>   3 files changed, 156 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_late_bind_fw.c b/drivers/gpu/drm/xe/xe_late_bind_fw.c
>> index 32d1436e7191..52243063d98a 100644
>> --- a/drivers/gpu/drm/xe/xe_late_bind_fw.c
>> +++ b/drivers/gpu/drm/xe/xe_late_bind_fw.c
>> @@ -16,6 +16,20 @@
>>   #include "xe_late_bind_fw.h"
>>   #include "xe_pcode.h"
>>   #include "xe_pcode_api.h"
>> +#include "xe_pm.h"
>> +
>> +/*
>> + * The component should load quite quickly in most cases, but it could take
>> + * a bit. Using a very big timeout just to cover the worst case scenario
>> + */
>> +#define LB_INIT_TIMEOUT_MS 20000
>> +
>> +/*
>> + * Retry interval set to 6 seconds, in steps of 200 ms, to allow time for
>> + * other OS components to release the MEI CL handle
>> + */
>> +#define LB_FW_LOAD_RETRY_MAXCOUNT 30
>> +#define LB_FW_LOAD_RETRY_PAUSE_MS 200
>>   
>>   static const u32 fw_id_to_type[] = {
>>   		[XE_LB_FW_FAN_CONTROL] = CSC_LATE_BINDING_TYPE_FAN_CONTROL,
>> @@ -31,6 +45,30 @@ late_bind_to_xe(struct xe_late_bind *late_bind)
>>   	return container_of(late_bind, struct xe_device, late_bind);
>>   }
>>   
>> +static const char *xe_late_bind_parse_status(uint32_t status)
>> +{
>> +	switch (status) {
>> +	case CSC_LATE_BINDING_STATUS_SUCCESS:
>> +		return "success";
>> +	case CSC_LATE_BINDING_STATUS_4ID_MISMATCH:
>> +		return "4Id Mismatch";
>> +	case CSC_LATE_BINDING_STATUS_ARB_FAILURE:
>> +		return "ARB Failure";
>> +	case CSC_LATE_BINDING_STATUS_GENERAL_ERROR:
>> +		return "General Error";
>> +	case CSC_LATE_BINDING_STATUS_INVALID_PARAMS:
>> +		return "Invalid Params";
>> +	case CSC_LATE_BINDING_STATUS_INVALID_SIGNATURE:
>> +		return "Invalid Signature";
>> +	case CSC_LATE_BINDING_STATUS_INVALID_PAYLOAD:
>> +		return "Invalid Payload";
>> +	case CSC_LATE_BINDING_STATUS_TIMEOUT:
>> +		return "Timeout";
>> +	default:
>> +		return "Unknown error";
>> +	}
>> +}
>> +
>>   static int xe_late_bind_fw_num_fans(struct xe_late_bind *late_bind)
>>   {
>>   	struct xe_device *xe = late_bind_to_xe(late_bind);
>> @@ -44,6 +82,93 @@ static int xe_late_bind_fw_num_fans(struct xe_late_bind *late_bind)
>>   		return 0;
>>   }
>>   
>> +static void xe_late_bind_wait_for_worker_completion(struct xe_late_bind *late_bind)
>> +{
>> +	struct xe_device *xe = late_bind_to_xe(late_bind);
>> +	struct xe_late_bind_fw *lbfw;
>> +	int fw_id;
>> +
>> +	for (fw_id = 0; fw_id < XE_LB_FW_MAX_ID; fw_id++) {
>> +		lbfw = &late_bind->late_bind_fw[fw_id];
>> +		if (lbfw->valid && late_bind->wq) {
>> +			drm_dbg(&xe->drm, "Flush work: load %s firmware\n",
>> +				fw_id_to_name[lbfw->id]);
>> +			flush_work(&lbfw->work);
>> +		}
>> +	}
>> +}
>> +
>> +static void xe_late_bind_work(struct work_struct *work)
>> +{
>> +	struct xe_late_bind_fw *lbfw = container_of(work, struct xe_late_bind_fw, work);
>> +	struct xe_late_bind *late_bind = container_of(lbfw, struct xe_late_bind,
>> +						      late_bind_fw[lbfw->id]);
>> +	struct xe_device *xe = late_bind_to_xe(late_bind);
>> +	int retry = LB_FW_LOAD_RETRY_MAXCOUNT;
>> +	int ret;
>> +	int slept;
>> +
>> +	/* we can queue this before the component is bound */
>> +	for (slept = 0; slept < LB_INIT_TIMEOUT_MS; slept += 100) {
>> +		if (late_bind->component.ops)
>> +			break;
>> +		msleep(100);
>> +	}
>> +
>> +	if (!xe_pm_runtime_get_if_active(xe))
>> +		return;
>> +
>> +	mutex_lock(&late_bind->mutex);
>> +
>> +	if (!late_bind->component.ops) {
>> +		drm_err(&xe->drm, "Late bind component not bound\n");
>> +		goto out;
>> +	}
>> +
>> +	drm_dbg(&xe->drm, "Load %s firmware\n", fw_id_to_name[lbfw->id]);
>> +
>> +	do {
>> +		ret = late_bind->component.ops->push_config(late_bind->component.mei_dev,
>> +							    lbfw->type, lbfw->flags,
>> +							    lbfw->payload, lbfw->payload_size);
>> +		if (!ret)
>> +			break;
>> +		msleep(LB_FW_LOAD_RETRY_PAUSE_MS);
>> +	} while (--retry && ret == -EBUSY);
>> +
>> +	if (!ret) {
>> +		drm_dbg(&xe->drm, "Load %s firmware successful\n",
>> +			fw_id_to_name[lbfw->id]);
>> +		goto out;
>> +	}
>> +
>> +	if (ret > 0)

nit: here you can just do "else if" and drop the goto.

Daniele

>> +		drm_err(&xe->drm, "Load %s firmware failed with err %d, %s\n",
>> +			fw_id_to_name[lbfw->id], ret, xe_late_bind_parse_status(ret));
>> +	else
>> +		drm_err(&xe->drm, "Load %s firmware failed with err %d",
>> +			fw_id_to_name[lbfw->id], ret);
>> +out:
>> +	mutex_unlock(&late_bind->mutex);
>> +	xe_pm_runtime_put(xe);
>> +}
>> +
>> +int xe_late_bind_fw_load(struct xe_late_bind *late_bind)
>> +{
>> +	struct xe_late_bind_fw *lbfw;
>> +	int fw_id;
>> +
>> +	if (!late_bind->component_added)
>> +		return -ENODEV;
>> +
>> +	for (fw_id = 0; fw_id < XE_LB_FW_MAX_ID; fw_id++) {
>> +		lbfw = &late_bind->late_bind_fw[fw_id];
>> +		if (lbfw->valid)
>> +			queue_work(late_bind->wq, &lbfw->work);
>> +	}
>> +	return 0;
>> +}
>> +
>>   static int __xe_late_bind_fw_init(struct xe_late_bind *late_bind, u32 fw_id)
>>   {
>>   	struct xe_device *xe = late_bind_to_xe(late_bind);
>> @@ -99,6 +224,7 @@ static int __xe_late_bind_fw_init(struct xe_late_bind *late_bind, u32 fw_id)
>>   
>>   	memcpy(lb_fw->payload, fw->data, lb_fw->payload_size);
>>   	release_firmware(fw);
>> +	INIT_WORK(&lb_fw->work, xe_late_bind_work);
>>   	lb_fw->valid = true;
>>   
>>   	return 0;
>> @@ -109,11 +235,16 @@ static int xe_late_bind_fw_init(struct xe_late_bind *late_bind)
>>   	int ret;
>>   	int fw_id;
>>   
>> +	late_bind->wq = alloc_ordered_workqueue("late-bind-ordered-wq", 0);
>> +	if (!late_bind->wq)
>> +		return -ENOMEM;
>> +
>>   	for (fw_id = 0; fw_id < XE_LB_FW_MAX_ID; fw_id++) {
>>   		ret = __xe_late_bind_fw_init(late_bind, fw_id);
>>   		if (ret)
>>   			return ret;
>>   	}
>> +
>>   	return 0;
>>   }
>>   
>> @@ -137,6 +268,8 @@ static void xe_late_bind_component_unbind(struct device *xe_kdev,
>>   	struct xe_device *xe = kdev_to_xe_device(xe_kdev);
>>   	struct xe_late_bind *late_bind = &xe->late_bind;
>>   
>> +	xe_late_bind_wait_for_worker_completion(late_bind);
>> +
>>   	mutex_lock(&late_bind->mutex);
>>   	late_bind->component.ops = NULL;
>>   	mutex_unlock(&late_bind->mutex);
>> @@ -152,7 +285,15 @@ static void xe_late_bind_remove(void *arg)
>>   	struct xe_late_bind *late_bind = arg;
>>   	struct xe_device *xe = late_bind_to_xe(late_bind);
>>   
>> +	xe_late_bind_wait_for_worker_completion(late_bind);
>> +
>> +	late_bind->component_added = false;
>> +
>>   	component_del(xe->drm.dev, &xe_late_bind_component_ops);
>> +	if (late_bind->wq) {
>> +		destroy_workqueue(late_bind->wq);
>> +		late_bind->wq = NULL;
>> +	}
>>   	mutex_destroy(&late_bind->mutex);
>>   }
>>   
>> @@ -183,9 +324,15 @@ int xe_late_bind_init(struct xe_late_bind *late_bind)
>>   		return err;
>>   	}
>>   
>> +	late_bind->component_added = true;
>> +
>>   	err = devm_add_action_or_reset(xe->drm.dev, xe_late_bind_remove, late_bind);
>>   	if (err)
>>   		return err;
>>   
>> -	return xe_late_bind_fw_init(late_bind);
>> +	err = xe_late_bind_fw_init(late_bind);
>> +	if (err)
>> +		return err;
>> +
>> +	return xe_late_bind_fw_load(late_bind);
>>   }
>> diff --git a/drivers/gpu/drm/xe/xe_late_bind_fw.h b/drivers/gpu/drm/xe/xe_late_bind_fw.h
>> index 4c73571c3e62..28d56ed2bfdc 100644
>> --- a/drivers/gpu/drm/xe/xe_late_bind_fw.h
>> +++ b/drivers/gpu/drm/xe/xe_late_bind_fw.h
>> @@ -11,5 +11,6 @@
>>   struct xe_late_bind;
>>   
>>   int xe_late_bind_init(struct xe_late_bind *late_bind);
>> +int xe_late_bind_fw_load(struct xe_late_bind *late_bind);
>>   
>>   #endif
>> diff --git a/drivers/gpu/drm/xe/xe_late_bind_fw_types.h b/drivers/gpu/drm/xe/xe_late_bind_fw_types.h
>> index 93abf4c51789..f119a75f4c9c 100644
>> --- a/drivers/gpu/drm/xe/xe_late_bind_fw_types.h
>> +++ b/drivers/gpu/drm/xe/xe_late_bind_fw_types.h
>> @@ -9,6 +9,7 @@
>>   #include <linux/iosys-map.h>
>>   #include <linux/mutex.h>
>>   #include <linux/types.h>
>> +#include <linux/workqueue.h>
>>   
>>   #define MAX_PAYLOAD_SIZE SZ_4K
>>   
>> @@ -38,6 +39,8 @@ struct xe_late_bind_fw {
>>   	u8  *payload;
>>   	/** @late_bind_fw.payload_size: late binding blob payload_size */
>>   	size_t payload_size;
>> +	/** @late_bind_fw.work: worker to upload latebind blob */
>> +	struct work_struct work;
>>   };
>>   
>>   /**
>> @@ -64,6 +67,10 @@ struct xe_late_bind {
>>   	struct mutex mutex;
>>   	/** @late_bind.late_bind_fw: late binding firmware array */
>>   	struct xe_late_bind_fw late_bind_fw[XE_LB_FW_MAX_ID];
>> +	/** @late_bind.wq: workqueue to submit request to download late bind blob */
>> +	struct workqueue_struct *wq;
>> +	/** @late_bind.component_added: whether the component has been added */
>> +	bool component_added;
>>   };
>>   
>>   #endif
>> -- 
>> 2.34.1
>>