[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20220720111523.4069830-5-ogabbay@kernel.org>
Date: Wed, 20 Jul 2022 14:15:19 +0300
From: Oded Gabbay <ogabbay@...nel.org>
To: linux-kernel@...r.kernel.org
Subject: [PATCH 5/9] habanalabs/gaudi: increase default cs timeout to 10 minutes
In order to improve scalability and reduce host overhead, it is better
to increase the default TDR timeout of Gaudi1 from 30 seconds to
10 minutes.
This will allow the DL Framework (e.g. PyTorch, TensorFlow) to remove
the host sync they are using now and improve overall performance on
scaleout training.
Note that one can always set the timeout to a custom value via
a kernel module parameter given during driver load.
Signed-off-by: Oded Gabbay <ogabbay@...nel.org>
---
.../misc/habanalabs/common/habanalabs_drv.c | 23 +++++++++++++++----
1 file changed, 18 insertions(+), 5 deletions(-)
diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c
index f733ead605e7..d59d8cdf33e6 100644
--- a/drivers/misc/habanalabs/common/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/common/habanalabs_drv.c
@@ -27,7 +27,10 @@ static struct class *hl_class;
static DEFINE_IDR(hl_devs_idr);
static DEFINE_MUTEX(hl_devs_idr_lock);
-static int timeout_locked = 30;
+#define HL_DEFAULT_TIMEOUT_LOCKED 30 /* 30 seconds */
+#define GAUDI_DEFAULT_TIMEOUT_LOCKED 600 /* 10 minutes */
+
+static int timeout_locked = HL_DEFAULT_TIMEOUT_LOCKED;
static int reset_on_lockup = 1;
static int memory_scrub;
static ulong boot_error_status_mask = ULONG_MAX;
@@ -314,12 +317,22 @@ static void copy_kernel_module_params_to_device(struct hl_device *hdev)
hdev->boot_error_status_mask = boot_error_status_mask;
}
-static void fixup_device_params_per_asic(struct hl_device *hdev)
+static void fixup_device_params_per_asic(struct hl_device *hdev, int timeout)
{
switch (hdev->asic_type) {
- case ASIC_GOYA:
case ASIC_GAUDI:
case ASIC_GAUDI_SEC:
+ /* If user didn't request a different timeout than the default one, we have
+ * a different default timeout for Gaudi
+ */
+ if (timeout == HL_DEFAULT_TIMEOUT_LOCKED)
+ hdev->timeout_jiffies = msecs_to_jiffies(GAUDI_DEFAULT_TIMEOUT_LOCKED *
+ MSEC_PER_SEC);
+
+ hdev->reset_upon_device_release = 0;
+ break;
+
+ case ASIC_GOYA:
hdev->reset_upon_device_release = 0;
break;
@@ -339,7 +352,7 @@ static int fixup_device_params(struct hl_device *hdev)
hdev->fw_comms_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC;
if (tmp_timeout)
- hdev->timeout_jiffies = msecs_to_jiffies(tmp_timeout * 1000);
+ hdev->timeout_jiffies = msecs_to_jiffies(tmp_timeout * MSEC_PER_SEC);
else
hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
@@ -360,7 +373,7 @@ static int fixup_device_params(struct hl_device *hdev)
if (!hdev->cpu_queues_enable)
hdev->heartbeat = 0;
- fixup_device_params_per_asic(hdev);
+ fixup_device_params_per_asic(hdev, tmp_timeout);
return 0;
}
--
2.25.1
Powered by blists - more mailing lists