[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20230724012118.2316073-23-sashal@kernel.org>
Date: Sun, 23 Jul 2023 21:20:56 -0400
From: Sasha Levin <sashal@...nel.org>
To: linux-kernel@...r.kernel.org, stable@...r.kernel.org
Cc: Ofir Bitton <obitton@...ana.ai>, Oded Gabbay <ogabbay@...nel.org>,
Sasha Levin <sashal@...nel.org>, gregkh@...uxfoundation.org,
ttayar@...ana.ai, osharabi@...ana.ai, oshpigelman@...ana.ai,
mhaimovski@...ana.ai, dliberman@...ana.ai, fkassabri@...ana.ai,
bjauhari@...ana.ai, talcohen@...ana.ai, dhirschfeld@...ana.ai,
rostedt@...dmis.org
Subject: [PATCH AUTOSEL 6.1 23/41] accel/habanalabs: add pci health check during heartbeat
From: Ofir Bitton <obitton@...ana.ai>
[ Upstream commit d8b9cea584661b30305cf341bf9f675dc0a25471 ]
Currently upon a heartbeat failure, we don't know if the failure
is due to firmware hang or due to a bad PCI link. Hence, we
are reading a PCI config space register with a known value (vendor ID)
so we will know which of the two possibilities caused the heartbeat
failure.
Signed-off-by: Ofir Bitton <obitton@...ana.ai>
Reviewed-by: Oded Gabbay <ogabbay@...nel.org>
Signed-off-by: Oded Gabbay <ogabbay@...nel.org>
Signed-off-by: Sasha Levin <sashal@...nel.org>
---
drivers/misc/habanalabs/common/device.c | 15 ++++++++++++++-
drivers/misc/habanalabs/common/habanalabs.h | 2 ++
drivers/misc/habanalabs/common/habanalabs_drv.c | 2 --
3 files changed, 16 insertions(+), 3 deletions(-)
diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index e0dca445abf14..9ee1b6abd8a05 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -870,6 +870,18 @@ static void device_early_fini(struct hl_device *hdev)
hdev->asic_funcs->early_fini(hdev);
}
+static bool is_pci_link_healthy(struct hl_device *hdev)
+{
+ u16 vendor_id;
+
+ if (!hdev->pdev)
+ return false;
+
+ pci_read_config_word(hdev->pdev, PCI_VENDOR_ID, &vendor_id);
+
+ return (vendor_id == PCI_VENDOR_ID_HABANALABS);
+}
+
static void hl_device_heartbeat(struct work_struct *work)
{
struct hl_device *hdev = container_of(work, struct hl_device,
@@ -882,7 +894,8 @@ static void hl_device_heartbeat(struct work_struct *work)
goto reschedule;
if (hl_device_operational(hdev, NULL))
- dev_err(hdev->dev, "Device heartbeat failed!\n");
+ dev_err(hdev->dev, "Device heartbeat failed! PCI link is %s\n",
+ is_pci_link_healthy(hdev) ? "healthy" : "broken");
hl_device_reset(hdev, HL_DRV_RESET_HARD | HL_DRV_RESET_HEARTBEAT);
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 58c95b13be69a..257b94cec6248 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -34,6 +34,8 @@
struct hl_device;
struct hl_fpriv;
+#define PCI_VENDOR_ID_HABANALABS 0x1da3
+
/* Use upper bits of mmap offset to store habana driver specific information.
* bits[63:59] - Encode mmap type
* bits[45:0] - mmap offset value
diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c
index 112632afe7d53..ae3cab3f4aa55 100644
--- a/drivers/misc/habanalabs/common/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/common/habanalabs_drv.c
@@ -54,8 +54,6 @@ module_param(boot_error_status_mask, ulong, 0444);
MODULE_PARM_DESC(boot_error_status_mask,
"Mask of the error status during device CPU boot (If bitX is cleared then error X is masked. Default all 1's)");
-#define PCI_VENDOR_ID_HABANALABS 0x1da3
-
#define PCI_IDS_GOYA 0x0001
#define PCI_IDS_GAUDI 0x1000
#define PCI_IDS_GAUDI_SEC 0x1010
--
2.39.2
Powered by blists - more mailing lists