lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20240527114746.1919292-9-obitton@habana.ai>
Date: Mon, 27 May 2024 14:47:46 +0300
From: Ofir Bitton <obitton@...ana.ai>
To: dri-devel@...ts.freedesktop.org, linux-kernel@...r.kernel.org
Cc: Farah Kassabri <fkassabri@...ana.ai>
Subject: [PATCH 9/9] accel/habanalabs: add heartbeat debug info

From: Farah Kassabri <fkassabri@...ana.ai>

It is hard to debug the reason for heartbeat check failures.
As an attempt to ease this task, this patch will provide more
information when this failure happens.
Heartbeat checks the communication with FW, so printing
the CPU queue pi/ci and the counter of how many times that event
was received would help in debugging the issue.

Signed-off-by: Farah Kassabri <fkassabri@...ana.ai>
Reviewed-by: Ofir Bitton <obitton@...ana.ai>
---
 drivers/accel/habanalabs/common/device.c     | 12 ++++++++++++
 drivers/accel/habanalabs/common/habanalabs.h | 15 ++++++++++++++-
 drivers/accel/habanalabs/gaudi2/gaudi2.c     |  3 +++
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index bb3f44392908..35502e938b5d 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -1052,12 +1052,22 @@ static bool is_pci_link_healthy(struct hl_device *hdev)
 static bool hl_device_eq_heartbeat_received(struct hl_device *hdev)
 {
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	u32 cpu_q_id;
 
 	if (!prop->cpucp_info.eq_health_check_supported)
 		return true;
 
 	if (!hdev->eq_heartbeat_received) {
+		cpu_q_id = hdev->heartbeat_debug_info.cpu_queue_id;
+
 		dev_err(hdev->dev, "EQ heartbeat event was not received!\n");
+
+		dev_err(hdev->dev, "Heartbeat events counter: %u, Q_PI: %u, Q_CI: %u, EQ CI: %u, EQ prev: %u\n",
+				hdev->heartbeat_debug_info.heartbeat_event_counter,
+				hdev->kernel_queues[cpu_q_id].pi,
+				atomic_read(&hdev->kernel_queues[cpu_q_id].ci),
+				hdev->event_queue.ci,
+				hdev->event_queue.prev_eqe_index);
 		return false;
 	}
 
@@ -1138,6 +1148,8 @@ static int device_late_init(struct hl_device *hdev)
 	hdev->high_pll = hdev->asic_prop.high_pll;
 
 	if (hdev->heartbeat) {
+		hdev->heartbeat_debug_info.heartbeat_event_counter = 0;
+
 		/*
 		 * Before scheduling the heartbeat driver will check if eq event has received.
 		 * for the first schedule we need to set the indication as true then for the next
diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index 55495861f432..5e9f54ca336a 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -71,7 +71,7 @@ struct hl_fpriv;
 
 #define HL_DEVICE_TIMEOUT_USEC		1000000 /* 1 s */
 
-#define HL_HEARTBEAT_PER_USEC		5000000 /* 5 s */
+#define HL_HEARTBEAT_PER_USEC		10000000 /* 10 s */
 
 #define HL_PLL_LOW_JOB_FREQ_USEC	5000000 /* 5 s */
 
@@ -3174,6 +3174,16 @@ struct hl_reset_info {
 	u8		watchdog_active;
 };
 
+/**
+ * struct eq_heartbeat_debug_info - stores debug info to be used upon heartbeat failure.
+ * @heartbeat_event_counter: number of heartbeat events received.
+ * @cpu_queue_id: used to read the queue pi/ci
+ */
+struct eq_heartbeat_debug_info {
+	u32 heartbeat_event_counter;
+	u32 cpu_queue_id;
+};
+
 /**
  * struct hl_device - habanalabs device structure.
  * @pdev: pointer to PCI device, can be NULL in case of simulator device.
@@ -3262,6 +3272,7 @@ struct hl_reset_info {
  * @clk_throttling: holds information about current/previous clock throttling events
  * @captured_err_info: holds information about errors.
  * @reset_info: holds current device reset information.
+ * @heartbeat_debug_info: counters used to debug heartbeat failures.
  * @irq_affinity_mask: mask of available CPU cores for user and decoder interrupt handling.
  * @stream_master_qid_arr: pointer to array with QIDs of master streams.
  * @fw_inner_major_ver: the major of current loaded preboot inner version.
@@ -3452,6 +3463,8 @@ struct hl_device {
 
 	struct hl_reset_info		reset_info;
 
+	struct eq_heartbeat_debug_info	heartbeat_debug_info;
+
 	cpumask_t			irq_affinity_mask;
 
 	u32				*stream_master_qid_arr;
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 962b7fcd4318..08276f03c80f 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -3796,6 +3796,8 @@ static int gaudi2_sw_init(struct hl_device *hdev)
 	if (rc)
 		goto special_blocks_free;
 
+	hdev->heartbeat_debug_info.cpu_queue_id = GAUDI2_QUEUE_ID_CPU_PQ;
+
 	return 0;
 
 special_blocks_free:
@@ -9777,6 +9779,7 @@ static u16 event_id_to_engine_id(struct hl_device *hdev, u16 event_type)
 
 static void hl_eq_heartbeat_event_handle(struct hl_device *hdev)
 {
+	hdev->heartbeat_debug_info.heartbeat_event_counter++;
 	hdev->eq_heartbeat_received = true;
 }
 
-- 
2.34.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ