lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20220704092941.2237683-1-ogabbay@kernel.org>
Date:   Mon,  4 Jul 2022 12:29:30 +0300
From:   Oded Gabbay <ogabbay@...nel.org>
To:     linux-kernel@...r.kernel.org
Cc:     Ofir Bitton <obitton@...ana.ai>
Subject: [PATCH 01/12] habanalabs/gaudi2: reset device upon critical ECC event

From: Ofir Bitton <obitton@...ana.ai>

Correctable ECC events are not fatal, but as they accumulate, the f/w
can decide that a hard-rest is required. This indication is
propagated to the host using the existing ECC event interface.

Signed-off-by: Ofir Bitton <obitton@...ana.ai>
Reviewed-by: Oded Gabbay <ogabbay@...nel.org>
Signed-off-by: Oded Gabbay <ogabbay@...nel.org>
---
 drivers/misc/habanalabs/gaudi2/gaudi2.c       | 25 +++++++++++--------
 .../misc/habanalabs/include/common/cpucp_if.h |  2 +-
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index edcf23b314a7..dbbd08600a56 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -6637,7 +6637,7 @@ static void gaudi2_print_irq_info(struct hl_device *hdev, u16 event_type)
 								event_type, desc);
 }
 
-static void gaudi2_handle_ecc_event(struct hl_device *hdev, u16 event_type,
+static bool gaudi2_handle_ecc_event(struct hl_device *hdev, u16 event_type,
 		struct hl_eq_ecc_data *ecc_data)
 {
 	u64 ecc_address = 0, ecc_syndrom = 0;
@@ -6647,8 +6647,11 @@ static void gaudi2_handle_ecc_event(struct hl_device *hdev, u16 event_type,
 	ecc_syndrom = le64_to_cpu(ecc_data->ecc_syndrom);
 	memory_wrapper_idx = ecc_data->memory_wrapper_idx;
 
-	dev_err(hdev->dev, "ECC error detected. address: %#llx. Syndrom: %#llx. block id %u\n",
-		ecc_address, ecc_syndrom, memory_wrapper_idx);
+	dev_err(hdev->dev,
+		"ECC error detected. address: %#llx. Syndrom: %#llx. block id %u. critical %u.\n",
+		ecc_address, ecc_syndrom, memory_wrapper_idx, ecc_data->is_critical);
+
+	return !!ecc_data->is_critical;
 }
 
 /*
@@ -7991,9 +7994,9 @@ static bool gaudi2_handle_hbm_mc_sei_err(struct hl_device *hdev, u16 event_type,
 	}
 
 	dev_err_ratelimited(hdev->dev,
-			"System Error Interrupt - HBM(%u) MC(%u) MC_CH(%u) MC_PC(%u). Error cause: %s\n",
-			hbm_id, mc_id, sei_data->hdr.mc_channel, sei_data->hdr.mc_pseudo_channel,
-			hbm_mc_sei_cause[cause_idx]);
+		"System Error Interrupt - HBM(%u) MC(%u) MC_CH(%u) MC_PC(%u). Critical(%u). Error cause: %s\n",
+		hbm_id, mc_id, sei_data->hdr.mc_channel, sei_data->hdr.mc_pseudo_channel,
+		sei_data->hdr.is_critical, hbm_mc_sei_cause[cause_idx]);
 
 	/* Print error-specific info */
 	switch (cause_idx) {
@@ -8032,6 +8035,8 @@ static bool gaudi2_handle_hbm_mc_sei_err(struct hl_device *hdev, u16 event_type,
 		break;
 	};
 
+	require_hard_reset |= !!sei_data->hdr.is_critical;
+
 	return require_hard_reset;
 }
 
@@ -8199,7 +8204,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 {
 	u32 ctl, reset_flags = HL_DRV_RESET_HARD | HL_DRV_RESET_DELAY;
 	struct gaudi2_device *gaudi2 = hdev->asic_specific;
-	bool hbm_require_reset = false, skip_reset = false;
+	bool reset_required = false, skip_reset = false;
 	int index, sbte_index;
 	u16 event_type;
 
@@ -8222,7 +8227,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 		fallthrough;
 	case GAUDI2_EVENT_ROTATOR0_SERR ... GAUDI2_EVENT_ROTATOR1_DERR:
 		reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
-		gaudi2_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data);
+		reset_required = gaudi2_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data);
 		break;
 
 	case GAUDI2_EVENT_TPC0_QM ... GAUDI2_EVENT_PDMA1_QM:
@@ -8387,7 +8392,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 	case GAUDI2_EVENT_HBM0_MC0_SEI_SEVERE ... GAUDI2_EVENT_HBM5_MC1_SEI_NON_SEVERE:
 		if (gaudi2_handle_hbm_mc_sei_err(hdev, event_type, &eq_entry->sei_data)) {
 			reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
-			hbm_require_reset = true;
+			reset_required = true;
 		}
 		break;
 
@@ -8539,7 +8544,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 						event_type);
 	}
 
-	if ((gaudi2_irq_map_table[event_type].reset || hbm_require_reset) && !skip_reset)
+	if ((gaudi2_irq_map_table[event_type].reset || reset_required) && !skip_reset)
 		goto reset_device;
 
 	/* Send unmask irq only for interrupts not classified as MSG */
diff --git a/drivers/misc/habanalabs/include/common/cpucp_if.h b/drivers/misc/habanalabs/include/common/cpucp_if.h
index 719b2ff80985..abf40e1c4965 100644
--- a/drivers/misc/habanalabs/include/common/cpucp_if.h
+++ b/drivers/misc/habanalabs/include/common/cpucp_if.h
@@ -192,7 +192,7 @@ struct hl_hbm_sei_header {
 	__u8 sei_cause;		/* enum hl_hbm_sei_cause */
 	__u8 mc_channel;		/* range: 0-3 */
 	__u8 mc_pseudo_channel;	/* range: 0-7 */
-	__u8 pad[1];
+	__u8 is_critical;
 };
 
 #define HBM_RD_ADDR_SID_SHIFT		0
-- 
2.25.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ