lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Date:   Wed, 29 Nov 2023 07:35:21 +0000
From:   Muralidhara M K <muralimk@....com>
To:     <linux-edac@...r.kernel.org>
CC:     <linux-kernel@...r.kernel.org>, <bp@...en8.de>,
        <mchehab@...nel.org>, Muralidhara M K <muralidhara.mk@....com>
Subject: [PATCH v2 6/6] RAS: EDAC/amd64: Retire all system physical address from HBM3 row

From: Muralidhara M K <muralidhara.mk@....com>

AMD systems have HBM memory embedded within the chip, The entire memory
is managed by host OS. Error containment needs to be reliable, because
HBM memory cannot be replaced.

HBM3 memory has 8 columns in each row and column bits are c2, c3 and c4
which gives 8 possible combination of addresses in each row.

Identify all these system physical addresses in a HBM row and retire all
system physical address to get rid of intermittent or recurrent memory
errors.

Signed-off-by: Muralidhara M K <muralidhara.mk@....com>
---
Changes:
v1 -> v2 : Rename and modify function amd_umc_retire_column_spa_from_row() 

 drivers/edac/amd64_edac.c |  3 ++
 drivers/ras/amd/atl/umc.c | 77 +++++++++++++++++++++++++++++++++++++++
 include/linux/amd-atl.h   |  2 +
 3 files changed, 82 insertions(+)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 623f84c53d2d..9872ede7eca9 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2831,6 +2831,9 @@ static void decode_umc_error(int node_id, struct mce *m)
 
 	error_address_to_page_and_offset(sys_addr, &err);
 
+	if (pvt->fam == 0x19 && (pvt->model >= 0x90 && pvt->model <= 0x9f))
+		amd_umc_retire_column_spa_from_row(m);
+
 log_error:
 	__log_ecc_error(mci, &err, ecc_type);
 }
diff --git a/drivers/ras/amd/atl/umc.c b/drivers/ras/amd/atl/umc.c
index 3533db279cec..de51b666b20e 100644
--- a/drivers/ras/amd/atl/umc.c
+++ b/drivers/ras/amd/atl/umc.c
@@ -255,3 +255,80 @@ int umc_mca_addr_to_sys_addr(struct mce *m, u64 *sys_addr)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(umc_mca_addr_to_sys_addr);
+
+/*
+ * High Bandwidth Memory (HBM v3) has fixed number of columns in a row.
+ * In specific, HBMv3 has 8 columns in one row.
+ * Extract column bits in a row to find all the combination of masks and
+ * to retire all the system physical addresses in that particular row.
+ */
+#define MAX_COLUMNS_IN_HBM_ROW	8
+
+/* Column 2, 3 and 4th bits in Normalized Address */
+#define UMC_NA_C2_BIT	BIT(8)
+#define UMC_NA_C3_BIT	BIT(9)
+#define UMC_NA_C4_BIT	BIT(14)
+
+/* Possible combinations of column address masks in a HBM v3 row */
+#define C_1_1_1_MASK	(UMC_NA_C4_BIT | UMC_NA_C3_BIT | UMC_NA_C2_BIT)
+#define C_1_1_0_MASK	(UMC_NA_C4_BIT | UMC_NA_C3_BIT)
+#define C_1_0_1_MASK	(UMC_NA_C4_BIT | UMC_NA_C2_BIT)
+#define C_1_0_0_MASK	(UMC_NA_C4_BIT)
+#define C_0_1_1_MASK	(UMC_NA_C3_BIT | UMC_NA_C2_BIT)
+#define C_0_1_0_MASK	(UMC_NA_C3_BIT)
+#define C_0_0_1_MASK	(UMC_NA_C2_BIT)
+#define C_0_0_0_MASK	~C_1_1_1_MASK
+
+/* Identify system address physical addresses of all columns in a HBM v3 row */
+static void identify_column_spa_from_row(struct mce *m, u64 *col)
+{
+	u8 cs_inst_id = get_cs_inst_id(m);
+	u8 socket_id = get_socket_id(m);
+	u64 norm_addr = get_norm_addr(m);
+	u8 die_id = get_die_id(m);
+	u16 df_acc_id = get_df_acc_id(m);
+
+	u64 retire_addr, column;
+	u64 column_masks[] = { 0, C_0_0_1_MASK, C_0_1_0_MASK, C_0_1_1_MASK,
+			C_1_0_0_MASK, C_1_0_1_MASK, C_1_1_0_MASK, C_1_1_1_MASK };
+
+	/* clear and loop for all possibilities of [c4 c3 c2] */
+	norm_addr &= C_0_0_0_MASK;
+
+	for (column = 0; column < ARRAY_SIZE(column_masks); column++) {
+		retire_addr = norm_addr | column_masks[column];
+
+		if (norm_to_sys_addr(df_acc_id, socket_id, die_id, cs_inst_id, &retire_addr))
+			pr_warn("Failed norm_to_sys_addr for column[%lld]\n", column);
+		else
+			col[column] = retire_addr;
+	}
+}
+
+void amd_umc_retire_column_spa_from_row(struct mce *m)
+{
+	u64 col[MAX_COLUMNS_IN_HBM_ROW];
+	u64 tmp[MAX_COLUMNS_IN_HBM_ROW];
+	int i, j, count = 0;
+	unsigned long pfn;
+
+	pr_info("Identify SPA of all columns from row for MCE Addr:0x%llx\n", m->addr);
+	identify_column_spa_from_row(m, col);
+
+	/* Find duplicate column SPA in a row */
+	for (i = 0; i < MAX_COLUMNS_IN_HBM_ROW; i++) {
+		for (j = 0; j < count; j++) {
+			if (col[i] == tmp[j])
+				break;
+		}
+		if (j == count) {
+			tmp[count] = col[i];
+			/* do page retirement, except for duplicate addresses */
+			pr_debug("Retire column spa:0x%llx ", tmp[count]);
+			pfn = PHYS_PFN(tmp[count]);
+			memory_failure(pfn, 0);
+			count++;
+		}
+	}
+}
+EXPORT_SYMBOL(amd_umc_retire_column_spa_from_row);
diff --git a/include/linux/amd-atl.h b/include/linux/amd-atl.h
index c625ea3ab5d0..6cba39be63ca 100644
--- a/include/linux/amd-atl.h
+++ b/include/linux/amd-atl.h
@@ -25,4 +25,6 @@ static inline int amd_umc_mca_addr_to_sys_addr(struct mce *m, u64 *sys_addr)
 	return umc_mca_addr_to_sys_addr(m, sys_addr);
 }
 
+void amd_umc_retire_column_spa_from_row(struct mce *m);
+
 #endif /* _AMD_ATL_H */
-- 
2.25.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ