lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20250214002728.6287-1-qiuxu.zhuo@intel.com>
Date: Fri, 14 Feb 2025 08:27:28 +0800
From: Qiuxu Zhuo <qiuxu.zhuo@...el.com>
To: Tony Luck <tony.luck@...el.com>
Cc: Qiuxu Zhuo <qiuxu.zhuo@...el.com>,
	Borislav Petkov <bp@...en8.de>,
	James Morse <james.morse@....com>,
	Mauro Carvalho Chehab <mchehab@...nel.org>,
	Robert Richter <rric@...nel.org>,
	Kevin Chang <kevin1.chang@...el.com>,
	Thomas Chen <Thomas.Chen@...el.com>,
	linux-edac@...r.kernel.org,
	linux-kernel@...r.kernel.org
Subject: [PATCH 1/1] EDAC/{skx_common,i10nm}: Fix some missing error reports on Emerald Rapids

When doing error injection to some memory DIMMs on certain Intel Emerald
Rapids servers, the i10nm_edac missed error reports for some memory DIMMs.

Certain BIOS configurations may hide some memory controllers, and the
i10nm_edac doesn't enumerate these hidden memory controllers. However, the
ADXL decodes memory errors using memory controller physical indices even
if there are hidden memory controllers. Therefore, the memory controller
physical indices reported by the ADXL may mismatch the logical indices
enumerated by the i10nm_edac, resulting in missed error reports for some
memory DIMMs.

Fix this issue by creating a mapping table from memory controller physical
indices (used by the ADXL) to logical indices (used by the i10nm_edac) and
using it to convert the physical indices to the logical indices during the
error handling process.

Fixes: c545f5e41225 ("EDAC/i10nm: Skip the absent memory controllers")
Reported-by: Kevin Chang <kevin1.chang@...el.com>
Tested-by: Kevin Chang <kevin1.chang@...el.com>
Reported-by: Thomas Chen <Thomas.Chen@...el.com>
Tested-by: Thomas Chen <Thomas.Chen@...el.com>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@...el.com>
---
 drivers/edac/i10nm_base.c |  2 ++
 drivers/edac/skx_common.c | 33 +++++++++++++++++++++++++++++++++
 drivers/edac/skx_common.h | 11 +++++++++++
 3 files changed, 46 insertions(+)

diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index f45d849d3f15..355a977019e9 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -751,6 +751,8 @@ static int i10nm_get_ddr_munits(void)
 				continue;
 			} else {
 				d->imc[lmc].mdev = mdev;
+				if (res_cfg->type == SPR)
+					skx_set_mc_mapping(d, i, lmc);
 				lmc++;
 			}
 		}
diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index f7bd930e058f..fa5b442b1844 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -121,6 +121,35 @@ void skx_adxl_put(void)
 }
 EXPORT_SYMBOL_GPL(skx_adxl_put);
 
+static void skx_init_mc_mapping(struct skx_dev *d)
+{
+	/*
+	 * By default, the BIOS presents all memory controllers within each
+	 * socket to the EDAC driver. The physical indices are the same as
+	 * the logical indices of the memory controllers enumerated by the
+	 * EDAC driver.
+	 */
+	for (int i = 0; i < NUM_IMC; i++)
+		d->mc_mapping[i] = i;
+}
+
+void skx_set_mc_mapping(struct skx_dev *d, u8 pmc, u8 lmc)
+{
+	edac_dbg(0, "Set the mapping of mc phy idx to logical idx: %02d -> %02d\n",
+		 pmc, lmc);
+
+	d->mc_mapping[pmc] = lmc;
+}
+EXPORT_SYMBOL_GPL(skx_set_mc_mapping);
+
+static u8 skx_get_mc_mapping(struct skx_dev *d, u8 pmc)
+{
+	edac_dbg(0, "Get the mapping of mc phy idx to logical idx: %02d -> %02d\n",
+		 pmc, d->mc_mapping[pmc]);
+
+	return d->mc_mapping[pmc];
+}
+
 static bool skx_adxl_decode(struct decoded_addr *res, enum error_source err_src)
 {
 	struct skx_dev *d;
@@ -188,6 +217,8 @@ static bool skx_adxl_decode(struct decoded_addr *res, enum error_source err_src)
 		return false;
 	}
 
+	res->imc = skx_get_mc_mapping(d, res->imc);
+
 	for (i = 0; i < adxl_component_count; i++) {
 		if (adxl_values[i] == ~0x0ull)
 			continue;
@@ -326,6 +357,8 @@ int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list)
 			 d->bus[0], d->bus[1], d->bus[2], d->bus[3]);
 		list_add_tail(&d->list, &dev_edac_list);
 		prev = pdev;
+
+		skx_init_mc_mapping(d);
 	}
 
 	if (list)
diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index b0845bdd4516..ca5408803f87 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -93,6 +93,16 @@ struct skx_dev {
 	struct pci_dev *uracu; /* for i10nm CPU */
 	struct pci_dev *pcu_cr3; /* for HBM memory detection */
 	u32 mcroute;
+	/*
+	 * Some server BIOS may hide certain memory controllers, and the
+	 * EDAC driver skips those hidden memory controllers. However, the
+	 * ADXL still decodes memory error address using physical memory
+	 * controller indices. The mapping table is used to convert the
+	 * physical indices (reported by ADXL) to the logical indices
+	 * (used the EDAC driver) of present memory controllers during the
+	 * error handling process.
+	 */
+	u8 mc_mapping[NUM_IMC];
 	struct skx_imc {
 		struct mem_ctl_info *mci;
 		struct pci_dev *mdev; /* for i10nm CPU */
@@ -242,6 +252,7 @@ void skx_adxl_put(void);
 void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log);
 void skx_set_mem_cfg(bool mem_cfg_2lm);
 void skx_set_res_cfg(struct res_config *cfg);
+void skx_set_mc_mapping(struct skx_dev *d, u8 pmc, u8 lmc);
 
 int skx_get_src_id(struct skx_dev *d, int off, u8 *id);
 
-- 
2.17.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ