lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20190529084344.28562-20-rrichter@marvell.com>
Date:   Wed, 29 May 2019 08:44:45 +0000
From:   Robert Richter <rrichter@...vell.com>
To:     Borislav Petkov <bp@...en8.de>, Tony Luck <tony.luck@...el.com>,
        "James Morse" <james.morse@....com>,
        Mauro Carvalho Chehab <mchehab@...nel.org>
CC:     "linux-edac@...r.kernel.org" <linux-edac@...r.kernel.org>,
        "linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
        Robert Richter <rrichter@...vell.com>
Subject: [PATCH 19/21] EDAC, ghes: Identify dimm by node, card, module and
 handle

According to SMBIOS Spec. 2.7 (N.2.5 Memory Error Section), a failing
DIMM (module or rank number) can be identified by its error location
consisting of node, card and module. A module handle is used to map it
to the dimms listed in the dmi table. Collect all those data from the
error record and select the dimm accordingly. Inconsistent error
records will be reported which is the case if the same dimm handle
reports errors with different node, card or module.

The change allows to enable per-layer reporting based on node, card
and module in the next patch.

Signed-off-by: Robert Richter <rrichter@...vell.com>
---
 drivers/edac/ghes_edac.c | 74 +++++++++++++++++++++++++++++++++-------
 1 file changed, 62 insertions(+), 12 deletions(-)

diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index 4bac643d3404..07c847ed7315 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -83,8 +83,11 @@ struct memarr_dmi_entry {
 
 struct ghes_dimm_info {
 	struct dimm_info dimm_info;
+	struct dimm_info *dimm;
 	int		idx;
 	int		numa_node;
+	int		card;
+	int		module;
 	phys_addr_t	start;
 	phys_addr_t	end;
 	u16		phys_handle;
@@ -119,6 +122,8 @@ static void ghes_dimm_info_init(void)
 	for_each_dimm(dimm) {
 		dimm->idx	= idx;
 		dimm->numa_node	= NUMA_NO_NODE;
+		dimm->card	= -1;
+		dimm->module	= -1;
 		idx++;
 	}
 }
@@ -401,6 +406,13 @@ static void mci_add_dimm_info(struct mem_ctl_info *mci)
 
 		if (*dmi_dimm->label)
 			strcpy(mci_dimm->label, dmi_dimm->label);
+
+		/*
+		 * From here on do not use any longer &dimm.dimm_info.
+		 * Instead switch to the mci's dimm info which might
+		 * contain updated data, such as the label.
+		 */
+		dimm->dimm = mci_dimm;
 	}
 
 	if (index != mci->tot_dimms)
@@ -408,24 +420,46 @@ static void mci_add_dimm_info(struct mem_ctl_info *mci)
 			index, mci->tot_dimms);
 }
 
-static struct mem_ctl_info *get_mc_by_node(int nid)
+/* Requires ghes_lock being set. */
+static struct ghes_dimm_info *
+get_and_prepare_dimm_info(int nid, int card, int module, int handle)
 {
-	struct mem_ctl_info *mci = edac_mc_find(nid);
+	static struct ghes_dimm_info *dimm;
+	struct dimm_info *di;
 
-	if (mci)
-		return mci;
+	/*
+	 * We require smbios_handle being set in the error report for
+	 * per layer reporting (SMBIOS handle for the Type 17 Memory
+	 * Device Structure that represents the Memory Module)
+	 */
+	for_each_dimm(dimm) {
+		di = dimm->dimm;
+		if (di->smbios_handle == handle)
+			goto found;
+	}
 
-	if (num_possible_nodes() > 1) {
-		edac_mc_printk(fallback, KERN_WARNING,
-			"Invalid or no node information, falling back to first node: %s",
-			fallback->dev_name);
+	return NULL;
+found:
+	if (dimm->card < 0 && card >= 0)
+		dimm->card = card;
+	if (dimm->module < 0 && module >= 0)
+		dimm->module = module;
+
+	if ((num_possible_nodes() > 1 && di->mci->mc_idx != nid) ||
+		(card >= 0 && card != dimm->card) ||
+		(module >= 0 && module != dimm->module)) {
+		edac_mc_printk(di->mci, KERN_WARNING,
+			"Inconsistent error report (nid/card/module): %d/%d/%d (dimm%d: %d/%d/%d)",
+			nid, card, module, di->idx,
+			di->mci->mc_idx, dimm->card, dimm->module);
 	}
 
-	return fallback;
+	return dimm;
 }
 
 void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
 {
+	struct ghes_dimm_info *dimm;
 	struct dimm_info *dimm_info;
 	enum hw_event_mc_err_type type;
 	struct edac_raw_error_desc *e;
@@ -434,6 +468,9 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
 	unsigned long flags;
 	char *p;
 	int nid = NUMA_NO_NODE;
+	int card = -1;
+	int module = -1;
+	int handle = -1;
 
 	/* We need at least one mc */
 	if (WARN_ON_ONCE(!fallback))
@@ -449,10 +486,23 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
 
 	spin_lock_irqsave(&ghes_lock, flags);
 
-	/* select the node's mc device */
 	if (mem_err->validation_bits & CPER_MEM_VALID_NODE)
 		nid = mem_err->node;
-	mci = get_mc_by_node(nid);
+	if (mem_err->validation_bits & CPER_MEM_VALID_CARD)
+		card = mem_err->card;
+	if (mem_err->validation_bits & CPER_MEM_VALID_MODULE)
+		module = mem_err->module;
+	if (mem_err->validation_bits & CPER_MEM_VALID_MODULE_HANDLE)
+		handle = mem_err->mem_dev_handle;
+
+	dimm = get_and_prepare_dimm_info(nid, card, module, handle);
+	if (dimm)
+		mci = dimm->dimm->mci;
+	else
+		mci = edac_mc_find(nid);
+	if (!mci)
+		mci = fallback;
+
 	pvt = mci->pvt_info;
 	e = &mci->error_desc;
 
@@ -670,7 +720,7 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
 	if (p > pvt->other_detail)
 		*(p - 1) = '\0';
 
-	dimm_info = edac_get_dimm_by_index(mci, e->top_layer);
+	dimm_info = dimm ? dimm->dimm : NULL;
 
 	edac_raw_mc_handle_error(type, mci, dimm_info, e, -1, -1);
 
-- 
2.20.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ