lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Date:   Thu, 28 May 2020 12:13:06 +0200
From:   Robert Richter <rrichter@...vell.com>
To:     Borislav Petkov <bp@...en8.de>,
        Mauro Carvalho Chehab <mchehab@...nel.org>,
        Tony Luck <tony.luck@...el.com>
CC:     James Morse <james.morse@....com>,
        Aristeu Rozanski <aris@...hat.com>,
        Robert Richter <rrichter@...vell.com>,
        Matthias Brugger <mbrugger@...e.com>,
        <linux-edac@...r.kernel.org>, <linux-kernel@...r.kernel.org>
Subject: [PATCH v4] EDAC/ghes: Setup DIMM label from DMI and use it in error reports

The ghes driver reports errors with 'unknown label' even if the actual
DIMM label is known, e.g.:

 EDAC MC0: 1 CE Single-bit ECC on unknown label (node:0 card:0
   module:0 rank:1 bank:0 col:13 bit_pos:16 DIMM location:N0 DIMM_A0
   page:0x966a9b3 offset:0x0 grain:1 syndrome:0x0 - APEI location:
   node:0 card:0 module:0 rank:1 bank:0 col:13 bit_pos:16 DIMM
   location:N0 DIMM_A0 status(0x0000000000000400): Storage error in
   DRAM memory)

Fix this by using struct dimm_info's label string in error reports:

 EDAC MC0: 1 CE Single-bit ECC on N0 DIMM_A0 (node:0 card:0 module:0
   rank:1 bank:515 col:14 bit_pos:16 DIMM location:N0 DIMM_A0
   page:0x99223d8 offset:0x0 grain:1 syndrome:0x0 - APEI location:
   node:0 card:0 module:0 rank:1 bank:515 col:14 bit_pos:16 DIMM
   location:N0 DIMM_A0 status(0x0000000000000400): Storage error in
   DRAM memory)

The labels are initialized by reading the bank and device strings from
DMI. Now, the label information can also read from sysfs. E.g. a
ThunderX2 system will show the following:

 /sys/devices/system/edac/mc/mc0/dimm0/dimm_label:N0 DIMM_A0
 /sys/devices/system/edac/mc/mc0/dimm1/dimm_label:N0 DIMM_B0
 /sys/devices/system/edac/mc/mc0/dimm2/dimm_label:N0 DIMM_C0
 /sys/devices/system/edac/mc/mc0/dimm3/dimm_label:N0 DIMM_D0
 /sys/devices/system/edac/mc/mc0/dimm4/dimm_label:N0 DIMM_E0
 /sys/devices/system/edac/mc/mc0/dimm5/dimm_label:N0 DIMM_F0
 /sys/devices/system/edac/mc/mc0/dimm6/dimm_label:N0 DIMM_G0
 /sys/devices/system/edac/mc/mc0/dimm7/dimm_label:N0 DIMM_H0
 /sys/devices/system/edac/mc/mc0/dimm8/dimm_label:N1 DIMM_I0
 /sys/devices/system/edac/mc/mc0/dimm9/dimm_label:N1 DIMM_J0
 /sys/devices/system/edac/mc/mc0/dimm10/dimm_label:N1 DIMM_K0
 /sys/devices/system/edac/mc/mc0/dimm11/dimm_label:N1 DIMM_L0
 /sys/devices/system/edac/mc/mc0/dimm12/dimm_label:N1 DIMM_M0
 /sys/devices/system/edac/mc/mc0/dimm13/dimm_label:N1 DIMM_N0
 /sys/devices/system/edac/mc/mc0/dimm14/dimm_label:N1 DIMM_O0
 /sys/devices/system/edac/mc/mc0/dimm15/dimm_label:N1 DIMM_P0

Since dimm_labels can be rewritten, that label will be used in a later
error report:

 # echo foobar >/sys/devices/system/edac/mc/mc0/dimm0/dimm_label
 # # some error injection here
 # dmesg | grep foobar
 [ 751.383533] EDAC MC0: 1 CE Single-bit ECC on foobar (node:0 card:0
 module:0 rank:1 bank:259 col:3 bit_pos:16 DIMM location:N0 DIMM_A0
 page:0x8c8dc74 offset:0x0 grain:1 syndrome:0x0 - APEI location:
 node:0 card:0 module:0 rank:1 bank:259 col:3 bit_pos:16 DIMM
 location:N0 DIMM_A0 status(0x0000000000000400): Storage error in DRAM
 memory)

Signed-off-by: Robert Richter <rrichter@...vell.com>
---
v4:

 * dimm->label: Only update dimm->label in if bank/device is found in
   the SMBIOS table, this keeps current behavior for machines that do
   not provide this information.

 * e->location: Keep current behavior how e->location is written.

 * e->label: Use dimm->label if a DIMM was found by its handle and
   "unknown memory" otherwise. This aligns with the edac_mc
   implementation.

Signed-off-by: Robert Richter <rrichter@...vell.com>
---
 drivers/edac/ghes_edac.c | 37 ++++++++++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index cb3dab56a875..9a6a055ab624 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -87,16 +87,29 @@ static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg)
 		(*num_dimm)++;
 }
 
-static int get_dimm_smbios_index(struct mem_ctl_info *mci, u16 handle)
+static struct dimm_info *find_dimm_by_handle(struct mem_ctl_info *mci, u16 handle)
 {
 	struct dimm_info *dimm;
 
 	mci_for_each_dimm(mci, dimm) {
 		if (dimm->smbios_handle == handle)
-			return dimm->idx;
+			return dimm;
 	}
 
-	return -1;
+	return NULL;
+}
+
+static void dimm_setup_label(struct dimm_info *dimm, u16 handle)
+{
+	const char *bank = NULL, *device = NULL;
+
+	dmi_memdev_name(handle, &bank, &device);
+
+	/* both strings must be non-zero */
+	if (bank && *bank && device && *device) {
+		snprintf(dimm->label, sizeof(dimm->label),
+			"%s %s", bank, device);
+	}
 }
 
 static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
@@ -179,9 +192,7 @@ static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
 		dimm->dtype = DEV_UNKNOWN;
 		dimm->grain = 128;		/* Likely, worse case */
 
-		/*
-		 * FIXME: It shouldn't be hard to also fill the DIMM labels
-		 */
+		dimm_setup_label(dimm, entry->handle);
 
 		if (dimm->nr_pages) {
 			edac_dbg(1, "DIMM%i: %s size = %d MB%s\n",
@@ -228,7 +239,6 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
 	memset(e, 0, sizeof (*e));
 	e->error_count = 1;
 	e->grain = 1;
-	strcpy(e->label, "unknown label");
 	e->msg = pvt->msg;
 	e->other_detail = pvt->other_detail;
 	e->top_layer = -1;
@@ -345,7 +355,7 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
 		p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos);
 	if (mem_err->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) {
 		const char *bank = NULL, *device = NULL;
-		int index = -1;
+		struct dimm_info *dimm;
 
 		dmi_memdev_name(mem_err->mem_dev_handle, &bank, &device);
 		if (bank != NULL && device != NULL)
@@ -354,13 +364,18 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
 			p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
 				     mem_err->mem_dev_handle);
 
-		index = get_dimm_smbios_index(mci, mem_err->mem_dev_handle);
-		if (index >= 0)
-			e->top_layer = index;
+		dimm = find_dimm_by_handle(mci, mem_err->mem_dev_handle);
+		if (dimm) {
+			e->top_layer = dimm->idx;
+			strcpy(e->label, dimm->label);
+		}
 	}
 	if (p > e->location)
 		*(p - 1) = '\0';
 
+	if (!*e->label)
+		strcpy(e->label, "unknown memory");
+
 	/* All other fields are mapped on e->other_detail */
 	p = pvt->other_detail;
 	p += snprintf(p, sizeof(pvt->other_detail),
-- 
2.20.1

Powered by blists - more mailing lists