lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <1493147772-2721-1-git-send-email-Yazen.Ghannam@amd.com>
Date:   Tue, 25 Apr 2017 14:16:11 -0500
From:   Yazen Ghannam <Yazen.Ghannam@....com>
To:     <linux-edac@...r.kernel.org>
CC:     Yazen Ghannam <Yazen.Ghannam@....com>,
        Tony Luck <tony.luck@...el.com>, Borislav Petkov <bp@...e.de>,
        <x86@...nel.org>, <linux-kernel@...r.kernel.org>
Subject: [PATCH v3 1/2] x86/mce/AMD: Redo logging of errors from APIC LVT interrupts

From: Yazen Ghannam <yazen.ghannam@....com>

We have support for the new SMCA MCA_DE{STAT,ADDR} registers in Linux. So
we've used these registers in place of MCA_{STATUS,ADDR} on SMCA systems.
However, the guidance for current implementations of SMCA is to continue
using MCA_{STATUS,ADDR} and to use MCA_DE{STAT,ADDR} only if a Deferred
error was not found in the former registers. If we logged a Deferred error
in MCA_STATUS then we should also clear MCA_DESTAT. This also means we
shouldn't clear MCA_CONFIG[LogDeferredInMcaStat].

Don't clear MCA_CONFIG[LogDeferredInMcaStat] during AMD mcheck init.

Don't break after finding the first error in either the Deferred or
Thresholding interrupt handlers.

Rework __log_error() to only log error. Most valid checks are moved into
__log_error_* helper functions.

Write the Deferred error __log_error_* helper function to follow the
guidance for current SMCA systems.

Signed-off-by: Yazen Ghannam <yazen.ghannam@....com>
---
Link:
https://lkml.kernel.org/r/1491326672-48298-1-git-send-email-Yazen.Ghannam@amd.com

v2->v3:
- Write __log_error_*() helper functions for Deferred and Thresholding errors.
- Redo __log_error() to only log the error and do valid checks in the helpers.
- Log every valid error in the DE handler rather than just Deferred errors.
- Don't break after the first error found in either interrupt handler.

v1->v2:
- Change name of check_deferred_status() to is_deferred_error().
- Rework __log_error() to move out SMCA/Deferred error-specific code.

 arch/x86/kernel/cpu/mcheck/mce_amd.c | 137 +++++++++++++++++++----------------
 1 file changed, 75 insertions(+), 62 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 6e4a047..e6e507d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -472,20 +472,6 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
 		smca_high |= BIT(0);
 
 		/*
-		 * SMCA logs Deferred Error information in MCA_DE{STAT,ADDR}
-		 * registers with the option of additionally logging to
-		 * MCA_{STATUS,ADDR} if MCA_CONFIG[LogDeferredInMcaStat] is set.
-		 *
-		 * This bit is usually set by BIOS to retain the old behavior
-		 * for OSes that don't use the new registers. Linux supports the
-		 * new registers so let's disable that additional logging here.
-		 *
-		 * MCA_CONFIG[LogDeferredInMcaStat] is bit 34 (bit 2 in the high
-		 * portion of the MSR).
-		 */
-		smca_high &= ~BIT(2);
-
-		/*
 		 * SMCA sets the Deferred Error Interrupt type per bank.
 		 *
 		 * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us
@@ -756,36 +742,19 @@ int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr)
 EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr);
 
 static void
-__log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc)
+__log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
 {
-	u32 msr_status = msr_ops.status(bank);
-	u32 msr_addr = msr_ops.addr(bank);
 	struct mce m;
-	u64 status;
-
-	WARN_ON_ONCE(deferred_err && threshold_err);
-
-	if (deferred_err && mce_flags.smca) {
-		msr_status = MSR_AMD64_SMCA_MCx_DESTAT(bank);
-		msr_addr = MSR_AMD64_SMCA_MCx_DEADDR(bank);
-	}
-
-	rdmsrl(msr_status, status);
-
-	if (!(status & MCI_STATUS_VAL))
-		return;
 
 	mce_setup(&m);
 
 	m.status = status;
+	m.misc   = misc;
 	m.bank   = bank;
 	m.tsc	 = rdtsc();
 
-	if (threshold_err)
-		m.misc = misc;
-
 	if (m.status & MCI_STATUS_ADDRV) {
-		rdmsrl(msr_addr, m.addr);
+		m.addr = addr;
 
 		/*
 		 * Extract [55:<lsb>] where lsb is the least significant
@@ -806,8 +775,6 @@ __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc)
 	}
 
 	mce_log(&m);
-
-	wrmsrl(msr_status, 0);
 }
 
 static inline void __smp_deferred_error_interrupt(void)
@@ -832,33 +799,83 @@ asmlinkage __visible void __irq_entry smp_trace_deferred_error_interrupt(void)
 	exiting_ack_irq();
 }
 
+/*
+ * We have three scenarios for checking for Deferred errors.
+ * 1) Non-SMCA systems check MCA_STATUS and log error if found.
+ * 2) SMCA systems check MCA_STATUS. If error is found then log it and also
+ *    clear MCA_DESTAT.
+ * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and
+ *    log it.
+ */
+static void
+__log_error_deferred(unsigned int bank)
+{
+	u64 status, addr;
+	bool logged_deferred = false;
+
+	rdmsrl(msr_ops.status(bank), status);
+
+	/* Log any valid error we find. */
+	if (status & MCI_STATUS_VAL) {
+		rdmsrl(msr_ops.addr(bank), addr);
+
+		__log_error(bank, status, addr, 0);
+
+		wrmsrl(msr_ops.status(bank), 0);
+
+		logged_deferred = !!(status & MCI_STATUS_DEFERRED);
+	}
+
+	if (!mce_flags.smca)
+		return;
+
+	/* Clear MCA_DESTAT if we logged the deferred error from MCA_STATUS. */
+	if (logged_deferred) {
+		wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0);
+		return;
+	}
+
+	rdmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), status);
+
+	/*
+	 * Only deferred errors are logged in MCA_DE{STAT,ADDR} so just check
+	 * for a valid error.
+	 */
+	if (status & MCI_STATUS_VAL) {
+		rdmsrl(MSR_AMD64_SMCA_MCx_DEADDR(bank), addr);
+
+		__log_error(bank, status, addr, 0);
+
+		wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0);
+	}
+}
+
 /* APIC interrupt handler for deferred errors */
 static void amd_deferred_error_interrupt(void)
 {
 	unsigned int bank;
-	u32 msr_status;
-	u64 status;
 
-	for (bank = 0; bank < mca_cfg.banks; ++bank) {
-		msr_status = (mce_flags.smca) ? MSR_AMD64_SMCA_MCx_DESTAT(bank)
-					      : msr_ops.status(bank);
+	for (bank = 0; bank < mca_cfg.banks; ++bank)
+		__log_error_deferred(bank);
+}
+
+static void __log_error_thresholding(unsigned int bank, u64 misc)
+{
+	u64 status, addr;
 
-		rdmsrl(msr_status, status);
+	rdmsrl(msr_ops.status(bank), status);
 
-		if (!(status & MCI_STATUS_VAL) ||
-		    !(status & MCI_STATUS_DEFERRED))
-			continue;
+	if (status & MCI_STATUS_VAL) {
+		rdmsrl(msr_ops.addr(bank), addr);
 
-		__log_error(bank, true, false, 0);
-		break;
+		__log_error(bank, status, addr, misc);
+
+		wrmsrl(msr_ops.status(bank), 0);
 	}
 }
 
 /*
  * APIC Interrupt Handler
- */
-
-/*
  * threshold interrupt handler will service THRESHOLD_APIC_VECTOR.
  * the interrupt goes off when error_count reaches threshold_limit.
  * the handler will simply log mcelog w/ software defined bank number.
@@ -870,7 +887,6 @@ static void amd_threshold_interrupt(void)
 	unsigned int bank, block, cpu = smp_processor_id();
 	struct thresh_restart tr;
 
-	/* assume first bank caused it */
 	for (bank = 0; bank < mca_cfg.banks; ++bank) {
 		if (!(per_cpu(bank_map, cpu) & (1 << bank)))
 			continue;
@@ -897,19 +913,16 @@ static void amd_threshold_interrupt(void)
 			 * Log the machine check that caused the threshold
 			 * event.
 			 */
-			if (high & MASK_OVERFLOW_HI)
-				goto log;
+			if (high & MASK_OVERFLOW_HI) {
+				__log_error_thresholding(bank, ((u64)high << 32) | low);
+
+				/* Reset threshold block after logging error. */
+				memset(&tr, 0, sizeof(tr));
+				tr.b = &per_cpu(threshold_banks, cpu)[bank]->blocks[block];
+				threshold_restart_bank(&tr);
+			}
 		}
 	}
-	return;
-
-log:
-	__log_error(bank, false, true, ((u64)high << 32) | low);
-
-	/* Reset threshold block after logging error. */
-	memset(&tr, 0, sizeof(tr));
-	tr.b = &per_cpu(threshold_banks, cpu)[bank]->blocks[block];
-	threshold_restart_bank(&tr);
 }
 
 /*
-- 
2.7.4

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ