lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20250226015202.36576-12-qiuxu.zhuo@intel.com>
Date: Wed, 26 Feb 2025 09:52:02 +0800
From: Qiuxu Zhuo <qiuxu.zhuo@...el.com>
To: Tony Luck <tony.luck@...el.com>,
	Borislav Petkov <bp@...en8.de>,
	Jason Baron <jbaron@...mai.com>
Cc: Qiuxu Zhuo <qiuxu.zhuo@...el.com>,
	James Morse <james.morse@....com>,
	Mauro Carvalho Chehab <mchehab@...nel.org>,
	Robert Richter <rric@...nel.org>,
	Gary Wang <gary.c.wang@...el.com>,
	Yi Lai <yi1.lai@...el.com>,
	linux-edac@...r.kernel.org,
	linux-kernel@...r.kernel.org
Subject: [PATCH 11/11] EDAC/ie31200: Switch Raptor Lake-S to interrupt mode

Raptor Lake-S SoCs notify memory errors via Machine Check. Add interrupt
mode support and switch Raptor Lake-S EDAC from polling to interrupt mode.

Tested-by: Gary Wang <gary.c.wang@...el.com>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@...el.com>
---
 drivers/edac/Kconfig        |  2 +-
 drivers/edac/ie31200_edac.c | 83 ++++++++++++++++++++++++++++++++++---
 2 files changed, 78 insertions(+), 7 deletions(-)

diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index 2051a7c944a5..b908a118fb32 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -168,7 +168,7 @@ config EDAC_I3200
 
 config EDAC_IE31200
 	tristate "Intel e312xx"
-	depends on PCI && X86
+	depends on PCI && X86 && X86_MCE_INTEL
 	help
 	  Support for error detection and correction on the Intel
 	  E3-1200 based DRAM controllers.
diff --git a/drivers/edac/ie31200_edac.c b/drivers/edac/ie31200_edac.c
index 8c0a2beec537..4abcf7b322c2 100644
--- a/drivers/edac/ie31200_edac.c
+++ b/drivers/edac/ie31200_edac.c
@@ -51,6 +51,7 @@
 #include <linux/edac.h>
 
 #include <linux/io-64-nonatomic-lo-hi.h>
+#include <asm/mce.h>
 #include "edac_module.h"
 
 #define EDAC_MOD_STR "ie31200_edac"
@@ -123,6 +124,7 @@ static int ie31200_registered = 1;
 
 struct res_config {
 	enum mem_type mtype;
+	bool machine_check;
 	int imc_num;
 	/* Host MMIO configuration register */
 	u64 reg_mchbar_mask;
@@ -172,6 +174,7 @@ struct ie31200_error_info {
 	u16 errsts;
 	u16 errsts2;
 	u64 eccerrlog[IE31200_CHANNELS];
+	u64 erraddr;
 };
 
 static const struct ie31200_dev_info ie31200_devs[] = {
@@ -327,13 +330,13 @@ static void ie31200_process_error_info(struct mem_ctl_info *mci,
 		log = info->eccerrlog[channel];
 		if (log & cfg->reg_eccerrlog_ue_mask) {
 			edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1,
-					     0, 0, 0,
+					     info->erraddr >> PAGE_SHIFT, 0, 0,
 					     field_get(cfg->reg_eccerrlog_rank_mask, log),
 					     channel, -1,
 					     "ie31200 UE", "");
 		} else if (log & cfg->reg_eccerrlog_ce_mask) {
 			edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1,
-					     0, 0,
+					     info->erraddr >> PAGE_SHIFT, 0,
 					     field_get(cfg->reg_eccerrlog_syndrome_mask, log),
 					     field_get(cfg->reg_eccerrlog_rank_mask, log),
 					     channel, -1,
@@ -342,14 +345,20 @@ static void ie31200_process_error_info(struct mem_ctl_info *mci,
 	}
 }
 
-static void ie31200_check(struct mem_ctl_info *mci)
+static void __ie31200_check(struct mem_ctl_info *mci, struct mce *mce)
 {
 	struct ie31200_error_info info;
 
+	info.erraddr = mce ? mce->addr : 0;
 	ie31200_get_and_clear_error_info(mci, &info);
 	ie31200_process_error_info(mci, &info);
 }
 
+static void ie31200_check(struct mem_ctl_info *mci)
+{
+	__ie31200_check(mci, NULL);
+}
+
 static void __iomem *ie31200_map_mchbar(struct pci_dev *pdev, struct res_config *cfg, int mc)
 {
 	union {
@@ -459,7 +468,7 @@ static int ie31200_register_mci(struct pci_dev *pdev, struct res_config *cfg, in
 	mci->mod_name = EDAC_MOD_STR;
 	mci->ctl_name = ie31200_devs[mc].ctl_name;
 	mci->dev_name = pci_name(pdev);
-	mci->edac_check = ie31200_check;
+	mci->edac_check = cfg->machine_check ? NULL : ie31200_check;
 	mci->ctl_page_to_phys = NULL;
 	priv = mci->pvt_info;
 	priv->window = window;
@@ -499,6 +508,58 @@ static int ie31200_register_mci(struct pci_dev *pdev, struct res_config *cfg, in
 	return ret;
 }
 
+static void mce_check(struct mce *mce)
+{
+	struct ie31200_priv *priv;
+	int i;
+
+	for (i = 0; i < IE31200_IMC_NUM; i++) {
+		priv = ie31200_pvt.priv[i];
+		if (!priv)
+			continue;
+
+		__ie31200_check(priv->mci, mce);
+	}
+}
+
+static int mce_handler(struct notifier_block *nb, unsigned long val, void *data)
+{
+	struct mce *mce = (struct mce *)data;
+	char *type;
+
+	if (mce->kflags & MCE_HANDLED_CEC)
+		return NOTIFY_DONE;
+
+	/*
+	 * Ignore unless this is a memory related error.
+	 * Don't check MCI_STATUS_ADDRV since it's not set on some CPUs.
+	 */
+	if ((mce->status & 0xefff) >> 7 != 1)
+		return NOTIFY_DONE;
+
+	type = mce->mcgstatus & MCG_STATUS_MCIP ?  "Exception" : "Event";
+
+	edac_dbg(0, "CPU %d: Machine Check %s: 0x%llx Bank %d: 0x%llx\n",
+		 mce->extcpu, type, mce->mcgstatus,
+		 mce->bank, mce->status);
+	edac_dbg(0, "TSC 0x%llx\n", mce->tsc);
+	edac_dbg(0, "ADDR 0x%llx\n", mce->addr);
+	edac_dbg(0, "MISC 0x%llx\n", mce->misc);
+	edac_dbg(0, "PROCESSOR %u:0x%x TIME %llu SOCKET %u APIC 0x%x\n",
+		 mce->cpuvendor, mce->cpuid, mce->time,
+		 mce->socketid, mce->apicid);
+
+	mce_check(mce);
+	mce->kflags |= MCE_HANDLED_EDAC;
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block ie31200_mce_dec = {
+	.notifier_call	= mce_handler,
+	.priority	= MCE_PRIO_EDAC,
+};
+
 static void ie31200_unregister_mcis(void)
 {
 	struct ie31200_priv *priv;
@@ -534,6 +595,13 @@ static int ie31200_probe1(struct pci_dev *pdev, struct res_config *cfg)
 			goto fail_register;
 	}
 
+	if (cfg->machine_check) {
+		mce_register_decode_chain(&ie31200_mce_dec);
+		edac_op_state = EDAC_OPSTATE_INT;
+	} else {
+		edac_op_state = EDAC_OPSTATE_POLL;
+	}
+
 	/* get this far and it's successful. */
 	edac_dbg(3, "MC: success\n");
 	return 0;
@@ -560,9 +628,13 @@ static int ie31200_init_one(struct pci_dev *pdev,
 
 static void ie31200_remove_one(struct pci_dev *pdev)
 {
+	struct ie31200_priv *priv = ie31200_pvt.priv[0];
+
 	edac_dbg(0, "\n");
 	pci_dev_put(mci_pdev);
 	mci_pdev = NULL;
+	if (priv->cfg->machine_check)
+		mce_unregister_decode_chain(&ie31200_mce_dec);
 	ie31200_unregister_mcis();
 }
 
@@ -612,6 +684,7 @@ static struct res_config skl_cfg = {
 
 struct res_config rpl_s_cfg = {
 	.mtype				= MEM_DDR5,
+	.machine_check			= true,
 	.imc_num			= 2,
 	.reg_mchbar_mask		= GENMASK_ULL(41, 17),
 	.reg_mchbar_window_size		= BIT_ULL(16),
@@ -677,8 +750,6 @@ static int __init ie31200_init(void)
 	int pci_rc, i;
 
 	edac_dbg(3, "MC:\n");
-	/* Ensure that the OPSTATE is set correctly for POLL or NMI */
-	opstate_init();
 
 	pci_rc = pci_register_driver(&ie31200_driver);
 	if (pci_rc < 0)
-- 
2.17.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ