lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Sat,  9 Oct 2010 14:49:46 +0800
From:	Huang Ying <ying.huang@...el.com>
To:	Don Zickus <dzickus@...hat.com>, Ingo Molnar <mingo@...e.hu>,
	"H. Peter Anvin" <hpa@...or.com>
Cc:	linux-kernel@...r.kernel.org, Andi Kleen <andi@...stfloor.org>,
	Robert Richter <robert.richter@....com>,
	Huang Ying <ying.huang@...el.com>
Subject: [PATCH -v3 5/6] x86, NMI, treat unknown NMI as hardware error

In general, unknown NMI is used by hardware and firmware to notify
fatal hardware errors to OS. So the Linux should treat unknown NMI as
hardware error and go panic upon unknown NMI for better error
containment.

But there are some broken hardware, which will generate unknown NMI
not for hardware error. To support these machines, a white list
mechanism is provided to treat unknown NMI as hardware error only on
some known working system.

These systems are identified via the presentation of APEI HEST or
some PCI ID of the host bridge. The PCI ID of host bridge instead of
DMI ID is used, so that the checking can be done based on the platform
type instead of motherboard. This should be simpler and sufficient.

The method to identify the platforms is designed by Andi Kleen.


v3:

- Change document and comments

Signed-off-by: Huang Ying <ying.huang@...el.com>
---
 arch/x86/include/asm/nmi.h |    1 
 arch/x86/kernel/Makefile   |    2 +
 arch/x86/kernel/hwerr.c    |   62 +++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/traps.c    |   11 +++++++
 drivers/acpi/apei/hest.c   |    8 +++++
 5 files changed, 84 insertions(+)
 create mode 100644 arch/x86/kernel/hwerr.c

--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -44,6 +44,7 @@ struct ctl_table;
 extern int proc_nmi_enabled(struct ctl_table *, int ,
 			void __user *, size_t *, loff_t *);
 extern int unknown_nmi_panic;
+extern int unknown_nmi_as_hwerr;
 
 void arch_trigger_all_cpu_backtrace(void);
 #define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -119,6 +119,8 @@ obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION)
 
 obj-$(CONFIG_SWIOTLB)			+= pci-swiotlb.o
 
+obj-y					+= hwerr.o
+
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
--- /dev/null
+++ b/arch/x86/kernel/hwerr.c
@@ -0,0 +1,62 @@
+/*
+ * Hardware error architecture dependent processing
+ *
+ * Copyright 2010 Intel Corp.
+ *   Author: Huang Ying <ying.huang@...el.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/nmi.h>
+
+/*
+ * In general, unknown NMI is used by hardware and firmware to notify
+ * fatal hardware errors to OS. So the Linux should treat unknown NMI
+ * as hardware error and go panic upon unknown NMI for better error
+ * containment.
+ *
+ * But there are some broken hardware, which will generate unknown NMI
+ * not for hardware error. To support these systems, a white list
+ * mechanism is used to treat unknown NMI as hardware error only on
+ * some known working system.
+ *
+ * The PCI ID of host bridge instead of DMI ID is used, so that the
+ * checking can be done based on the platform instead of motherboard.
+ * This should be simpler and sufficient.
+ */
+static const
+struct pci_device_id unknown_nmi_as_hwerr_platform[] __initdata = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3406) },
+	{ 0, }
+};
+
+int __init check_unknown_nmi_as_hwerr(void)
+{
+	struct pci_dev *dev = NULL;
+
+	for_each_pci_dev(dev) {
+		if (pci_match_id(unknown_nmi_as_hwerr_platform, dev)) {
+			pr_info(
+"Host bridge is identified, will treat unknown NMI as hardware error!\n");
+			unknown_nmi_as_hwerr = 1;
+			break;
+		}
+	}
+
+	return 0;
+}
+late_initcall(check_unknown_nmi_as_hwerr);
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -83,6 +83,8 @@ EXPORT_SYMBOL_GPL(used_vectors);
 
 static int ignore_nmis;
 
+int unknown_nmi_as_hwerr;
+
 /*
  * Prevent NMI reason port (0x61) being accessed simultaneously, can
  * only be used in NMI handler.
@@ -366,6 +368,15 @@ unknown_nmi_error(unsigned char reason,
 	if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) ==
 			NOTIFY_STOP)
 		return;
+	/*
+	 * On some platforms, hardware errors may be notified via
+	 * unknown NMI
+	 */
+	if (unknown_nmi_as_hwerr)
+		panic(
+		"NMI for hardware error without error record: Not continuing\n"
+		"Please check BIOS/BMC log for further information.");
+
 #ifdef CONFIG_MCA
 	/*
 	 * Might actually be able to figure out what the guilty party
--- a/drivers/acpi/apei/hest.c
+++ b/drivers/acpi/apei/hest.c
@@ -35,6 +35,7 @@
 #include <linux/highmem.h>
 #include <linux/io.h>
 #include <linux/platform_device.h>
+#include <linux/nmi.h>
 #include <acpi/apei.h>
 
 #include "apei-internal.h"
@@ -225,6 +226,13 @@ static int __init hest_init(void)
 	if (rc)
 		goto err;
 
+	/*
+	 * System has proper HEST should treat unknown NMI as fatal
+	 * hardware error notification
+	 */
+	pr_info("HEST is valid, will treat unknown NMI as hardware error!\n");
+	unknown_nmi_as_hwerr = 1;
+
 	rc = hest_ghes_dev_register(ghes_count);
 	if (rc)
 		goto err;
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ