lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Message-Id: <20241227035459.90602-1-yue.zhao@shopee.com>
Date: Fri, 27 Dec 2024 11:54:59 +0800
From: Yue Zhao <yue.zhao@...pee.com>
To: anthony.l.nguyen@...el.com,
	przemyslaw.kitszel@...el.com,
	andrew+netdev@...n.ch,
	edumazet@...gle.com,
	kuba@...nel.org,
	pabeni@...hat.com,
	intel-wired-lan@...ts.osuosl.org,
	netdev@...r.kernel.org,
	linux-kernel@...r.kernel.org
Cc: chunguang.xu@...pee.com,
	haifeng.xu@...pee.com,
	Yue Zhao <yue.zhao@...pee.com>
Subject: [PATCH] i40e: Disable i40e PCIe AER on system reboot

Disable PCIe AER on the i40e device on system reboot on a limited
list of Dell PowerEdge systems. This prevents a fatal PCIe AER event
on the i40e device during the ACPI _PTS (prepare to sleep) method for
S5 on those systems. The _PTS is invoked by acpi_enter_sleep_state_prep()
as part of the kernel's reboot sequence as a result of commit
38f34dba806a ("PM: ACPI: reboot: Reinstate S5 for reboot").

We first noticed this abnormal reboot issue in tg3 device, and there
is a similar patch about disable PCIe AER to fix hardware error during
reboot. The hardware error in tg3 device has gone after we apply this
patch below.

https://lore.kernel.org/lkml/20241129203640.54492-1-lszubowi@redhat.com/T/

So we try to disable PCIe AER on the i40e device in the similar way.

hardware crash dmesg log:

ACPI: PM: Preparing to enter system sleep state S5
{1}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 5
{1}[Hardware Error]: event severity: fatal
{1}[Hardware Error]:  Error 0, type: fatal
{1}[Hardware Error]:   section_type: PCIe error
{1}[Hardware Error]:   port_type: 0, PCIe end point
{1}[Hardware Error]:   version: 3.0
{1}[Hardware Error]:   command: 0x0006, status: 0x0010
{1}[Hardware Error]:   device_id: 0000:05:00.1
{1}[Hardware Error]:   slot: 0
{1}[Hardware Error]:   secondary_bus: 0x00
{1}[Hardware Error]:   vendor_id: 0x8086, device_id: 0x1572
{1}[Hardware Error]:   class_code: 020000
{1}[Hardware Error]:   aer_uncor_status: 0x00100000, aer_uncor_mask: 0x00018000
{1}[Hardware Error]:   aer_uncor_severity: 0x000ef030
{1}[Hardware Error]:   TLP Header: 40000001 0000000f 90028090 00000000
Kernel panic - not syncing: Fatal hardware error!
Hardware name: Dell Inc. PowerEdge C4140/08Y2GR, BIOS 2.21.1 12/12/2023
Call Trace:
 <NMI>
 dump_stack_lvl+0x48/0x70
 dump_stack+0x10/0x20
 panic+0x1b4/0x3a0
 __ghes_panic+0x6c/0x70
 ghes_in_nmi_queue_one_entry.constprop.0+0x1ee/0x2c0
 ghes_notify_nmi+0x5e/0xe0
 nmi_handle+0x62/0x160
 default_do_nmi+0x4c/0x150
 exc_nmi+0x140/0x1f0
 end_repeat_nmi+0x16/0x67
RIP: 0010:intel_idle_irq+0x70/0xf0
 </NMI>
 <TASK>
 cpuidle_enter_state+0x91/0x6f0
 cpuidle_enter+0x2e/0x50
 call_cpuidle+0x23/0x60
 cpuidle_idle_call+0x11d/0x190
 do_idle+0x82/0xf0
 cpu_startup_entry+0x2a/0x30
 rest_init+0xc2/0xf0
 arch_call_rest_init+0xe/0x30
 start_kernel+0x34f/0x440
 x86_64_start_reservations+0x18/0x30
 x86_64_start_kernel+0xbf/0x110
 secondary_startup_64_no_verify+0x18f/0x19b
 </TASK>

Fixes: 38f34dba806a ("PM: ACPI: reboot: Reinstate S5 for reboot")
Signed-off-by: Yue Zhao <yue.zhao@...pee.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 64 +++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 0e1d9e2fbf38..80e66e4e90f7 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <net/pkt_cls.h>
 #include <net/xdp_sock_drv.h>
+#include <linux/dmi.h>
 
 /* Local includes */
 #include "i40e.h"
@@ -16608,6 +16609,56 @@ static void i40e_pci_error_resume(struct pci_dev *pdev)
 	i40e_io_resume(pf);
 }
 
+/* Systems where ACPI _PTS (Prepare To Sleep) S5 will result in a fatal
+ * PCIe AER event on the i40e device if the i40e device is not, or cannot
+ * be, powered down.
+ */
+static const struct dmi_system_id i40e_restart_aer_quirk_table[] = {
+	{
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge C4140"),
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge R440"),
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge R540"),
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge R640"),
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge R650"),
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge R740"),
+		},
+	},
+	{
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge R750"),
+		},
+	},
+	{}
+};
+
 /**
  * i40e_shutdown - PCI callback for shutting down
  * @pdev: PCI device information struct
@@ -16654,6 +16705,19 @@ static void i40e_shutdown(struct pci_dev *pdev)
 	i40e_clear_interrupt_scheme(pf);
 	rtnl_unlock();
 
+	if (system_state == SYSTEM_RESTART &&
+		dmi_first_match(i40e_restart_aer_quirk_table) &&
+		pdev->current_state <= PCI_D3hot) {
+		/* Disable PCIe AER on the i40e to avoid a fatal
+		 * error during this system restart.
+		 */
+		pcie_capability_clear_word(pdev, PCI_EXP_DEVCTL,
+					   PCI_EXP_DEVCTL_CERE |
+					   PCI_EXP_DEVCTL_NFERE |
+					   PCI_EXP_DEVCTL_FERE |
+					   PCI_EXP_DEVCTL_URRE);
+	}
+
 	if (system_state == SYSTEM_POWER_OFF) {
 		pci_wake_from_d3(pdev, pf->wol_en);
 		pci_set_power_state(pdev, PCI_D3hot);
-- 
2.39.5 (Apple Git-154)


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ