lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20211012142910.9688-1-zhangliguang@linux.alibaba.com>
Date:   Tue, 12 Oct 2021 22:29:10 +0800
From:   Liguang Zhang <zhangliguang@...ux.alibaba.com>
To:     "Rafael J. Wysocki" <rafael@...nel.org>,
        Len Brown <lenb@...nel.org>, James Morse <james.morse@....com>,
        Tony Luck <tony.luck@...el.com>, Borislav Petkov <bp@...en8.de>
Cc:     linux-acpi@...r.kernel.org, linux-kernel@...r.kernel.org,
        linux-arm-kernel@...ts.infradead.org,
        Liguang Zhang <zhangliguang@...ux.alibaba.com>
Subject: [PATCH V2] ACPI / APEI: restore interrupt before panic in sdei flow

When hest acpi table configure Hardware Error Notification type as
Software Delegated Exception(0x0B) for RAS event, OS RAS interacts with
ATF by SDEI mechanism. On the firmware first system, OS was notified by
ATF sdei call.

The calling flow like as below when fatal RAS error happens:

ATF notify OS flow:
  sdei_dispatch_event()
    ehf_activate_priority()
      call sdei callback  // callback registered by OS
    ehf_deactivate_priority()

OS sdei callback:
  sdei_asm_handler()
    __sdei_handler()
      _sdei_handler()
        sdei_event_handler()
          ghes_sdei_critical_callback()
            ghes_in_nmi_queue_one_entry()
              /* if RAS error is fatal */
              __ghes_panic()
                panic()

If fatal RAS error occured, panic was called in sdei_asm_handle()
without ehf_deactivate_priority executed, which lead interrupt masked.
If interrupt masked, system would be halted in kdump flow like this:

arm-smmu-v3 arm-smmu-v3.3.auto: allocated 65536 entries for cmdq
arm-smmu-v3 arm-smmu-v3.3.auto: allocated 32768 entries for evtq
arm-smmu-v3 arm-smmu-v3.3.auto: allocated 65536 entries for priq
arm-smmu-v3 arm-smmu-v3.3.auto: SMMU currently enabled! Resetting...

After debug, we found accurate halted position is:
arm_smmu_device_probe()
  arm_smmu_device_reset()
    arm_smmu_device_disable()
      arm_smmu_write_reg_sync()
        readl_relaxed_poll_timeout()
          readx_poll_timeout()
            read_poll_timeout()
              usleep_range() // hrtimer is never waked.

So interrupt should be restored before panic otherwise kdump will trigger
error. In the process of sdei, a SDEI_EVENT_COMPLETE_AND_RESUME call
should be called before panic for a completed run of ehf_deactivate_priority().

Signed-off-by: Liguang Zhang <zhangliguang@...ux.alibaba.com>
---
 drivers/acpi/apei/ghes.c    | 25 +++++++++++++++++++++----
 drivers/firmware/arm_sdei.c | 14 ++++++++++++++
 include/linux/arm_sdei.h    |  2 ++
 3 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 0c8330ed1ffd..4f734c60987c 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -141,6 +141,7 @@ static unsigned long ghes_estatus_pool_size_request;
 static struct ghes_estatus_cache *ghes_estatus_caches[GHES_ESTATUS_CACHES_SIZE];
 static atomic_t ghes_estatus_cache_alloced;
 
+static bool ghes_sdei_callback;
 static int ghes_panic_timeout __read_mostly = 30;
 
 static void __iomem *ghes_map(u64 pfn, enum fixed_addresses fixmap_idx)
@@ -837,18 +838,30 @@ static void ghes_estatus_cache_add(
 	rcu_read_unlock();
 }
 
+static void sdei_api_restore_ras(void)
+{
+	/* reboot to log the error! */
+	if (!panic_timeout)
+		panic_timeout = ghes_panic_timeout;
+	panic("Fatal hardware error!");
+}
+
 static void __ghes_panic(struct ghes *ghes,
 			 struct acpi_hest_generic_status *estatus,
 			 u64 buf_paddr, enum fixed_addresses fixmap_idx)
 {
+	int err;
+
 	__ghes_print_estatus(KERN_EMERG, ghes->generic, estatus);
 
 	ghes_clear_estatus(ghes, estatus, buf_paddr, fixmap_idx);
 
-	/* reboot to log the error! */
-	if (!panic_timeout)
-		panic_timeout = ghes_panic_timeout;
-	panic("Fatal hardware error!");
+	if (ghes_sdei_callback) {
+		err = sdei_api_event_complete_and_resume((unsigned long)sdei_api_restore_ras);
+		if (err)
+			sdei_api_restore_ras();
+	} else
+		sdei_api_restore_ras();
 }
 
 static int ghes_proc(struct ghes *ghes)
@@ -1224,7 +1237,9 @@ static int ghes_sdei_normal_callback(u32 event_num, struct pt_regs *regs,
 	int err;
 
 	raw_spin_lock(&ghes_notify_lock_sdei_normal);
+	ghes_sdei_callback = true;
 	err = __ghes_sdei_callback(ghes, FIX_APEI_GHES_SDEI_NORMAL);
+	ghes_sdei_callback = false;
 	raw_spin_unlock(&ghes_notify_lock_sdei_normal);
 
 	return err;
@@ -1238,7 +1253,9 @@ static int ghes_sdei_critical_callback(u32 event_num, struct pt_regs *regs,
 	int err;
 
 	raw_spin_lock(&ghes_notify_lock_sdei_critical);
+	ghes_sdei_callback = true;
 	err = __ghes_sdei_callback(ghes, FIX_APEI_GHES_SDEI_CRITICAL);
+	ghes_sdei_callback = false;
 	raw_spin_unlock(&ghes_notify_lock_sdei_critical);
 
 	return err;
diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c
index a7e762c352f9..1af6b6b55c57 100644
--- a/drivers/firmware/arm_sdei.c
+++ b/drivers/firmware/arm_sdei.c
@@ -473,6 +473,20 @@ static int sdei_api_event_unregister(u32 event_num)
 			      0, 0, 0, NULL);
 }
 
+int sdei_api_event_complete_and_resume(u64 addr)
+{
+	int err;
+
+	err = invoke_sdei_fn(SDEI_1_0_FN_SDEI_EVENT_COMPLETE_AND_RESUME, addr,
+			      0, 0, 0, 0, NULL);
+	if (err && err != -EIO) {
+		pr_warn_once("failed to restore CPU[%u]: %d\n", smp_processor_id(), err);
+		return err;
+	}
+
+	return 0;
+}
+
 /* Called directly by the hotplug callbacks */
 static void _local_event_unregister(void *data)
 {
diff --git a/include/linux/arm_sdei.h b/include/linux/arm_sdei.h
index 0a241c5c911d..b6d347085834 100644
--- a/include/linux/arm_sdei.h
+++ b/include/linux/arm_sdei.h
@@ -46,9 +46,11 @@ int sdei_unregister_ghes(struct ghes *ghes);
 /* For use by arch code when CPU hotplug notifiers are not appropriate. */
 int sdei_mask_local_cpu(void);
 int sdei_unmask_local_cpu(void);
+int sdei_api_event_complete_and_resume(u64 addr);
 #else
 static inline int sdei_mask_local_cpu(void) { return 0; }
 static inline int sdei_unmask_local_cpu(void) { return 0; }
+int sdei_api_event_complete_and_resume(u64 addr) { return 0; }
 #endif /* CONFIG_ARM_SDE_INTERFACE */
 
 
-- 
2.19.1.6.gb485710b

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ