linux-kernel - [PATCH] perf/arm-cmn: Add shutdown routine

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20221215180039.18035-1-blakgeof@amazon.com>
Date:   Thu, 15 Dec 2022 12:00:39 -0600
From:   Geoff Blake <blakgeof@...zon.com>
To:     <Robin.Murphy@....com>
CC:     <blakgeof@...zon.com>, Will Deacon <will@...nel.org>,
        Mark Rutland <mark.rutland@....com>,
        <linux-arm-kernel@...ts.infradead.org>,
        <linux-kernel@...r.kernel.org>
Subject: [PATCH] perf/arm-cmn: Add shutdown routine

Attempt #2 with all the feedback from Robin to do the minimal amount of
shutdown and handle spurious IRQs within the CMN driver but still do
limited logging in the event a spurious IRQ still occurs in the future.
Tested over 100's of kexec's and have no reproduced the spurious IRQs.

The CMN driver does not gracefully handle all
restart cases, such as kexec.  On a kexec if the
arm-cmn driver is in use it can be left in a state
with still active  events that can cause spurious and/or
unhandled interrupts that appear as non-fatal kernel errors
like below, that can be confusing and misleading:

[    3.895093] irq 28: nobody cared (try booting with the "irqpoll" option)
[    3.895170] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.19.0-1011-aws #12
[    3.895172] Hardware name: Amazon EC2 c6g.metal/Not Specified, BIOS 1.0 10/16/2017
[    3.895174] Call trace:
[    3.895175]  dump_backtrace+0xe8/0x150
[    3.895181]  show_stack+0x28/0x70
[    3.895183]  dump_stack_lvl+0x68/0x9c
[    3.895188]  dump_stack+0x1c/0x48
[    3.895190]  __report_bad_irq+0x58/0x138
[    3.895193]  note_interrupt+0x23c/0x360
[    3.895196]  handle_irq_event+0x108/0x1a0
[    3.895198]  handle_fasteoi_irq+0xd0/0x24c
[    3.895201]  generic_handle_domain_irq+0x3c/0x70
[    3.895203]  __gic_handle_irq_from_irqson.isra.0+0xcc/0x2c0
[    3.895207]  gic_handle_irq+0x34/0xb0
[    3.895209]  call_on_irq_stack+0x40/0x50
[    3.895211]  do_interrupt_handler+0xb0/0xb4
[    3.895214]  el1_interrupt+0x4c/0xe0
[    3.895217]  el1h_64_irq_handler+0x1c/0x40
[    3.895220]  el1h_64_irq+0x78/0x7c
[    3.895222]  __do_softirq+0xd0/0x450
[    3.895223]  __irq_exit_rcu+0xcc/0x120
[    3.895227]  irq_exit_rcu+0x20/0x40
[    3.895229]  el1_interrupt+0x50/0xe0
[    3.895231]  el1h_64_irq_handler+0x1c/0x40
[    3.895233]  el1h_64_irq+0x78/0x7c
[    3.895235]  arch_cpu_idle+0x1c/0x6c
[    3.895238]  default_idle_call+0x4c/0x19c
[    3.895240]  cpuidle_idle_call+0x18c/0x1f0
[    3.895243]  do_idle+0xb0/0x11c
[    3.895245]  cpu_startup_entry+0x34/0x40
[    3.895248]  rest_init+0xec/0x104
[    3.895250]  arch_post_acpi_subsys_init+0x0/0x30
[    3.895254]  start_kernel+0x4d0/0x534
[    3.895256]  __primary_switched+0xc4/0xcc
[    3.895259] handlers:
[    3.895292] [<000000008f5364c7>] arm_cmn_handle_irq [arm_cmn]
[    3.895369] Disabling IRQ #28

This type of kernel error can be reproduced by running perf with
an arm_cmn event active and then forcing a kexec.  On return from
the kexec, this message can appear semi-regularly.


Signed-off-by: Geoff Blake <blakgeof@...zon.com>
---
 drivers/perf/arm-cmn.c | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index b80a9b74662b..5e661a9aa0fe 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -112,6 +112,7 @@
 #define CMN_DTM_UNIT_INFO		0x0910
 
 #define CMN_DTM_NUM_COUNTERS		4
+#define CMN_DTM_NUM_WPS			4
 /* Want more local counters? Why not replicate the whole DTM! Ugh... */
 #define CMN_DTM_OFFSET(n)		((n) * 0x200)
 
@@ -1797,6 +1798,7 @@ static int arm_cmn_pmu_offline_cpu(unsigned int cpu, struct hlist_node *cpuhp_no
 
 static irqreturn_t arm_cmn_handle_irq(int irq, void *dev_id)
 {
+	static int spurious_count = 100;
 	struct arm_cmn_dtc *dtc = dev_id;
 	irqreturn_t ret = IRQ_NONE;
 
@@ -1825,8 +1827,13 @@ static irqreturn_t arm_cmn_handle_irq(int irq, void *dev_id)
 
 		writel_relaxed(status, dtc->base + CMN_DT_PMOVSR_CLR);
 
-		if (!dtc->irq_friend)
-			return ret;
+		if (!dtc->irq_friend) {
+			if (ret != IRQ_HANDLED && spurious_count > 0) {
+				spurious_count--;
+				WARN_ON(ret != IRQ_HANDLED);
+			}
+			return IRQ_HANDLED;
+		}
 		dtc += dtc->irq_friend;
 	}
 }
@@ -1865,7 +1872,7 @@ static void arm_cmn_init_dtm(struct arm_cmn_dtm *dtm, struct arm_cmn_node *xp, i
 
 	dtm->base = xp->pmu_base + CMN_DTM_OFFSET(idx);
 	dtm->pmu_config_low = CMN_DTM_PMU_CONFIG_PMU_EN;
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < CMN_DTM_NUM_WPS; i++) {
 		dtm->wp_event[i] = -1;
 		writeq_relaxed(0, dtm->base + CMN_DTM_WPn_MASK(i));
 		writeq_relaxed(~0ULL, dtm->base + CMN_DTM_WPn_VAL(i));
@@ -2312,11 +2319,18 @@ static int arm_cmn_probe(struct platform_device *pdev)
 	return err;
 }
 
-static int arm_cmn_remove(struct platform_device *pdev)
+static void arm_cmn_shutdown(struct platform_device *pdev)
 {
 	struct arm_cmn *cmn = platform_get_drvdata(pdev);
 
 	writel_relaxed(0, cmn->dtc[0].base + CMN_DT_DTC_CTL);
+}
+
+static int arm_cmn_remove(struct platform_device *pdev)
+{
+	struct arm_cmn *cmn = platform_get_drvdata(pdev);
+	
+	arm_cmn_shutdown(pdev);
 
 	perf_pmu_unregister(&cmn->pmu);
 	cpuhp_state_remove_instance_nocalls(arm_cmn_hp_state, &cmn->cpuhp_node);
@@ -2353,6 +2367,7 @@ static struct platform_driver arm_cmn_driver = {
 	},
 	.probe = arm_cmn_probe,
 	.remove = arm_cmn_remove,
+	.shutdown = arm_cmn_shutdown,
 };
 
 static int __init arm_cmn_init(void)
-- 
2.24.3 (Apple Git-128)