lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20221011055529.25861-1-zhangxincheng@uniontech.com>
Date:   Tue, 11 Oct 2022 13:55:29 +0800
From:   Zhang Xincheng <zhangxincheng@...ontech.com>
To:     tglx@...utronix.de
Cc:     maz@...nel.org, wsa+renesas@...g-engineering.com,
        hdegoede@...hat.com, bigeasy@...utronix.de, mark.rutland@....com,
        michael@...le.cc, linux-kernel@...r.kernel.org,
        Zhang Xincheng <zhangxincheng@...ontech.com>
Subject: [PATCH v2] interrupt: debug for discovering frequent interrupts

In some cases, a peripheral's interrupt will be triggered frequently,
which will keep the CPU processing the interrupt and eventually cause
the RCU to report rcu_sched self-detected stall on the CPU. This patch
provides a way to discover and report which outage is causing the problem.

Signed-off-by: Zhang Xincheng <zhangxincheng@...ontech.com>
---
 include/linux/irqdesc.h |   5 ++
 kernel/irq/Kconfig      |  25 +++++++++
 kernel/irq/spurious.c   | 121 +++++++++++++++++++++++++++++++++++++---
 3 files changed, 144 insertions(+), 7 deletions(-)

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 1cd4e36890fb..f82b138c68d6 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -102,6 +102,11 @@ struct irq_desc {
 	int			parent_irq;
 	struct module		*owner;
 	const char		*name;
+#ifdef CONFIG_FREQUENT_IRQ_DEBUG
+	bool have_reported;
+	u32 gap_count;
+	u64 gap_time;
+#endif
 } ____cacheline_internodealigned_in_smp;
 
 #ifdef CONFIG_SPARSE_IRQ
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index db3d174c53d4..0b666ef51a08 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -137,6 +137,31 @@ config GENERIC_IRQ_DEBUGFS
 
 	  If you don't know what to do here, say N.
 
+config FREQUENT_IRQ_DEBUG
+	bool "Support for finding and reporting frequent interrupt"
+	default n
+	help
+
+	  This is a mechanism to detect and report that interrupts
+	  are triggered too frequently.
+
+config COUNT_PER_SECOND
+	int "Interrupt limit per second"
+	depends on FREQUENT_IRQ_DEBUG
+	default "2000"
+	help
+
+	  This is the limit on the number of interrupts triggered per second.
+	  (Max 65535)
+config DURATION_LIMIT
+	int "Duration limit"
+	depends on FREQUENT_IRQ_DEBUG
+	default "30"
+	help
+
+	  The number of interruptions per second exceeds the duration limit of
+	  the limit. (Max 65535)
+
 endmenu
 
 config GENERIC_IRQ_MULTI_HANDLER
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 02b2daf07441..74f3833aedd3 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -22,6 +22,16 @@ static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs);
 static int irq_poll_cpu;
 static atomic_t irq_poll_active;
 
+#ifdef CONFIG_FREQUENT_IRQ_DEBUG
+#define COUNT_PER_SECOND_MASK	0x0000ffff
+#define DURATION_LIMIT_MASK		0xffff0000
+#define DURATION_LIMIT_COUNT	0x00010000
+#define DURATION_LIMIT_OFFSET	16
+static unsigned int count_per_second = CONFIG_COUNT_PER_SECOND;
+static unsigned int duration_limit = CONFIG_DURATION_LIMIT;
+static bool disable_frequent_irq;
+#endif /* CONFIG_FREQUENT_IRQ_DEBUG */
+
 /*
  * We wait here for a poller to finish.
  *
@@ -189,18 +199,16 @@ static inline int bad_action_ret(irqreturn_t action_ret)
  * (The other 100-of-100,000 interrupts may have been a correctly
  *  functioning device sharing an IRQ with the failing one)
  */
-static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
+static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret, const char *msg)
 {
 	unsigned int irq = irq_desc_get_irq(desc);
 	struct irqaction *action;
 	unsigned long flags;
 
 	if (bad_action_ret(action_ret)) {
-		printk(KERN_ERR "irq event %d: bogus return value %x\n",
-				irq, action_ret);
+		printk(msg, irq, action_ret);
 	} else {
-		printk(KERN_ERR "irq %d: nobody cared (try booting with "
-				"the \"irqpoll\" option)\n", irq);
+		printk(msg, irq);
 	}
 	dump_stack();
 	printk(KERN_ERR "handlers:\n");
@@ -228,7 +236,7 @@ static void report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
 
 	if (count > 0) {
 		count--;
-		__report_bad_irq(desc, action_ret);
+		__report_bad_irq(desc, action_ret, KERN_ERR "irq event %d: bogus return value %x\n");
 	}
 }
 
@@ -269,6 +277,46 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
 
 #define SPURIOUS_DEFERRED	0x80000000
 
+#ifdef CONFIG_FREQUENT_IRQ_DEBUG
+/*
+ * Some bad hardware will trigger interrupts very frequently, which will
+ * cause the CPU to process hardware interrupts all the time. We found
+ * and reported it, and disabling it is optional.
+ */
+void report_frequent_irq(struct irq_desc *desc, irqreturn_t action_ret)
+{
+	if (desc->have_reported)
+		return;
+
+	if ((desc->gap_count & DURATION_LIMIT_MASK) == 0)
+		desc->gap_time = get_jiffies_64();
+
+	desc->gap_count++;
+
+	if ((desc->gap_count & COUNT_PER_SECOND_MASK) >= count_per_second) {
+		if ((get_jiffies_64() - desc->gap_time) < HZ) {
+			desc->gap_count += DURATION_LIMIT_COUNT;
+			desc->gap_count &= DURATION_LIMIT_MASK;
+		} else {
+			desc->gap_count = 0;
+		}
+
+		if ((desc->gap_count >> DURATION_LIMIT_OFFSET) >= duration_limit) {
+			__report_bad_irq(desc, action_ret, KERN_ERR "irq %d: triggered too "
+					"frequently\n");
+			desc->have_reported = true;
+			if (disable_frequent_irq)
+				irq_disable(desc);
+		}
+	}
+}
+
+#else
+void report_frequent_irq(struct irq_desc *desc, irqreturn_t action_ret)
+{
+}
+#endif /* CONFIG_FREQUENT_IRQ_DEBUG */
+
 void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
 {
 	unsigned int irq;
@@ -282,6 +330,8 @@ void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
 		return;
 	}
 
+	report_frequent_irq(desc, action_ret);
+
 	/*
 	 * We cannot call note_interrupt from the threaded handler
 	 * because we need to look at the compound of all handlers
@@ -416,7 +466,8 @@ void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
 		/*
 		 * The interrupt is stuck
 		 */
-		__report_bad_irq(desc, action_ret);
+		__report_bad_irq(desc, action_ret, KERN_ERR "irq %d: nobody cared (try "
+				"bootingwith the \"irqpoll\" option)\n");
 		/*
 		 * Now kill the IRQ
 		 */
@@ -476,3 +527,59 @@ static int __init irqpoll_setup(char *str)
 }
 
 __setup("irqpoll", irqpoll_setup);
+
+#ifdef CONFIG_FREQUENT_IRQ_DEBUG
+int __init count_per_second_setup(char *str)
+{
+	int ret;
+
+	ret = kstrtouint(str, 10, &count_per_second);
+	if (ret)
+		return 0;
+
+	printk(KERN_INFO "Interrupt limit per second: %u\n", count_per_second);
+
+	return 1;
+}
+
+__setup("count_per_second=", count_per_second_setup);
+module_param(count_per_second, uint, 0644);
+MODULE_PARM_DESC(count_per_second, "Interrupt limit per second. (Max 0x65535)");
+
+int __init duration_limit_setup(char *str)
+{
+	int ret;
+
+	ret = kstrtouint(str, 10, &duration_limit);
+	if (ret)
+		return 0;
+
+	printk(KERN_INFO "Duration limit: %u\n", duration_limit);
+
+	return 1;
+}
+
+__setup("duration_limit=", duration_limit_setup);
+module_param(duration_limit, uint, 0644);
+MODULE_PARM_DESC(duration_limit, "The number of interruptions per second exceeds the duration limit of the limit. (Max 65535)");
+
+int __init disable_frequent_irq_setup(char *str)
+{
+	int ret;
+
+	ret = kstrtobool(str, &disable_frequent_irq);
+	if (ret)
+		return 0;
+
+	if (disable_frequent_irq)
+		printk(KERN_INFO "Disable frequent irq'\n");
+	else
+		printk(KERN_INFO "Don't disable frequent irq'\n");
+
+	return 1;
+}
+
+__setup("disable_frequent_irq=", disable_frequent_irq_setup);
+module_param(disable_frequent_irq, bool, 0644);
+MODULE_PARM_DESC(noirqdebug, "Disable frequent irq when true");
+#endif /* CONFIG_FREQUENT_IRQ_DEBUG */
-- 
2.20.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ