lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <tencent_3B90F136446E9125563169AF@qq.com>
Date:   Sun, 9 Oct 2022 18:02:34 +0800
From:   "Zhang Xincheng" <zhangxincheng@...ontech.com>
To:     "maz" <maz@...nel.org>
Cc:     "tglx" <tglx@...utronix.de>,
        "linux-kernel" <linux-kernel@...r.kernel.org>,
        "oleksandr" <oleksandr@...alenko.name>,
        "Hans de Goede" <hdegoede@...hat.com>,
        "bigeasy" <bigeasy@...utronix.de>,
        "mark.rutland" <mark.rutland@....com>,
        "michael" <michael@...le.cc>
Subject: Re: [PATCH] interrupt: discover and disable very frequent interrupts

> I don't think this is much better. You hardcode values that only make
> sense on your HW, and for nobody else. And what can the user do with
> this message? Nothing at all. The message itself only contributes to
> problem. As it is, this patch is only a nuisance.

Indeed, this patch can only report a possible problem.

> As I said before, this would be much better as a rate-limiter, with
> configurable limits, and behind a debug option.

This is a good suggestion, I have modified my patch as follows:

From 681a864ead6521560a25d089b96b8360bf987f5f Mon Sep 17 00:00:00 2001
From: zhangxincheng <zhangxincheng@...ontech.com>
Date: Thu, 29 Sep 2022 17:22:25 +0800
Subject: [PATCH] interrupt: debug for discovering frequent interrupts

In some cases, a peripheral's interrupt will be triggered frequently,
which will keep the CPU processing the interrupt and eventually cause
the RCU to report rcu_sched self-detected stall on the CPU.

[  838.131628] rcu: INFO: rcu_sched self-detected stall on CPU
[  838.137189] rcu:     0-....: (194839 ticks this GP) idle=f02/1/0x4000000000000004
softirq=9993/9993 fqs=97428
[  838.146912] rcu:      (t=195015 jiffies g=6773 q=0)
[  838.151516] Task dump for CPU 0:
[  838.154730] systemd-sleep   R  running task        0  3445      1 0x0000000a

Signed-off-by: Zhang Xincheng <zhangxincheng@...ontech.com>
---
 include/linux/irqdesc.h |   5 ++
 kernel/irq/Kconfig      |  25 +++++++++
 kernel/irq/spurious.c   | 121 +++++++++++++++++++++++++++++++++++++---
 3 files changed, 144 insertions(+), 7 deletions(-)

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 1cd4e36890fb..f82b138c68d6 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -102,6 +102,11 @@ struct irq_desc {
 	int			parent_irq;
 	struct module		*owner;
 	const char		*name;
+#ifdef CONFIG_FREQUENT_IRQ_DEBUG
+	bool have_reported;
+	u32 gap_count;
+	u64 gap_time;
+#endif
 } ____cacheline_internodealigned_in_smp;
 
 #ifdef CONFIG_SPARSE_IRQ
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index db3d174c53d4..ae5c6e45265a 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -137,6 +137,31 @@ config GENERIC_IRQ_DEBUGFS
 
 	  If you don't know what to do here, say N.
 
+config FREQUENT_IRQ_DEBUG
+	bool "Support for finding and reporting frequent interrupt"
+	default n
+	help
+
+	  This is a mechanism to detect and report that interrupts
+	  are triggered too frequently.
+
+config COUNT_PER_SECOND
+	int "Interrupt limit per second"
+	depends on FREQUENT_IRQ_DEBUG
+	default "2000"
+	help
+
+	  This is the limit on the number of interrupts triggered per second.
+	  (Max 0xffff)
+config DURATION_LIMIT
+	int "Duration limit"
+	depends on FREQUENT_IRQ_DEBUG
+	default "30"
+	help
+
+	  The number of interruptions per second exceeds the duration limit of
+	  the limit. (Max 0xffff)
+
 endmenu
 
 config GENERIC_IRQ_MULTI_HANDLER
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 02b2daf07441..a8f606567b21 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -22,6 +22,16 @@ static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs);
 static int irq_poll_cpu;
 static atomic_t irq_poll_active;
 
+#ifdef CONFIG_FREQUENT_IRQ_DEBUG
+#define COUNT_PER_SECOND_MASK	0x0000ffff
+#define DURATION_LIMIT_MASK		0xffff0000
+#define DURATION_LIMIT_COUNT	0x00010000
+#define DURATION_LIMIT_OFFSET	16
+static unsigned int count_per_second = CONFIG_COUNT_PER_SECOND;
+static unsigned int duration_limit = CONFIG_DURATION_LIMIT;
+static bool disable_frequent_irq;
+#endif /* CONFIG_FREQUENT_IRQ_DEBUG */
+
 /*
  * We wait here for a poller to finish.
  *
@@ -189,18 +199,16 @@ static inline int bad_action_ret(irqreturn_t action_ret)
  * (The other 100-of-100,000 interrupts may have been a correctly
  *  functioning device sharing an IRQ with the failing one)
  */
-static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
+static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret, const char *msg)
 {
 	unsigned int irq = irq_desc_get_irq(desc);
 	struct irqaction *action;
 	unsigned long flags;
 
 	if (bad_action_ret(action_ret)) {
-		printk(KERN_ERR "irq event %d: bogus return value %x\n",
-				irq, action_ret);
+		printk(msg, irq, action_ret);
 	} else {
-		printk(KERN_ERR "irq %d: nobody cared (try booting with "
-				"the \"irqpoll\" option)\n", irq);
+		printk(msg, irq);
 	}
 	dump_stack();
 	printk(KERN_ERR "handlers:\n");
@@ -228,7 +236,7 @@ static void report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
 
 	if (count > 0) {
 		count--;
-		__report_bad_irq(desc, action_ret);
+		__report_bad_irq(desc, action_ret, KERN_ERR "irq event %d: bogus return value %x\n");
 	}
 }
 
@@ -269,6 +277,46 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
 
 #define SPURIOUS_DEFERRED	0x80000000
 
+#ifdef CONFIG_FREQUENT_IRQ_DEBUG
+/*
+ * Some bad hardware will trigger interrupts very frequently, which will
+ * cause the CPU to process hardware interrupts all the time. We found
+ * and reported it, and disabling it is optional.
+ */
+void report_frequent_irq(struct irq_desc *desc, irqreturn_t action_ret)
+{
+	if (desc->have_reported)
+		return;
+
+	if ((desc->gap_count & DURATION_LIMIT_MASK) == 0)
+		desc->gap_time = get_jiffies_64();
+
+	desc->gap_count++;
+
+	if ((desc->gap_count & COUNT_PER_SECOND_MASK) >= count_per_second) {
+		if ((get_jiffies_64() - desc->gap_time) < HZ) {
+			desc->gap_count += DURATION_LIMIT_COUNT;
+			desc->gap_count &= DURATION_LIMIT_MASK;
+		} else {
+			desc->gap_count = 0;
+		}
+
+		if ((desc->gap_count >> DURATION_LIMIT_OFFSET) >= duration_limit) {
+			__report_bad_irq(desc, action_ret, KERN_ERR "irq %d: triggered too "
+					"frequently\n");
+			desc->have_reported = true;
+			if (disable_frequent_irq)
+				irq_disable(desc);
+		}
+	}
+}
+
+#else
+void report_frequent_irq(struct irq_desc *desc, irqreturn_t action_ret)
+{
+}
+#endif /* CONFIG_FREQUENT_IRQ_DEBUG */
+
 void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
 {
 	unsigned int irq;
@@ -282,6 +330,8 @@ void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
 		return;
 	}
 
+	report_frequent_irq(desc, action_ret);
+
 	/*
 	 * We cannot call note_interrupt from the threaded handler
 	 * because we need to look at the compound of all handlers
@@ -416,7 +466,8 @@ void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
 		/*
 		 * The interrupt is stuck
 		 */
-		__report_bad_irq(desc, action_ret);
+		__report_bad_irq(desc, action_ret, KERN_ERR "irq %d: nobody cared (try "
+				"bootingwith the \"irqpoll\" option)\n");
 		/*
 		 * Now kill the IRQ
 		 */
@@ -476,3 +527,59 @@ static int __init irqpoll_setup(char *str)
 }
 
 __setup("irqpoll", irqpoll_setup);
+
+#ifdef CONFIG_FREQUENT_IRQ_DEBUG
+int __init count_per_second_setup(char *str)
+{
+	int ret;
+
+	ret = kstrtouint(str, 10, &count_per_second);
+	if (ret)
+		return 0;
+
+	printk(KERN_INFO "Interrupt limit per second: %u\n", count_per_second);
+
+	return 1;
+}
+
+__setup("count_per_second=", count_per_second_setup);
+module_param(count_per_second, uint, 0644);
+MODULE_PARM_DESC(count_per_second, "Interrupt limit per second. (Max 0xffff)");
+
+int __init duration_limit_setup(char *str)
+{
+	int ret;
+
+	ret = kstrtouint(str, 10, &duration_limit);
+	if (ret)
+		return 0;
+
+	printk(KERN_INFO "Duration limit: %u\n", duration_limit);
+
+	return 1;
+}
+
+__setup("duration_limit=", duration_limit_setup);
+module_param(duration_limit, uint, 0644);
+MODULE_PARM_DESC(duration_limit, "The number of interruptions per second exceeds the duration limit of the limit. (Max 0xffff)");
+
+int __init disable_frequent_irq_setup(char *str)
+{
+	int ret;
+
+	ret = kstrtobool(str, &disable_frequent_irq);
+	if (ret)
+		return 0;
+
+	if (disable_frequent_irq)
+		printk(KERN_INFO "Disable frequent irq'\n");
+	else
+		printk(KERN_INFO "Don't disable frequent irq'\n");
+
+	return 1;
+}
+
+__setup("disable_frequent_irq=", disable_frequent_irq_setup);
+module_param(disable_frequent_irq, bool, 0644);
+MODULE_PARM_DESC(noirqdebug, "Disable frequent irq when true");
+#endif /* CONFIG_FREQUENT_IRQ_DEBUG */
-- 
2.20.1


Thanks,

             Zhang Xincheng

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ