[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <5a547a75b621e9919cd7f4706306c0e0@codeaurora.org>
Date: Sat, 05 Jun 2021 08:02:13 +0530
From: Sai Prakash Ranjan <saiprakash.ranjan@...eaurora.org>
To: tglx@...utronix.de
Cc: Jisheng.Zhang@...aptics.com, afzal.mohd.ma@...il.com,
akpm@...ux-foundation.org, corbet@....net, gpiccoli@...onical.com,
gustavo@...eddedor.com, helgaas@...nel.org, ilina@...eaurora.org,
kernelfans@...il.com, kexec@...ts.infradead.org,
linus.walleij@...aro.org, linux-doc@...r.kernel.org,
linux-kernel@...r.kernel.org, maz@...nel.org,
mike.kravetz@...cle.com, mkshah@...eaurora.org, oneukum@...e.com,
pawan.kumar.gupta@...ux.intel.com, peterz@...radead.org,
pmladek@...e.com, viro@...iv.linux.org.uk
Subject: Re: [PATCH 0/3] warn and suppress irqflood
Hi Thomas,
On 2021-03-02 13:15, Sai Prakash Ranjan wrote:
> Hi Thomas,
>
>> On Wed, Oct 28, 2020 at 12:58:41PM +0100, Thomas Gleixner wrote:
>>
>
> <snip>...
>
>> Something like the completly untested below should work independent of
>> config options.
>>
>> Thanks,
>>
>> tglx
>> ---
>> include/linux/irqdesc.h | 4 ++
>> kernel/irq/manage.c | 3 +
>> kernel/irq/spurious.c | 74
>> +++++++++++++++++++++++++++++++++++-------------
>> 3 files changed, 61 insertions(+), 20 deletions(-)
>>
>> --- a/include/linux/irqdesc.h
>> +++ b/include/linux/irqdesc.h
>> @@ -30,6 +30,8 @@ struct pt_regs;
>> * @tot_count: stats field for non-percpu irqs
>> * @irq_count: stats field to detect stalled irqs
>> * @last_unhandled: aging timer for unhandled count
>> + * @storm_count: Counter for irq storm detection
>> + * @storm_checked: Timestamp for irq storm detection
>> * @irqs_unhandled: stats field for spurious unhandled interrupts
>> * @threads_handled: stats field for deferred spurious detection of
>> threaded handlers
>> * @threads_handled_last: comparator field for deferred spurious
>> detection of theraded handlers
>> @@ -65,6 +67,8 @@ struct irq_desc {
>> unsigned int tot_count;
>> unsigned int irq_count; /* For detecting broken IRQs */
>> unsigned long last_unhandled; /* Aging timer for unhandled count */
>> + unsigned long storm_count;
>> + unsigned long storm_checked;
>> unsigned int irqs_unhandled;
>> atomic_t threads_handled;
>> int threads_handled_last;
>> --- a/kernel/irq/manage.c
>> +++ b/kernel/irq/manage.c
>> @@ -1581,6 +1581,9 @@ static int
>> if (!shared) {
>> init_waitqueue_head(&desc->wait_for_threads);
>>
>> + /* Take a timestamp for interrupt storm detection */
>> + desc->storm_checked = jiffies;
>> +
>> /* Setup the type (level, edge polarity) if configured: */
>> if (new->flags & IRQF_TRIGGER_MASK) {
>> ret = __irq_set_trigger(desc,
>> --- a/kernel/irq/spurious.c
>> +++ b/kernel/irq/spurious.c
>> @@ -21,6 +21,7 @@ static void poll_spurious_irqs(struct ti
>> static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs);
>> static int irq_poll_cpu;
>> static atomic_t irq_poll_active;
>> +static unsigned long irqstorm_limit __ro_after_init;
>>
>> /*
>> * We wait here for a poller to finish.
>> @@ -189,18 +190,21 @@ static inline int bad_action_ret(irqretu
>> * (The other 100-of-100,000 interrupts may have been a correctly
>> * functioning device sharing an IRQ with the failing one)
>> */
>> -static void __report_bad_irq(struct irq_desc *desc, irqreturn_t
>> action_ret)
>> +static void __report_bad_irq(struct irq_desc *desc, irqreturn_t
>> action_ret,
>> + bool storm)
>> {
>> unsigned int irq = irq_desc_get_irq(desc);
>> struct irqaction *action;
>> unsigned long flags;
>>
>> - if (bad_action_ret(action_ret)) {
>> - printk(KERN_ERR "irq event %d: bogus return value %x\n",
>> - irq, action_ret);
>> - } else {
>> - printk(KERN_ERR "irq %d: nobody cared (try booting with "
>> + if (!storm) {
>> + if (bad_action_ret(action_ret)) {
>> + pr_err("irq event %d: bogus return value %x\n",
>> + irq, action_ret);
>> + } else {
>> + pr_err("irq %d: nobody cared (try booting with "
>> "the \"irqpoll\" option)\n", irq);
>> + }
>> }
>> dump_stack();
>> printk(KERN_ERR "handlers:\n");
>> @@ -228,7 +232,7 @@ static void report_bad_irq(struct irq_de
>>
>> if (count > 0) {
>> count--;
>> - __report_bad_irq(desc, action_ret);
>> + __report_bad_irq(desc, action_ret, false);
>> }
>> }
>>
>> @@ -267,6 +271,33 @@ try_misrouted_irq(unsigned int irq, stru
>> return action && (action->flags & IRQF_IRQPOLL);
>> }
>>
>> +static void disable_stuck_irq(struct irq_desc *desc, irqreturn_t
>> action_ret,
>> + const char *reason, bool storm)
>> +{
>> + __report_bad_irq(desc, action_ret, storm);
>> + pr_emerg("Disabling %s IRQ #%d\n", reason, irq_desc_get_irq(desc));
>> + desc->istate |= IRQS_SPURIOUS_DISABLED;
>> + desc->depth++;
>> + irq_disable(desc);
>> +}
>> +
>> +/* Interrupt storm detector for runaway interrupts (handled or not).
>> */
>> +static bool irqstorm_detected(struct irq_desc *desc)
>> +{
>> + unsigned long now = jiffies;
>> +
>> + if (++desc->storm_count < irqstorm_limit) {
>> + if (time_after(now, desc->storm_checked + HZ)) {
>> + desc->storm_count = 0;
>> + desc->storm_checked = now;
>> + }
>> + return false;
>> + }
>> +
>> + disable_stuck_irq(desc, IRQ_NONE, "runaway", true);
>> + return true;
>> +}
>> +
>> #define SPURIOUS_DEFERRED 0x80000000
>>
>> void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
>> @@ -403,24 +434,16 @@ void note_interrupt(struct irq_desc *des
>> desc->irqs_unhandled -= ok;
>> }
>>
>> + if (unlikely(irqstorm_limit && irqstorm_detected(desc)))
>> + return;
>> +
>> desc->irq_count++;
>> if (likely(desc->irq_count < 100000))
>> return;
>>
>> desc->irq_count = 0;
>> if (unlikely(desc->irqs_unhandled > 99900)) {
>> - /*
>> - * The interrupt is stuck
>> - */
>> - __report_bad_irq(desc, action_ret);
>> - /*
>> - * Now kill the IRQ
>> - */
>> - printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
>> - desc->istate |= IRQS_SPURIOUS_DISABLED;
>> - desc->depth++;
>> - irq_disable(desc);
>> -
>> + disable_stuck_irq(desc, action_ret, "unhandled", false);
>> mod_timer(&poll_spurious_irq_timer,
>> jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
>> }
>> @@ -462,5 +485,16 @@ static int __init irqpoll_setup(char *st
>> "performance\n");
>> return 1;
>> }
>> -
>> __setup("irqpoll", irqpoll_setup);
>> +
>> +static int __init irqstorm_setup(char *arg)
>> +{
>> + int res = kstrtoul(arg, 0, &irqstorm_limit);
>> +
>> + if (!res) {
>> + pr_info("Interrupt storm detector enabled. Limit=%lu / s\n",
>> + irqstorm_limit);
>> + }
>> + return !!res;
>> +}
>> +__setup("irqstorm_limit", irqstorm_setup);
>
> This irq storm detection feature is very useful, any chance to get this
> merged?
> We will be happy to test. People seem to be having their own copy of
> such feature
> out-of-tree [1].
>
> [1]
> https://elinux.org/images/d/de/Oct28_InterruptStormDetectionFeature_KentoKobayashi.pdf
>
Any chance of having this useful debug feature in upstream kernel?
Thanks,
Sai
--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a
member
of Code Aurora Forum, hosted by The Linux Foundation
Powered by blists - more mailing lists