[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date: Fri, 2 Feb 2024 22:22:31 +0800
From: Bitao Hu <yaoma@...ux.alibaba.com>
To: Liu Song <liusong@...ux.alibaba.com>, dianders@...omium.org,
akpm@...ux-foundation.org, pmladek@...e.com, kernelfans@...il.com,
yaoma@...ux.alibaba.com
Cc: linux-kernel@...r.kernel.org
Subject: Re: [PATCHv3 2/2] watchdog/softlockup: report the most frequent
interrupts
On 2024/2/1 09:46, Liu Song wrote:
>
> 在 2024/2/1 01:17, Bitao Hu 写道:
>> When the watchdog determines that the current soft lockup is due
>> to an interrupt storm based on CPU utilization, reporting the
>> most frequent interrupts could be good enough for further
>> troubleshooting.
>>
>> Below is an example of interrupt storm. The call tree does not
>> provide useful information, but we can analyze which interrupt
>> caused the soft lockup by comparing the counts of interrupts.
>>
>> [ 2987.488075] watchdog: BUG: soft lockup - CPU#9 stuck for 23s!
>> [kworker/9:1:214]
>> [ 2987.488607] CPU#9 Utilization every 4s during lockup:
>> [ 2987.488941] #1: 0% system, 0% softirq, 100%
>> hardirq, 0% idle
>> [ 2987.489357] #2: 0% system, 0% softirq, 100%
>> hardirq, 0% idle
>> [ 2987.489771] #3: 0% system, 0% softirq, 100%
>> hardirq, 0% idle
>> [ 2987.490186] #4: 0% system, 0% softirq, 100%
>> hardirq, 0% idle
>> [ 2987.490601] #5: 0% system, 0% softirq, 100%
>> hardirq, 0% idle
>> [ 2987.491034] CPU#9 Detect HardIRQ Time exceeds 50%. Most frequent
>> HardIRQs:
>> [ 2987.491493] #1: 330985 irq#7(IPI)
>> [ 2987.491743] #2: 5000 irq#10(arch_timer)
>> [ 2987.492039] #3: 9 irq#91(nvme0q2)
>> [ 2987.492318] #4: 3 irq#118(virtio1-output.12)
>> ...
>> [ 2987.492728] Call trace:
>> [ 2987.492729] __do_softirq+0xa8/0x364
>>
>> Signed-off-by: Bitao Hu <yaoma@...ux.alibaba.com>
>> ---
>> kernel/watchdog.c | 156 ++++++++++++++++++++++++++++++++++++++++++++++
>> 1 file changed, 156 insertions(+)
>>
>> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
>> index 046507be4eb5..c4c25f25eae7 100644
>> --- a/kernel/watchdog.c
>> +++ b/kernel/watchdog.c
>> @@ -25,6 +25,9 @@
>> #include <linux/stop_machine.h>
>> #include <linux/kernel_stat.h>
>> #include <linux/math64.h>
>> +#include <linux/irq.h>
>> +#include <linux/bitops.h>
>> +#include <linux/irqdesc.h>
>> #include <asm/irq_regs.h>
>> #include <linux/kvm_para.h>
>> @@ -431,11 +434,15 @@ void touch_softlockup_watchdog_sync(void)
>> __this_cpu_write(watchdog_report_ts, SOFTLOCKUP_DELAY_REPORT);
>> }
>> +static void set_potential_softlockup(unsigned long now, unsigned long
>> touch_ts);
>> +
>> static int is_softlockup(unsigned long touch_ts,
>> unsigned long period_ts,
>> unsigned long now)
>> {
>> if ((watchdog_enabled & WATCHDOG_SOFTOCKUP_ENABLED) &&
>> watchdog_thresh) {
>> + /* Softlockup may occur in the current period */
>> + set_potential_softlockup(now, period_ts);
>> /* Warn about unreasonable delays. */
>> if (time_after(now, period_ts + get_softlockup_thresh()))
>> return now - touch_ts;
>> @@ -462,6 +469,8 @@ static DEFINE_PER_CPU(u16,
>> cpustat_old[NUM_STATS_PER_GROUP]);
>> static DEFINE_PER_CPU(u8,
>> cpustat_utilization[NUM_STATS_GROUPS][NUM_STATS_PER_GROUP]);
>> static DEFINE_PER_CPU(u8, cpustat_tail);
>> +static void print_hardirq_counts(void);
>> +
>> /*
>> * We don't need nanosecond resolution. A granularity of 16ms is
>> * sufficient for our precision, allowing us to use u16 to store
>> @@ -516,10 +525,156 @@ static void print_cpustat(void)
>> __this_cpu_read(cpustat_utilization[i][STATS_HARDIRQ]),
>> __this_cpu_read(cpustat_utilization[i][STATS_IDLE]));
>> }
>> + print_hardirq_counts();
>> +}
>> +
>> +#define HARDIRQ_PERCENT_THRESH 50
>> +#define NUM_HARDIRQ_REPORT 5
>> +static DECLARE_BITMAP(softlockup_hardirq_cpus, CONFIG_NR_CPUS);
>> +static DEFINE_PER_CPU(u32 *, hardirq_counts);
>> +
>> +struct irq_counts {
>> + int irq;
>> + u32 counts;
>> +};
>> +
>> +static void find_counts_top(struct irq_counts *irq_counts, int irq,
>> u32 counts, int range)
>> +{
>> + unsigned int i, j;
>> +
>> + for (i = 0; i < range; i++) {
>> + if (counts > irq_counts[i].counts) {
>> + for (j = range - 1; j > i; j--) {
>> + irq_counts[j].counts = irq_counts[j - 1].counts;
>> + irq_counts[j].irq = irq_counts[j - 1].irq;
>> + }
>> + irq_counts[j].counts = counts;
>> + irq_counts[j].irq = irq;
>> + break;
>> + }
> The current implementation can lead to a reordering with each iteration.
> It is recommended to update in place if the value is larger and perform
> the reordering just before printing at the end.
Sure, I will consider optimization.
>> + }
>> +}
>> +
>> +/*
>> + * If the proportion of time spent handling irq exceeds
>> HARDIRQ_PERCENT_THRESH%
>> + * during sample_period, then it is necessary to record the counts of
>> each irq.
>> + */
>> +static inline bool need_record_irq_counts(int type)
>> +{
>> + int tail = __this_cpu_read(cpustat_tail);
>> + u8 utilization;
>> +
>> + if (--tail == -1)
>> + tail = 4;
>> + utilization = __this_cpu_read(cpustat_utilization[tail][type]);
>> + return utilization > HARDIRQ_PERCENT_THRESH;
>> +}
>> +
>> +/*
>> + * Mark softlockup as potentially caused by hardirq
>> + */
>> +static void set_potential_softlockup_hardirq(void)
>> +{
>> + u32 i;
>> + u32 *counts = __this_cpu_read(hardirq_counts);
>> + int cpu = smp_processor_id();
>> + struct irq_desc *desc;
>> +
>> + if (!need_record_irq_counts(STATS_HARDIRQ))
>> + return;
>> +
>> + if (!test_bit(cpu, softlockup_hardirq_cpus)) {
>> + counts = kmalloc_array(nr_irqs, sizeof(u32), GFP_ATOMIC);
>> + if (!counts)
>> + return;
>> + for_each_irq_desc(i, desc) {
>> + if (!desc)
>> + continue;
>> + counts[i] = desc->kstat_irqs ?
>> + *this_cpu_ptr(desc->kstat_irqs) : 0;
>> + }
>> + __this_cpu_write(hardirq_counts, counts);
>> + set_bit(cpu, softlockup_hardirq_cpus);
>> + }
>> +}
>> +
>> +static void clear_potential_softlockup_hardirq(void)
>> +{
>> + u32 *counts = __this_cpu_read(hardirq_counts);
>> + int cpu = smp_processor_id();
>> +
>> + if (test_bit(cpu, softlockup_hardirq_cpus)) {
>> + kfree(counts);
>> + counts = NULL;
>> + __this_cpu_write(hardirq_counts, counts);
>> + clear_bit(cpu, softlockup_hardirq_cpus);
>> + }
>> }
>> +
>> +/*
>> + * Mark that softlockup may occur
>> + */
>> +static void set_potential_softlockup(unsigned long now, unsigned long
>> period_ts)
>> +{
>> + if (time_after_eq(now, period_ts + get_softlockup_thresh() / 5))
>> + set_potential_softlockup_hardirq();
>> +}
>> +
>> +static void clear_potential_softlockup(void)
>> +{
>> + clear_potential_softlockup_hardirq();
>> +}
> Given that clear_potential_softlockup will only call
> clear_potential_softlockup_hardirq, then why is there a need to declare
> clear_potential_softlockup separately?
I had considered adding "clear_potential_softlockup_softirq", but
ultimately I removed it. Indeed, it is confusing; I will remove the
"clear_potential_softlockup".
>> +
>> +static void print_hardirq_counts(void)
>> +{
>> + u32 i;
>> + struct irq_desc *desc;
>> + u32 counts_diff;
>> + u32 *counts = __this_cpu_read(hardirq_counts);
>> + int cpu = smp_processor_id();
>> + struct irq_counts hardirq_counts_top[NUM_HARDIRQ_REPORT] = {
>> + {-1, 0}, {-1, 0}, {-1, 0}, {-1, 0},
>> + };
>> +
>> + if (test_bit(cpu, softlockup_hardirq_cpus)) {
>> + /* Find the top NUM_HARDIRQ_REPORT most frequent interrupts */
>> + for_each_irq_desc(i, desc) {
>> + if (!desc)
>> + continue;
>> + counts_diff = desc->kstat_irqs ?
>> + *this_cpu_ptr(desc->kstat_irqs) - counts[i] : 0;
>> + find_counts_top(hardirq_counts_top, i, counts_diff,
>> + NUM_HARDIRQ_REPORT);
>> + }
>> + /*
>> + * We do not want the "watchdog: " prefix on every line,
>> + * hence we use "printk" instead of "pr_crit".
>> + */
>> + printk(KERN_CRIT "CPU#%d Detect HardIRQ Time exceeds %d%%.
>> Most frequent HardIRQs:\n",
>> + smp_processor_id(), HARDIRQ_PERCENT_THRESH);
>> + for (i = 0; i < NUM_HARDIRQ_REPORT; i++) {
>> + if (hardirq_counts_top[i].irq == -1)
>> + break;
>> + desc = irq_to_desc(hardirq_counts_top[i].irq);
>> + if (desc && desc->action)
>> + printk(KERN_CRIT "\t#%u: %-10u\tirq#%d(%s)\n",
>> + i+1, hardirq_counts_top[i].counts,
>> + hardirq_counts_top[i].irq, desc->action->name);
>> + else
>> + printk(KERN_CRIT "\t#%u: %-10u\tirq#%d\n",
>> + i+1, hardirq_counts_top[i].counts,
>> + hardirq_counts_top[i].irq);
>> + }
>> + if (!need_record_irq_counts(STATS_HARDIRQ))
>> + clear_potential_softlockup_hardirq();
>> + }
>> +}
>> +
>> #else
>> static inline void update_cpustat(void) { }
>> static inline void print_cpustat(void) { }
>> +static inline void set_potential_softlockup(unsigned long now,
>> unsigned long period_ts) { }
>> +static inline void clear_potential_softlockup(void) { }
>> #endif
>> /* watchdog detector functions */
>> @@ -537,6 +692,7 @@ static DEFINE_PER_CPU(struct cpu_stop_work,
>> softlockup_stop_work);
>> static int softlockup_fn(void *data)
>> {
>> update_touch_ts();
>> + clear_potential_softlockup();
>> complete(this_cpu_ptr(&softlockup_completion));
>> return 0;
Powered by blists - more mailing lists