lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <48CE6E2E.4090106@users.sourceforge.net>
Date:	Mon, 15 Sep 2008 10:16:14 -0400
From:	Elad Lahav <elad_lahav@...rs.sourceforge.net>
To:	linux-kernel@...r.kernel.org
Subject: Re: Soft IRQ statistics under /proc/stat

> I've been observing some oddities in the statistics produced by mpstat 
> with respect to soft IRQs (for example, considerable soft IRQ time on 
> processors sending UDP packets on dummy NICs).

Here's some data to support my claims.
The first experiment consists of sending UDP packets on a dummy network interface. No 
interrupts are generated, so there should be no soft IRQs. Nevertheless, /proc/stat shows 
that a considerable share of CPU time is taken by soft IRQs:

CPU   %user   %nice    %sys %iowait    %irq   %soft  %steal  %guest   %idle
   0    4.52    0.00   67.84    0.00    0.00   27.64    0.00    0.00    0.00
   0    4.00    0.00   70.00    0.00    0.00   26.00    0.00    0.00    0.00
   0    4.98    0.00   68.16    0.00    0.00   26.87    0.00    0.00    0.00
   0    4.02    0.00   69.85    0.00    0.00   26.13    0.00    0.00    0.00

In a second experiment, UDP packets are sent over a real NIC by a process pinned to CPU 0, 
while the respective network interrupts are pinned to CPU 2. Here, you can see that CPU 0 
is executing soft IRQs, despite the interrupt affinity:

CPU   %user   %nice    %sys %iowait    %irq   %soft  %steal  %guest   %idle
   0    4.02    0.00   63.82    0.00    0.00   32.16    0.00    0.00    0.00
   2    0.00    0.00    0.00    0.00    6.47   40.30    0.00    0.00   53.23
   0    2.48    0.00   67.33    0.00    0.00   30.20    0.00    0.00    0.00
   2    0.00    0.00    0.00    0.00    6.47   41.79    0.00    0.00   51.74

I have verified, from /proc/interrupts, that in both cases the number of interrupts per 
second on CPU 0 is negligible.

Next, I modified the kernel code to raise a per-CPU flag at the beginning of 
__do_softirq(), and clear it at the end. Using this flag, account_system_time() can 
differentiate between a "true" soft IRQ, and code running under local_bh_disable(). The 
results of the first experiment (dummy NIC) are as follows:

CPU   %user   %nice    %sys %iowait    %irq   %soft  %steal  %guest %bh_dis   %idle
   0    4.50    0.00   71.00    0.00    0.00    0.00    0.00    0.00   24.50    0.00
   0    4.00    0.00   67.00    0.00    0.00    0.00    0.00    0.00   29.00    0.00
   0    3.98    0.00   69.15    0.00    0.00    0.00    0.00    0.00   26.87    0.00
   0    3.50    0.00   69.00    0.00    0.00    0.00    0.00    0.00   27.50    0.00

where bh_dis refers to code executing with softirq_count() greater than 0, but with the 
soft IRQ flag cleared. The results for the second experiment (real NIC):

CPU   %user   %nice    %sys %iowait    %irq   %soft  %steal  %guest %bh_dis   %idle
   0    3.00    0.00   67.50    0.00    0.00    0.00    0.00    0.00   29.50    0.00
   2    0.00    0.00    0.00    0.00    3.48   40.30    0.00    0.00    0.00   56.22
   0    1.99    0.00   66.67    0.00    0.00    0.00    0.00    0.00   31.34    0.00
   2    0.00    0.00    0.00    0.49    3.45   39.41    0.00    0.00    0.00   56.65

These results make much more sense.

Elad

P.S.,

For reference, here's the patch I applied to the kernel in order to differentiate between 
"true" and "false" soft IRQs (I am not suggesting it as a permanent patch, it's just for 
testing and verification purposes):

diff -u -r --exclude='*.o' --exclude='*.cmd' linux-2.6.26.5/fs/proc/proc_misc.c 
linux-2.6.26.5-bh/fs/proc/proc_misc.c
--- linux-2.6.26.5/fs/proc/proc_misc.c  2008-09-08 13:40:20.000000000 -0400
+++ linux-2.6.26.5-bh/fs/proc/proc_misc.c       2008-09-11 19:39:52.000000000 -0400
@@ -478,6 +478,7 @@
         unsigned long jif;
         cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
         cputime64_t guest;
+       cputime64_t bh_disabled;
         u64 sum = 0;
         struct timespec boottime;
         unsigned int *per_irq_sum;
@@ -489,6 +490,7 @@
         user = nice = system = idle = iowait =
                 irq = softirq = steal = cputime64_zero;
         guest = cputime64_zero;
+       bh_disabled = cputime64_zero;
         getboottime(&boottime);
         jif = boottime.tv_sec;

@@ -504,6 +506,8 @@
                 softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
                 steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
                 guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
+               bh_disabled = cputime64_add(bh_disabled,
+                                           kstat_cpu(i).cpustat.bh_disabled);
                 for (j = 0; j < NR_IRQS; j++) {
                         unsigned int temp = kstat_cpu(i).irqs[j];
                         sum += temp;
@@ -511,7 +515,7 @@
                 }
         }

-       seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+       seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
                 (unsigned long long)cputime64_to_clock_t(user),
                 (unsigned long long)cputime64_to_clock_t(nice),
                 (unsigned long long)cputime64_to_clock_t(system),
@@ -520,7 +524,8 @@
                 (unsigned long long)cputime64_to_clock_t(irq),
                 (unsigned long long)cputime64_to_clock_t(softirq),
                 (unsigned long long)cputime64_to_clock_t(steal),
-               (unsigned long long)cputime64_to_clock_t(guest));
+               (unsigned long long)cputime64_to_clock_t(guest),
+               (unsigned long long)cputime64_to_clock_t(bh_disabled));
         for_each_online_cpu(i) {

                 /* Copy values here to work around gcc-2.95.3, gcc-2.96 */
@@ -533,8 +538,9 @@
                 softirq = kstat_cpu(i).cpustat.softirq;
                 steal = kstat_cpu(i).cpustat.steal;
                 guest = kstat_cpu(i).cpustat.guest;
+               bh_disabled = kstat_cpu(i).cpustat.bh_disabled;
                 seq_printf(p,
-                       "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+                       "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
                         i,
                         (unsigned long long)cputime64_to_clock_t(user),
                         (unsigned long long)cputime64_to_clock_t(nice),
@@ -544,7 +550,8 @@
                         (unsigned long long)cputime64_to_clock_t(irq),
                         (unsigned long long)cputime64_to_clock_t(softirq),
                         (unsigned long long)cputime64_to_clock_t(steal),
-                       (unsigned long long)cputime64_to_clock_t(guest));
+                       (unsigned long long)cputime64_to_clock_t(guest),
+                       (unsigned long long)cputime64_to_clock_t(bh_disabled));
         }
         seq_printf(p, "intr %llu", (unsigned long long)sum);

diff -u -r --exclude='*.o' --exclude='*.cmd' linux-2.6.26.5/include/linux/kernel_stat.h 
linux-2.6.26.5-bh/include/linux/kernel_stat.h
--- linux-2.6.26.5/include/linux/kernel_stat.h  2008-09-08 13:40:20.000000000 -0400
+++ linux-2.6.26.5-bh/include/linux/kernel_stat.h       2008-09-11 19:32:00.000000000 -0400
@@ -24,6 +24,7 @@
         cputime64_t iowait;
         cputime64_t steal;
         cputime64_t guest;
+       cputime64_t bh_disabled;
  };

  struct kernel_stat {
@@ -32,6 +33,7 @@
  };

  DECLARE_PER_CPU(struct kernel_stat, kstat);
+DECLARE_PER_CPU(int, in_softirq);

  #define kstat_cpu(cpu) per_cpu(kstat, cpu)
  /* Must have preemption disabled for this to be meaningful. */
diff -u -r --exclude='*.o' --exclude='*.cmd' linux-2.6.26.5/kernel/sched.c 
linux-2.6.26.5-bh/kernel/sched.c
--- linux-2.6.26.5/kernel/sched.c       2008-09-08 13:40:20.000000000 -0400
+++ linux-2.6.26.5-bh/kernel/sched.c    2008-09-11 20:56:31.000000000 -0400
@@ -3950,8 +3950,12 @@
         tmp = cputime_to_cputime64(cputime);
         if (hardirq_count() - hardirq_offset)
                 cpustat->irq = cputime64_add(cpustat->irq, tmp);
-       else if (softirq_count())
-               cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+       else if (softirq_count()) {
+               if (__get_cpu_var(in_softirq))
+                       cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+               else
+                       cpustat->bh_disabled = cputime64_add(cpustat->bh_disabled, tmp);
+       }
         else if (p != rq->idle)
                 cpustat->system = cputime64_add(cpustat->system, tmp);
         else if (atomic_read(&rq->nr_iowait) > 0)
diff -u -r --exclude='*.o' --exclude='*.cmd' linux-2.6.26.5/kernel/softirq.c 
linux-2.6.26.5-bh/kernel/softirq.c
--- linux-2.6.26.5/kernel/softirq.c     2008-09-08 13:40:20.000000000 -0400
+++ linux-2.6.26.5-bh/kernel/softirq.c  2008-09-11 19:34:33.000000000 -0400
@@ -49,6 +49,7 @@
  static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp;

  static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
+DEFINE_PER_CPU(int, in_softirq);

  /*
   * we cannot loop indefinitely here to avoid userspace starvation,
@@ -214,6 +215,8 @@
         int max_restart = MAX_SOFTIRQ_RESTART;
         int cpu;

+       __get_cpu_var(in_softirq) = 1;
+
         pending = local_softirq_pending();
         account_system_vtime(current);

@@ -251,6 +254,8 @@

         account_system_vtime(current);
         _local_bh_enable();
+
+       __get_cpu_var(in_softirq) = 0;
  }

  #ifndef __ARCH_HAS_DO_SOFTIRQ
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ