linux-kernel - Re: [BUG nohz]: wrong user and system time accounting

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CANRm+Cw8g_4J_Mo-P8ZOMST3+Uh9eOn6o2T020BYt5XYBTjxSA@mail.gmail.com>
Date:   Thu, 13 Apr 2017 12:31:12 +0800
From:   Wanpeng Li <kernellwp@...il.com>
To:     Thomas Gleixner <tglx@...utronix.de>
Cc:     Frederic Weisbecker <fweisbec@...il.com>,
        Mike Galbraith <efault@....de>, Rik van Riel <riel@...hat.com>,
        Luiz Capitulino <lcapitulino@...hat.com>,
        "linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
        Peter Zijlstra <peterz@...radead.org>
Subject: Re: [BUG nohz]: wrong user and system time accounting

2017-04-12 22:57 GMT+08:00 Thomas Gleixner <tglx@...utronix.de>:
> On Wed, 12 Apr 2017, Frederic Weisbecker wrote:
>> On Tue, Apr 11, 2017 at 04:22:48PM +0200, Thomas Gleixner wrote:
>> > It's not different from the current jiffies based stuff at all. Same
>> > failure mode.
>>
>> Yes you're right, I got confused again. So to fix this we could do our snapshots
>> at a frequency lower than HZ but still high enough to avoid overhead.
>>
>> Something like TICK_NSEC / 2 ?
>
> If you are using TSC anyway then you can do proper accumulation for both
> system and user and only account the data when the accumulation is more
> than a jiffie.

So I implement it as below:

- HZ=1000.
  1) two cpu hogs on cpu in nohz_full mode, 100% user time
  2) Luzi's testcase, ~95% user time, ~5% idle time (as we expected)
- HZ=250
   1) two cpu hogs on cpu in nohz_full mode, 100% user time
   2) Luzi's testcase, 100% idle

So the codes below still not work correctly for HZ=250, any suggestions?

-------------------------------------->8-----------------------------------------------------

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d67eee8..6a11771 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -668,6 +668,8 @@ struct task_struct {
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
     seqcount_t            vtime_seqcount;
     unsigned long long        vtime_snap;
+    u64                vtime_acct_utime;
+    u64                vtime_acct_stime;
     enum {
         /* Task is sleeping or running in a CPU with VTIME inactive: */
         VTIME_INACTIVE = 0,
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index f3778e2b..f8e54ba 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -674,20 +674,41 @@ void thread_group_cputime_adjusted(struct
task_struct *p, u64 *ut, u64 *st)
 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */

 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-static u64 vtime_delta(struct task_struct *tsk)
+static u64 vtime_delta(struct task_struct *tsk, bool user)
 {
-    unsigned long now = READ_ONCE(jiffies);
+    u64 delta, ret = 0;

-    if (time_before(now, (unsigned long)tsk->vtime_snap))
-        return 0;
+    delta = sched_clock() - tsk->vtime_snap;

-    return jiffies_to_nsecs(now - tsk->vtime_snap);
+    if (is_idle_task(tsk)) {
+        if (delta >= TICK_NSEC)
+            ret = delta;
+    } else {
+        if (user) {
+            tsk->vtime_acct_utime += delta;
+            if (tsk->vtime_acct_utime >= TICK_NSEC)
+                ret = tsk->vtime_acct_utime;
+        } else {
+            tsk->vtime_acct_stime += delta;
+            if (tsk->vtime_acct_utime >= TICK_NSEC)
+                ret = tsk->vtime_acct_stime;
+        }
+    }
+
+    return ret;
 }

-static u64 get_vtime_delta(struct task_struct *tsk)
+static u64 get_vtime_delta(struct task_struct *tsk, bool user)
 {
-    unsigned long now = READ_ONCE(jiffies);
-    u64 delta, other;
+    u64 delta = vtime_delta(tsk, user);
+    u64 other;
+
+    if (!is_idle_task(tsk)) {
+        if (user)
+            tsk->vtime_acct_utime = 0;
+        else
+            tsk->vtime_acct_stime = 0;
+    }

     /*
      * Unlike tick based timing, vtime based timing never has lost
@@ -696,22 +717,21 @@ static u64 get_vtime_delta(struct task_struct *tsk)
      * elapsed time. Limit account_other_time to prevent rounding
      * errors from causing elapsed vtime to go negative.
      */
-    delta = jiffies_to_nsecs(now - tsk->vtime_snap);
     other = account_other_time(delta);
     WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
-    tsk->vtime_snap = now;
+    tsk->vtime_snap += delta;

     return delta - other;
 }

 static void __vtime_account_system(struct task_struct *tsk)
 {
-    account_system_time(tsk, irq_count(), get_vtime_delta(tsk));
+    account_system_time(tsk, irq_count(), get_vtime_delta(tsk, false));
 }

 void vtime_account_system(struct task_struct *tsk)
 {
-    if (!vtime_delta(tsk))
+    if (!vtime_delta(tsk, false))
         return;

     write_seqcount_begin(&tsk->vtime_seqcount);
@@ -723,15 +743,15 @@ void vtime_account_user(struct task_struct *tsk)
 {
     write_seqcount_begin(&tsk->vtime_seqcount);
     tsk->vtime_snap_whence = VTIME_SYS;
-    if (vtime_delta(tsk))
-        account_user_time(tsk, get_vtime_delta(tsk));
+    if (vtime_delta(tsk, true))
+        account_user_time(tsk, get_vtime_delta(tsk, true));
     write_seqcount_end(&tsk->vtime_seqcount);
 }

 void vtime_user_enter(struct task_struct *tsk)
 {
     write_seqcount_begin(&tsk->vtime_seqcount);
-    if (vtime_delta(tsk))
+    if (vtime_delta(tsk, false))
         __vtime_account_system(tsk);
     tsk->vtime_snap_whence = VTIME_USER;
     write_seqcount_end(&tsk->vtime_seqcount);
@@ -747,7 +767,7 @@ void vtime_guest_enter(struct task_struct *tsk)
      * that can thus safely catch up with a tickless delta.
      */
     write_seqcount_begin(&tsk->vtime_seqcount);
-    if (vtime_delta(tsk))
+    if (vtime_delta(tsk, false))
         __vtime_account_system(tsk);
     current->flags |= PF_VCPU;
     write_seqcount_end(&tsk->vtime_seqcount);
@@ -765,7 +785,7 @@ EXPORT_SYMBOL_GPL(vtime_guest_exit);

 void vtime_account_idle(struct task_struct *tsk)
 {
-    account_idle_time(get_vtime_delta(tsk));
+    account_idle_time(get_vtime_delta(tsk, false));
 }

 void arch_vtime_task_switch(struct task_struct *prev)
@@ -776,7 +796,7 @@ void arch_vtime_task_switch(struct task_struct *prev)

     write_seqcount_begin(&current->vtime_seqcount);
     current->vtime_snap_whence = VTIME_SYS;
-    current->vtime_snap = jiffies;
+    current->vtime_snap = sched_clock_cpu(smp_processor_id());
     write_seqcount_end(&current->vtime_seqcount);
 }

@@ -787,7 +807,7 @@ void vtime_init_idle(struct task_struct *t, int cpu)
     local_irq_save(flags);
     write_seqcount_begin(&t->vtime_seqcount);
     t->vtime_snap_whence = VTIME_SYS;
-    t->vtime_snap = jiffies;
+    t->vtime_snap = sched_clock_cpu(cpu);
     write_seqcount_end(&t->vtime_seqcount);
     local_irq_restore(flags);
 }
@@ -805,7 +825,7 @@ u64 task_gtime(struct task_struct *t)

         gtime = t->gtime;
         if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU)
-            gtime += vtime_delta(t);
+            gtime += vtime_delta(t, false);

     } while (read_seqcount_retry(&t->vtime_seqcount, seq));

@@ -819,7 +839,6 @@ u64 task_gtime(struct task_struct *t)
  */
 void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
 {
-    u64 delta;
     unsigned int seq;

     if (!vtime_accounting_enabled()) {
@@ -838,16 +857,14 @@ void task_cputime(struct task_struct *t, u64
*utime, u64 *stime)
         if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t))
             continue;

-        delta = vtime_delta(t);
-
         /*
          * Task runs either in user or kernel space, add pending nohz time to
          * the right place.
          */
         if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU)
-            *utime += delta;
+            *utime += vtime_delta(t, true);
         else if (t->vtime_snap_whence == VTIME_SYS)
-            *stime += delta;
+            *stime += vtime_delta(t, false);
     } while (read_seqcount_retry(&t->vtime_seqcount, seq));
 }
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */