linux-kernel - Re: [BUG nohz]: wrong user and system time accounting

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CANRm+Czqs+Rc=O6wmxnKm2ETu8MT7A2=QZGuEtxJqJh0SLxs2A@mail.gmail.com>
Date:   Mon, 15 May 2017 16:17:10 +0800
From:   Wanpeng Li <kernellwp@...il.com>
To:     Frederic Weisbecker <fweisbec@...il.com>
Cc:     Thomas Gleixner <tglx@...utronix.de>,
        Mike Galbraith <efault@....de>, Rik van Riel <riel@...hat.com>,
        Luiz Capitulino <lcapitulino@...hat.com>,
        "linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
        Peter Zijlstra <peterz@...radead.org>,
        Paolo Bonzini <pbonzini@...hat.com>
Subject: Re: [BUG nohz]: wrong user and system time accounting

Ping,
2017-05-02 18:01 GMT+08:00 Wanpeng Li <kernellwp@...il.com>:
> Cc Paolo,
> 2017-04-13 21:32 GMT+08:00 Frederic Weisbecker <fweisbec@...il.com>:
>> On Thu, Apr 13, 2017 at 12:31:12PM +0800, Wanpeng Li wrote:
>>> 2017-04-12 22:57 GMT+08:00 Thomas Gleixner <tglx@...utronix.de>:
>>> > On Wed, 12 Apr 2017, Frederic Weisbecker wrote:
>>> >> On Tue, Apr 11, 2017 at 04:22:48PM +0200, Thomas Gleixner wrote:
>>> >> > It's not different from the current jiffies based stuff at all. Same
>>> >> > failure mode.
>>> >>
>>> >> Yes you're right, I got confused again. So to fix this we could do our snapshots
>>> >> at a frequency lower than HZ but still high enough to avoid overhead.
>>> >>
>>> >> Something like TICK_NSEC / 2 ?
>>> >
>>> > If you are using TSC anyway then you can do proper accumulation for both
>>> > system and user and only account the data when the accumulation is more
>>> > than a jiffie.
>>>
>>> So I implement it as below:
>>>
>>> - HZ=1000.
>>>   1) two cpu hogs on cpu in nohz_full mode, 100% user time
>>>   2) Luzi's testcase, ~95% user time, ~5% idle time (as we expected)
>>> - HZ=250
>>>    1) two cpu hogs on cpu in nohz_full mode, 100% user time
>>>    2) Luzi's testcase, 100% idle
>>>
>>> So the codes below still not work correctly for HZ=250, any suggestions?
>>
>> Right, so first lets reorder that code a bit so we can see clear inside :-)
>>
>>>
>>> -------------------------------------->8-----------------------------------------------------
>>>
>>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>>> index d67eee8..6a11771 100644
>>> --- a/include/linux/sched.h
>>> +++ b/include/linux/sched.h
>>> @@ -668,6 +668,8 @@ struct task_struct {
>>>  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
>>>      seqcount_t            vtime_seqcount;
>>>      unsigned long long        vtime_snap;
>>> +    u64                vtime_acct_utime;
>>> +    u64                vtime_acct_stime;
>>
>> You need to accumulate guest and steal time as well.
>>
>
> Hi Frederic,
>
> Sorry for the delay since I'm too busy recently, I just add guest time
> and idle time accumulations as below, the code work as we expected for
> native kernel, however, the testcase fails when it runs in kvm guest.
> Top shows ~99% sys for Luzi's testcase "./acct-bug 1 995" which we
> expect 95% user  and %5 idle. In addition, what's the design idea of
> steal time accumluation in your mind? Pass the tsk parameter in the
> function get_vtime_delta() down to the function
> steal_account_process_time()?
>
> -------------------------------------->8-----------------------------------------------------
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 4cf9a59..56815cd 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -672,6 +672,10 @@ struct task_struct {
>  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
>      seqcount_t            vtime_seqcount;
>      unsigned long long        vtime_snap;
> +    u64                vtime_acct_utime;
> +    u64                vtime_acct_stime;
> +    u64                vtime_acct_idle_time;
> +    u64                vtime_acct_guest_time;
>      enum {
>          /* Task is sleeping or running in a CPU with VTIME inactive: */
>          VTIME_INACTIVE = 0,
> diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
> index f3778e2b..2d950c6 100644
> --- a/kernel/sched/cputime.c
> +++ b/kernel/sched/cputime.c
> @@ -676,18 +676,19 @@ void thread_group_cputime_adjusted(struct
> task_struct *p, u64 *ut, u64 *st)
>  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
>  static u64 vtime_delta(struct task_struct *tsk)
>  {
> -    unsigned long now = READ_ONCE(jiffies);
> +    unsigned long long clock;
>
> -    if (time_before(now, (unsigned long)tsk->vtime_snap))
> +    clock = sched_clock();
> +    if (clock < tsk->vtime_snap)
>          return 0;
>
> -    return jiffies_to_nsecs(now - tsk->vtime_snap);
> +    return clock - tsk->vtime_snap;
>  }
>
>  static u64 get_vtime_delta(struct task_struct *tsk)
>  {
> -    unsigned long now = READ_ONCE(jiffies);
> -    u64 delta, other;
> +    u64 delta = vtime_delta(tsk);
> +    u64 other;
>
>      /*
>       * Unlike tick based timing, vtime based timing never has lost
> @@ -696,17 +697,16 @@ static u64 get_vtime_delta(struct task_struct *tsk)
>       * elapsed time. Limit account_other_time to prevent rounding
>       * errors from causing elapsed vtime to go negative.
>       */
> -    delta = jiffies_to_nsecs(now - tsk->vtime_snap);
>      other = account_other_time(delta);
>      WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
> -    tsk->vtime_snap = now;
> +    tsk->vtime_snap += delta;
>
>      return delta - other;
>  }
>
>  static void __vtime_account_system(struct task_struct *tsk)
>  {
> -    account_system_time(tsk, irq_count(), get_vtime_delta(tsk));
> +    account_system_time(tsk, irq_count(), tsk->vtime_acct_stime);
>  }
>
>  void vtime_account_system(struct task_struct *tsk)
> @@ -715,7 +715,11 @@ void vtime_account_system(struct task_struct *tsk)
>          return;
>
>      write_seqcount_begin(&tsk->vtime_seqcount);
> -    __vtime_account_system(tsk);
> +    tsk->vtime_acct_stime += get_vtime_delta(tsk);
> +    if (tsk->vtime_acct_stime >= TICK_NSEC) {
> +        __vtime_account_system(tsk);
> +        tsk->vtime_acct_stime = 0;
> +    }
>      write_seqcount_end(&tsk->vtime_seqcount);
>  }
>
> @@ -723,16 +727,22 @@ void vtime_account_user(struct task_struct *tsk)
>  {
>      write_seqcount_begin(&tsk->vtime_seqcount);
>      tsk->vtime_snap_whence = VTIME_SYS;
> -    if (vtime_delta(tsk))
> -        account_user_time(tsk, get_vtime_delta(tsk));
> +    tsk->vtime_acct_utime += get_vtime_delta(tsk);
> +    if (tsk->vtime_acct_utime >= TICK_NSEC) {
> +        account_user_time(tsk, tsk->vtime_acct_utime);
> +        tsk->vtime_acct_utime = 0;
> +    }
>      write_seqcount_end(&tsk->vtime_seqcount);
>  }
>
>  void vtime_user_enter(struct task_struct *tsk)
>  {
>      write_seqcount_begin(&tsk->vtime_seqcount);
> -    if (vtime_delta(tsk))
> +    tsk->vtime_acct_stime += get_vtime_delta(tsk);
> +    if (tsk->vtime_acct_stime >= TICK_NSEC) {
>          __vtime_account_system(tsk);
> +        tsk->vtime_acct_stime = 0;
> +    }
>      tsk->vtime_snap_whence = VTIME_USER;
>      write_seqcount_end(&tsk->vtime_seqcount);
>  }
> @@ -747,8 +757,11 @@ void vtime_guest_enter(struct task_struct *tsk)
>       * that can thus safely catch up with a tickless delta.
>       */
>      write_seqcount_begin(&tsk->vtime_seqcount);
> -    if (vtime_delta(tsk))
> +    tsk->vtime_acct_stime += get_vtime_delta(tsk);
> +    if (tsk->vtime_acct_stime >= TICK_NSEC) {
>          __vtime_account_system(tsk);
> +        tsk->vtime_acct_stime = 0;
> +    }
>      current->flags |= PF_VCPU;
>      write_seqcount_end(&tsk->vtime_seqcount);
>  }
> @@ -757,7 +770,11 @@ EXPORT_SYMBOL_GPL(vtime_guest_enter);
>  void vtime_guest_exit(struct task_struct *tsk)
>  {
>      write_seqcount_begin(&tsk->vtime_seqcount);
> -    __vtime_account_system(tsk);
> +    tsk->vtime_acct_stime += get_vtime_delta(tsk);
> +    if (tsk->vtime_acct_stime >= TICK_NSEC) {
> +        __vtime_account_system(tsk);
> +        tsk->vtime_acct_stime = 0;
> +    }
>      current->flags &= ~PF_VCPU;
>      write_seqcount_end(&tsk->vtime_seqcount);
>  }
> @@ -765,7 +782,11 @@ EXPORT_SYMBOL_GPL(vtime_guest_exit);
>
>  void vtime_account_idle(struct task_struct *tsk)
>  {
> -    account_idle_time(get_vtime_delta(tsk));
> +    tsk->vtime_acct_idle_time += get_vtime_delta(tsk);
> +    if (tsk->vtime_acct_idle_time >= TICK_NSEC) {
> +        account_idle_time(tsk->vtime_acct_idle_time);
> +        tsk->vtime_acct_idle_time = 0;
> +    }
>  }
>
>  void arch_vtime_task_switch(struct task_struct *prev)
> @@ -776,7 +797,7 @@ void arch_vtime_task_switch(struct task_struct *prev)
>
>      write_seqcount_begin(&current->vtime_seqcount);
>      current->vtime_snap_whence = VTIME_SYS;
> -    current->vtime_snap = jiffies;
> +    current->vtime_snap = sched_clock_cpu(smp_processor_id());
>      write_seqcount_end(&current->vtime_seqcount);
>  }
>
> @@ -787,7 +808,7 @@ void vtime_init_idle(struct task_struct *t, int cpu)
>      local_irq_save(flags);
>      write_seqcount_begin(&t->vtime_seqcount);
>      t->vtime_snap_whence = VTIME_SYS;
> -    t->vtime_snap = jiffies;
> +    t->vtime_snap = sched_clock_cpu(cpu);
>      write_seqcount_end(&t->vtime_seqcount);
>      local_irq_restore(flags);
>  }
>
> Regards,
> Wanpeng Li