linux-kernel - Re: [PATCH 5/5] sched: Accumulate vtime on top of nsec clocksource

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CANRm+Cxe=A4D=tevVdzZC4gD5e0+ZV1UnpBBZ9w7LyFuREehvw@mail.gmail.com>
Date:   Fri, 30 Jun 2017 09:52:04 +0800
From:   Wanpeng Li <kernellwp@...il.com>
To:     Frederic Weisbecker <fweisbec@...il.com>
Cc:     LKML <linux-kernel@...r.kernel.org>,
        Peter Zijlstra <peterz@...radead.org>,
        Thomas Gleixner <tglx@...utronix.de>,
        Luiz Capitulino <lcapitulino@...hat.com>,
        Ingo Molnar <mingo@...nel.org>, Rik van Riel <riel@...hat.com>
Subject: Re: [PATCH 5/5] sched: Accumulate vtime on top of nsec clocksource

2017-06-30 1:15 GMT+08:00 Frederic Weisbecker <fweisbec@...il.com>:
> From: Wanpeng Li <kernellwp@...il.com>

From: Wanpeng Li <wanpeng.li@...mail.com>

>
> Currently the cputime source used by vtime is jiffies. When we cross
> a context boundary and jiffies have changed since the last snapshot, the
> pending cputime is accounted to the switching out context.
>
> This system works ok if the ticks are not aligned across CPUs. If they
> instead are aligned (ie: all fire at the same time) and the CPUs run in
> userspace, the jiffies change is only observed on tick exit and therefore
> the user cputime is accounted as system cputime. This is because the
> CPU that maintains timekeeping fires its tick at the same time as the
> others. It updates jiffies in the middle of the tick and the other CPUs
> see that update on IRQ exit:
>
>     CPU 0 (timekeeper)                  CPU 1
>     -------------------              -------------
>                       jiffies = N
>     ...                              run in userspace for a jiffy
>     tick entry                       tick entry (sees jiffies = N)
>     set jiffies = N + 1
>     tick exit                        tick exit (sees jiffies = N + 1)
>                                                 account 1 jiffy as stime
>
> Fix this with using a nanosec clock source instead of jiffies. The
> cputime is then accumulated and flushed everytime the pending delta
> reaches a jiffy in order to mitigate the accounting overhead.
>
> [fweisbec: changelog, rebase on struct vtime, field renames, add delta
> on cputime readers, keep idle vtime as-is (low overhead accounting),
> harmonize clock sources]
>
> Reported-by: Luiz Capitulino <lcapitulino@...hat.com>
> Suggested-by: Thomas Gleixner <tglx@...utronix.de>
> Not-Yet-Signed-off-by: Wanpeng Li <kernellwp@...il.com>

Signed-off-by: Wanpeng Li <wanpeng.li@...mail.com>

Thanks for the patchset, Frederic! :)

Regards,
Wanpeng Li


> Cc: Rik van Riel <riel@...hat.com>
> Cc: Peter Zijlstra <peterz@...radead.org>
> Cc: Thomas Gleixner <tglx@...utronix.de>
> Cc: Wanpeng Li <kernellwp@...il.com>
> Cc: Ingo Molnar <mingo@...nel.org>
> Cc: Luiz Capitulino <lcapitulino@...hat.com>
> Signed-off-by: Frederic Weisbecker <fweisbec@...il.com>
> ---
>  include/linux/sched.h  |  3 +++
>  kernel/sched/cputime.c | 64 +++++++++++++++++++++++++++++++++-----------------
>  2 files changed, 45 insertions(+), 22 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 4a48c3f..85c5cb2 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -236,6 +236,9 @@ struct vtime {
>         seqcount_t              seqcount;
>         unsigned long long      starttime;
>         enum vtime_state        state;
> +       u64                     utime;
> +       u64                     stime;
> +       u64                     gtime;
>  };
>
>  struct sched_info {
> diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
> index b28d312..3dafea5 100644
> --- a/kernel/sched/cputime.c
> +++ b/kernel/sched/cputime.c
> @@ -681,18 +681,19 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
>  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
>  static u64 vtime_delta(struct vtime *vtime)
>  {
> -       unsigned long now = READ_ONCE(jiffies);
> +       unsigned long long clock;
>
> -       if (time_before(now, (unsigned long)vtime->starttime))
> +       clock = sched_clock_cpu(smp_processor_id());
> +       if (clock < vtime->starttime)
>                 return 0;
>
> -       return jiffies_to_nsecs(now - vtime->starttime);
> +       return clock - vtime->starttime;
>  }
>
>  static u64 get_vtime_delta(struct vtime *vtime)
>  {
> -       unsigned long now = READ_ONCE(jiffies);
> -       u64 delta, other;
> +       u64 delta = vtime_delta(vtime);
> +       u64 other;
>
>         /*
>          * Unlike tick based timing, vtime based timing never has lost
> @@ -701,17 +702,31 @@ static u64 get_vtime_delta(struct vtime *vtime)
>          * elapsed time. Limit account_other_time to prevent rounding
>          * errors from causing elapsed vtime to go negative.
>          */
> -       delta = jiffies_to_nsecs(now - vtime->starttime);
>         other = account_other_time(delta);
>         WARN_ON_ONCE(vtime->state == VTIME_INACTIVE);
> -       vtime->starttime = now;
> +       vtime->starttime += delta;
>
>         return delta - other;
>  }
>
> -static void __vtime_account_system(struct task_struct *tsk)
> +static void __vtime_account_system(struct task_struct *tsk,
> +                                  struct vtime *vtime)
>  {
> -       account_system_time(tsk, irq_count(), get_vtime_delta(&tsk->vtime));
> +       vtime->stime += get_vtime_delta(vtime);
> +       if (vtime->stime >= TICK_NSEC) {
> +               account_system_time(tsk, irq_count(), vtime->stime);
> +               vtime->stime = 0;
> +       }
> +}
> +
> +static void vtime_account_guest(struct task_struct *tsk,
> +                               struct vtime *vtime)
> +{
> +       vtime->gtime += get_vtime_delta(vtime);
> +       if (vtime->gtime >= TICK_NSEC) {
> +               account_guest_time(tsk, vtime->gtime);
> +               vtime->gtime = 0;
> +       }
>  }
>
>  void vtime_account_system(struct task_struct *tsk)
> @@ -722,7 +737,11 @@ void vtime_account_system(struct task_struct *tsk)
>                 return;
>
>         write_seqcount_begin(&vtime->seqcount);
> -       __vtime_account_system(tsk);
> +       /* We might have scheduled out from guest path */
> +       if (current->flags & PF_VCPU)
> +               vtime_account_guest(tsk, vtime);
> +       else
> +               __vtime_account_system(tsk, vtime);
>         write_seqcount_end(&vtime->seqcount);
>  }
>
> @@ -731,8 +750,7 @@ void vtime_user_enter(struct task_struct *tsk)
>         struct vtime *vtime = &tsk->vtime;
>
>         write_seqcount_begin(&vtime->seqcount);
> -       if (vtime_delta(vtime))
> -               __vtime_account_system(tsk);
> +       __vtime_account_system(tsk, vtime);
>         vtime->state = VTIME_USER;
>         write_seqcount_end(&vtime->seqcount);
>  }
> @@ -742,8 +760,11 @@ void vtime_user_exit(struct task_struct *tsk)
>         struct vtime *vtime = &tsk->vtime;
>
>         write_seqcount_begin(&vtime->seqcount);
> -       if (vtime_delta(vtime))
> -               account_user_time(tsk, get_vtime_delta(vtime));
> +       vtime->utime += get_vtime_delta(vtime);
> +       if (vtime->utime >= TICK_NSEC) {
> +               account_user_time(tsk, vtime->utime);
> +               vtime->utime = 0;
> +       }
>         vtime->state = VTIME_SYS;
>         write_seqcount_end(&vtime->seqcount);
>  }
> @@ -759,8 +780,7 @@ void vtime_guest_enter(struct task_struct *tsk)
>          * that can thus safely catch up with a tickless delta.
>          */
>         write_seqcount_begin(&vtime->seqcount);
> -       if (vtime_delta(vtime))
> -               __vtime_account_system(tsk);
> +       __vtime_account_system(tsk, vtime);
>         current->flags |= PF_VCPU;
>         write_seqcount_end(&vtime->seqcount);
>  }
> @@ -771,7 +791,7 @@ void vtime_guest_exit(struct task_struct *tsk)
>         struct vtime *vtime = &tsk->vtime;
>
>         write_seqcount_begin(&vtime->seqcount);
> -       __vtime_account_system(tsk);
> +       vtime_account_guest(tsk, vtime);
>         current->flags &= ~PF_VCPU;
>         write_seqcount_end(&vtime->seqcount);
>  }
> @@ -794,7 +814,7 @@ void arch_vtime_task_switch(struct task_struct *prev)
>
>         write_seqcount_begin(&vtime->seqcount);
>         vtime->state = VTIME_SYS;
> -       vtime->starttime = jiffies;
> +       vtime->starttime = sched_clock_cpu(smp_processor_id());
>         write_seqcount_end(&vtime->seqcount);
>  }
>
> @@ -806,7 +826,7 @@ void vtime_init_idle(struct task_struct *t, int cpu)
>         local_irq_save(flags);
>         write_seqcount_begin(&vtime->seqcount);
>         vtime->state = VTIME_SYS;
> -       vtime->starttime = jiffies;
> +       vtime->starttime = sched_clock_cpu(cpu);
>         write_seqcount_end(&vtime->seqcount);
>         local_irq_restore(flags);
>  }
> @@ -825,7 +845,7 @@ u64 task_gtime(struct task_struct *t)
>
>                 gtime = t->gtime;
>                 if (vtime->state == VTIME_SYS && t->flags & PF_VCPU)
> -                       gtime += vtime_delta(vtime);
> +                       gtime += vtime->gtime + vtime_delta(vtime);
>
>         } while (read_seqcount_retry(&vtime->seqcount, seq));
>
> @@ -866,9 +886,9 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
>                  * the right place.
>                  */
>                 if (vtime->state == VTIME_USER || t->flags & PF_VCPU)
> -                       *utime += delta;
> +                       *utime += vtime->utime + delta;
>                 else if (vtime->state == VTIME_SYS)
> -                       *stime += delta;
> +                       *stime += vtime->stime + delta;
>         } while (read_seqcount_retry(&vtime->seqcount, seq));
>  }
>  #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
> --
> 2.7.4
>