From: Frederic Weisbecker If we are not running the tick, we are not anymore regularly counting the user/system cputime at every jiffies. To solve this, save a snapshot of the jiffies when we stop the tick and keep track of where we saved it: user or system. On top of this, we account the cputime elapsed when we cross the kernel entry/exit boundaries and when we restart the tick. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Daniel Lezcano Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner --- include/linux/tick.h | 12 +++++ kernel/sched/core.c | 1 + kernel/time/tick-sched.c | 129 +++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 140 insertions(+), 2 deletions(-) diff --git a/include/linux/tick.h b/include/linux/tick.h index 03b6edd..598b492 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -153,11 +153,23 @@ static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } # endif /* !NO_HZ */ #ifdef CONFIG_CPUSETS_NO_HZ +extern void tick_nohz_enter_kernel(void); +extern void tick_nohz_exit_kernel(void); +extern void tick_nohz_enter_exception(struct pt_regs *regs); +extern void tick_nohz_exit_exception(struct pt_regs *regs); extern void tick_nohz_check_adaptive(void); +extern void tick_nohz_pre_schedule(void); extern void tick_nohz_post_schedule(void); +extern bool tick_nohz_account_tick(void); #else /* !CPUSETS_NO_HZ */ +static inline void tick_nohz_enter_kernel(void) { } +static inline void tick_nohz_exit_kernel(void) { } +static inline void tick_nohz_enter_exception(struct pt_regs *regs) { } +static inline void tick_nohz_exit_exception(struct pt_regs *regs) { } static inline void tick_nohz_check_adaptive(void) { } +static inline void tick_nohz_pre_schedule(void) { } static inline void tick_nohz_post_schedule(void) { } +static inline bool tick_nohz_account_tick(void) { return false; } #endif /* CPUSETS_NO_HZ */ #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7b35eda..bebea17 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1771,6 +1771,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { trace_sched_switch(prev, next); + tick_nohz_pre_schedule(); sched_info_switch(prev, next); perf_event_task_sched_out(prev, next); fire_sched_out_preempt_notifiers(prev, next); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index b8f3757..de8ba59 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -532,7 +532,13 @@ static bool can_stop_adaptive_tick(void) static void tick_nohz_cpuset_stop_tick(struct tick_sched *ts) { + struct pt_regs *regs = get_irq_regs(); int cpu = smp_processor_id(); + int was_stopped; + int user = 0; + + if (regs) + user = user_mode(regs); if (!cpuset_adaptive_nohz() || is_idle_task(current)) return; @@ -543,7 +549,36 @@ static void tick_nohz_cpuset_stop_tick(struct tick_sched *ts) if (!can_stop_adaptive_tick()) return; + /* + * If we stop the tick between the syscall exit hook and the actual + * return to userspace, we'll think we are in system space (due to + * user_mode() thinking so). And since we passed the syscall exit hook + * already we won't realize we are in userspace. So the time spent + * tickless would be spuriously accounted as belonging to system. + * + * To avoid this kind of problem, we only stop the tick from userspace + * (until we find a better solution). + * We can later enter the kernel and keep the tick stopped. But the place + * where we stop the tick must be userspace. + * We make an exception for kernel threads since they always execute in + * kernel space. + */ + if (!user && current->mm) + return; + + was_stopped = ts->tick_stopped; tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); + + if (!was_stopped && ts->tick_stopped) { + WARN_ON_ONCE(ts->saved_jiffies_whence != JIFFIES_SAVED_NONE); + if (user) + ts->saved_jiffies_whence = JIFFIES_SAVED_USER; + else if (!current->mm) + ts->saved_jiffies_whence = JIFFIES_SAVED_SYS; + + ts->saved_jiffies = jiffies; + set_thread_flag(TIF_NOHZ); + } } #else static void tick_nohz_cpuset_stop_tick(struct tick_sched *ts) { } @@ -871,6 +906,68 @@ void tick_check_idle(int cpu) } #ifdef CONFIG_CPUSETS_NO_HZ +void tick_nohz_exit_kernel(void) +{ + unsigned long flags; + struct tick_sched *ts; + unsigned long delta_jiffies; + + if (!test_thread_flag(TIF_NOHZ)) + return; + + local_irq_save(flags); + + ts = &__get_cpu_var(tick_cpu_sched); + + WARN_ON_ONCE(!ts->tick_stopped); + WARN_ON_ONCE(ts->saved_jiffies_whence != JIFFIES_SAVED_SYS); + + delta_jiffies = jiffies - ts->saved_jiffies; + account_system_ticks(current, delta_jiffies); + + ts->saved_jiffies = jiffies; + ts->saved_jiffies_whence = JIFFIES_SAVED_USER; + + local_irq_restore(flags); +} + +void tick_nohz_enter_kernel(void) +{ + unsigned long flags; + struct tick_sched *ts; + unsigned long delta_jiffies; + + if (!test_thread_flag(TIF_NOHZ)) + return; + + local_irq_save(flags); + + ts = &__get_cpu_var(tick_cpu_sched); + + WARN_ON_ONCE(!ts->tick_stopped); + WARN_ON_ONCE(ts->saved_jiffies_whence != JIFFIES_SAVED_USER); + + delta_jiffies = jiffies - ts->saved_jiffies; + account_user_ticks(current, delta_jiffies); + + ts->saved_jiffies = jiffies; + ts->saved_jiffies_whence = JIFFIES_SAVED_SYS; + + local_irq_restore(flags); +} + +void tick_nohz_enter_exception(struct pt_regs *regs) +{ + if (user_mode(regs)) + tick_nohz_enter_kernel(); +} + +void tick_nohz_exit_exception(struct pt_regs *regs) +{ + if (user_mode(regs)) + tick_nohz_exit_kernel(); +} + /* * Take the timer duty if nobody is taking care of it. * If a CPU already does and and it's in a nohz cpuset, @@ -889,6 +986,15 @@ static void tick_do_timer_check_handler(int cpu) } } +static void tick_nohz_restart_adaptive(void) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + + tick_nohz_account_ticks(ts); + tick_nohz_restart_sched_tick(); + clear_thread_flag(TIF_NOHZ); +} + void tick_nohz_check_adaptive(void) { struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); @@ -896,7 +1002,7 @@ void tick_nohz_check_adaptive(void) if (cpuset_adaptive_nohz()) { if (ts->tick_stopped && !is_idle_task(current)) { if (!can_stop_adaptive_tick()) - tick_nohz_restart_sched_tick(); + tick_nohz_restart_adaptive(); } } } @@ -909,6 +1015,26 @@ void cpuset_exit_nohz_interrupt(void *unused) tick_nohz_restart_adaptive(); } +/* + * Flush cputime and clear hooks before context switch in case we + * haven't yet received the IPI that should take care of that. + */ +void tick_nohz_pre_schedule(void) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + + /* + * We are holding the rq lock and if we restart the tick now + * we could deadlock by acquiring the lock twice. Instead + * we do that on post schedule time. For now do the cleanups + * on the prev task. + */ + if (test_thread_flag(TIF_NOHZ)) { + tick_nohz_account_ticks(ts); + clear_thread_flag(TIF_NOHZ); + } +} + void tick_nohz_post_schedule(void) { struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); @@ -921,7 +1047,6 @@ void tick_nohz_post_schedule(void) if (ts->tick_stopped) tick_nohz_restart_sched_tick(); } - #else static void tick_do_timer_check_handler(int cpu) -- 1.7.10.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/