Subject: sched: Replace rt_avg From: Peter Zijlstra Date: Mon Aug 7 18:26:25 CEST 2017 Now that we have separate IRQ, RT and DL utilization tracking, we can fully replace the old rt_avg code, reducing the amount of statistics. Signed-off-by: Peter Zijlstra (Intel) --- include/linux/sched/sysctl.h | 1 - kernel/sched/core.c | 40 ++-------------------------------------- kernel/sched/deadline.c | 2 -- kernel/sched/fair.c | 21 +++++---------------- kernel/sched/rt.c | 2 -- kernel/sched/sched.h | 26 +------------------------- kernel/sysctl.c | 7 ------- 7 files changed, 8 insertions(+), 91 deletions(-) --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -39,7 +39,6 @@ extern unsigned int sysctl_numa_balancin #ifdef CONFIG_SCHED_DEBUG extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_nr_migrate; -extern unsigned int sysctl_sched_time_avg; int sched_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -62,14 +62,6 @@ const_debug unsigned int sysctl_sched_fe const_debug unsigned int sysctl_sched_nr_migrate = 32; /* - * period over which we average the RT time consumption, measured - * in ms. - * - * default: 1s - */ -const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; - -/* * period over which we measure -rt task CPU usage in us. * default: 1s */ @@ -161,7 +153,7 @@ static void update_rq_clock_task(struct { /* * In theory, the compile should just see 0 here, and optimize out the call - * to sched_rt_avg_update. But I don't trust it... + * to update_irq_load_avg(). But I don't trust it... */ #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) s64 steal = 0, irq_delta = 0; @@ -206,10 +198,8 @@ static void update_rq_clock_task(struct rq->clock_task += delta; #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) - if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) { - sched_rt_avg_update(rq, irq_delta + steal); + if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) update_irq_load_avg(rq->clock, cpu_of(rq), rq, 1); - } #endif } @@ -673,23 +663,6 @@ bool sched_can_stop_tick(struct rq *rq) return true; } #endif /* CONFIG_NO_HZ_FULL */ - -void sched_avg_update(struct rq *rq) -{ - s64 period = sched_avg_period(); - - while ((s64)(rq_clock(rq) - rq->age_stamp) > period) { - /* - * Inline assembly required to prevent the compiler - * optimising this loop into a divmod call. - * See __iter_div_u64_rem() for another example of this. - */ - asm("" : "+rm" (rq->age_stamp)); - rq->age_stamp += period; - rq->rt_avg /= 2; - } -} - #endif /* CONFIG_SMP */ #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ @@ -5515,13 +5488,6 @@ void set_rq_offline(struct rq *rq) } } -static void set_cpu_rq_start_time(unsigned int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - rq->age_stamp = sched_clock_cpu(cpu); -} - /* * used to mark begin/end of suspend/resume: */ @@ -5640,7 +5606,6 @@ static void sched_rq_cpu_starting(unsign int sched_cpu_starting(unsigned int cpu) { - set_cpu_rq_start_time(cpu); sched_rq_cpu_starting(cpu); return 0; } @@ -5919,7 +5884,6 @@ void __init sched_init(void) if (cpu_isolated_map == NULL) zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); idle_thread_set_boot_cpu(); - set_cpu_rq_start_time(smp_processor_id()); #endif init_sched_fair_class(); --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1147,8 +1147,6 @@ static void update_curr_dl(struct rq *rq curr->se.exec_start = rq_clock_task(rq); cpuacct_charge(curr, delta_exec); - sched_rt_avg_update(rq, delta_exec); - if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) delta_exec = grub_reclaim(delta_exec, rq, &curr->dl); dl_se->runtime -= delta_exec; --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5151,8 +5151,6 @@ static void cpu_load_update(struct rq *t this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; } - - sched_avg_update(this_rq); } /* Used instead of source_load when we know the type == 0 */ @@ -7134,23 +7132,14 @@ static inline int get_sd_load_idx(struct static unsigned long scale_rt_capacity(int cpu) { struct rq *rq = cpu_rq(cpu); - u64 total, used, age_stamp, avg; - s64 delta; + unsigned long used; /* - * Since we're reading these variables without serialization make sure - * we read them once before doing sanity checks on them. + * Subtract IRQ, RT and DL time as all of those preempt CFS. */ - age_stamp = READ_ONCE(rq->age_stamp); - avg = READ_ONCE(rq->rt_avg); - delta = __rq_clock_broken(rq) - age_stamp; - - if (unlikely(delta < 0)) - delta = 0; - - total = sched_avg_period() + delta; - - used = div_u64(avg, total); + used = READ_ONCE(rq->irq_avg.util_avg) + + READ_ONCE(rq->rt.avg.util_avg) + + READ_ONCE(rq->dl.avg.util_avg); if (likely(used < SCHED_CAPACITY_SCALE)) return SCHED_CAPACITY_SCALE - used; --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -981,8 +981,6 @@ static void update_curr_rt(struct rq *rq curr->se.exec_start = rq_clock_task(rq); cpuacct_charge(curr, delta_exec); - sched_rt_avg_update(rq, delta_exec); - if (!rt_bandwidth_enabled()) return; --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -750,8 +750,6 @@ struct rq { struct sched_avg irq_avg; - u64 rt_avg; - u64 age_stamp; u64 idle_stamp; u64 avg_idle; @@ -843,11 +841,6 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() raw_cpu_ptr(&runqueues) -static inline u64 __rq_clock_broken(struct rq *rq) -{ - return READ_ONCE(rq->clock); -} - /* * rq::clock_update_flags bits * @@ -1621,15 +1614,9 @@ extern void deactivate_task(struct rq *r extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); -extern const_debug unsigned int sysctl_sched_time_avg; extern const_debug unsigned int sysctl_sched_nr_migrate; extern const_debug unsigned int sysctl_sched_migration_cost; -static inline u64 sched_avg_period(void) -{ - return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; -} - #ifdef CONFIG_SCHED_HRTICK /* @@ -1658,8 +1645,6 @@ static inline int hrtick_enabled(struct #endif /* CONFIG_SCHED_HRTICK */ #ifdef CONFIG_SMP -extern void sched_avg_update(struct rq *rq); - #ifndef arch_scale_freq_capacity static __always_inline unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) @@ -1678,16 +1663,7 @@ unsigned long arch_scale_cpu_capacity(st return SCHED_CAPACITY_SCALE; } #endif - -static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) -{ - rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); - sched_avg_update(rq); -} -#else -static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } -static inline void sched_avg_update(struct rq *rq) { } -#endif +#endif /* CONFIG_SMP */ struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires(rq->lock); --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -362,13 +362,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "sched_time_avg_ms", - .data = &sysctl_sched_time_avg, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, #ifdef CONFIG_SCHEDSTATS { .procname = "sched_schedstats",