Steven asked for per group periods in order to get closer to RMA or EDF scheduling. Use the fancy new hrtimers to provide a per group period Signed-off-by: Peter Zijlstra --- include/linux/sched.h | 2 kernel/sched.c | 225 +++++++++++++++++++++++++++++++++++++++++------ kernel/sched_rt.c | 61 ++++++------ kernel/sysctl.c | 2 kernel/time/tick-sched.c | 5 - 5 files changed, 232 insertions(+), 63 deletions(-) Index: linux-2.6/include/linux/sched.h =================================================================== --- linux-2.6.orig/include/linux/sched.h +++ linux-2.6/include/linux/sched.h @@ -230,8 +230,6 @@ static inline int select_nohz_load_balan } #endif -extern unsigned long rt_needs_cpu(int cpu); - /* * Only dump TASK_* tasks. (0 for all tasks) */ Index: linux-2.6/kernel/sched.c =================================================================== --- linux-2.6.orig/kernel/sched.c +++ linux-2.6/kernel/sched.c @@ -177,6 +177,7 @@ struct task_group { struct rt_rq **rt_rq; unsigned int rt_ratio; + ktime_t rt_period; /* * shares assigned to a task group governs how much of cpu bandwidth @@ -372,6 +373,7 @@ struct rt_rq { #endif int rt_throttled; u64 rt_time; + struct hrtimer rt_period_timer; #ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; @@ -441,8 +443,6 @@ struct rq { struct cfs_rq cfs; struct rt_rq rt; - u64 rt_period_expire; - int rt_throttled; #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ @@ -595,23 +595,6 @@ static void update_rq_clock(struct rq *r #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) -unsigned long rt_needs_cpu(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - u64 delta; - - if (!rq->rt_throttled) - return 0; - - if (rq->clock > rq->rt_period_expire) - return 1; - - delta = rq->rt_period_expire - rq->clock; - do_div(delta, NSEC_PER_SEC / HZ); - - return (unsigned long)delta; -} - /* * Tunables that become constants when CONFIG_SCHED_DEBUG is off: */ @@ -652,10 +635,10 @@ const_debug unsigned int sysctl_sched_fe const_debug unsigned int sysctl_sched_nr_migrate = 32; /* - * period over which we measure -rt task cpu usage in ms. + * period over which we measure -rt task cpu usage in us. * default: 1s */ -const_debug unsigned int sysctl_sched_rt_period = 1000; +const_debug unsigned int sysctl_sched_rt_period = 1000000; #define SCHED_RT_FRAC_SHIFT 16 #define SCHED_RT_FRAC (1UL << SCHED_RT_FRAC_SHIFT) @@ -664,7 +647,7 @@ const_debug unsigned int sysctl_sched_rt * ratio of time -rt tasks may consume. * default: 95% */ -const_debug unsigned int sysctl_sched_rt_ratio = 62259; +const_debug unsigned int sysctl_sched_rt_ratio = 32768; //62259; /* * For kernel-internal use: high-speed (but slightly incorrect) per-cpu @@ -1245,6 +1228,12 @@ static unsigned long cpu_avg_load_per_ta static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); #endif /* CONFIG_SMP */ +static inline ktime_t ns_to_ktime(u64 ns) +{ + static const ktime_t ktime_zero = { .tv64 = 0 }; + return ktime_add_ns(ktime_zero, ns); +} + #include "sched_stats.h" #include "sched_idletask.c" #include "sched_fair.c" @@ -3741,7 +3730,6 @@ void scheduler_tick(void) rq->tick_timestamp = rq->clock; update_cpu_load(rq); curr->sched_class->task_tick(rq, curr, 0); - update_sched_rt_period(rq); spin_unlock(&rq->lock); #ifdef CONFIG_SMP @@ -5287,6 +5275,152 @@ static inline void sched_init_granularit sysctl_sched_batch_wakeup_granularity *= factor; } +static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) +{ + struct rt_rq *rt_rq = + container_of(timer, struct rt_rq, rt_period_timer); + struct rq *rq = rq_of_rt_rq(rt_rq); + ktime_t now = ktime_get(); + + WARN_ON(smp_processor_id() != cpu_of(rq)); + WARN_ON(!in_irq()); + + spin_lock(&rq->lock); + update_sched_rt_period(rt_rq); + spin_unlock(&rq->lock); + + hrtimer_forward(timer, now, sched_rt_period(rt_rq)); + return HRTIMER_RESTART; +} + +static void sched_rt_period_start(struct rt_rq *rt_rq) +{ + ktime_t period = sched_rt_period(rt_rq); + + WARN_ON(smp_processor_id() != cpu_of(rq_of_rt_rq(rt_rq))); + + for (;;) { + ktime_t now = ktime_get(); + hrtimer_forward(&rt_rq->rt_period_timer, now, period); + hrtimer_start(&rt_rq->rt_period_timer, + rt_rq->rt_period_timer.expires, + HRTIMER_MODE_ABS); + if (hrtimer_active(&rt_rq->rt_period_timer)) + break; + } +} + +static void sched_rt_period_stop(struct rt_rq *rt_rq) +{ + hrtimer_cancel(&rt_rq->rt_period_timer); +} + +static void sched_rt_period_start_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + struct rt_rq *rt_rq; + + for_each_leaf_rt_rq(rt_rq, rq) + sched_rt_period_start(rt_rq); +} + +#ifdef CONFIG_SMP +static void sched_rt_period_stop_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + struct rt_rq *rt_rq; + + for_each_leaf_rt_rq(rt_rq, rq) + sched_rt_period_stop(rt_rq); +} + +static int sched_rt_period_hotplug(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + int cpu = (unsigned long)hcpu; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + sched_rt_period_start_cpu(cpu); + return NOTIFY_OK; + + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + sched_rt_period_stop_cpu(cpu); + return NOTIFY_OK; + + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + case CPU_DEAD: + case CPU_DEAD_FROZEN: + return NOTIFY_OK; + + default: + return NOTIFY_DONE; + } + + return NOTIFY_OK; +} + +static void __init __sched_rt_period_init(void *arg) +{ + int cpu = smp_processor_id(); + sched_rt_period_start_cpu(cpu); +} + +static void __init sched_rt_period_init(void) +{ + on_each_cpu(__sched_rt_period_init, NULL, 0, 1); + hotcpu_notifier(sched_rt_period_hotplug, 0); +} + +static void __sched_rt_period_init_tg(void *arg) +{ + struct task_group *tg = arg; + int cpu = smp_processor_id(); + + sched_rt_period_start(tg->rt_rq[cpu]); +} + +static void sched_rt_period_init_tg(struct task_group *tg) +{ + on_each_cpu(__sched_rt_period_init_tg, tg, 0, 1); +} + +static void __sched_rt_period_destroy_tg(void *arg) +{ + struct task_group *tg = arg; + int cpu = smp_processor_id(); + + sched_rt_period_stop(tg->rt_rq[cpu]); +} + +static void sched_rt_period_destroy_tg(struct task_group *tg) +{ + on_each_cpu(__sched_rt_period_destroy_tg, tg, 0, 1); +} +#else +static void __init sched_rt_period_init(void) +{ + sched_rt_period_start_cpu(0); +} + +static void sched_rt_period_init_tg(struct task_group *tg) +{ + sched_rt_period_start(tg->rt_rq[0]); +} + +static void sched_rt_period_destroy_tg(struct task_group *tg) +{ + sched_rt_period_stop(tg->rt_rq[0]); +} +#endif + #ifdef CONFIG_SMP /* * This is how migration works: @@ -7068,6 +7202,7 @@ void __init sched_init_smp(void) if (set_cpus_allowed(current, non_isolated_cpus) < 0) BUG(); sched_init_granularity(); + sched_rt_period_init(); #ifdef CONFIG_FAIR_GROUP_SCHED if (nr_cpu_ids == 1) @@ -7088,6 +7223,7 @@ void __init sched_init_smp(void) void __init sched_init_smp(void) { sched_init_granularity(); + sched_rt_period_init(); } #endif /* CONFIG_SMP */ @@ -7131,6 +7267,11 @@ static void init_rt_rq(struct rt_rq *rt_ rt_rq->rt_time = 0; rt_rq->rt_throttled = 0; + hrtimer_init(&rt_rq->rt_period_timer, + CLOCK_MONOTONIC, HRTIMER_MODE_REL); + rt_rq->rt_period_timer.function = sched_rt_period_timer; + rt_rq->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + #ifdef CONFIG_FAIR_GROUP_SCHED rt_rq->rq = rq; #endif @@ -7201,6 +7342,8 @@ void __init sched_init(void) &per_cpu(init_sched_entity, i), i, 1); init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */ + init_task_group.rt_period = + ns_to_ktime(sysctl_sched_rt_period * NSEC_PER_USEC); INIT_LIST_HEAD(&rq->leaf_rt_rq_list); init_tg_rt_entry(rq, &init_task_group, &per_cpu(init_rt_rq, i), @@ -7208,8 +7351,6 @@ void __init sched_init(void) list_add(&init_task_group.list, &task_groups); #endif - rq->rt_period_expire = 0; - rq->rt_throttled = 0; for (j = 0; j < CPU_LOAD_IDX_MAX; j++) rq->cpu_load[j] = 0; @@ -7598,6 +7739,7 @@ struct task_group *sched_create_group(vo tg->shares = NICE_0_LOAD; tg->rt_ratio = 0; /* XXX */ + tg->rt_period = ns_to_ktime(sysctl_sched_rt_period * NSEC_PER_USEC); for_each_possible_cpu(i) { rq = cpu_rq(i); @@ -7637,6 +7779,8 @@ struct task_group *sched_create_group(vo list_add_rcu(&tg->list, &task_groups); unlock_task_group_list(); + sched_rt_period_init_tg(tg); + return tg; err: @@ -7658,6 +7802,8 @@ void sched_destroy_group(struct task_gro struct rt_rq *rt_rq = NULL; int i; + sched_rt_period_destroy_tg(tg); + lock_task_group_list(); for_each_possible_cpu(i) { cfs_rq = tg->cfs_rq[i]; @@ -7815,6 +7961,19 @@ unsigned long sched_group_rt_ratio(struc return tg->rt_ratio; } +int sched_group_set_rt_period(struct task_group *tg, unsigned long rt_period) +{ + tg->rt_period = ns_to_ktime((u64)rt_period * NSEC_PER_USEC); + return 0; +} + +unsigned long sched_group_rt_period(struct task_group *tg) +{ + u64 ns = ktime_to_ns(tg->rt_period); + do_div(ns, NSEC_PER_USEC); + return ns; +} + #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_FAIR_CGROUP_SCHED @@ -7903,6 +8062,17 @@ static u64 cpu_rt_ratio_read_uint(struct return (u64) tg->rt_ratio; } +static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, + u64 rt_period_val) +{ + return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_val); +} + +static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) +{ + return (u64) sched_group_rt_period(cgroup_tg(cgrp)); +} + static struct cftype cpu_files[] = { { .name = "shares", @@ -7914,6 +8084,11 @@ static struct cftype cpu_files[] = { .read_uint = cpu_rt_ratio_read_uint, .write_uint = cpu_rt_ratio_write_uint, }, + { + .name = "rt_period_us", + .read_uint = cpu_rt_period_read_uint, + .write_uint = cpu_rt_period_write_uint, + }, }; static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) Index: linux-2.6/kernel/sched_rt.c =================================================================== --- linux-2.6.orig/kernel/sched_rt.c +++ linux-2.6/kernel/sched_rt.c @@ -65,6 +65,17 @@ static inline unsigned int sched_rt_rati return rt_rq->tg->rt_ratio; } +static inline ktime_t sched_rt_period(struct rt_rq *rt_rq) +{ + BUG_ON(!rt_rq->tg); + return rt_rq->tg->rt_period; +} + +static inline u64 sched_rt_period_ns(struct rt_rq *rt_rq) +{ + return ktime_to_ns(sched_rt_period(rt_rq)); +} + #define for_each_leaf_rt_rq(rt_rq, rq) \ list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) @@ -117,6 +128,16 @@ static inline unsigned int sched_rt_rati return sysctl_sched_rt_ratio; } +static inline ktime_t sched_rt_period(struct rt_rq *rt_rq) +{ + return ns_to_ktime((u64)sysctl_sched_rt_period * NSEC_PER_USEC); +} + +static inline u64 sched_rt_period_ns(struct rt_rq *rt_rq) +{ + return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; +} + #define for_each_leaf_rt_rq(rt_rq, rq) \ for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) @@ -174,15 +195,11 @@ static int sched_rt_ratio_exceeded(struc if (rt_rq->rt_throttled) return 1; - period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC; + period = sched_rt_period_ns(rt_rq); ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT; if (rt_rq->rt_time > ratio) { - struct rq *rq = rq_of_rt_rq(rt_rq); - - rq->rt_throttled = 1; rt_rq->rt_throttled = 1; - sched_rt_ratio_dequeue(rt_rq); return 1; } @@ -190,27 +207,16 @@ static int sched_rt_ratio_exceeded(struc return 0; } -static void update_sched_rt_period(struct rq *rq) +static void update_sched_rt_period(struct rt_rq *rt_rq) { - struct rt_rq *rt_rq; - u64 period; - - while (rq->clock > rq->rt_period_expire) { - period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC; - rq->rt_period_expire += period; - - for_each_leaf_rt_rq(rt_rq, rq) { - unsigned long rt_ratio = sched_rt_ratio(rt_rq); - u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT; - - rt_rq->rt_time -= min(rt_rq->rt_time, ratio); - if (rt_rq->rt_throttled) { - rt_rq->rt_throttled = 0; - sched_rt_ratio_enqueue(rt_rq); - } - } - - rq->rt_throttled = 0; + u64 period = sched_rt_period_ns(rt_rq); + unsigned long rt_ratio = sched_rt_ratio(rt_rq); + u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT; + + rt_rq->rt_time -= min(rt_rq->rt_time, ratio); + if (rt_rq->rt_throttled) { + rt_rq->rt_throttled = 0; + sched_rt_ratio_enqueue(rt_rq); } } @@ -238,11 +244,6 @@ static void update_curr_rt(struct rq *rq cpuacct_charge(curr, delta_exec); rt_rq->rt_time += delta_exec; - /* - * might make it a tad more accurate: - * - * update_sched_rt_period(rq); - */ if (sched_rt_ratio_exceeded(rt_rq)) resched_task(curr); } Index: linux-2.6/kernel/sysctl.c =================================================================== --- linux-2.6.orig/kernel/sysctl.c +++ linux-2.6/kernel/sysctl.c @@ -311,7 +311,7 @@ static struct ctl_table kern_table[] = { }, { .ctl_name = CTL_UNNUMBERED, - .procname = "sched_rt_period_ms", + .procname = "sched_rt_period_us", .data = &sysctl_sched_rt_period, .maxlen = sizeof(unsigned int), .mode = 0644, Index: linux-2.6/kernel/time/tick-sched.c =================================================================== --- linux-2.6.orig/kernel/time/tick-sched.c +++ linux-2.6/kernel/time/tick-sched.c @@ -153,7 +153,6 @@ void tick_nohz_update_jiffies(void) void tick_nohz_stop_sched_tick(void) { unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; - unsigned long rt_jiffies; struct tick_sched *ts; ktime_t last_update, expires, now, delta; struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; @@ -217,10 +216,6 @@ void tick_nohz_stop_sched_tick(void) next_jiffies = get_next_timer_interrupt(last_jiffies); delta_jiffies = next_jiffies - last_jiffies; - rt_jiffies = rt_needs_cpu(cpu); - if (rt_jiffies && rt_jiffies < delta_jiffies) - delta_jiffies = rt_jiffies; - if (rcu_needs_cpu(cpu)) delta_jiffies = 1; /* -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/