The current hierarchical RQ group scheduler suffers from a number of problems: - yield - wakeup preemption - sleeper fairness All these problems are due to the isolation caused by the multiple RQ design. They are caused by the fact that vruntime becomes a local property. Solve this by returning to a single RQ model. Signed-off-by: Peter Zijlstra --- kernel/sched.c | 9 -- kernel/sched_fair.c | 160 +++++++++++++++++++++++++++++++++------------------- 2 files changed, 105 insertions(+), 64 deletions(-) Index: linux-2.6/kernel/sched.c =================================================================== --- linux-2.6.orig/kernel/sched.c +++ linux-2.6/kernel/sched.c @@ -1197,6 +1197,9 @@ static void __resched_task(struct task_s */ #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) +/* + * delta *= weight / lw + */ static unsigned long calc_delta_mine(unsigned long delta_exec, unsigned long weight, struct load_weight *lw) @@ -1219,12 +1222,6 @@ calc_delta_mine(unsigned long delta_exec return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); } -static inline unsigned long -calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) -{ - return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); -} - static inline void update_load_add(struct load_weight *lw, unsigned long inc) { lw->weight += inc; Index: linux-2.6/kernel/sched_fair.c =================================================================== --- linux-2.6.orig/kernel/sched_fair.c +++ linux-2.6/kernel/sched_fair.c @@ -238,12 +238,22 @@ static inline s64 entity_key(struct cfs_ */ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; + struct rb_node **link; struct rb_node *parent = NULL; struct sched_entity *entry; - s64 key = entity_key(cfs_rq, se); + s64 key; int leftmost = 1; + if (!entity_is_task(se)) + return; + + if (se == cfs_rq->curr) + return; + + cfs_rq = &rq_of(cfs_rq)->cfs; + + link = &cfs_rq->tasks_timeline.rb_node; + key = entity_key(cfs_rq, se); /* * Find the right place in the rbtree: */ @@ -275,6 +285,14 @@ static void __enqueue_entity(struct cfs_ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { + if (!entity_is_task(se)) + return; + + if (se == cfs_rq->curr) + return; + + cfs_rq = &rq_of(cfs_rq)->cfs; + if (cfs_rq->rb_leftmost == &se->run_node) cfs_rq->rb_leftmost = rb_next(&se->run_node); @@ -358,6 +376,13 @@ static u64 sched_slice(struct cfs_rq *cf { u64 slice = __sched_period(cfs_rq->nr_running); + /* + * FIXME: curious 'hack' to make it boot - when the tick is started we + * hit this with the init_task, which is not actually enqueued. + */ + if (unlikely(rq_of(cfs_rq)->load.weight <= se->load.weight)) + goto out; + for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); @@ -365,37 +390,66 @@ static u64 sched_slice(struct cfs_rq *cf do_div(slice, cfs_rq->load.weight); } +out: return slice; } /* * We calculate the vruntime slice of a to be inserted task * - * vs = s/w = p/rw + * vs = s*rw/w = p */ static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) { unsigned long nr_running = cfs_rq->nr_running; - unsigned long weight; - u64 vslice; if (!se->on_rq) nr_running++; - vslice = __sched_period(nr_running); + return __sched_period(nr_running); +} +static inline unsigned long +calc_delta_fair(unsigned long delta, struct sched_entity *se) +{ for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); + delta = calc_delta_mine(delta, + cfs_rq_of(se)->load.weight, &se->load); + } - weight = cfs_rq->load.weight; - if (!se->on_rq) - weight += se->load.weight; + return delta; +} - vslice *= NICE_0_LOAD; - do_div(vslice, weight); +/* + * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in + * that it favours >=0 over <0. + * + * -20 | + * | + * 0 --------+------- + * .' + * 19 .' + * + */ +static unsigned long +calc_delta_asym(unsigned long delta, struct sched_entity *se) +{ + struct load_weight lw = { + .weight = NICE_0_LOAD, + .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT) + }; + + for_each_sched_entity(se) { + struct load_weight *se_lw = &se->load; + + if (se->load.weight < NICE_0_LOAD) + se_lw = &lw; + + delta = calc_delta_mine(delta, + cfs_rq_of(se)->load.weight, se_lw); } - return vslice; + return delta; } /* @@ -413,13 +467,14 @@ __update_curr(struct cfs_rq *cfs_rq, str curr->sum_exec_runtime += delta_exec; schedstat_add(cfs_rq, exec_clock, delta_exec); - delta_exec_weighted = delta_exec; - if (unlikely(curr->load.weight != NICE_0_LOAD)) { - delta_exec_weighted = calc_delta_fair(delta_exec_weighted, - &curr->load); - } + delta_exec_weighted = calc_delta_fair(delta_exec, curr); curr->vruntime += delta_exec_weighted; + if (!entity_is_task(curr)) + return; + + cfs_rq = &rq_of(cfs_rq)->cfs; + /* * maintain cfs_rq->min_vruntime to be a monotonic increasing * value tracking the leftmost vruntime in the tree. @@ -599,6 +654,11 @@ place_entity(struct cfs_rq *cfs_rq, stru { u64 vruntime; + if (!entity_is_task(se)) + return; + + cfs_rq = &rq_of(cfs_rq)->cfs; + vruntime = cfs_rq->min_vruntime; /* @@ -613,7 +673,7 @@ place_entity(struct cfs_rq *cfs_rq, stru if (!initial) { /* sleeps upto a single latency don't count. */ if (sched_feat(NEW_FAIR_SLEEPERS)) - vruntime -= sysctl_sched_latency; + vruntime -= calc_delta_asym(sysctl_sched_latency, se); /* ensure we never gain time by being placed backwards. */ vruntime = max_vruntime(se->vruntime, vruntime); @@ -637,8 +697,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, st update_stats_enqueue(cfs_rq, se); check_spread(cfs_rq, se); - if (se != cfs_rq->curr) - __enqueue_entity(cfs_rq, se); + __enqueue_entity(cfs_rq, se); account_entity_enqueue(cfs_rq, se); } @@ -664,8 +723,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, st #endif } - if (se != cfs_rq->curr) - __dequeue_entity(cfs_rq, se); + __dequeue_entity(cfs_rq, se); account_entity_dequeue(cfs_rq, se); } @@ -694,6 +752,8 @@ set_next_entity(struct cfs_rq *cfs_rq, s * runqueue. */ update_stats_wait_end(cfs_rq, se); + if (WARN_ON_ONCE(cfs_rq->curr)) + cfs_rq->curr = NULL; __dequeue_entity(cfs_rq, se); } @@ -713,18 +773,6 @@ set_next_entity(struct cfs_rq *cfs_rq, s se->prev_sum_exec_runtime = se->sum_exec_runtime; } -static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) -{ - struct sched_entity *se = NULL; - - if (first_fair(cfs_rq)) { - se = __pick_next_entity(cfs_rq); - set_next_entity(cfs_rq, se); - } - - return se; -} - static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) { /* @@ -735,12 +783,12 @@ static void put_prev_entity(struct cfs_r update_curr(cfs_rq); check_spread(cfs_rq, prev); + cfs_rq->curr = NULL; if (prev->on_rq) { update_stats_wait_start(cfs_rq, prev); /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); } - cfs_rq->curr = NULL; } static void @@ -751,6 +799,9 @@ entity_tick(struct cfs_rq *cfs_rq, struc */ update_curr(cfs_rq); + if (!entity_is_task(curr)) + return; + #ifdef CONFIG_SCHED_HRTICK /* * queued ticks are scheduled to match the slice, so don't bother @@ -766,7 +817,8 @@ entity_tick(struct cfs_rq *cfs_rq, struc return; #endif - if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) + if (rq_of(cfs_rq)->load.weight != curr->load.weight || + !sched_feat(WAKEUP_PREEMPT)) check_preempt_tick(cfs_rq, curr); } @@ -891,7 +943,7 @@ static void yield_task_fair(struct rq *r /* * Are we the only task in the tree? */ - if (unlikely(cfs_rq->nr_running == 1)) + if (unlikely(rq->load.weight == curr->se.load.weight)) return; if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { @@ -906,7 +958,7 @@ static void yield_task_fair(struct rq *r /* * Find the rightmost entry in the rbtree: */ - rightmost = __pick_last_entity(cfs_rq); + rightmost = __pick_last_entity(&rq->cfs); /* * Already in the rightmost position? */ @@ -1068,7 +1120,6 @@ out_set_cpu: } #endif /* CONFIG_SMP */ - /* * Preempt the current task with a newly woken task if needed: */ @@ -1095,19 +1146,11 @@ static void check_preempt_wakeup(struct if (!sched_feat(WAKEUP_PREEMPT)) return; - while (!is_same_group(se, pse)) { - se = parent_entity(se); - pse = parent_entity(pse); - } - - gran = sysctl_sched_wakeup_granularity; /* - * More easily preempt - nice tasks, while not making - * it harder for + nice tasks. + * More easily preempt - nice tasks, while not making it harder for + * + nice tasks. */ - if (unlikely(se->load.weight > NICE_0_LOAD)) - gran = calc_delta_fair(gran, &se->load); - + gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); if (pse->vruntime + gran < se->vruntime) resched_task(curr); } @@ -1116,17 +1159,18 @@ static struct task_struct *pick_next_tas { struct task_struct *p; struct cfs_rq *cfs_rq = &rq->cfs; - struct sched_entity *se; + struct sched_entity *se, *next; - if (unlikely(!cfs_rq->nr_running)) + if (!first_fair(cfs_rq)) return NULL; - do { - se = pick_next_entity(cfs_rq); - cfs_rq = group_cfs_rq(se); - } while (cfs_rq); + next = se = __pick_next_entity(cfs_rq); + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + set_next_entity(cfs_rq, se); + } - p = task_of(se); + p = task_of(next); hrtick_start_fair(rq, p); return p; -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/