By its very nature we try to converge vruntime between tasks. This makes it very hard to interleave the groups that have varying latency requirements, they end up in a single 'lump'. Avoid this by introducing an artificial vruntime offset: A1 |--------------| A2 |--------------| A3 |--------------| New |--------------| Because tasks have no stable (dense) enumeration within a group and we'd want the tasks evenly spaced within the period in a regular fashion, we use an ascending iterator (nr_iter). This ensures that in a steady state, each task will have the same offset However, when a new task gets inserted we cannot re-adjust all offsets, hence we will approximate by inserting the new task at p(1-1/n). This is why account_entity_enqueue() sets nr_iter to nr_running-1. Signed-off-by: Peter Zijlstra --- include/linux/sched.h | 1 kernel/sched.c | 2 + kernel/sched_fair.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 55 insertions(+), 4 deletions(-) Index: linux-2.6/include/linux/sched.h =================================================================== --- linux-2.6.orig/include/linux/sched.h +++ linux-2.6/include/linux/sched.h @@ -923,6 +923,7 @@ struct sched_entity { u64 exec_start; u64 sum_exec_runtime; u64 vruntime; + u64 voffset; u64 prev_sum_exec_runtime; #ifdef CONFIG_SCHEDSTATS Index: linux-2.6/kernel/sched_fair.c =================================================================== --- linux-2.6.orig/kernel/sched_fair.c +++ linux-2.6/kernel/sched_fair.c @@ -220,9 +220,11 @@ static inline u64 min_vruntime(u64 min_v static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) { - return se->vruntime - cfs_rq->min_vruntime; + return se->vruntime + se->voffset - cfs_rq->min_vruntime; } +static u64 sched_voffset(struct cfs_rq *cfs_rq, struct sched_entity *se); + /* * Enqueue an entity into the rb-tree: */ @@ -240,6 +242,8 @@ static void __enqueue_entity(struct cfs_ if (se == cfs_rq->curr) return; + se->voffset = sched_voffset(cfs_rq, se); + cfs_rq = &rq_of(cfs_rq)->cfs; link = &cfs_rq->tasks_timeline.rb_node; @@ -387,7 +391,7 @@ out: * * vs = s*rw/w = p */ -static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) +static inline u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) { unsigned long nr_running = cfs_rq->nr_running; @@ -397,6 +401,46 @@ static u64 sched_vslice_add(struct cfs_r return __sched_period(nr_running); } +/* + * By its very nature we try to converge vruntime between tasks. This makes it + * very hard to interleave the groups that have varying latency requirements, + * they end up in a single 'lump'. Avoid this by introducing an artificial + * vruntime offset: + * + * A1 |--------------| + * A2 |--------------| + * A3 |--------------| + * + * New |--------------| + * + * Because tasks have no stable (dense) enumeration within a group [*] + * and we'd want the tasks evenly spaced within the period in a regular + * fashion, we use an ascending iterator (nr_iter). + * + * This ensures that in a steady state, each task will have the same offset + * + * However, when a new task gets inserted we cannot re-adjust all offsets, + * hence we will approximate by inserting the new task at p(1-1/n). + * This is why account_entity_enqueue() sets nr_iter to nr_running-1. + * + * [*] it would be possible to arrange for one, but it seems unnecessarily + * complex, esp. as we still can't re-adjust all tasks on insert. + */ +static u64 sched_voffset(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + /* + * approximate p/n with min_granularity + */ + u64 frac = sysctl_sched_min_granularity; + + frac *= cfs_rq->nr_iter; + cfs_rq->nr_iter++; + if (cfs_rq->nr_iter >= cfs_rq->nr_running) + cfs_rq->nr_iter = 0; + + return frac; +} + static inline unsigned long calc_delta_fair(unsigned long delta, struct sched_entity *se) { @@ -564,6 +608,7 @@ static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_add(&cfs_rq->load, se->load.weight); + cfs_rq->nr_iter = cfs_rq->nr_running; cfs_rq->nr_running++; se->on_rq = 1; list_add(&se->group_node, &cfs_rq->tasks); @@ -574,6 +619,8 @@ account_entity_dequeue(struct cfs_rq *cf { update_load_sub(&cfs_rq->load, se->load.weight); cfs_rq->nr_running--; + if (cfs_rq->nr_iter >= cfs_rq->nr_running) + cfs_rq->nr_iter = 0; se->on_rq = 0; list_del_init(&se->group_node); } @@ -656,7 +703,7 @@ place_entity(struct cfs_rq *cfs_rq, stru * stays open at the end. */ if (initial && sched_feat(START_DEBIT)) - vruntime += sched_vslice_add(cfs_rq, se); + vruntime += sched_vslice(cfs_rq, se); if (!initial) { /* sleeps upto a single latency don't count. */ @@ -678,6 +725,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, st */ update_curr(cfs_rq); + account_entity_enqueue(cfs_rq, se); + if (wakeup) { place_entity(cfs_rq, se, 0); enqueue_sleeper(cfs_rq, se); @@ -686,7 +735,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, st update_stats_enqueue(cfs_rq, se); check_spread(cfs_rq, se); __enqueue_entity(cfs_rq, se); - account_entity_enqueue(cfs_rq, se); } static void Index: linux-2.6/kernel/sched.c =================================================================== --- linux-2.6.orig/kernel/sched.c +++ linux-2.6/kernel/sched.c @@ -425,6 +425,8 @@ struct cfs_rq { struct load_weight load; unsigned long nr_running; + unsigned long nr_iter; + u64 exec_clock; u64 min_vruntime; -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/