[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20230328110354.141543852@infradead.org>
Date: Tue, 28 Mar 2023 11:26:30 +0200
From: Peter Zijlstra <peterz@...radead.org>
To: mingo@...nel.org, vincent.guittot@...aro.org
Cc: linux-kernel@...r.kernel.org, peterz@...radead.org,
juri.lelli@...hat.com, dietmar.eggemann@....com,
rostedt@...dmis.org, bsegall@...gle.com, mgorman@...e.de,
bristot@...hat.com, corbet@....net, qyousef@...alina.io,
chris.hyser@...cle.com, patrick.bellasi@...bug.net, pjt@...gle.com,
pavel@....cz, qperret@...gle.com, tim.c.chen@...ux.intel.com,
joshdon@...gle.com, timj@....org, kprateek.nayak@....com,
yu.c.chen@...el.com, youssefesmat@...omium.org,
joel@...lfernandes.org, efault@....de
Subject: [PATCH 08/17] sched/fair: Implement an EEVDF like policy
Where CFS is currently a WFQ based scheduler with only a single knob,
the weight. The addition of a second, latency oriented parameter,
makes something like WF2Q or EEVDF based a much better fit.
Specifically, EEVDF does EDF like scheduling in the left half of the
tree -- those entities that are owed service. Except because this is a
virtual time scheduler, the deadlines are in virtual time as well,
which is what allows over-subscription.
EEVDF has two parameters:
- weight, or time-slope; which is mapped to nice just as before
- relative deadline; which is related to slice length and mapped
to the new latency nice.
Basically, by setting a smaller slice, the deadline will be earlier
and the task will be more eligible and ran earlier.
Preemption (both tick and wakeup) is driven by testing against a fresh
pick. Because the tree is now effectively an interval tree, and the
selection is no longer 'leftmost', over-scheduling is less of a
problem.
Signed-off-by: Peter Zijlstra (Intel) <peterz@...radead.org>
---
include/linux/sched.h | 4
kernel/sched/debug.c | 6
kernel/sched/fair.c | 324 +++++++++++++++++++++++++++++++++++++++++-------
kernel/sched/features.h | 3
kernel/sched/sched.h | 1
5 files changed, 293 insertions(+), 45 deletions(-)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -548,6 +548,9 @@ struct sched_entity {
/* For load-balancing: */
struct load_weight load;
struct rb_node run_node;
+ u64 deadline;
+ u64 min_deadline;
+
struct list_head group_node;
unsigned int on_rq;
@@ -556,6 +559,7 @@ struct sched_entity {
u64 vruntime;
u64 prev_sum_exec_runtime;
s64 vlag;
+ u64 slice;
u64 nr_migrations;
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -535,9 +535,13 @@ print_task(struct seq_file *m, struct rq
else
SEQ_printf(m, " %c", task_state_to_char(p));
- SEQ_printf(m, " %15s %5d %9Ld.%06ld %9Ld %5d ",
+ SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ",
p->comm, task_pid_nr(p),
SPLIT_NS(p->se.vruntime),
+ entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N',
+ SPLIT_NS(p->se.deadline),
+ SPLIT_NS(p->se.slice),
+ SPLIT_NS(p->se.sum_exec_runtime),
(long long)(p->nvcsw + p->nivcsw),
p->prio);
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -47,6 +47,7 @@
#include <linux/psi.h>
#include <linux/ratelimit.h>
#include <linux/task_work.h>
+#include <linux/rbtree_augmented.h>
#include <asm/switch_to.h>
@@ -347,6 +348,16 @@ static u64 __calc_delta(u64 delta_exec,
return mul_u64_u32_shr(delta_exec, fact, shift);
}
+/*
+ * delta /= w
+ */
+static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
+{
+ if (unlikely(se->load.weight != NICE_0_LOAD))
+ delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
+
+ return delta;
+}
const struct sched_class fair_sched_class;
@@ -691,11 +702,62 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
/*
* lag_i = S - s_i = w_i * (V - v_i)
+ *
+ * However, since V is approximated by the weighted average of all entities it
+ * is possible -- by addition/removal/reweight to the tree -- to move V around
+ * and end up with a larger lag than we started with.
+ *
+ * Limit this to either double the slice length with a minimum of TICK_NSEC
+ * since that is the timing granularity.
+ *
+ * EEVDF gives the following limit for a steady state system:
+ *
+ * -r_max < lag < max(r_max, q)
+ *
+ * XXX could add max_slice to the augmented data to track this.
*/
void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ s64 lag, limit;
+
SCHED_WARN_ON(!se->on_rq);
- se->vlag = avg_vruntime(cfs_rq) - se->vruntime;
+ lag = avg_vruntime(cfs_rq) - se->vruntime;
+
+ limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
+ se->vlag = clamp(lag, -limit, limit);
+}
+
+/*
+ * Entity is eligible once it received less service than it ought to have,
+ * eg. lag >= 0.
+ *
+ * lag_i = S - s_i = w_i*(V - v_i)
+ *
+ * lag_i >= 0 -> V >= v_i
+ *
+ * \Sum (v_i - v)*w_i
+ * V = ------------------ + v
+ * \Sum w_i
+ *
+ * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i)
+ *
+ * Note: using 'avg_vruntime() > se->vruntime' is inacurate due
+ * to the loss in precision caused by the division.
+ */
+int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct sched_entity *curr = cfs_rq->curr;
+ s64 avg = cfs_rq->avg_vruntime;
+ long load = cfs_rq->avg_load;
+
+ if (curr && curr->on_rq) {
+ unsigned long weight = scale_load_down(curr->load.weight);
+
+ avg += entity_key(cfs_rq, curr) * weight;
+ load += weight;
+ }
+
+ return avg >= entity_key(cfs_rq, se) * load;
}
static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
@@ -714,8 +776,8 @@ static u64 __update_min_vruntime(struct
static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
+ struct sched_entity *se = __pick_first_entity(cfs_rq);
struct sched_entity *curr = cfs_rq->curr;
- struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
u64 vruntime = cfs_rq->min_vruntime;
@@ -726,9 +788,7 @@ static void update_min_vruntime(struct c
curr = NULL;
}
- if (leftmost) { /* non-empty tree */
- struct sched_entity *se = __node_2_se(leftmost);
-
+ if (se) {
if (!curr)
vruntime = se->vruntime;
else
@@ -745,18 +805,50 @@ static inline bool __entity_less(struct
return entity_before(__node_2_se(a), __node_2_se(b));
}
+#define deadline_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; })
+
+static inline void __update_min_deadline(struct sched_entity *se, struct rb_node *node)
+{
+ if (node) {
+ struct sched_entity *rse = __node_2_se(node);
+ if (deadline_gt(min_deadline, se, rse))
+ se->min_deadline = rse->min_deadline;
+ }
+}
+
+/*
+ * se->min_deadline = min(se->deadline, left->min_deadline, right->min_deadline)
+ */
+static inline bool min_deadline_update(struct sched_entity *se, bool exit)
+{
+ u64 old_min_deadline = se->min_deadline;
+ struct rb_node *node = &se->run_node;
+
+ se->min_deadline = se->deadline;
+ __update_min_deadline(se, node->rb_right);
+ __update_min_deadline(se, node->rb_left);
+
+ return se->min_deadline == old_min_deadline;
+}
+
+RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity,
+ run_node, min_deadline, min_deadline_update);
+
/*
* Enqueue an entity into the rb-tree:
*/
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
avg_vruntime_add(cfs_rq, se);
- rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
+ se->min_deadline = se->deadline;
+ rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
+ __entity_less, &min_deadline_cb);
}
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
+ rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
+ &min_deadline_cb);
avg_vruntime_sub(cfs_rq, se);
}
@@ -780,6 +872,97 @@ static struct sched_entity *__pick_next_
return __node_2_se(next);
}
+static struct sched_entity *pick_cfs(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+{
+ struct sched_entity *left = __pick_first_entity(cfs_rq);
+
+ /*
+ * If curr is set we have to see if its left of the leftmost entity
+ * still in the tree, provided there was anything in the tree at all.
+ */
+ if (!left || (curr && entity_before(curr, left)))
+ left = curr;
+
+ return left;
+}
+
+/*
+ * Earliest Eligible Virtual Deadline First
+ *
+ * In order to provide latency guarantees for different request sizes
+ * EEVDF selects the best runnable task from two criteria:
+ *
+ * 1) the task must be eligible (must be owed service)
+ *
+ * 2) from those tasks that meet 1), we select the one
+ * with the earliest virtual deadline.
+ *
+ * We can do this in O(log n) time due to an augmented RB-tree. The
+ * tree keeps the entries sorted on service, but also functions as a
+ * heap based on the deadline by keeping:
+ *
+ * se->min_deadline = min(se->deadline, se->{left,right}->min_deadline)
+ *
+ * Which allows an EDF like search on (sub)trees.
+ */
+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
+{
+ struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
+ struct sched_entity *curr = cfs_rq->curr;
+ struct sched_entity *best = NULL;
+
+ if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
+ curr = NULL;
+
+ while (node) {
+ struct sched_entity *se = __node_2_se(node);
+
+ /*
+ * If this entity is not eligible, try the left subtree.
+ */
+ if (!entity_eligible(cfs_rq, se)) {
+ node = node->rb_left;
+ continue;
+ }
+
+ /*
+ * If this entity has an earlier deadline than the previous
+ * best, take this one. If it also has the earliest deadline
+ * of its subtree, we're done.
+ */
+ if (!best || deadline_gt(deadline, best, se)) {
+ best = se;
+ if (best->deadline == best->min_deadline)
+ break;
+ }
+
+ /*
+ * If the earlest deadline in this subtree is in the fully
+ * eligible left half of our space, go there.
+ */
+ if (node->rb_left &&
+ __node_2_se(node->rb_left)->min_deadline == se->min_deadline) {
+ node = node->rb_left;
+ continue;
+ }
+
+ node = node->rb_right;
+ }
+
+ if (!best || (curr && deadline_gt(deadline, best, curr)))
+ best = curr;
+
+ if (unlikely(!best)) {
+ struct sched_entity *left = __pick_first_entity(cfs_rq);
+ if (left) {
+ pr_err("EEVDF scheduling fail, picking leftmost\n");
+ return left;
+ }
+ }
+
+ return best;
+}
+
#ifdef CONFIG_SCHED_DEBUG
struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
@@ -822,17 +1005,6 @@ long calc_latency_offset(int prio)
}
/*
- * delta /= w
- */
-static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
-{
- if (unlikely(se->load.weight != NICE_0_LOAD))
- delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
-
- return delta;
-}
-
-/*
* The idea is to set a period in which each task runs once.
*
* When there are too many tasks (sched_nr_latency) we have to stretch
@@ -897,6 +1069,38 @@ static u64 sched_slice(struct cfs_rq *cf
return slice;
}
+/*
+ * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
+ * this is probably good enough.
+ */
+static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ if ((s64)(se->vruntime - se->deadline) < 0)
+ return;
+
+ if (sched_feat(EEVDF)) {
+ /*
+ * For EEVDF the virtual time slope is determined by w_i (iow.
+ * nice) while the request time r_i is determined by
+ * latency-nice.
+ */
+ se->slice = se->latency_offset;
+ } else {
+ /*
+ * When many tasks blow up the sched_period; it is possible
+ * that sched_slice() reports unusually large results (when
+ * many tasks are very light for example). Therefore impose a
+ * maximum.
+ */
+ se->slice = min_t(u64, sched_slice(cfs_rq, se), sysctl_sched_latency);
+ }
+
+ /*
+ * EEVDF: vd_i = ve_i + r_i / w_i
+ */
+ se->deadline = se->vruntime + calc_delta_fair(se->slice, se);
+}
+
#include "pelt.h"
#ifdef CONFIG_SMP
@@ -1029,6 +1233,7 @@ static void update_curr(struct cfs_rq *c
schedstat_add(cfs_rq->exec_clock, delta_exec);
curr->vruntime += calc_delta_fair(delta_exec, curr);
+ update_deadline(cfs_rq, curr);
update_min_vruntime(cfs_rq);
if (entity_is_task(curr)) {
@@ -4796,6 +5001,7 @@ static inline bool entity_is_long_sleepe
static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
+ u64 vslice = calc_delta_fair(se->slice, se);
u64 vruntime = avg_vruntime(cfs_rq);
s64 lag = 0;
@@ -4834,9 +5040,9 @@ place_entity(struct cfs_rq *cfs_rq, stru
*/
load = cfs_rq->avg_load;
if (curr && curr->on_rq)
- load += curr->load.weight;
+ load += scale_load_down(curr->load.weight);
- lag *= load + se->load.weight;
+ lag *= load + scale_load_down(se->load.weight);
if (WARN_ON_ONCE(!load))
load = 1;
lag = div_s64(lag, load);
@@ -4877,6 +5083,19 @@ place_entity(struct cfs_rq *cfs_rq, stru
}
se->vruntime = vruntime;
+
+ /*
+ * When joining the competition; the exisiting tasks will be,
+ * on average, halfway through their slice, as such start tasks
+ * off with half a slice to ease into the competition.
+ */
+ if (sched_feat(PLACE_DEADLINE_INITIAL) && initial)
+ vslice /= 2;
+
+ /*
+ * EEVDF: vd_i = ve_i + r_i/w_i
+ */
+ se->deadline = se->vruntime + vslice;
}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
@@ -5088,19 +5307,20 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
static void
check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
- unsigned long ideal_runtime, delta_exec;
+ unsigned long delta_exec;
struct sched_entity *se;
s64 delta;
- /*
- * When many tasks blow up the sched_period; it is possible that
- * sched_slice() reports unusually large results (when many tasks are
- * very light for example). Therefore impose a maximum.
- */
- ideal_runtime = min_t(u64, sched_slice(cfs_rq, curr), sysctl_sched_latency);
+ if (sched_feat(EEVDF)) {
+ if (pick_eevdf(cfs_rq) != curr)
+ goto preempt;
+
+ return;
+ }
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
- if (delta_exec > ideal_runtime) {
+ if (delta_exec > curr->slice) {
+preempt:
resched_curr(rq_of(cfs_rq));
/*
* The current task ran long enough, ensure it doesn't get
@@ -5124,7 +5344,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq
if (delta < 0)
return;
- if (delta > ideal_runtime)
+ if (delta > curr->slice)
resched_curr(rq_of(cfs_rq));
}
@@ -5179,17 +5399,20 @@ wakeup_preempt_entity(struct sched_entit
static struct sched_entity *
pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
- struct sched_entity *left = __pick_first_entity(cfs_rq);
- struct sched_entity *se;
+ struct sched_entity *left, *se;
- /*
- * If curr is set we have to see if its left of the leftmost entity
- * still in the tree, provided there was anything in the tree at all.
- */
- if (!left || (curr && entity_before(curr, left)))
- left = curr;
+ if (sched_feat(EEVDF)) {
+ /*
+ * Enabling NEXT_BUDDY will affect latency but not fairness.
+ */
+ if (sched_feat(NEXT_BUDDY) &&
+ cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
+ return cfs_rq->next;
+
+ return pick_eevdf(cfs_rq);
+ }
- se = left; /* ideally we run the leftmost entity */
+ se = left = pick_cfs(cfs_rq, curr);
/*
* Avoid running the skip buddy, if running something else can
@@ -6284,13 +6507,12 @@ static inline void unthrottle_offline_cf
static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
SCHED_WARN_ON(task_rq(p) != rq);
if (rq->cfs.h_nr_running > 1) {
- u64 slice = sched_slice(cfs_rq, se);
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+ u64 slice = se->slice;
s64 delta = slice - ran;
if (delta < 0) {
@@ -8010,7 +8232,19 @@ static void check_preempt_wakeup(struct
if (cse_is_idle != pse_is_idle)
return;
- update_curr(cfs_rq_of(se));
+ cfs_rq = cfs_rq_of(se);
+ update_curr(cfs_rq);
+
+ if (sched_feat(EEVDF)) {
+ /*
+ * XXX pick_eevdf(cfs_rq) != se ?
+ */
+ if (pick_eevdf(cfs_rq) == pse)
+ goto preempt;
+
+ return;
+ }
+
if (wakeup_preempt_entity(se, pse) == 1) {
/*
* Bias pick_next to pick the sched entity that is
@@ -8256,7 +8490,7 @@ static void yield_task_fair(struct rq *r
clear_buddies(cfs_rq, se);
- if (curr->policy != SCHED_BATCH) {
+ if (sched_feat(EEVDF) || curr->policy != SCHED_BATCH) {
update_rq_clock(rq);
/*
* Update run-time statistics of the 'current'.
@@ -8269,6 +8503,8 @@ static void yield_task_fair(struct rq *r
*/
rq_clock_skip_update(rq);
}
+ if (sched_feat(EEVDF))
+ se->deadline += calc_delta_fair(se->slice, se);
set_skip_buddy(se);
}
@@ -12012,8 +12248,8 @@ static void rq_offline_fair(struct rq *r
static inline bool
__entity_slice_used(struct sched_entity *se, int min_nr_tasks)
{
- u64 slice = sched_slice(cfs_rq_of(se), se);
u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+ u64 slice = se->slice;
return (rtime * min_nr_tasks > slice);
}
@@ -12728,7 +12964,7 @@ static unsigned int get_rr_interval_fair
* idle runqueue:
*/
if (rq->cfs.load.weight)
- rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
+ rr_interval = NS_TO_JIFFIES(se->slice);
return rr_interval;
}
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -13,6 +13,7 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
* sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
*/
SCHED_FEAT(PLACE_LAG, true)
+SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
/*
* Prefer to schedule the task we woke last (assuming it failed
@@ -103,3 +104,5 @@ SCHED_FEAT(LATENCY_WARN, false)
SCHED_FEAT(ALT_PERIOD, true)
SCHED_FEAT(BASE_SLICE, true)
+
+SCHED_FEAT(EEVDF, true)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3316,5 +3316,6 @@ static inline void switch_mm_cid(struct
#endif
extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
+extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
#endif /* _KERNEL_SCHED_SCHED_H */
Powered by blists - more mailing lists