From ebd646f8eabb245e5d6e04389febc513b47a5b9a Mon Sep 17 00:00:00 2001 From: Florian Bezdeka Date: Wed, 16 Apr 2025 12:41:03 +0200 Subject: [PATCH 7/7] sched/fair: alternative way of accounting throttle time Implement an alternative way of accounting cfs_rq throttle time which: - starts accounting when a throttled cfs_rq has no tasks enqueued and its throttled list is not empty; - stops accounting when this cfs_rq gets unthrottled or a task gets enqueued. This way, the accounted throttle time is when the cfs_rq has absolutely no tasks enqueued and has tasks throttled. [Florian: manual backport to 6.14] Signed-off-by: Aaron Lu Signed-off-by: Florian Bezdeka --- kernel/sched/fair.c | 112 ++++++++++++++++++++++++++++++++----------- kernel/sched/sched.h | 4 ++ 2 files changed, 89 insertions(+), 27 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4e9079f2e3a6a..e515a1b43bba8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5309,6 +5309,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) static void check_enqueue_throttle(struct cfs_rq *cfs_rq); static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); +static void account_cfs_rq_throttle_self(struct cfs_rq *cfs_rq); static void requeue_delayed_entity(struct sched_entity *se); @@ -5371,10 +5372,14 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (throttled_hierarchy(cfs_rq)) { struct rq *rq = rq_of(cfs_rq); - if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock) - cfs_rq->throttled_clock = rq_clock(rq); - if (!cfs_rq->throttled_clock_self) - cfs_rq->throttled_clock_self = rq_clock(rq); + if (cfs_rq->throttled_clock) { + cfs_rq->throttled_time += + rq_clock(rq) - cfs_rq->throttled_clock; + cfs_rq->throttled_clock = 0; + } + + if (cfs_rq->throttled_clock_self) + account_cfs_rq_throttle_self(cfs_rq); } #endif } @@ -5462,7 +5467,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * DELAY_DEQUEUE relies on spurious wakeups, special task * states must not suffer spurious wakeups, excempt them. */ - if (flags & DEQUEUE_SPECIAL) + if (flags & (DEQUEUE_SPECIAL | DEQUEUE_THROTTLE)) delay = false; SCHED_WARN_ON(delay && se->sched_delayed); @@ -5522,8 +5527,24 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (cfs_rq->nr_queued == 0) { update_idle_cfs_rq_clock_pelt(cfs_rq); - if (throttled_hierarchy(cfs_rq)) + +#ifdef CONFIG_CFS_BANDWIDTH + if (throttled_hierarchy(cfs_rq)) { list_del_leaf_cfs_rq(cfs_rq); + + if (cfs_rq->h_nr_throttled) { + struct rq *rq = rq_of(cfs_rq); + + WARN_ON_ONCE(cfs_rq->throttled_clock_self); + cfs_rq->throttled_clock_self = rq_clock(rq); + + if (cfs_rq_throttled(cfs_rq)) { + WARN_ON_ONCE(cfs_rq->throttled_clock); + cfs_rq->throttled_clock = rq_clock(rq); + } + } + } +#endif } return true; } @@ -5817,6 +5838,18 @@ static inline bool task_is_throttled(struct task_struct *p) return !list_empty(&p->throttle_node); } +static inline void +cfs_rq_inc_h_nr_throttled(struct cfs_rq *cfs_rq, unsigned int nr) +{ + cfs_rq->h_nr_throttled += nr; +} + +static inline void +cfs_rq_dec_h_nr_throttled(struct cfs_rq *cfs_rq, unsigned int nr) +{ + cfs_rq->h_nr_throttled -= nr; +} + static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags); static void throttle_cfs_rq_work(struct callback_head *work) { @@ -5853,7 +5886,7 @@ static void throttle_cfs_rq_work(struct callback_head *work) rq = scope.rq; update_rq_clock(rq); WARN_ON_ONCE(!list_empty(&p->throttle_node)); - dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL); + dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_THROTTLE); list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list); resched_curr(rq); } @@ -5871,16 +5904,37 @@ void init_cfs_throttle_work(struct task_struct *p) static void dequeue_throttled_task(struct task_struct *p, int flags) { + struct sched_entity *se = &p->se; + /* * Task is throttled and someone wants to dequeue it again: * it must be sched/core when core needs to do things like * task affinity change, task group change, task sched class * change etc. */ - WARN_ON_ONCE(p->se.on_rq); - WARN_ON_ONCE(flags & DEQUEUE_SLEEP); + WARN_ON_ONCE(se->on_rq); + WARN_ON_ONCE(flags & DEQUEUE_THROTTLE); list_del_init(&p->throttle_node); + + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + cfs_rq->h_nr_throttled--; + } +} + +static void account_cfs_rq_throttle_self(struct cfs_rq *cfs_rq) +{ + /* account self time */ + u64 delta = rq_clock(rq_of(cfs_rq)) - cfs_rq->throttled_clock_self; + + cfs_rq->throttled_clock_self = 0; + + if (WARN_ON_ONCE((s64)delta < 0)) + delta = 0; + + cfs_rq->throttled_clock_self_time += delta; } static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags); @@ -5897,27 +5951,21 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - cfs_rq->throttled_clock_pelt; - if (cfs_rq->throttled_clock_self) { - u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self; - - cfs_rq->throttled_clock_self = 0; - - if (SCHED_WARN_ON((s64)delta < 0)) - delta = 0; - - cfs_rq->throttled_clock_self_time += delta; - } + if (cfs_rq->throttled_clock_self) + account_cfs_rq_throttle_self(cfs_rq); /* Re-enqueue the tasks that have been throttled at this level. */ list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) { list_del_init(&p->throttle_node); - enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP); + enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP | ENQUEUE_THROTTLE); } /* Add cfs_rq with load or one or more already running entities to the list */ if (!cfs_rq_is_decayed(cfs_rq)) list_add_leaf_cfs_rq(cfs_rq); + WARN_ON_ONCE(cfs_rq->h_nr_throttled); + return 0; } @@ -5953,10 +6001,7 @@ static int tg_throttle_down(struct task_group *tg, void *data) /* group is entering throttled state, stop time */ cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); - WARN_ON_ONCE(cfs_rq->throttled_clock_self); - if (cfs_rq->nr_queued) - cfs_rq->throttled_clock_self = rq_clock(rq); - else + if (!cfs_rq->nr_queued) list_del_leaf_cfs_rq(cfs_rq); WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list)); @@ -6000,9 +6045,6 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) * throttled-list. rq->lock protects completion. */ cfs_rq->throttled = 1; - SCHED_WARN_ON(cfs_rq->throttled_clock); - if (cfs_rq->nr_queued) - cfs_rq->throttled_clock = rq_clock(rq); } void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) @@ -6033,6 +6075,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; cfs_rq->throttled_clock = 0; } + if (cfs_rq->throttled_time) { + cfs_b->throttled_time += cfs_rq->throttled_time; + cfs_rq->throttled_time = 0; + } list_del_rcu(&cfs_rq->throttled_list); raw_spin_unlock(&cfs_b->lock); @@ -6717,6 +6763,8 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static void task_throttle_setup_work(struct task_struct *p) {} static bool task_is_throttled(struct task_struct *p) { return false; } static void dequeue_throttled_task(struct task_struct *p, int flags) {} +static void cfs_rq_inc_h_nr_throttled(struct cfs_rq *cfs_rq, unsigned int nr) {} +static void cfs_rq_dec_h_nr_throttled(struct cfs_rq *cfs_rq, unsigned int nr) {} static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) { @@ -6905,6 +6953,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct sched_entity *se = &p->se; int h_nr_idle = task_has_idle_policy(p); int h_nr_runnable = 1; + int h_nr_throttled = (flags & ENQUEUE_THROTTLE) ? 1 : 0; int task_new = !(flags & ENQUEUE_WAKEUP); int rq_h_nr_queued = rq->cfs.h_nr_queued; u64 slice = 0; @@ -6958,6 +7007,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_runnable += h_nr_runnable; cfs_rq->h_nr_queued++; cfs_rq->h_nr_idle += h_nr_idle; + cfs_rq_dec_h_nr_throttled(cfs_rq, h_nr_throttled); if (cfs_rq_is_idle(cfs_rq)) h_nr_idle = 1; @@ -6980,6 +7030,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_runnable += h_nr_runnable; cfs_rq->h_nr_queued++; cfs_rq->h_nr_idle += h_nr_idle; + cfs_rq_dec_h_nr_throttled(cfs_rq, h_nr_throttled); if (cfs_rq_is_idle(cfs_rq)) h_nr_idle = 1; @@ -7034,10 +7085,12 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) int rq_h_nr_queued = rq->cfs.h_nr_queued; bool task_sleep = flags & DEQUEUE_SLEEP; bool task_delayed = flags & DEQUEUE_DELAYED; + bool task_throttle = flags & DEQUEUE_THROTTLE; struct task_struct *p = NULL; int h_nr_idle = 0; int h_nr_queued = 0; int h_nr_runnable = 0; + int h_nr_throttled = 0; struct cfs_rq *cfs_rq; u64 slice = 0; @@ -7047,6 +7100,9 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) h_nr_idle = task_has_idle_policy(p); if (task_sleep || task_delayed || !se->sched_delayed) h_nr_runnable = 1; + + if (task_throttle) + h_nr_throttled = 1; } else { cfs_rq = group_cfs_rq(se); slice = cfs_rq_min_slice(cfs_rq); @@ -7065,6 +7121,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) cfs_rq->h_nr_runnable -= h_nr_runnable; cfs_rq->h_nr_queued -= h_nr_queued; cfs_rq->h_nr_idle -= h_nr_idle; + cfs_rq_inc_h_nr_throttled(cfs_rq, h_nr_throttled); if (cfs_rq_is_idle(cfs_rq)) h_nr_idle = h_nr_queued; @@ -7102,6 +7159,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) cfs_rq->h_nr_runnable -= h_nr_runnable; cfs_rq->h_nr_queued -= h_nr_queued; cfs_rq->h_nr_idle -= h_nr_idle; + cfs_rq_inc_h_nr_throttled(cfs_rq, h_nr_throttled); if (cfs_rq_is_idle(cfs_rq)) h_nr_idle = h_nr_queued; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a7ae74366c078..f994123c327b5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -727,6 +727,7 @@ struct cfs_rq { #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; + unsigned int h_nr_throttled; s64 runtime_remaining; u64 throttled_pelt_idle; @@ -738,6 +739,7 @@ struct cfs_rq { u64 throttled_clock_pelt_time; u64 throttled_clock_self; u64 throttled_clock_self_time; + u64 throttled_time; int throttled; int throttle_count; struct list_head throttled_list; @@ -2381,6 +2383,7 @@ extern const u32 sched_prio_to_wmult[40]; #define DEQUEUE_SPECIAL 0x10 #define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */ #define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */ +#define DEQUEUE_THROTTLE 0x800 /* Matches ENQUEUE_THROTTLE */ #define ENQUEUE_WAKEUP 0x01 #define ENQUEUE_RESTORE 0x02 @@ -2398,6 +2401,7 @@ extern const u32 sched_prio_to_wmult[40]; #define ENQUEUE_MIGRATING 0x100 #define ENQUEUE_DELAYED 0x200 #define ENQUEUE_RQ_SELECTED 0x400 +#define ENQUEUE_THROTTLE 0x800 #define RETRY_TASK ((void *)-1UL) -- 2.39.5