[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20250520104110.3673059-7-ziqianlu@bytedance.com>
Date: Tue, 20 May 2025 18:41:09 +0800
From: Aaron Lu <ziqianlu@...edance.com>
To: Valentin Schneider <vschneid@...hat.com>,
Ben Segall <bsegall@...gle.com>,
K Prateek Nayak <kprateek.nayak@....com>,
Peter Zijlstra <peterz@...radead.org>,
Josh Don <joshdon@...gle.com>,
Ingo Molnar <mingo@...hat.com>,
Vincent Guittot <vincent.guittot@...aro.org>,
Xi Wang <xii@...gle.com>
Cc: linux-kernel@...r.kernel.org,
Juri Lelli <juri.lelli@...hat.com>,
Dietmar Eggemann <dietmar.eggemann@....com>,
Steven Rostedt <rostedt@...dmis.org>,
Mel Gorman <mgorman@...e.de>,
Chengming Zhou <chengming.zhou@...ux.dev>,
Chuyi Zhou <zhouchuyi@...edance.com>,
Jan Kiszka <jan.kiszka@...mens.com>,
Florian Bezdeka <florian.bezdeka@...mens.com>
Subject: [PATCH 6/7] sched/fair: task based throttle time accounting
With task based throttle model, the previous way to check cfs_rq's
nr_queued to decide if throttled time should be accounted doesn't work
as expected, e.g. when a cfs_rq is throttled, its nr_queued == 1 but
then that task could block in kernel mode instead of being dequeued on
limbo list. Account this as throttled time is not accurate.
Rework throttle time accounting for a cfs_rq as follows:
- start accounting when the first task gets throttled in its hierarchy;
- stop accounting on unthrottle.
Suggested-by: Chengming Zhou <chengming.zhou@...ux.dev> # accounting mechanism
Co-developed-by: K Prateek Nayak <kprateek.nayak@....com> # simplify implementation
Signed-off-by: K Prateek Nayak <kprateek.nayak@....com>
Signed-off-by: Aaron Lu <ziqianlu@...edance.com>
---
kernel/sched/fair.c | 41 +++++++++++++++++++++++------------------
kernel/sched/sched.h | 1 +
2 files changed, 24 insertions(+), 18 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a968d334e8730..4646d4f8b878d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5360,16 +5360,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (cfs_rq->nr_queued == 1) {
check_enqueue_throttle(cfs_rq);
list_add_leaf_cfs_rq(cfs_rq);
-#ifdef CONFIG_CFS_BANDWIDTH
- if (throttled_hierarchy(cfs_rq)) {
- struct rq *rq = rq_of(cfs_rq);
-
- if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
- cfs_rq->throttled_clock = rq_clock(rq);
- if (!cfs_rq->throttled_clock_self)
- cfs_rq->throttled_clock_self = rq_clock(rq);
- }
-#endif
}
}
@@ -5455,7 +5445,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* DELAY_DEQUEUE relies on spurious wakeups, special task
* states must not suffer spurious wakeups, excempt them.
*/
- if (flags & DEQUEUE_SPECIAL)
+ if (flags & (DEQUEUE_SPECIAL | DEQUEUE_THROTTLE))
delay = false;
WARN_ON_ONCE(delay && se->sched_delayed);
@@ -5863,7 +5853,7 @@ static void throttle_cfs_rq_work(struct callback_head *work)
rq = scope.rq;
update_rq_clock(rq);
WARN_ON_ONCE(!list_empty(&p->throttle_node));
- dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
+ dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_THROTTLE);
/*
* Must not add it to limbo list before dequeue or dequeue will
* mistakenly regard this task as an already throttled one.
@@ -5955,6 +5945,17 @@ static inline void task_throttle_setup_work(struct task_struct *p)
task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
}
+static void record_throttle_clock(struct cfs_rq *cfs_rq)
+{
+ struct rq *rq = rq_of(cfs_rq);
+
+ if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
+ cfs_rq->throttled_clock = rq_clock(rq);
+
+ if (!cfs_rq->throttled_clock_self)
+ cfs_rq->throttled_clock_self = rq_clock(rq);
+}
+
static int tg_throttle_down(struct task_group *tg, void *data)
{
struct rq *rq = data;
@@ -5967,12 +5968,10 @@ static int tg_throttle_down(struct task_group *tg, void *data)
/* group is entering throttled state, stop time */
cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
- WARN_ON_ONCE(cfs_rq->throttled_clock_self);
- if (cfs_rq->nr_queued)
- cfs_rq->throttled_clock_self = rq_clock(rq);
- else
+ if (!cfs_rq->nr_queued)
list_del_leaf_cfs_rq(cfs_rq);
+ WARN_ON_ONCE(cfs_rq->throttled_clock_self);
WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
return 0;
}
@@ -6015,8 +6014,6 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
*/
cfs_rq->throttled = 1;
WARN_ON_ONCE(cfs_rq->throttled_clock);
- if (cfs_rq->nr_queued)
- cfs_rq->throttled_clock = rq_clock(rq);
return;
}
@@ -6734,6 +6731,7 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static void task_throttle_setup_work(struct task_struct *p) {}
static bool task_is_throttled(struct task_struct *p) { return false; }
static void dequeue_throttled_task(struct task_struct *p, int flags) {}
+static void record_throttle_clock(struct cfs_rq *cfs_rq) {}
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
{
@@ -7057,6 +7055,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
int rq_h_nr_queued = rq->cfs.h_nr_queued;
bool task_sleep = flags & DEQUEUE_SLEEP;
bool task_delayed = flags & DEQUEUE_DELAYED;
+ bool task_throttled = flags & DEQUEUE_THROTTLE;
struct task_struct *p = NULL;
int h_nr_idle = 0;
int h_nr_queued = 0;
@@ -7092,6 +7091,9 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
if (cfs_rq_is_idle(cfs_rq))
h_nr_idle = h_nr_queued;
+ if (throttled_hierarchy(cfs_rq) && task_throttled)
+ record_throttle_clock(cfs_rq);
+
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
slice = cfs_rq_min_slice(cfs_rq);
@@ -7128,6 +7130,9 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
if (cfs_rq_is_idle(cfs_rq))
h_nr_idle = h_nr_queued;
+
+ if (throttled_hierarchy(cfs_rq) && task_throttled)
+ record_throttle_clock(cfs_rq);
}
sub_nr_running(rq, h_nr_queued);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 83f16fc44884f..9ba4b8f988ebf 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2360,6 +2360,7 @@ extern const u32 sched_prio_to_wmult[40];
#define DEQUEUE_SPECIAL 0x10
#define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */
#define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */
+#define DEQUEUE_THROTTLE 0x800
#define ENQUEUE_WAKEUP 0x01
#define ENQUEUE_RESTORE 0x02
--
2.39.5
Powered by blists - more mailing lists