lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20250409120746.635476-8-ziqianlu@bytedance.com>
Date: Wed,  9 Apr 2025 20:07:46 +0800
From: Aaron Lu <ziqianlu@...edance.com>
To: Valentin Schneider <vschneid@...hat.com>,
	Ben Segall <bsegall@...gle.com>,
	K Prateek Nayak <kprateek.nayak@....com>,
	Peter Zijlstra <peterz@...radead.org>,
	Josh Don <joshdon@...gle.com>,
	Ingo Molnar <mingo@...hat.com>,
	Vincent Guittot <vincent.guittot@...aro.org>,
	Xi Wang <xii@...gle.com>
Cc: linux-kernel@...r.kernel.org,
	Juri Lelli <juri.lelli@...hat.com>,
	Dietmar Eggemann <dietmar.eggemann@....com>,
	Steven Rostedt <rostedt@...dmis.org>,
	Mel Gorman <mgorman@...e.de>,
	Chengming Zhou <chengming.zhou@...ux.dev>,
	Chuyi Zhou <zhouchuyi@...edance.com>,
	Jan Kiszka <jan.kiszka@...mens.com>
Subject: [RFC PATCH v2 7/7] sched/fair: alternative way of accounting throttle time

Implement an alternative way of accounting cfs_rq throttle time which:
- starts accounting when a throttled cfs_rq has no tasks enqueued and its
  throttled list is not empty;
- stops accounting when this cfs_rq gets unthrottled or a task gets
  enqueued.

This way, the accounted throttle time is when the cfs_rq has absolutely
no tasks enqueued and has tasks throttled.

Signed-off-by: Aaron Lu <ziqianlu@...edance.com>
---
 kernel/sched/fair.c  | 112 ++++++++++++++++++++++++++++++++-----------
 kernel/sched/sched.h |   4 ++
 2 files changed, 89 insertions(+), 27 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 20471a3aa35e6..70f7de82d1d9d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5300,6 +5300,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
+static void account_cfs_rq_throttle_self(struct cfs_rq *cfs_rq);
 
 static void
 requeue_delayed_entity(struct sched_entity *se);
@@ -5362,10 +5363,14 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 		if (throttled_hierarchy(cfs_rq)) {
 			struct rq *rq = rq_of(cfs_rq);
 
-			if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
-				cfs_rq->throttled_clock = rq_clock(rq);
-			if (!cfs_rq->throttled_clock_self)
-				cfs_rq->throttled_clock_self = rq_clock(rq);
+			if (cfs_rq->throttled_clock) {
+				cfs_rq->throttled_time +=
+					rq_clock(rq) - cfs_rq->throttled_clock;
+				cfs_rq->throttled_clock = 0;
+			}
+
+			if (cfs_rq->throttled_clock_self)
+				account_cfs_rq_throttle_self(cfs_rq);
 		}
 #endif
 	}
@@ -5453,7 +5458,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 		 * DELAY_DEQUEUE relies on spurious wakeups, special task
 		 * states must not suffer spurious wakeups, excempt them.
 		 */
-		if (flags & DEQUEUE_SPECIAL)
+		if (flags & (DEQUEUE_SPECIAL | DEQUEUE_THROTTLE))
 			delay = false;
 
 		WARN_ON_ONCE(delay && se->sched_delayed);
@@ -5513,8 +5518,24 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 	if (cfs_rq->nr_queued == 0) {
 		update_idle_cfs_rq_clock_pelt(cfs_rq);
-		if (throttled_hierarchy(cfs_rq))
+
+#ifdef CONFIG_CFS_BANDWIDTH
+		if (throttled_hierarchy(cfs_rq)) {
 			list_del_leaf_cfs_rq(cfs_rq);
+
+			if (cfs_rq->h_nr_throttled) {
+				struct rq *rq = rq_of(cfs_rq);
+
+				WARN_ON_ONCE(cfs_rq->throttled_clock_self);
+				cfs_rq->throttled_clock_self = rq_clock(rq);
+
+				if (cfs_rq_throttled(cfs_rq)) {
+					WARN_ON_ONCE(cfs_rq->throttled_clock);
+					cfs_rq->throttled_clock = rq_clock(rq);
+				}
+			}
+		}
+#endif
 	}
 
 	return true;
@@ -5809,6 +5830,18 @@ static inline bool task_is_throttled(struct task_struct *p)
 	return !list_empty(&p->throttle_node);
 }
 
+static inline void
+cfs_rq_inc_h_nr_throttled(struct cfs_rq *cfs_rq, unsigned int nr)
+{
+	cfs_rq->h_nr_throttled += nr;
+}
+
+static inline void
+cfs_rq_dec_h_nr_throttled(struct cfs_rq *cfs_rq, unsigned int nr)
+{
+	cfs_rq->h_nr_throttled -= nr;
+}
+
 static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
 static void throttle_cfs_rq_work(struct callback_head *work)
 {
@@ -5845,7 +5878,7 @@ static void throttle_cfs_rq_work(struct callback_head *work)
 		rq = scope.rq;
 		update_rq_clock(rq);
 		WARN_ON_ONCE(!list_empty(&p->throttle_node));
-		dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
+		dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_THROTTLE);
 		list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
 		resched_curr(rq);
 	}
@@ -5863,16 +5896,37 @@ void init_cfs_throttle_work(struct task_struct *p)
 
 static void dequeue_throttled_task(struct task_struct *p, int flags)
 {
+	struct sched_entity *se = &p->se;
+
 	/*
 	 * Task is throttled and someone wants to dequeue it again:
 	 * it must be sched/core when core needs to do things like
 	 * task affinity change, task group change, task sched class
 	 * change etc.
 	 */
-	WARN_ON_ONCE(p->se.on_rq);
-	WARN_ON_ONCE(flags & DEQUEUE_SLEEP);
+	WARN_ON_ONCE(se->on_rq);
+	WARN_ON_ONCE(flags & DEQUEUE_THROTTLE);
 
 	list_del_init(&p->throttle_node);
+
+	for_each_sched_entity(se) {
+		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+		cfs_rq->h_nr_throttled--;
+	}
+}
+
+static void account_cfs_rq_throttle_self(struct cfs_rq *cfs_rq)
+{
+	/* account self time */
+	u64 delta = rq_clock(rq_of(cfs_rq)) - cfs_rq->throttled_clock_self;
+
+	cfs_rq->throttled_clock_self = 0;
+
+	if (WARN_ON_ONCE((s64)delta < 0))
+		delta = 0;
+
+	cfs_rq->throttled_clock_self_time += delta;
 }
 
 static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
@@ -5889,27 +5943,21 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
 	cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
 		cfs_rq->throttled_clock_pelt;
 
-	if (cfs_rq->throttled_clock_self) {
-		u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
-
-		cfs_rq->throttled_clock_self = 0;
-
-		if (WARN_ON_ONCE((s64)delta < 0))
-			delta = 0;
-
-		cfs_rq->throttled_clock_self_time += delta;
-	}
+	if (cfs_rq->throttled_clock_self)
+		account_cfs_rq_throttle_self(cfs_rq);
 
 	/* Re-enqueue the tasks that have been throttled at this level. */
 	list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) {
 		list_del_init(&p->throttle_node);
-		enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
+		enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP | ENQUEUE_THROTTLE);
 	}
 
 	/* Add cfs_rq with load or one or more already running entities to the list */
 	if (!cfs_rq_is_decayed(cfs_rq))
 		list_add_leaf_cfs_rq(cfs_rq);
 
+	WARN_ON_ONCE(cfs_rq->h_nr_throttled);
+
 	return 0;
 }
 
@@ -5945,10 +5993,7 @@ static int tg_throttle_down(struct task_group *tg, void *data)
 	/* group is entering throttled state, stop time */
 	cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
 
-	WARN_ON_ONCE(cfs_rq->throttled_clock_self);
-	if (cfs_rq->nr_queued)
-		cfs_rq->throttled_clock_self = rq_clock(rq);
-	else
+	if (!cfs_rq->nr_queued)
 		list_del_leaf_cfs_rq(cfs_rq);
 
 	WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
@@ -5992,9 +6037,6 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	 * throttled-list.  rq->lock protects completion.
 	 */
 	cfs_rq->throttled = 1;
-	WARN_ON_ONCE(cfs_rq->throttled_clock);
-	if (cfs_rq->nr_queued)
-		cfs_rq->throttled_clock = rq_clock(rq);
 	return;
 }
 
@@ -6026,6 +6068,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 		cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
 		cfs_rq->throttled_clock = 0;
 	}
+	if (cfs_rq->throttled_time) {
+		cfs_b->throttled_time += cfs_rq->throttled_time;
+		cfs_rq->throttled_time = 0;
+	}
 	list_del_rcu(&cfs_rq->throttled_list);
 	raw_spin_unlock(&cfs_b->lock);
 
@@ -6710,6 +6756,8 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 static void task_throttle_setup_work(struct task_struct *p) {}
 static bool task_is_throttled(struct task_struct *p) { return false; }
 static void dequeue_throttled_task(struct task_struct *p, int flags) {}
+static void cfs_rq_inc_h_nr_throttled(struct cfs_rq *cfs_rq, unsigned int nr) {}
+static void cfs_rq_dec_h_nr_throttled(struct cfs_rq *cfs_rq, unsigned int nr) {}
 
 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 {
@@ -6898,6 +6946,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	struct sched_entity *se = &p->se;
 	int h_nr_idle = task_has_idle_policy(p);
 	int h_nr_runnable = 1;
+	int h_nr_throttled = (flags & ENQUEUE_THROTTLE) ? 1 : 0;
 	int task_new = !(flags & ENQUEUE_WAKEUP);
 	int rq_h_nr_queued = rq->cfs.h_nr_queued;
 	u64 slice = 0;
@@ -6951,6 +7000,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		cfs_rq->h_nr_runnable += h_nr_runnable;
 		cfs_rq->h_nr_queued++;
 		cfs_rq->h_nr_idle += h_nr_idle;
+		cfs_rq_dec_h_nr_throttled(cfs_rq, h_nr_throttled);
 
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = 1;
@@ -6973,6 +7023,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		cfs_rq->h_nr_runnable += h_nr_runnable;
 		cfs_rq->h_nr_queued++;
 		cfs_rq->h_nr_idle += h_nr_idle;
+		cfs_rq_dec_h_nr_throttled(cfs_rq, h_nr_throttled);
 
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = 1;
@@ -7027,10 +7078,12 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 	int rq_h_nr_queued = rq->cfs.h_nr_queued;
 	bool task_sleep = flags & DEQUEUE_SLEEP;
 	bool task_delayed = flags & DEQUEUE_DELAYED;
+	bool task_throttle = flags & DEQUEUE_THROTTLE;
 	struct task_struct *p = NULL;
 	int h_nr_idle = 0;
 	int h_nr_queued = 0;
 	int h_nr_runnable = 0;
+	int h_nr_throttled = 0;
 	struct cfs_rq *cfs_rq;
 	u64 slice = 0;
 
@@ -7040,6 +7093,9 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 		h_nr_idle = task_has_idle_policy(p);
 		if (task_sleep || task_delayed || !se->sched_delayed)
 			h_nr_runnable = 1;
+
+		if (task_throttle)
+			h_nr_throttled = 1;
 	} else {
 		cfs_rq = group_cfs_rq(se);
 		slice = cfs_rq_min_slice(cfs_rq);
@@ -7058,6 +7114,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 		cfs_rq->h_nr_runnable -= h_nr_runnable;
 		cfs_rq->h_nr_queued -= h_nr_queued;
 		cfs_rq->h_nr_idle -= h_nr_idle;
+		cfs_rq_inc_h_nr_throttled(cfs_rq, h_nr_throttled);
 
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = h_nr_queued;
@@ -7095,6 +7152,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 		cfs_rq->h_nr_runnable -= h_nr_runnable;
 		cfs_rq->h_nr_queued -= h_nr_queued;
 		cfs_rq->h_nr_idle -= h_nr_idle;
+		cfs_rq_inc_h_nr_throttled(cfs_rq, h_nr_throttled);
 
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = h_nr_queued;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 97be6a6f53b9c..54cdec21aa5c2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -721,6 +721,7 @@ struct cfs_rq {
 
 #ifdef CONFIG_CFS_BANDWIDTH
 	int			runtime_enabled;
+	unsigned int		h_nr_throttled;
 	s64			runtime_remaining;
 
 	u64			throttled_pelt_idle;
@@ -732,6 +733,7 @@ struct cfs_rq {
 	u64			throttled_clock_pelt_time;
 	u64			throttled_clock_self;
 	u64			throttled_clock_self_time;
+	u64			throttled_time;
 	int			throttled;
 	int			throttle_count;
 	struct list_head	throttled_list;
@@ -2360,6 +2362,7 @@ extern const u32		sched_prio_to_wmult[40];
 #define DEQUEUE_SPECIAL		0x10
 #define DEQUEUE_MIGRATING	0x100 /* Matches ENQUEUE_MIGRATING */
 #define DEQUEUE_DELAYED		0x200 /* Matches ENQUEUE_DELAYED */
+#define DEQUEUE_THROTTLE	0x800 /* Matches ENQUEUE_THROTTLE */
 
 #define ENQUEUE_WAKEUP		0x01
 #define ENQUEUE_RESTORE		0x02
@@ -2377,6 +2380,7 @@ extern const u32		sched_prio_to_wmult[40];
 #define ENQUEUE_MIGRATING	0x100
 #define ENQUEUE_DELAYED		0x200
 #define ENQUEUE_RQ_SELECTED	0x400
+#define ENQUEUE_THROTTLE	0x800
 
 #define RETRY_TASK		((void *)-1UL)
 
-- 
2.39.5


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ