linux-kernel - [RFC PATCH 2/7] sched/fair: Handle throttle path for task based throttle

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CANCG0GcFF7cnR4rCbU5MmY1Gq3M+r4gPXv39QPXXC=Cdr6sRww@mail.gmail.com>
Date: Thu, 13 Mar 2025 07:21:21 +0000
From: Aaron Lu <ziqianlu@...edance.com>
To: Valentin Schneider <vschneid@...hat.com>, Ben Segall <bsegall@...gle.com>, 
	K Prateek Nayak <kprateek.nayak@....com>, Peter Zijlstra <peterz@...radead.org>, 
	Josh Don <joshdon@...gle.com>, Ingo Molnar <mingo@...hat.com>, 
	Vincent Guittot <vincent.guittot@...aro.org>
Cc: linux-kernel@...r.kernel.org, Juri Lelli <juri.lelli@...hat.com>, 
	Dietmar Eggemann <dietmar.eggemann@....com>, Steven Rostedt <rostedt@...dmis.org>, 
	Mel Gorman <mgorman@...e.de>, Chengming Zhou <chengming.zhou@...ux.dev>, 
	Chuyi Zhou <zhouchuyi@...edance.com>
Subject: [RFC PATCH 2/7] sched/fair: Handle throttle path for task based throttle

From: Valentin Schneider <vschneid@...hat.com>

Once a cfs_rq gets throttled, for all tasks belonging to this cfs_rq,
add a task work to them so that when those tasks return to user, the
actual throttle/dequeue can happen.

Note that since the throttle/dequeue always happens on a task basis when
it returns to user, it's no longer necessary for check_cfs_rq_runtime()
to return a value and pick_task_fair() acts differently according to that
return value, so check_cfs_rq_runtime() is changed to not return a
value.

[aaronlu: extracted from Valentin's original patches.
 Fixed a problem that curr is not in timeline tree and has to be dealed
 with explicitely;
 Make check_cfs_rq_runtime() void.]
Signed-off-by: Valentin Schneider <vschneid@...hat.com>
Signed-off-by: Aaron Lu <ziqianlu@...edance.com>
---
 kernel/sched/fair.c  | 201 ++++++++++++++++++++++++-------------------
 kernel/sched/sched.h |   1 +
 2 files changed, 112 insertions(+), 90 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 60eb5329bf526..ab403ff7d53c8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5607,7 +5607,7 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
 	return se;
 }

-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);

 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 {
@@ -5832,8 +5832,49 @@ static inline int throttled_lb_pair(struct
task_group *tg,
 	       throttled_hierarchy(dest_cfs_rq);
 }

+static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
 static void throttle_cfs_rq_work(struct callback_head *work)
 {
+	struct task_struct *p = container_of(work, struct task_struct,
sched_throttle_work);
+	struct sched_entity *se;
+	struct cfs_rq *cfs_rq;
+	struct rq *rq;
+	struct rq_flags rf;
+
+	WARN_ON_ONCE(p != current);
+	p->sched_throttle_work.next = &p->sched_throttle_work;
+
+	/*
+	 * If task is exiting, then there won't be a return to userspace, so we
+	 * don't have to bother with any of this.
+	 */
+	if ((p->flags & PF_EXITING))
+		return;
+
+	rq = task_rq_lock(p, &rf);
+
+	se = &p->se;
+	cfs_rq = cfs_rq_of(se);
+
+	/* Raced, forget */
+	if (p->sched_class != &fair_sched_class)
+		goto out_unlock;
+
+	/*
+	 * If not in limbo, then either replenish has happened or this task got
+	 * migrated out of the throttled cfs_rq, move along
+	 */
+	if (!cfs_rq->throttle_count)
+		goto out_unlock;
+
+	update_rq_clock(rq);
+	WARN_ON_ONCE(!list_empty(&p->throttle_node));
+	list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
+	dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL);
+	resched_curr(rq);
+
+out_unlock:
+	task_rq_unlock(rq, p, &rf);
 }

 void init_cfs_throttle_work(struct task_struct *p)
@@ -5873,32 +5914,81 @@ static int tg_unthrottle_up(struct task_group
*tg, void *data)
 	return 0;
 }

+static inline bool task_has_throttle_work(struct task_struct *p)
+{
+	return p->sched_throttle_work.next != &p->sched_throttle_work;
+}
+
+static inline void task_throttle_setup_work(struct task_struct *p)
+{
+	/*
+	 * Kthreads and exiting tasks don't return to userspace, so adding the
+	 * work is pointless
+	 */
+	if ((p->flags & (PF_EXITING | PF_KTHREAD)))
+		return;
+
+	if (task_has_throttle_work(p))
+		return;
+
+	task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
+}
+
 static int tg_throttle_down(struct task_group *tg, void *data)
 {
 	struct rq *rq = data;
 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+	struct task_struct *p;
+	struct rb_node *node;
+
+	cfs_rq->throttle_count++;
+	if (cfs_rq->throttle_count > 1)
+		return 0;

 	/* group is entering throttled state, stop time */
-	if (!cfs_rq->throttle_count) {
-		cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
-		list_del_leaf_cfs_rq(cfs_rq);
+	cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
+	list_del_leaf_cfs_rq(cfs_rq);

-		SCHED_WARN_ON(cfs_rq->throttled_clock_self);
-		if (cfs_rq->nr_queued)
-			cfs_rq->throttled_clock_self = rq_clock(rq);
+	SCHED_WARN_ON(cfs_rq->throttled_clock_self);
+	if (cfs_rq->nr_queued)
+		cfs_rq->throttled_clock_self = rq_clock(rq);
+
+	WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
+	/*
+	 * rq_lock is held, current is (obviously) executing this in kernelspace.
+	 *
+	 * All other tasks enqueued on this rq have their saved PC at the
+	 * context switch, so they will go through the kernel before returning
+	 * to userspace. Thus, there are no tasks-in-userspace to handle, just
+	 * install the task_work on all of them.
+	 */
+	node = rb_first(&cfs_rq->tasks_timeline.rb_root);
+	while (node) {
+		struct sched_entity *se = __node_2_se(node);
+
+		if (!entity_is_task(se))
+			goto next;
+
+		p = task_of(se);
+		task_throttle_setup_work(p);
+next:
+		node = rb_next(node);
+	}
+
+	/* curr is not in the timeline tree */
+	if (cfs_rq->curr && entity_is_task(cfs_rq->curr)) {
+		p = task_of(cfs_rq->curr);
+		task_throttle_setup_work(p);
 	}
-	cfs_rq->throttle_count++;

 	return 0;
 }

-static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
+static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	struct rq *rq = rq_of(cfs_rq);
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-	struct sched_entity *se;
-	long queued_delta, runnable_delta, idle_delta, dequeue = 1;
-	long rq_h_nr_queued = rq->cfs.h_nr_queued;
+	int dequeue = 1;

 	raw_spin_lock(&cfs_b->lock);
 	/* This will start the period timer if necessary */
@@ -5919,74 +6009,13 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	raw_spin_unlock(&cfs_b->lock);

 	if (!dequeue)
-		return false;  /* Throttle no longer required. */
-
-	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+		return;  /* Throttle no longer required. */

 	/* freeze hierarchy runnable averages while throttled */
 	rcu_read_lock();
 	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
 	rcu_read_unlock();

-	queued_delta = cfs_rq->h_nr_queued;
-	runnable_delta = cfs_rq->h_nr_runnable;
-	idle_delta = cfs_rq->h_nr_idle;
-	for_each_sched_entity(se) {
-		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-		int flags;
-
-		/* throttled entity or throttle-on-deactivate */
-		if (!se->on_rq)
-			goto done;
-
-		/*
-		 * Abuse SPECIAL to avoid delayed dequeue in this instance.
-		 * This avoids teaching dequeue_entities() about throttled
-		 * entities and keeps things relatively simple.
-		 */
-		flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL;
-		if (se->sched_delayed)
-			flags |= DEQUEUE_DELAYED;
-		dequeue_entity(qcfs_rq, se, flags);
-
-		if (cfs_rq_is_idle(group_cfs_rq(se)))
-			idle_delta = cfs_rq->h_nr_queued;
-
-		qcfs_rq->h_nr_queued -= queued_delta;
-		qcfs_rq->h_nr_runnable -= runnable_delta;
-		qcfs_rq->h_nr_idle -= idle_delta;
-
-		if (qcfs_rq->load.weight) {
-			/* Avoid re-evaluating load for this entity: */
-			se = parent_entity(se);
-			break;
-		}
-	}
-
-	for_each_sched_entity(se) {
-		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-		/* throttled entity or throttle-on-deactivate */
-		if (!se->on_rq)
-			goto done;
-
-		update_load_avg(qcfs_rq, se, 0);
-		se_update_runnable(se);
-
-		if (cfs_rq_is_idle(group_cfs_rq(se)))
-			idle_delta = cfs_rq->h_nr_queued;
-
-		qcfs_rq->h_nr_queued -= queued_delta;
-		qcfs_rq->h_nr_runnable -= runnable_delta;
-		qcfs_rq->h_nr_idle -= idle_delta;
-	}
-
-	/* At this point se is NULL and we are at root level*/
-	sub_nr_running(rq, queued_delta);
-
-	/* Stop the fair server if throttling resulted in no runnable tasks */
-	if (rq_h_nr_queued && !rq->cfs.h_nr_queued)
-		dl_server_stop(&rq->fair_server);
-done:
 	/*
 	 * Note: distribution will already see us throttled via the
 	 * throttled-list.  rq->lock protects completion.
@@ -5995,7 +6024,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	SCHED_WARN_ON(cfs_rq->throttled_clock);
 	if (cfs_rq->nr_queued)
 		cfs_rq->throttled_clock = rq_clock(rq);
-	return true;
+	return;
 }

 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
@@ -6471,22 +6500,22 @@ static void sync_throttle(struct task_group
*tg, int cpu)
 }

 /* conditionally throttle active cfs_rq's from put_prev_entity() */
-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
 	if (!cfs_bandwidth_used())
-		return false;
+		return;

 	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
-		return false;
+		return;

 	/*
 	 * it's possible for a throttled entity to be forced into a running
 	 * state (e.g. set_curr_task), in this case we're finished.
 	 */
 	if (cfs_rq_throttled(cfs_rq))
-		return true;
+		return;

-	return throttle_cfs_rq(cfs_rq);
+	throttle_cfs_rq(cfs_rq);
 }

 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
@@ -6582,6 +6611,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	cfs_rq->runtime_enabled = 0;
 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
 	INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
+	INIT_LIST_HEAD(&cfs_rq->throttled_limbo_list);
 }

 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -7117,10 +7147,6 @@ static int dequeue_entities(struct rq *rq,
struct sched_entity *se, int flags)
 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = h_nr_queued;

-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(cfs_rq))
-			return 0;
-
 		/* Don't dequeue parent if it has other entities besides us */
 		if (cfs_rq->load.weight) {
 			slice = cfs_rq_min_slice(cfs_rq);
@@ -7157,10 +7183,6 @@ static int dequeue_entities(struct rq *rq,
struct sched_entity *se, int flags)

 		if (cfs_rq_is_idle(cfs_rq))
 			h_nr_idle = h_nr_queued;
-
-		/* end evaluation on encountering a throttled cfs_rq */
-		if (cfs_rq_throttled(cfs_rq))
-			return 0;
 	}

 	sub_nr_running(rq, h_nr_queued);
@@ -8869,8 +8891,7 @@ static struct task_struct *pick_task_fair(struct rq *rq)
 		if (cfs_rq->curr && cfs_rq->curr->on_rq)
 			update_curr(cfs_rq);

-		if (unlikely(check_cfs_rq_runtime(cfs_rq)))
-			goto again;
+		check_cfs_rq_runtime(cfs_rq);

 		se = pick_next_entity(rq, cfs_rq);
 		if (!se)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c8bfa3d708081..5c2af5a70163c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -742,6 +742,7 @@ struct cfs_rq {
 	int			throttle_count;
 	struct list_head	throttled_list;
 	struct list_head	throttled_csd_list;
+	struct list_head	throttled_limbo_list;
 #endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 };
-- 
2.39.5