linux-kernel - [RFC PATCH 19/22] sched/fair: Ignore in-kernel indicators for running task outside of schedule()

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250220093257.9380-20-kprateek.nayak@amd.com>
Date: Thu, 20 Feb 2025 09:32:54 +0000
From: K Prateek Nayak <kprateek.nayak@....com>
To: Peter Zijlstra <peterz@...radead.org>, Ingo Molnar <mingo@...hat.com>,
	Juri Lelli <juri.lelli@...hat.com>, Vincent Guittot
	<vincent.guittot@...aro.org>, Valentin Schneider <vschneid@...hat.com>, "Ben
 Segall" <bsegall@...gle.com>, Thomas Gleixner <tglx@...utronix.de>, "Andy
 Lutomirski" <luto@...nel.org>, <linux-kernel@...r.kernel.org>
CC: Dietmar Eggemann <dietmar.eggemann@....com>, Steven Rostedt
	<rostedt@...dmis.org>, Mel Gorman <mgorman@...e.de>, "Sebastian Andrzej
 Siewior" <bigeasy@...utronix.de>, Clark Williams <clrkwllms@...nel.org>,
	<linux-rt-devel@...ts.linux.dev>, Tejun Heo <tj@...nel.org>, "Frederic
 Weisbecker" <frederic@...nel.org>, Barret Rhoden <brho@...gle.com>, "Petr
 Mladek" <pmladek@...e.com>, Josh Don <joshdon@...gle.com>, Qais Yousef
	<qyousef@...alina.io>, "Paul E. McKenney" <paulmck@...nel.org>, David Vernet
	<dvernet@...a.com>, K Prateek Nayak <kprateek.nayak@....com>, "Gautham R.
 Shenoy" <gautham.shenoy@....com>, Swapnil Sapkal <swapnil.sapkal@....com>
Subject: [RFC PATCH 19/22] sched/fair: Ignore in-kernel indicators for running task outside of schedule()

A running task can go a dequeue_task() -> put_prev_task() ->
enqueue_task() -> set_next_task() cycle when on_cpu for a number of
operations.

Since the task is running on a remote CPU, its "kernel_cs_count" is
unstable and looking at it can lead to wrong outcomes (imbalanced
accounting at __enqueue_entity() and __dequeue_entity())

Use "sched_throttled" to indicate that the task's kernel mode indicators
should be ignored. put_prev_task() is called with prev set to NULL only
for save restore operations and is used as an indicator to set the
ignore bit. Subsequent call to set_next_task_fair() will clear this
indicator.

There are cases where the save restore cycle can fully throttle the
task's hierarchy. One such condition is:

    dequeue_task()
    put_prev_task() # Sets cfs_rq->curr to NULL
    ...
    enqueue_task()
        enqueue_task_fair()
            check_enqueue_throttle()
	        # cfs_rq->curr is NULL so goes ahead
                # Full throttle
    set_next_task() # Sets cfs_rq->curr back

If set_next_task_fair() finds a task marked for ignoring but has been
forced into running on a throttled hierarchy, request resched for
schedule() to partially unthrottle the hierarchy if required before
going ahead with the pick.

Signed-off-by: K Prateek Nayak <kprateek.nayak@....com>
---
 kernel/sched/fair.c | 115 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 111 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1497b0aed1c2..55e53db8da45 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6837,6 +6837,21 @@ bool cfs_task_bw_constrained(struct task_struct *p)
 	return false;
 }
 
+/* se->sched_throttled states: Modifications are serialized by rq_lock */
+enum sched_throttled_states {
+	/* No throttle status */
+	PICK_ON_UNTHROTTLED	= 0x0,
+	/* Task was picked on a throttled hierarchy */
+	PICK_ON_THROTTLED	= 0x1,
+	/*
+	 * Ignore tasks's "kernel_cs_count". Used for save / restore
+	 * operations on running tasks from a remote CPU. Prevents
+	 * enqueue and set_next_entity() from adjusting the stats and
+	 * puts it off until schedule() is called on the CPU.
+	 */
+	PICK_IGNORE		= 0x2,
+};
+
 __always_inline void sched_notify_critical_section_entry(void)
 {
 	SCHED_WARN_ON(current->se.kernel_cs_count);
@@ -6847,6 +6862,8 @@ __always_inline void sched_notify_critical_section_entry(void)
 	 */
 }
 
+static inline int task_picked_on_throttled(struct task_struct *p);
+
 __always_inline void sched_notify_critical_section_exit(void)
 {
 	lockdep_assert_irqs_disabled();
@@ -6860,7 +6877,7 @@ __always_inline void sched_notify_critical_section_exit(void)
 	 * schedule() soon after enabling interrupts again in
 	 * exit_to_user_mode_loop()?
 	 */
-	if (!current->se.kernel_cs_count && current->se.sched_throttled) {
+	if (!current->se.kernel_cs_count && task_picked_on_throttled(current)) {
 		struct rq *rq = this_rq();
 
 		guard(rq_lock_irqsave)(rq);
@@ -6873,15 +6890,40 @@ static __always_inline int se_in_kernel(struct sched_entity *se)
 	return se->kernel_cs_count;
 }
 
-/* se picked on a partially throttled hierarchy. */
+/* task picked on a partially throttled hierarchy. */
 static inline void task_mark_throttled(struct task_struct *p)
 {
-	p->se.sched_throttled = 1;
+	WRITE_ONCE(p->se.sched_throttled, PICK_ON_THROTTLED);
+}
+
+static inline void task_mark_ignore(struct task_struct *p)
+{
+	WRITE_ONCE(p->se.sched_throttled, READ_ONCE(p->se.sched_throttled) | PICK_IGNORE);
 }
 
 static inline void task_clear_throttled(struct task_struct *p)
 {
-	p->se.sched_throttled = 0;
+	WRITE_ONCE(p->se.sched_throttled, PICK_ON_UNTHROTTLED);
+}
+
+static inline void task_clear_ignore(struct task_struct *p)
+{
+	WRITE_ONCE(p->se.sched_throttled, READ_ONCE(p->se.sched_throttled) & ~PICK_IGNORE);
+}
+
+static inline int __kcs_ignore_entity(struct sched_entity *se)
+{
+	return READ_ONCE(se->sched_throttled) & PICK_IGNORE;
+}
+
+static inline int task_picked_on_throttled(struct task_struct *p)
+{
+	return READ_ONCE(p->se.sched_throttled) & PICK_ON_THROTTLED;
+}
+
+static inline int ignore_task_kcs_stats(struct task_struct *p)
+{
+	return __kcs_ignore_entity(&p->se);
 }
 
 /*
@@ -6893,6 +6935,10 @@ static __always_inline void avg_kcs_vruntime_add(struct cfs_rq *cfs_rq, struct s
 	unsigned long weight;
 	s64 key;
 
+	/* See avg_kcs_vruntime_sub() */
+	if (__kcs_ignore_entity(se))
+		return;
+
 	if (!se_in_kernel(se))
 		return;
 
@@ -6912,6 +6958,15 @@ static __always_inline void avg_kcs_vruntime_sub(struct cfs_rq *cfs_rq, struct s
 	unsigned long weight;
 	s64 key;
 
+	/*
+	 * A remote running task in being enqueued for a restore operation.
+	 * Since it has an unstable "kernel_cs_count" being in a running state,
+	 * do not account its stats yet. A set_next_task() -> schedule() will
+	 * follow on the CPU to adjust it.
+	 */
+	if (__kcs_ignore_entity(se))
+		return;
+
 	if (!se_in_kernel(se))
 		return;
 
@@ -7238,6 +7293,20 @@ static __always_inline int se_in_kernel(struct sched_entity *se)
 
 static __always_inline void task_mark_throttled(struct task_struct *p) {}
 static __always_inline void task_clear_throttled(struct task_struct *p) {}
+static __always_inline void task_mark_ignore(struct task_struct *p) {}
+static __always_inline void task_clear_ignore(struct task_struct *p) {}
+
+static inline int task_picked_on_throttled(struct task_struct *p)
+{
+	return 0;
+}
+
+static inline int ignore_task_kcs_stats(struct task_struct *p)
+{
+	return 0;
+}
+
+
 static __always_inline void avg_kcs_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
 static __always_inline void avg_kcs_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
 static __always_inline void avg_kcs_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) {}
@@ -7437,6 +7506,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE))))
 		util_est_enqueue(&rq->cfs, p);
 
+	/*
+	 * Running task has just moved to the fair class.
+	 * Ignore "kernel_cs_count" until set_next_task_fair()
+	 */
+	if (task_on_cpu(rq, p) && (flags & ENQUEUE_MOVE))
+		task_mark_ignore(p);
+
 	if (flags & ENQUEUE_DELAYED) {
 		requeue_delayed_entity(se);
 		return;
@@ -9582,9 +9658,23 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct t
 	 * Clear the pick on throttled indicator only if
 	 * another task was picked and not for a save /
 	 * restore operation for the task.
+	 *
+	 * For a save / restore operation of a running task,
+	 * mark the task to be ignored for "kernel_cs_stats"
+	 * adjustment. Either set_next_task_fair() or
+	 * switched_from_fair() will clear these idicators.
 	 */
 	if (next)
 		task_clear_throttled(prev);
+	else {
+		/*
+		 * put_prev_task_fair() is only called with next as NULL
+		 * during save / restore operations. Since idle thread
+		 * is always runnable, all other cases will have a valid
+		 * prev task set.
+		 */
+		task_mark_ignore(prev);
+	}
 
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
@@ -13897,6 +13987,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
 {
 	struct sched_entity *se = &p->se;
 	bool task_in_kernel = !task_on_cpu(rq, p) && se_in_kernel(se);
+	bool h_throttled = false;
 
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
@@ -13905,6 +13996,22 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
 		account_kcs_dequeue(cfs_rq, task_in_kernel);
 		/* ensure bandwidth has been allocated on our new cfs_rq */
 		account_cfs_rq_runtime(cfs_rq, 0);
+		/* Check if task is on a partially throttled hierarchy */
+		h_throttled = h_throttled || cfs_rq_throttled(cfs_rq);
+	}
+
+	/* Mark the end of save / restore operation. */
+	if (ignore_task_kcs_stats(p)) {
+		task_clear_ignore(p);
+
+		/*
+		 * If the hierarchy is throttled but the task was not picked on
+		 * a throttled hierarchy, the hierarchy was throttled during the
+		 * course of a save / restore operation. Request a resched for
+		 * pick_next_task_fair() to reevaluate the throttle status.
+		 */
+		if (h_throttled && !task_picked_on_throttled(p))
+			resched_curr(rq);
 	}
 
 	__set_next_task_fair(rq, p, first);
-- 
2.43.0