linux-kernel - Re: [PATCH 4/7] sched/fair: Take care of group/affinity/sched

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <8501f4a8-8480-477e-8ab1-1d7796b978f1@amd.com>
Date: Tue, 27 May 2025 16:49:36 +0530
From: K Prateek Nayak <kprateek.nayak@....com>
To: Aaron Lu <ziqianlu@...edance.com>, Peter Zijlstra <peterz@...radead.org>
Cc: Valentin Schneider <vschneid@...hat.com>, Ben Segall
 <bsegall@...gle.com>, Josh Don <joshdon@...gle.com>,
 Ingo Molnar <mingo@...hat.com>, Vincent Guittot
 <vincent.guittot@...aro.org>, Xi Wang <xii@...gle.com>,
 linux-kernel@...r.kernel.org, Juri Lelli <juri.lelli@...hat.com>,
 Dietmar Eggemann <dietmar.eggemann@....com>,
 Steven Rostedt <rostedt@...dmis.org>, Mel Gorman <mgorman@...e.de>,
 Chengming Zhou <chengming.zhou@...ux.dev>,
 Chuyi Zhou <zhouchuyi@...edance.com>, Jan Kiszka <jan.kiszka@...mens.com>,
 Florian Bezdeka <florian.bezdeka@...mens.com>
Subject: Re: [PATCH 4/7] sched/fair: Take care of group/affinity/sched_class
 change for throttled task

Hello Aaron,

On 5/27/2025 12:28 PM, Aaron Lu wrote:
> On Mon, May 26, 2025 at 07:36:50PM +0800, Aaron Lu wrote:
>> On Fri, May 23, 2025 at 04:59:42PM +0200, Peter Zijlstra wrote:
>>> On Thu, May 22, 2025 at 08:49:43PM +0800, Aaron Lu wrote:
>>>> On Thu, May 22, 2025 at 02:03:36PM +0200, Peter Zijlstra wrote:
>>>
>>>>> This is asymmetric -- dequeue removes it from that throttle list, but
>>>>> the corresponding enqueue will not add it back, what gives?
>>>>>
>>>>> Because now we have:
>>>>>
>>>>>   p->on_rq=1
>>>>>   p->throttle_node on list
>>>>>
>>>>> move_queued_task()
>>>>>    deactivate_task()
>>>>>      dequeue_task_fair()
>>>>>        list_del_init(throttle_node)
>>>>>      p->on_rq = 2
>>>>>
>>>>>    activate_task()
>>>>>      enqueue_task_fair()
>>>>>        // nothing special, makes the thing runnable
>>>>>      p->on_rq = 1;
>>>>>
>>>>> and we exit with a task that is on-rq and not throttled ?!?
>>>>>
>>>>> Why is this? Are we relying on pick_task_fair() to dequeue it again and
>>>>> fix up our inconsistencies? If so, that had better have a comment on.
>>>>
>>>> Correct.
>>>
>>> But would it not be better to have enqueue bail when we're trying to
>>> enqueue an already throttled task into a throttled cfs_rq?
>>>
>>> It seems a waste to do the actual enqueue, pick, dequeue when we
>>> could've just avoided all that.
>>>
>>
>> The original idea is to keep code simple but surely this can be
>> optimized. I'm working on it and will paste diff here once I get it
>> work.
>>
> 
> I tried below diff on top of this series:
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 055f3782eeaee..1c5d7c4ff6652 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -882,6 +882,7 @@ struct task_struct {
>   #ifdef CONFIG_CFS_BANDWIDTH
>   	struct callback_head		sched_throttle_work;
>   	struct list_head		throttle_node;
> +	bool				throttled;
>   #endif
>   #endif
>   
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 89afa472299b7..c585a12f2c753 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5798,7 +5798,7 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
>   
>   static inline bool task_is_throttled(struct task_struct *p)
>   {
> -	return !list_empty(&p->throttle_node);
> +	return p->throttled;
>   }
>   
>   static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
> @@ -5842,6 +5842,7 @@ static void throttle_cfs_rq_work(struct callback_head *work)
>   		 * mistakenly regard this task as an already throttled one.
>   		 */
>   		list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
> +		p->throttled = true;
>   		resched_curr(rq);
>   	}

Since we now have an official per-task throttle indicator, what are your
thoughts on reusing "p->se.group_node" for throttled_limbo_list?

Something like this lightly tested diff based on your suggestion:

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 11eb0612e22d..f9fdcf812e81 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -578,6 +578,9 @@ struct sched_entity {
  	unsigned char			sched_delayed;
  	unsigned char			rel_deadline;
  	unsigned char			custom_slice;
+#ifdef CONFIG_CFS_BANDWIDTH
+	unsigned char			sched_throttled;
+#endif
  					/* hole */
  
  	u64				exec_start;
@@ -881,7 +884,6 @@ struct task_struct {
  	struct task_group		*sched_task_group;
  #ifdef CONFIG_CFS_BANDWIDTH
  	struct callback_head		sched_throttle_work;
-	struct list_head		throttle_node;
  #endif
  #endif
  
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 25e794ea0283..b1cb05baf8d4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5798,7 +5798,7 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
  
  static inline bool task_is_throttled(struct task_struct *p)
  {
-	return !list_empty(&p->throttle_node);
+	return !!p->se.sched_throttled;
  }
  
  static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
@@ -5835,13 +5835,14 @@ static void throttle_cfs_rq_work(struct callback_head *work)
  			return;
  		rq = scope.rq;
  		update_rq_clock(rq);
-		WARN_ON_ONCE(!list_empty(&p->throttle_node));
+		WARN_ON_ONCE(p->se.sched_throttled);
  		dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_THROTTLE);
  		/*
-		 * Must not add it to limbo list before dequeue or dequeue will
+		 * Must not mark throttled before dequeue or dequeue will
  		 * mistakenly regard this task as an already throttled one.
  		 */
-		list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
+		p->se.sched_throttled = 1;
+		list_add(&p->se.group_node, &cfs_rq->throttled_limbo_list);
  		resched_curr(rq);
  	}
  
@@ -5853,7 +5854,6 @@ void init_cfs_throttle_work(struct task_struct *p)
  	init_task_work(&p->sched_throttle_work, throttle_cfs_rq_work);
  	/* Protect against double add, see throttle_cfs_rq() and throttle_cfs_rq_work() */
  	p->sched_throttle_work.next = &p->sched_throttle_work;
-	INIT_LIST_HEAD(&p->throttle_node);
  }
  
  static void dequeue_throttled_task(struct task_struct *p, int flags)
@@ -5864,10 +5864,26 @@ static void dequeue_throttled_task(struct task_struct *p, int flags)
  	 * task affinity change, task group change, task sched class
  	 * change etc.
  	 */
-	WARN_ON_ONCE(p->se.on_rq);
+	WARN_ON_ONCE(!p->se.sched_throttled);
  	WARN_ON_ONCE(flags & DEQUEUE_SLEEP);
  
-	list_del_init(&p->throttle_node);
+	list_del_init(&p->se.group_node);
+}
+
+/* return true to skip actual enqueue */
+static bool enqueue_throttled_task(struct task_struct *p)
+{
+	struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
+
+	if (throttled_hierarchy(cfs_rq)) {
+		/* throttled task move across task groups/rqs. */
+		list_add(&p->se.group_node, &cfs_rq->throttled_limbo_list);
+		return true;
+	}
+
+	/* unthrottle */
+	p->se.sched_throttled = 0;
+	return false;
  }
  
  static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
@@ -5896,8 +5912,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
  	}
  
  	/* Re-enqueue the tasks that have been throttled at this level. */
-	list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) {
-		list_del_init(&p->throttle_node);
+	list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, se.group_node) {
+		WARN_ON_ONCE(!p->se.sched_throttled);
+
+		p->se.sched_throttled = 0;
+		list_del_init(&p->se.group_node);
  		enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
  	}
  
@@ -6714,6 +6733,7 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
  static void task_throttle_setup_work(struct task_struct *p) {}
  static bool task_is_throttled(struct task_struct *p) { return false; }
  static void dequeue_throttled_task(struct task_struct *p, int flags) {}
+static bool enqueue_throttled_task(struct task_struct *p) { return false; }
  static void record_throttle_clock(struct cfs_rq *cfs_rq) {}
  
  static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
@@ -6907,6 +6927,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
  	int rq_h_nr_queued = rq->cfs.h_nr_queued;
  	u64 slice = 0;
  
+	if (unlikely(task_is_throttled(p) && enqueue_throttled_task(p)))
+		return;
+
  	/*
  	 * The code below (indirectly) updates schedutil which looks at
  	 * the cfs_rq utilization to select a frequency.
@@ -13244,7 +13267,7 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs
  	struct sched_entity *se = &p->se;
  
  #ifdef CONFIG_SMP
-	if (task_on_rq_queued(p)) {
+	if (task_on_rq_queued(p) && !task_is_throttled(p)) {
  		/*
  		 * Move the next running task to the front of the list, so our
  		 * cfs_tasks list becomes MRU one.


-- 
Thanks and Regards,
Prateek