[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <8501f4a8-8480-477e-8ab1-1d7796b978f1@amd.com>
Date: Tue, 27 May 2025 16:49:36 +0530
From: K Prateek Nayak <kprateek.nayak@....com>
To: Aaron Lu <ziqianlu@...edance.com>, Peter Zijlstra <peterz@...radead.org>
Cc: Valentin Schneider <vschneid@...hat.com>, Ben Segall
<bsegall@...gle.com>, Josh Don <joshdon@...gle.com>,
Ingo Molnar <mingo@...hat.com>, Vincent Guittot
<vincent.guittot@...aro.org>, Xi Wang <xii@...gle.com>,
linux-kernel@...r.kernel.org, Juri Lelli <juri.lelli@...hat.com>,
Dietmar Eggemann <dietmar.eggemann@....com>,
Steven Rostedt <rostedt@...dmis.org>, Mel Gorman <mgorman@...e.de>,
Chengming Zhou <chengming.zhou@...ux.dev>,
Chuyi Zhou <zhouchuyi@...edance.com>, Jan Kiszka <jan.kiszka@...mens.com>,
Florian Bezdeka <florian.bezdeka@...mens.com>
Subject: Re: [PATCH 4/7] sched/fair: Take care of group/affinity/sched_class
change for throttled task
Hello Aaron,
On 5/27/2025 12:28 PM, Aaron Lu wrote:
> On Mon, May 26, 2025 at 07:36:50PM +0800, Aaron Lu wrote:
>> On Fri, May 23, 2025 at 04:59:42PM +0200, Peter Zijlstra wrote:
>>> On Thu, May 22, 2025 at 08:49:43PM +0800, Aaron Lu wrote:
>>>> On Thu, May 22, 2025 at 02:03:36PM +0200, Peter Zijlstra wrote:
>>>
>>>>> This is asymmetric -- dequeue removes it from that throttle list, but
>>>>> the corresponding enqueue will not add it back, what gives?
>>>>>
>>>>> Because now we have:
>>>>>
>>>>> p->on_rq=1
>>>>> p->throttle_node on list
>>>>>
>>>>> move_queued_task()
>>>>> deactivate_task()
>>>>> dequeue_task_fair()
>>>>> list_del_init(throttle_node)
>>>>> p->on_rq = 2
>>>>>
>>>>> activate_task()
>>>>> enqueue_task_fair()
>>>>> // nothing special, makes the thing runnable
>>>>> p->on_rq = 1;
>>>>>
>>>>> and we exit with a task that is on-rq and not throttled ?!?
>>>>>
>>>>> Why is this? Are we relying on pick_task_fair() to dequeue it again and
>>>>> fix up our inconsistencies? If so, that had better have a comment on.
>>>>
>>>> Correct.
>>>
>>> But would it not be better to have enqueue bail when we're trying to
>>> enqueue an already throttled task into a throttled cfs_rq?
>>>
>>> It seems a waste to do the actual enqueue, pick, dequeue when we
>>> could've just avoided all that.
>>>
>>
>> The original idea is to keep code simple but surely this can be
>> optimized. I'm working on it and will paste diff here once I get it
>> work.
>>
>
> I tried below diff on top of this series:
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 055f3782eeaee..1c5d7c4ff6652 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -882,6 +882,7 @@ struct task_struct {
> #ifdef CONFIG_CFS_BANDWIDTH
> struct callback_head sched_throttle_work;
> struct list_head throttle_node;
> + bool throttled;
> #endif
> #endif
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 89afa472299b7..c585a12f2c753 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5798,7 +5798,7 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
>
> static inline bool task_is_throttled(struct task_struct *p)
> {
> - return !list_empty(&p->throttle_node);
> + return p->throttled;
> }
>
> static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
> @@ -5842,6 +5842,7 @@ static void throttle_cfs_rq_work(struct callback_head *work)
> * mistakenly regard this task as an already throttled one.
> */
> list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
> + p->throttled = true;
> resched_curr(rq);
> }
Since we now have an official per-task throttle indicator, what are your
thoughts on reusing "p->se.group_node" for throttled_limbo_list?
Something like this lightly tested diff based on your suggestion:
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 11eb0612e22d..f9fdcf812e81 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -578,6 +578,9 @@ struct sched_entity {
unsigned char sched_delayed;
unsigned char rel_deadline;
unsigned char custom_slice;
+#ifdef CONFIG_CFS_BANDWIDTH
+ unsigned char sched_throttled;
+#endif
/* hole */
u64 exec_start;
@@ -881,7 +884,6 @@ struct task_struct {
struct task_group *sched_task_group;
#ifdef CONFIG_CFS_BANDWIDTH
struct callback_head sched_throttle_work;
- struct list_head throttle_node;
#endif
#endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 25e794ea0283..b1cb05baf8d4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5798,7 +5798,7 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
static inline bool task_is_throttled(struct task_struct *p)
{
- return !list_empty(&p->throttle_node);
+ return !!p->se.sched_throttled;
}
static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
@@ -5835,13 +5835,14 @@ static void throttle_cfs_rq_work(struct callback_head *work)
return;
rq = scope.rq;
update_rq_clock(rq);
- WARN_ON_ONCE(!list_empty(&p->throttle_node));
+ WARN_ON_ONCE(p->se.sched_throttled);
dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_THROTTLE);
/*
- * Must not add it to limbo list before dequeue or dequeue will
+ * Must not mark throttled before dequeue or dequeue will
* mistakenly regard this task as an already throttled one.
*/
- list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
+ p->se.sched_throttled = 1;
+ list_add(&p->se.group_node, &cfs_rq->throttled_limbo_list);
resched_curr(rq);
}
@@ -5853,7 +5854,6 @@ void init_cfs_throttle_work(struct task_struct *p)
init_task_work(&p->sched_throttle_work, throttle_cfs_rq_work);
/* Protect against double add, see throttle_cfs_rq() and throttle_cfs_rq_work() */
p->sched_throttle_work.next = &p->sched_throttle_work;
- INIT_LIST_HEAD(&p->throttle_node);
}
static void dequeue_throttled_task(struct task_struct *p, int flags)
@@ -5864,10 +5864,26 @@ static void dequeue_throttled_task(struct task_struct *p, int flags)
* task affinity change, task group change, task sched class
* change etc.
*/
- WARN_ON_ONCE(p->se.on_rq);
+ WARN_ON_ONCE(!p->se.sched_throttled);
WARN_ON_ONCE(flags & DEQUEUE_SLEEP);
- list_del_init(&p->throttle_node);
+ list_del_init(&p->se.group_node);
+}
+
+/* return true to skip actual enqueue */
+static bool enqueue_throttled_task(struct task_struct *p)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
+
+ if (throttled_hierarchy(cfs_rq)) {
+ /* throttled task move across task groups/rqs. */
+ list_add(&p->se.group_node, &cfs_rq->throttled_limbo_list);
+ return true;
+ }
+
+ /* unthrottle */
+ p->se.sched_throttled = 0;
+ return false;
}
static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
@@ -5896,8 +5912,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
}
/* Re-enqueue the tasks that have been throttled at this level. */
- list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) {
- list_del_init(&p->throttle_node);
+ list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, se.group_node) {
+ WARN_ON_ONCE(!p->se.sched_throttled);
+
+ p->se.sched_throttled = 0;
+ list_del_init(&p->se.group_node);
enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
}
@@ -6714,6 +6733,7 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static void task_throttle_setup_work(struct task_struct *p) {}
static bool task_is_throttled(struct task_struct *p) { return false; }
static void dequeue_throttled_task(struct task_struct *p, int flags) {}
+static bool enqueue_throttled_task(struct task_struct *p) { return false; }
static void record_throttle_clock(struct cfs_rq *cfs_rq) {}
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
@@ -6907,6 +6927,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
int rq_h_nr_queued = rq->cfs.h_nr_queued;
u64 slice = 0;
+ if (unlikely(task_is_throttled(p) && enqueue_throttled_task(p)))
+ return;
+
/*
* The code below (indirectly) updates schedutil which looks at
* the cfs_rq utilization to select a frequency.
@@ -13244,7 +13267,7 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs
struct sched_entity *se = &p->se;
#ifdef CONFIG_SMP
- if (task_on_rq_queued(p)) {
+ if (task_on_rq_queued(p) && !task_is_throttled(p)) {
/*
* Move the next running task to the front of the list, so our
* cfs_tasks list becomes MRU one.
--
Thanks and Regards,
Prateek
Powered by blists - more mailing lists