linux-kernel - Re: [PATCH] sched/uclamp: Let each sched

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <34521ae6-5dba-4861-a971-8f7c8a37bb04@arm.com>
Date: Thu, 6 Mar 2025 12:24:18 +0000
From: Hongyan Xia <hongyan.xia2@....com>
To: Xuewen Yan <xuewen.yan94@...il.com>
Cc: Ingo Molnar <mingo@...hat.com>, Peter Zijlstra <peterz@...radead.org>,
 Juri Lelli <juri.lelli@...hat.com>,
 Vincent Guittot <vincent.guittot@...aro.org>,
 Dietmar Eggemann <dietmar.eggemann@....com>,
 Steven Rostedt <rostedt@...dmis.org>, Ben Segall <bsegall@...gle.com>,
 Mel Gorman <mgorman@...e.de>, Valentin Schneider <vschneid@...hat.com>,
 Tejun Heo <tj@...nel.org>, David Vernet <void@...ifault.com>,
 Andrea Righi <arighi@...dia.com>, Changwoo Min <changwoo@...lia.com>,
 linux-kernel@...r.kernel.org, Xuewen Yan <xuewen.yan@...soc.com>
Subject: Re: [PATCH] sched/uclamp: Let each sched_class handle uclamp

On 06/03/2025 12:03, Xuewen Yan wrote:
> Hi Hongyan,
> 
> On Thu, Feb 27, 2025 at 9:55 PM Hongyan Xia <hongyan.xia2@....com> wrote:
>>
>> While delayed dequeue issues were being resolved, uclamp was made out of
>> sync with cpufreq, especially in enqueue_task().
>>
>> For example, when a task with uclamp_min goes through enqueue_task() and
>> updates cpufreq, its uclamp_min won't even be considered in the cpufreq
>> update. It is only after enqueue will the uclamp_min be added to rq
>> buckets, and cpufreq will only pick it up at the next update. This is
>> very different from the old behavior, where a uclamp value immediately
>> has an effect at enqueue. Worse, sub classes like fair.c issue cpufreq
>> updates on utilization changes. If no utilization changes for a while,
>> the new uclamp will be delayed further.
>>
>> So, let each sched_class handle uclamp in its own class, in case delayed
>> dequeue needs further tweaks or there are potential future similar
>> changes, and make sure uclamp is picked up immediately on enqueue. In
>> fair.c, we re-use the guard logic for util_est.
>>
>> Signed-off-by: Hongyan Xia <hongyan.xia2@....com>
>> ---
>>   kernel/sched/core.c  | 28 ++--------------------------
>>   kernel/sched/ext.c   |  8 ++++----
>>   kernel/sched/fair.c  | 12 ++++++------
>>   kernel/sched/rt.c    |  8 ++++----
>>   kernel/sched/sched.h |  9 +++++----
>>   5 files changed, 21 insertions(+), 44 deletions(-)
>>
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index b00f884701a6..2d51608a4c46 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -1745,7 +1745,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
>>          }
>>   }
>>
>> -static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
>> +void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
>>   {
>>          enum uclamp_id clamp_id;
>>
>> @@ -1758,12 +1758,6 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
>>          if (!static_branch_unlikely(&sched_uclamp_used))
>>                  return;
>>
>> -       if (unlikely(!p->sched_class->uclamp_enabled))
>> -               return;
>> -
>> -       if (p->se.sched_delayed)
>> -               return;
>> -
>>          for_each_clamp_id(clamp_id)
>>                  uclamp_rq_inc_id(rq, p, clamp_id);
>>
>> @@ -1772,7 +1766,7 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
>>                  rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
>>   }
>>
>> -static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
>> +void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
>>   {
>>          enum uclamp_id clamp_id;
>>
>> @@ -1785,12 +1779,6 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
>>          if (!static_branch_unlikely(&sched_uclamp_used))
>>                  return;
>>
>> -       if (unlikely(!p->sched_class->uclamp_enabled))
>> -               return;
>> -
>> -       if (p->se.sched_delayed)
>> -               return;
>> -
>>          for_each_clamp_id(clamp_id)
>>                  uclamp_rq_dec_id(rq, p, clamp_id);
>>   }
>> @@ -2029,8 +2017,6 @@ static void __init init_uclamp(void)
>>   }
>>
>>   #else /* !CONFIG_UCLAMP_TASK */
>> -static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
>> -static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
>>   static inline void uclamp_fork(struct task_struct *p) { }
>>   static inline void uclamp_post_fork(struct task_struct *p) { }
>>   static inline void init_uclamp(void) { }
>> @@ -2066,11 +2052,6 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
>>                  update_rq_clock(rq);
>>
>>          p->sched_class->enqueue_task(rq, p, flags);
>> -       /*
>> -        * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
>> -        * ->sched_delayed.
>> -        */
>> -       uclamp_rq_inc(rq, p);
>>
>>          psi_enqueue(p, flags);
>>
>> @@ -2097,11 +2078,6 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
>>
>>          psi_dequeue(p, flags);
>>
>> -       /*
>> -        * Must be before ->dequeue_task() because ->dequeue_task() can 'fail'
>> -        * and mark the task ->sched_delayed.
>> -        */
>> -       uclamp_rq_dec(rq, p);
>>          return p->sched_class->dequeue_task(rq, p, flags);
>>   }
>>
>> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
>> index 8857c0709bdd..4521c27f9ab8 100644
>> --- a/kernel/sched/ext.c
>> +++ b/kernel/sched/ext.c
>> @@ -2094,6 +2094,8 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
>>   {
>>          int sticky_cpu = p->scx.sticky_cpu;
>>
>> +       uclamp_rq_inc(rq, p);
>> +
>>          if (enq_flags & ENQUEUE_WAKEUP)
>>                  rq->scx.flags |= SCX_RQ_IN_WAKEUP;
>>
>> @@ -2181,6 +2183,8 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags)
>>
>>   static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags)
>>   {
>> +       uclamp_rq_dec(rq, p);
>> +
>>          if (!(p->scx.flags & SCX_TASK_QUEUED)) {
>>                  WARN_ON_ONCE(task_runnable(p));
>>                  return true;
>> @@ -4456,10 +4460,6 @@ DEFINE_SCHED_CLASS(ext) = {
>>          .prio_changed           = prio_changed_scx,
>>
>>          .update_curr            = update_curr_scx,
>> -
>> -#ifdef CONFIG_UCLAMP_TASK
>> -       .uclamp_enabled         = 1,
>> -#endif
>>   };
>>
>>   static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id)
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 857808da23d8..7e5a653811ad 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -6941,8 +6941,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>           * Let's add the task's estimated utilization to the cfs_rq's
>>           * estimated utilization, before we update schedutil.
>>           */
>> -       if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE))))
>> +       if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE)))) {
>> +               uclamp_rq_inc(rq, p);
>>                  util_est_enqueue(&rq->cfs, p);
>> +       }
>>
>>          if (flags & ENQUEUE_DELAYED) {
>>                  requeue_delayed_entity(se);
>> @@ -7183,8 +7185,10 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
>>    */
>>   static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>   {
>> -       if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
>> +       if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE)))) {
>> +               uclamp_rq_dec(rq, p);
>>                  util_est_dequeue(&rq->cfs, p);
>> +       }
>>
>>          util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
>>          if (dequeue_entities(rq, &p->se, flags) < 0)
>> @@ -13660,10 +13664,6 @@ DEFINE_SCHED_CLASS(fair) = {
>>   #ifdef CONFIG_SCHED_CORE
>>          .task_is_throttled      = task_is_throttled_fair,
>>   #endif
>> -
>> -#ifdef CONFIG_UCLAMP_TASK
>> -       .uclamp_enabled         = 1,
>> -#endif
>>   };
>>
>>   #ifdef CONFIG_SCHED_DEBUG
>> diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
>> index 4b8e33c615b1..7c0642ea85f2 100644
>> --- a/kernel/sched/rt.c
>> +++ b/kernel/sched/rt.c
>> @@ -1471,6 +1471,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
>>   {
>>          struct sched_rt_entity *rt_se = &p->rt;
>>
>> +       uclamp_rq_inc(rq, p);
>> +
>>          if (flags & ENQUEUE_WAKEUP)
>>                  rt_se->timeout = 0;
>>
>> @@ -1487,6 +1489,8 @@ static bool dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
>>   {
>>          struct sched_rt_entity *rt_se = &p->rt;
>>
>> +       uclamp_rq_dec(rq, p);
>> +
>>          update_curr_rt(rq);
>>          dequeue_rt_entity(rt_se, flags);
>>
>> @@ -2649,10 +2653,6 @@ DEFINE_SCHED_CLASS(rt) = {
>>   #ifdef CONFIG_SCHED_CORE
>>          .task_is_throttled      = task_is_throttled_rt,
>>   #endif
>> -
>> -#ifdef CONFIG_UCLAMP_TASK
>> -       .uclamp_enabled         = 1,
>> -#endif
>>   };
>>
>>   #ifdef CONFIG_RT_GROUP_SCHED
>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>> index ab16d3d0e51c..990d87e8d8ed 100644
>> --- a/kernel/sched/sched.h
>> +++ b/kernel/sched/sched.h
>> @@ -2410,10 +2410,6 @@ extern s64 update_curr_common(struct rq *rq);
>>
>>   struct sched_class {
>>
>> -#ifdef CONFIG_UCLAMP_TASK
>> -       int uclamp_enabled;
>> -#endif
> 
> Why delete the uclamp_enable?
> 

After moving uclamp enqueue dequeue into each sched_class, we no longer 
check sched_class->uclamp_enabled in core.c and it won't be used anywhere.

>> -
>>          void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
>>          bool (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
>>          void (*yield_task)   (struct rq *rq);
>> @@ -3393,6 +3389,8 @@ static inline bool update_other_load_avgs(struct rq *rq) { return false; }
>>   #ifdef CONFIG_UCLAMP_TASK
>>
>>   unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
>> +void uclamp_rq_inc(struct rq *rq, struct task_struct *p);
>> +void uclamp_rq_dec(struct rq *rq, struct task_struct *p);
>>
>>   static inline unsigned long uclamp_rq_get(struct rq *rq,
>>                                            enum uclamp_id clamp_id)
>> @@ -3470,6 +3468,9 @@ uclamp_se_set(struct uclamp_se *uc_se, unsigned int value, bool user_defined)
>>
>>   #else /* !CONFIG_UCLAMP_TASK: */
>>
>> +static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { };
>> +static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { };
>> +
>>   static inline unsigned long
>>   uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
>>   {
>> --
>> 2.34.1
>>
>>