linux-kernel - Re: [RFC PATCH v3 2/6] sched/uclamp: Track a new util_avg

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <fdde3285-6d40-458d-84bd-3d7badc1e592@arm.com>
Date: Mon, 27 May 2024 00:52:24 +0200
From: Dietmar Eggemann <dietmar.eggemann@....com>
To: Hongyan Xia <hongyan.xia2@....com>, Ingo Molnar <mingo@...hat.com>,
 Peter Zijlstra <peterz@...radead.org>,
 Vincent Guittot <vincent.guittot@...aro.org>,
 Juri Lelli <juri.lelli@...hat.com>, Steven Rostedt <rostedt@...dmis.org>,
 Ben Segall <bsegall@...gle.com>, Mel Gorman <mgorman@...e.de>,
 Daniel Bristot de Oliveira <bristot@...hat.com>,
 Valentin Schneider <vschneid@...hat.com>
Cc: Qais Yousef <qyousef@...alina.io>,
 Morten Rasmussen <morten.rasmussen@....com>,
 Lukasz Luba <lukasz.luba@....com>,
 Christian Loehle <christian.loehle@....com>, pierre.gondois@....com,
 linux-kernel@...r.kernel.org
Subject: Re: [RFC PATCH v3 2/6] sched/uclamp: Track a new util_avg_bias signal

On 07/05/2024 14:50, Hongyan Xia wrote:
> Add a util_avg_bias signal in sched_avg, which is obtained by:
> 
> util_avg_bias = clamp(util_avg, uclamp_min, uclamp_max) - util_avg
> 
> The task utilization after considering uclamp is;
> 
> util_avg_uclamp = util_avg + util_avg_bias
> 
> We then sum up all biases on the same rq and use the total bias to bias
> the rq utilization. This is the core idea of uclamp sum aggregation. The
> rq utilization will be
> 
> rq_util_avg_uclamp = rq_util_avg + total_util_avg_bias
> 
> Signed-off-by: Hongyan Xia <hongyan.xia2@....com>
> ---
>  include/linux/sched.h |  4 +++-
>  kernel/sched/debug.c  |  2 +-
>  kernel/sched/fair.c   | 16 ++++++++++++++++
>  kernel/sched/pelt.c   | 39 +++++++++++++++++++++++++++++++++++++++
>  kernel/sched/sched.h  | 28 ++++++++++++++++++++++++++++
>  5 files changed, 87 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index c75fd46506df..4ea4b8b30f54 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -476,8 +476,10 @@ struct sched_avg {
>  	u32				period_contrib;
>  	unsigned long			load_avg;
>  	unsigned long			runnable_avg;
> -	unsigned long			util_avg;
> +	unsigned int			util_avg;
> +	int				util_avg_bias;
>  	unsigned int			util_est;
> +	unsigned int			util_est_uclamp;

Looks like you only introduce uclamped util_est functions in 3/6. It's
easy to grasp when data changes go together with new functions. Maybe
introduce a specific util_est patch before 3/6 "Use util biases for
utilization and frequency"?


>  } ____cacheline_aligned;
>  
>  /*
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 8d5d98a5834d..c4dadb740e96 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -682,7 +682,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
>  			cfs_rq->avg.load_avg);
>  	SEQ_printf(m, "  .%-30s: %lu\n", "runnable_avg",
>  			cfs_rq->avg.runnable_avg);
> -	SEQ_printf(m, "  .%-30s: %lu\n", "util_avg",
> +	SEQ_printf(m, "  .%-30s: %u\n", "util_avg",
>  			cfs_rq->avg.util_avg);
>  	SEQ_printf(m, "  .%-30s: %u\n", "util_est",
>  			cfs_rq->avg.util_est);
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index ef5bb576ac65..571c8de59508 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1083,6 +1083,7 @@ void post_init_entity_util_avg(struct task_struct *p)
>  	}
>  
>  	sa->runnable_avg = sa->util_avg;
> +	sa->util_avg_bias = 0;
>  }
>  
>  #else /* !CONFIG_SMP */
> @@ -6801,6 +6802,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  
>  	/* At this point se is NULL and we are at root level*/
>  	add_nr_running(rq, 1);
> +	util_bias_enqueue(&rq->cfs.avg, p);
> +	/* XXX: We should skip the update above and only do it once here. */

By 'above' you're referring to the update in enqueue_entity() ->
update_load_avg(..., DO_ATTACH) -> attach_entity_load_avg() ->
cfs_rq_util_change() ?

I assume you won't have the problem of having to add a
cpufreq_update_util(rq, 0) call after the util_bias enqueue or dequeue
with "[PATCH v4] sched: Consolidate cpufreq updates"
https://lkml.kernel.org/r/20240516204802.846520-1-qyousef@layalina.io
anymore?

> +	cpufreq_update_util(rq, 0);
>  
>  	/*
>  	 * Since new tasks are assigned an initial util_avg equal to
> @@ -6892,6 +6896,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  
>  	/* At this point se is NULL and we are at root level*/
>  	sub_nr_running(rq, 1);
> +	util_bias_dequeue(&rq->cfs.avg, p);
> +	/* XXX: We should skip the update above and only do it once here. */
> +	cpufreq_update_util(rq, 0);
>  
>  	/* balance early to pull high priority tasks */
>  	if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
> @@ -6900,6 +6907,15 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  dequeue_throttle:
>  	util_est_update(&rq->cfs, p, task_sleep);
>  	hrtick_update(rq);
> +
> +#ifdef CONFIG_UCLAMP_TASK
> +	if (rq->cfs.h_nr_running == 0) {
> +		WARN_ONCE(rq->cfs.avg.util_avg_bias,
> +			"0 tasks on CFS of CPU %d, but util_avg_bias is %d\n",
> +			rq->cpu, rq->cfs.avg.util_avg_bias);
> +		WRITE_ONCE(rq->cfs.avg.util_avg_bias, 0);
> +	}
> +#endif
>  }
>  
>  #ifdef CONFIG_SMP
> diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
> index ef00382de595..56346988182f 100644
> --- a/kernel/sched/pelt.c
> +++ b/kernel/sched/pelt.c
> @@ -266,6 +266,40 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load)
>  	WRITE_ONCE(sa->util_avg, sa->util_sum / divider);
>  }
>  
> +#ifdef CONFIG_UCLAMP_TASK
> +/* avg must belong to the queue this se is on. */
> +static void update_util_bias(struct sched_avg *avg, struct task_struct *p)

You could pass a `struct rq *rq` here? I assume this code is already
there to include rt-tasks (via per-entity load-tracking in rt class)?

> +{
> +	unsigned int util, uclamp_min, uclamp_max;
> +	int old, new;
> +
> +	/*
> +	 * We MUST update the rq summed total every time we update the
> +	 * util_avg_bias of a task. If we are on a rq but we are not given the
> +	 * rq, then the best thing is to just skip this update.
> +	 */
> +	if (p->se.on_rq && !avg)
> +		return;
> +
> +	util = READ_ONCE(p->se.avg.util_avg);
> +	uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
> +	uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
> +	if (uclamp_max == SCHED_CAPACITY_SCALE)
> +		uclamp_max = UINT_MAX;

This is done to not cap a task util_avg > 1024 in case the task has
default uclamp_max?

> +	old = READ_ONCE(p->se.avg.util_avg_bias);
> +	new = (int)clamp(util, uclamp_min, uclamp_max) - (int)util;
> +
> +	WRITE_ONCE(p->se.avg.util_avg_bias, new);
> +	if (!p->se.on_rq)
> +		return;
> +	WRITE_ONCE(avg->util_avg_bias, READ_ONCE(avg->util_avg_bias) + new - old);
> +}
> +#else /* !CONFIG_UCLAMP_TASK */
> +static void update_util_bias(struct sched_avg *avg, struct task_struct *p)
> +{
> +}
> +#endif
> +
>  /*
>   * sched_entity:
>   *
> @@ -296,6 +330,8 @@ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
>  {
>  	if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
>  		___update_load_avg(&se->avg, se_weight(se));
> +		if (entity_is_task(se))
> +			update_util_bias(NULL, task_of(se));

IMHO,

update_util_bias(struct sched_avg *avg, struct sched_entity *se)

    if (!entity_is_task(se))
        return;

    ...

would be easier to read.

>  		trace_pelt_se_tp(se);
>  		return 1;
>  	}
> @@ -310,6 +346,9 @@ int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se
>  
>  		___update_load_avg(&se->avg, se_weight(se));
>  		cfs_se_util_change(&se->avg);
> +		if (entity_is_task(se))
> +			update_util_bias(&rq_of(cfs_rq)->cfs.avg,
> +					 task_of(se));
>  		trace_pelt_se_tp(se);
>  		return 1;
>  	}
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index cb3792c04eea..aec812e6c6ba 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -3095,6 +3095,24 @@ static inline bool uclamp_is_used(void)
>  {
>  	return static_branch_likely(&sched_uclamp_used);
>  }
> +
> +static inline void util_bias_enqueue(struct sched_avg *avg,
> +				     struct task_struct *p)
> +{
> +	int avg_val = READ_ONCE(avg->util_avg_bias);
> +	int p_val = READ_ONCE(p->se.avg.util_avg_bias);
> +
> +	WRITE_ONCE(avg->util_avg_bias, avg_val + p_val);
> +}
> +
> +static inline void util_bias_dequeue(struct sched_avg *avg,
> +				     struct task_struct *p)
> +{
> +	int avg_val = READ_ONCE(avg->util_avg_bias);
> +	int p_val = READ_ONCE(p->se.avg.util_avg_bias);
> +
> +	WRITE_ONCE(avg->util_avg_bias, avg_val - p_val);
> +}

Maybe enqueue_util_bias() and dequeue_util_bias() since there is the
related update_util_bias()? I know that there is util_est_enqueue() but
util_est also has util_est_update().

[...]