linux-kernel - Re: [PATCH v7 6/9] sched/fair: Add sched group latency support

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20221101192848.pjns3um7dnrwrk5p@airbuntu>
Date:   Tue, 1 Nov 2022 19:28:48 +0000
From:   Qais Yousef <qyousef@...alina.io>
To:     Vincent Guittot <vincent.guittot@...aro.org>
Cc:     mingo@...hat.com, peterz@...radead.org, juri.lelli@...hat.com,
        dietmar.eggemann@....com, rostedt@...dmis.org, bsegall@...gle.com,
        mgorman@...e.de, bristot@...hat.com, vschneid@...hat.com,
        linux-kernel@...r.kernel.org, parth@...ux.ibm.com,
        qais.yousef@....com, chris.hyser@...cle.com,
        patrick.bellasi@...bug.net, David.Laight@...lab.com,
        pjt@...gle.com, pavel@....cz, tj@...nel.org, qperret@...gle.com,
        tim.c.chen@...ux.intel.com, joshdon@...gle.com, timj@....org,
        kprateek.nayak@....com, yu.c.chen@...el.com,
        youssefesmat@...omium.org, joel@...lfernandes.org
Subject: Re: [PATCH v7 6/9] sched/fair: Add sched group latency support

On 10/28/22 11:34, Vincent Guittot wrote:
> Task can set its latency priority with sched_setattr(), which is then used
> to set the latency offset of its sched_enity, but sched group entities
> still have the default latency offset value.
> 
> Add a latency.nice field in cpu cgroup controller to set the latency
> priority of the group similarly to sched_setattr(). The latency priority
> is then used to set the offset of the sched_entities of the group.
> 
> Signed-off-by: Vincent Guittot <vincent.guittot@...aro.org>
> ---
>  Documentation/admin-guide/cgroup-v2.rst |  8 ++++
>  kernel/sched/core.c                     | 52 +++++++++++++++++++++++++
>  kernel/sched/fair.c                     | 33 ++++++++++++++++
>  kernel/sched/sched.h                    |  4 ++
>  4 files changed, 97 insertions(+)
> 
> diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
> index be4a77baf784..d8ae7e411f9c 100644
> --- a/Documentation/admin-guide/cgroup-v2.rst
> +++ b/Documentation/admin-guide/cgroup-v2.rst
> @@ -1095,6 +1095,14 @@ All time durations are in microseconds.
>          values similar to the sched_setattr(2). This maximum utilization
>          value is used to clamp the task specific maximum utilization clamp.
>  
> +  cpu.latency.nice
> +	A read-write single value file which exists on non-root
> +	cgroups.  The default is "0".
> +
> +	The nice value is in the range [-20, 19].
> +
> +	This interface file allows reading and setting latency using the
> +	same values used by sched_setattr(2).

I'm still not sure about this [1].

In some scenarios we'd like to get the effective latency_nice of the task. How
will the task inherit the cgroup value or be impacted by it?

For example if there are tasks that belong to a latency sensitive cgroup; and
I'd like to skip some searches in EAS to improve that latency sensitivity - how
would I extract this info in EAS path given these tasks are using default
latency_nice value? And if should happen if their latency_nice is set to
something else other than default?

[1] https://lore.kernel.org/lkml/20221012160734.hrkb5jcjdq7r23pr@wubuntu/


Thanks

--
Qais Yousef

>  
>  
>  Memory
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index caf54e54a74f..3f42b1f61a7e 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -10890,6 +10890,47 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
>  {
>  	return sched_group_set_idle(css_tg(css), idle);
>  }
> +
> +static s64 cpu_latency_nice_read_s64(struct cgroup_subsys_state *css,
> +				    struct cftype *cft)
> +{
> +	int prio, delta, last_delta = INT_MAX;
> +	s64 weight;
> +
> +	weight = css_tg(css)->latency_offset * NICE_LATENCY_WEIGHT_MAX;
> +	weight = div_s64(weight, get_sched_latency(false));
> +
> +	/* Find the closest nice value to the current weight */
> +	for (prio = 0; prio < ARRAY_SIZE(sched_latency_to_weight); prio++) {
> +		delta = abs(sched_latency_to_weight[prio] - weight);
> +		if (delta >= last_delta)
> +			break;
> +		last_delta = delta;
> +	}
> +
> +	return LATENCY_TO_NICE(prio-1);
> +}
> +
> +static int cpu_latency_nice_write_s64(struct cgroup_subsys_state *css,
> +				     struct cftype *cft, s64 nice)
> +{
> +	s64 latency_offset;
> +	long weight;
> +	int idx;
> +
> +	if (nice < MIN_LATENCY_NICE || nice > MAX_LATENCY_NICE)
> +		return -ERANGE;
> +
> +	idx = NICE_TO_LATENCY(nice);
> +	idx = array_index_nospec(idx, LATENCY_NICE_WIDTH);
> +	weight = sched_latency_to_weight[idx];
> +
> +	latency_offset = weight * get_sched_latency(false);
> +	latency_offset = div_s64(latency_offset, NICE_LATENCY_WEIGHT_MAX);
> +
> +	return sched_group_set_latency(css_tg(css), latency_offset);
> +}
> +
>  #endif
>  
>  static struct cftype cpu_legacy_files[] = {
> @@ -10904,6 +10945,11 @@ static struct cftype cpu_legacy_files[] = {
>  		.read_s64 = cpu_idle_read_s64,
>  		.write_s64 = cpu_idle_write_s64,
>  	},
> +	{
> +		.name = "latency.nice",
> +		.read_s64 = cpu_latency_nice_read_s64,
> +		.write_s64 = cpu_latency_nice_write_s64,
> +	},
>  #endif
>  #ifdef CONFIG_CFS_BANDWIDTH
>  	{
> @@ -11121,6 +11167,12 @@ static struct cftype cpu_files[] = {
>  		.read_s64 = cpu_idle_read_s64,
>  		.write_s64 = cpu_idle_write_s64,
>  	},
> +	{
> +		.name = "latency.nice",
> +		.flags = CFTYPE_NOT_ON_ROOT,
> +		.read_s64 = cpu_latency_nice_read_s64,
> +		.write_s64 = cpu_latency_nice_write_s64,
> +	},
>  #endif
>  #ifdef CONFIG_CFS_BANDWIDTH
>  	{
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 4299d5108dc7..9583936ce30c 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -11764,6 +11764,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
>  		goto err;
>  
>  	tg->shares = NICE_0_LOAD;
> +	tg->latency_offset = 0;
>  
>  	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
>  
> @@ -11862,6 +11863,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
>  	}
>  
>  	se->my_q = cfs_rq;
> +
> +	se->latency_offset = tg->latency_offset;
> +
>  	/* guarantee group entities always have weight */
>  	update_load_set(&se->load, NICE_0_LOAD);
>  	se->parent = parent;
> @@ -11992,6 +11996,35 @@ int sched_group_set_idle(struct task_group *tg, long idle)
>  	return 0;
>  }
>  
> +int sched_group_set_latency(struct task_group *tg, s64 latency)
> +{
> +	int i;
> +
> +	if (tg == &root_task_group)
> +		return -EINVAL;
> +
> +	if (abs(latency) > sysctl_sched_latency)
> +		return -EINVAL;
> +
> +	mutex_lock(&shares_mutex);
> +
> +	if (tg->latency_offset == latency) {
> +		mutex_unlock(&shares_mutex);
> +		return 0;
> +	}
> +
> +	tg->latency_offset = latency;
> +
> +	for_each_possible_cpu(i) {
> +		struct sched_entity *se = tg->se[i];
> +
> +		WRITE_ONCE(se->latency_offset, latency);
> +	}
> +
> +	mutex_unlock(&shares_mutex);
> +	return 0;
> +}
> +
>  #else /* CONFIG_FAIR_GROUP_SCHED */
>  
>  void free_fair_sched_group(struct task_group *tg) { }
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 99f10b4dc230..95d4be4f3af6 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -407,6 +407,8 @@ struct task_group {
>  
>  	/* A positive value indicates that this is a SCHED_IDLE group. */
>  	int			idle;
> +	/* latency constraint of the group. */
> +	int			latency_offset;
>  
>  #ifdef	CONFIG_SMP
>  	/*
> @@ -517,6 +519,8 @@ extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
>  
>  extern int sched_group_set_idle(struct task_group *tg, long idle);
>  
> +extern int sched_group_set_latency(struct task_group *tg, s64 latency);
> +
>  #ifdef CONFIG_SMP
>  extern void set_task_rq_fair(struct sched_entity *se,
>  			     struct cfs_rq *prev, struct cfs_rq *next);
> -- 
> 2.17.1
>