linux-kernel - Re: [PATCH v3 2/4] sched/fair: Make CFS bandwidth controller burstable

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <YEjDyADiVjVyt3ur@hirez.programming.kicks-ass.net>
Date:   Wed, 10 Mar 2021 14:04:08 +0100
From:   Peter Zijlstra <peterz@...radead.org>
To:     Huaixin Chang <changhuaixin@...ux.alibaba.com>
Cc:     bsegall@...gle.com, dietmar.eggemann@....com,
        juri.lelli@...hat.com, khlebnikov@...dex-team.ru,
        linux-kernel@...r.kernel.org, mgorman@...e.de, mingo@...hat.com,
        pauld@...head.com, pjt@...gle.com, rostedt@...dmis.org,
        shanpeic@...ux.alibaba.com, vincent.guittot@...aro.org,
        xiyou.wangcong@...il.com
Subject: Re: [PATCH v3 2/4] sched/fair: Make CFS bandwidth controller
 burstable

On Thu, Jan 21, 2021 at 07:04:51PM +0800, Huaixin Chang wrote:
> Accumulate unused quota from previous periods, thus accumulated
> bandwidth runtime can be used in the following periods. During
> accumulation, take care of runtime overflow. Previous non-burstable
> CFS bandwidth controller only assign quota to runtime, that saves a lot.
> 
> A sysctl parameter sysctl_sched_cfs_bw_burst_onset_percent is introduced to
> denote how many percent of burst is given on setting cfs bandwidth. By
> default it is 0, which means on burst is allowed unless accumulated.
> 
> Also, parameter sysctl_sched_cfs_bw_burst_enabled is introduced as a
> switch for burst. It is enabled by default.
> 
> Signed-off-by: Huaixin Chang <changhuaixin@...ux.alibaba.com>
> Signed-off-by: Shanpei Chen <shanpeic@...ux.alibaba.com>

Identical invalid SoB chain.

> Reported-by: kernel test robot <lkp@...el.com>

What exactly did the robot report; the whole patch?

> ---
>  include/linux/sched/sysctl.h |  2 ++
>  kernel/sched/core.c          | 31 +++++++++++++++++++++++++----
>  kernel/sched/fair.c          | 47 ++++++++++++++++++++++++++++++++++++--------
>  kernel/sched/sched.h         |  4 ++--
>  kernel/sysctl.c              | 18 +++++++++++++++++
>  5 files changed, 88 insertions(+), 14 deletions(-)
> 
> diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
> index 3c31ba88aca5..3400828eaf2d 100644
> --- a/include/linux/sched/sysctl.h
> +++ b/include/linux/sched/sysctl.h
> @@ -72,6 +72,8 @@ extern unsigned int sysctl_sched_uclamp_util_min_rt_default;
>  
>  #ifdef CONFIG_CFS_BANDWIDTH
>  extern unsigned int sysctl_sched_cfs_bandwidth_slice;
> +extern unsigned int sysctl_sched_cfs_bw_burst_onset_percent;
> +extern unsigned int sysctl_sched_cfs_bw_burst_enabled;
>  #endif
>  
>  #ifdef CONFIG_SCHED_AUTOGROUP
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 48d3bad12be2..fecf0f05ef0c 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -66,6 +66,16 @@ const_debug unsigned int sysctl_sched_features =
>   */
>  const_debug unsigned int sysctl_sched_nr_migrate = 32;
>  
> +#ifdef CONFIG_CFS_BANDWIDTH
> +/*
> + * Percent of burst assigned to cfs_b->runtime on tg_set_cfs_bandwidth,
> + * 0 by default.
> + */
> +unsigned int sysctl_sched_cfs_bw_burst_onset_percent;
> +
> +unsigned int sysctl_sched_cfs_bw_burst_enabled = 1;
> +#endif

There's already an #ifdef block that contains that bandwidth_slice
thing, see the previous hunk, so why create a new #ifdef here?

Also, personally I think percentages are over-represented as members of
Q.

> @@ -7891,7 +7901,7 @@ static DEFINE_MUTEX(cfs_constraints_mutex);
>  const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
>  static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
>  /* More than 203 days if BW_SHIFT equals 20. */
> -static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
> +const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
>  
>  static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
>  
> @@ -7900,7 +7910,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
>  {
>  	int i, ret = 0, runtime_enabled, runtime_was_enabled;
>  	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
> -	u64 buffer;
> +	u64 buffer, burst_onset;
>  
>  	if (tg == &root_task_group)
>  		return -EINVAL;
> @@ -7961,11 +7971,24 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
>  	cfs_b->burst = burst;
>  	cfs_b->buffer = buffer;
>  
> -	__refill_cfs_bandwidth_runtime(cfs_b);
> +	cfs_b->max_overrun = DIV_ROUND_UP_ULL(max_cfs_runtime, quota);
> +	cfs_b->runtime = cfs_b->quota;
> +
> +	/* burst_onset needed */
> +	if (cfs_b->quota != RUNTIME_INF &&
> +			sysctl_sched_cfs_bw_burst_enabled &&
> +			sysctl_sched_cfs_bw_burst_onset_percent > 0) {

'creative' indentation again...

Also, this gives rise to the question as to why onset_percent is
separate from enabled.

> +
> +		burst_onset = do_div(burst, 100) *
> +			sysctl_sched_cfs_bw_burst_onset_percent;

and again..

> +
> +		cfs_b->runtime += burst_onset;
> +		cfs_b->runtime = min(max_cfs_runtime, cfs_b->runtime);
> +	}
>  
>  	/* Restart the period timer (if active) to handle new period expiry: */
>  	if (runtime_enabled)
> -		start_cfs_bandwidth(cfs_b);
> +		start_cfs_bandwidth(cfs_b, 1);
>  
>  	raw_spin_unlock_irq(&cfs_b->lock);
>  
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 6bb4f89259fd..abe6eb05fe09 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4598,10 +4598,23 @@ static inline u64 sched_cfs_bandwidth_slice(void)
>   *
>   * requires cfs_b->lock
>   */
> -void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
> +static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b,
> +		u64 overrun)
>  {
> -	if (cfs_b->quota != RUNTIME_INF)
> -		cfs_b->runtime = cfs_b->quota;
> +	u64 refill;
> +
> +	if (cfs_b->quota != RUNTIME_INF) {
> +
> +		if (!sysctl_sched_cfs_bw_burst_enabled) {
> +			cfs_b->runtime = cfs_b->quota;
> +			return;
> +		}
> +
> +		overrun = min(overrun, cfs_b->max_overrun);
> +		refill = cfs_b->quota * overrun;
> +		cfs_b->runtime += refill;
> +		cfs_b->runtime = min(cfs_b->runtime, cfs_b->buffer);
> +	}
>  }
>  
>  static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
> @@ -4623,7 +4636,7 @@ static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
>  	if (cfs_b->quota == RUNTIME_INF)
>  		amount = min_amount;
>  	else {
> -		start_cfs_bandwidth(cfs_b);
> +		start_cfs_bandwidth(cfs_b, 0);
>  
>  		if (cfs_b->runtime > 0) {
>  			amount = min(cfs_b->runtime, min_amount);
> @@ -4957,7 +4970,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
>  	if (cfs_b->idle && !throttled)
>  		goto out_deactivate;
>  
> -	__refill_cfs_bandwidth_runtime(cfs_b);
> +	__refill_cfs_bandwidth_runtime(cfs_b, overrun);
>  
>  	if (!throttled) {
>  		/* mark as potentially idle for the upcoming period */
> @@ -5181,6 +5194,7 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
>  }
>  
>  extern const u64 max_cfs_quota_period;
> +extern const u64 max_cfs_runtime;
>  
>  static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
>  {
> @@ -5210,7 +5224,14 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
>  			new = old * 2;
>  			if (new < max_cfs_quota_period) {
>  				cfs_b->period = ns_to_ktime(new);
> -				cfs_b->quota *= 2;
> +				cfs_b->quota = min(cfs_b->quota * 2,
> +						max_cfs_runtime);

again, broken indent

> +
> +				cfs_b->buffer = min(max_cfs_runtime,
> +						cfs_b->quota + cfs_b->burst);

and again..

> +				/* Add 1 in case max_overrun becomes 0. */

A better comment would explain *why* 0 is a problem; and possibly
include a reference to the code that cares
(__refill_cfs_bandiwdth_runtime() afaict).

> +				cfs_b->max_overrun >>= 1;
> +				cfs_b->max_overrun++;
>  
>  				pr_warn_ratelimited(
>  	"cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",