[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAKfTPtC_uEqP8y9j-Njk2mHJXEqo02PHhM9koAtUTfYbc+KQEA@mail.gmail.com>
Date: Mon, 23 Aug 2021 12:08:20 +0200
From: Vincent Guittot <vincent.guittot@...aro.org>
To: Josh Don <joshdon@...gle.com>
Cc: Ingo Molnar <mingo@...hat.com>,
Peter Zijlstra <peterz@...radead.org>,
Juri Lelli <juri.lelli@...hat.com>,
Dietmar Eggemann <dietmar.eggemann@....com>,
Steven Rostedt <rostedt@...dmis.org>,
Ben Segall <bsegall@...gle.com>, Mel Gorman <mgorman@...e.de>,
Daniel Bristot de Oliveira <bristot@...hat.com>,
Paul Turner <pjt@...gle.com>,
Oleg Rombakh <olegrom@...gle.com>,
Viresh Kumar <viresh.kumar@...aro.org>,
Steve Sistare <steven.sistare@...cle.com>,
Tejun Heo <tj@...nel.org>, Rik van Riel <riel@...riel.com>,
linux-kernel <linux-kernel@...r.kernel.org>
Subject: Re: [PATCH v3 3/4] sched: reduce sched slice for SCHED_IDLE entities
On Fri, 20 Aug 2021 at 03:04, Josh Don <joshdon@...gle.com> wrote:
>
> Use a small, non-scaled min granularity for SCHED_IDLE entities, when
> competing with normal entities. This reduces the latency of getting
> a normal entity back on cpu, at the expense of increased context
> switch frequency of SCHED_IDLE entities.
>
> The benefit of this change is to reduce the round-robin latency for
> normal entities when competing with a SCHED_IDLE entity.
>
> Example: on a machine with HZ=1000, spawned two threads, one of which is
> SCHED_IDLE, and affined to one cpu. Without this patch, the SCHED_IDLE
> thread runs for 4ms then waits for 1.4s. With this patch, it runs for
> 1ms and waits 340ms (as it round-robins with the other thread).
>
> Signed-off-by: Josh Don <joshdon@...gle.com>
> ---
> kernel/sched/debug.c | 2 ++
> kernel/sched/fair.c | 29 ++++++++++++++++++++++++-----
> kernel/sched/sched.h | 1 +
> 3 files changed, 27 insertions(+), 5 deletions(-)
>
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 33538579db9a..317ef560aa63 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -305,6 +305,7 @@ static __init int sched_init_debug(void)
>
> debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency);
> debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity);
> + debugfs_create_u32("idle_min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_idle_min_granularity);
> debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity);
>
> debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
> @@ -806,6 +807,7 @@ static void sched_debug_header(struct seq_file *m)
> SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
> PN(sysctl_sched_latency);
> PN(sysctl_sched_min_granularity);
> + PN(sysctl_sched_idle_min_granularity);
> PN(sysctl_sched_wakeup_granularity);
> P(sysctl_sched_child_runs_first);
> P(sysctl_sched_features);
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 19a9244c140f..31f40aa005b9 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -59,6 +59,14 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
> unsigned int sysctl_sched_min_granularity = 750000ULL;
> static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
>
> +/*
> + * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks.
> + * Applies only when SCHED_IDLE tasks compete with normal tasks.
> + *
> + * (default: 0.75 msec)
> + */
> +unsigned int sysctl_sched_idle_min_granularity = 750000ULL;
> +
> /*
> * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
> */
> @@ -665,6 +673,8 @@ static u64 __sched_period(unsigned long nr_running)
> return sysctl_sched_latency;
> }
>
> +static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq);
> +
> /*
> * We calculate the wall-time slice from the period by taking a part
> * proportional to the weight.
> @@ -674,6 +684,8 @@ static u64 __sched_period(unsigned long nr_running)
> static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
> {
> unsigned int nr_running = cfs_rq->nr_running;
> + struct sched_entity *init_se = se;
> + unsigned int min_gran;
> u64 slice;
>
> if (sched_feat(ALT_PERIOD))
> @@ -684,12 +696,13 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
> for_each_sched_entity(se) {
> struct load_weight *load;
> struct load_weight lw;
> + struct cfs_rq *qcfs_rq;
>
> - cfs_rq = cfs_rq_of(se);
> - load = &cfs_rq->load;
> + qcfs_rq = cfs_rq_of(se);
> + load = &qcfs_rq->load;
>
> if (unlikely(!se->on_rq)) {
> - lw = cfs_rq->load;
> + lw = qcfs_rq->load;
>
> update_load_add(&lw, se->load.weight);
> load = &lw;
> @@ -697,8 +710,14 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
> slice = __calc_delta(slice, se->load.weight, load);
> }
>
> - if (sched_feat(BASE_SLICE))
> - slice = max(slice, (u64)sysctl_sched_min_granularity);
> + if (sched_feat(BASE_SLICE)) {
> + if (se_is_idle(init_se) && !sched_idle_cfs_rq(cfs_rq))
Like for place_entity, we should probably not dynamically switch
between the 2 values below depending on the presence or not of non
sched idle tasks and always use sysctl_sched_idle_min_granularity
> + min_gran = sysctl_sched_idle_min_granularity;
> + else
> + min_gran = sysctl_sched_min_granularity;
> +
> + slice = max_t(u64, slice, min_gran);
> + }
>
> return slice;
> }
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 6af039e433fb..29846da35861 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2399,6 +2399,7 @@ extern const_debug unsigned int sysctl_sched_migration_cost;
> #ifdef CONFIG_SCHED_DEBUG
> extern unsigned int sysctl_sched_latency;
> extern unsigned int sysctl_sched_min_granularity;
> +extern unsigned int sysctl_sched_idle_min_granularity;
> extern unsigned int sysctl_sched_wakeup_granularity;
> extern int sysctl_resched_latency_warn_ms;
> extern int sysctl_resched_latency_warn_once;
> --
> 2.33.0.rc2.250.ged5fa647cd-goog
>
Powered by blists - more mailing lists