[<prev] [next>] [day] [month] [year] [list]
Message-ID: <20200413144457.GA2517@lorien.usersys.redhat.com>
Date: Mon, 13 Apr 2020 10:44:57 -0400
From: Phil Auld <pauld@...hat.com>
To: Josh Don <joshdon@...gle.com>
Cc: Ingo Molnar <mingo@...hat.com>,
Peter Zijlstra <peterz@...radead.org>,
Juri Lelli <juri.lelli@...hat.com>,
Vincent Guittot <vincent.guittot@...aro.org>,
Dietmar Eggemann <dietmar.eggemann@....com>,
Steven Rostedt <rostedt@...dmis.org>,
Ben Segall <bsegall@...gle.com>, Mel Gorman <mgorman@...e.de>,
Paul Turner <pjt@...gle.com>,
Huaixin Chang <changhuaixin@...ux.alibaba.com>,
linux-kernel <linux-kernel@...r.kernel.org>
Subject: Re: [PATCH 1/2] sched: eliminate bandwidth race between throttling
and distribution
Hi Josh,
On Sat, Apr 11, 2020 at 07:00:17PM -0700 Josh Don wrote:
> +[1]pauld@...hat.com
>
> On Fri, Apr 10, 2020 at 3:52 PM Josh Don <[2]joshdon@...gle.com> wrote:
>
> From: Paul Turner <[3]pjt@...gle.com>
>
> There is a race window in which an entity begins throttling before quota
> is added to the pool, but does not finish throttling until after we have
> finished with distribute_cfs_runtime(). This entity is not observed by
> distribute_cfs_runtime() because it was not on the throttled list at the
> time that distribution was running. This race manifests as rare
> period-length statlls for such entities.
>
> Rather than heavy-weight the synchronization with the progress of
> distribution, we can fix this by aborting throttling if bandwidth has
> become available. Otherwise, we immediately add the entity to the
> throttled list so that it can be observed by a subsequent distribution.
>
> Additionally, we can remove the case of adding the throttled entity to
> the head of the throttled list, and simply always add to the tail.
> Thanks to 26a8b12747c97, distribute_cfs_runtime() no longer holds onto
> its own pool of runtime. This means that if we do hit the !assign and
> distribute_running case, we know that distribution is about to end.
>
> Signed-off-by: Paul Turner <[4]pjt@...gle.com>
> Co-developed-by: Ben Segall <[5]bsegall@...gle.com>
> Signed-off-by: Ben Segall <[6]bsegall@...gle.com>
> Signed-off-by: Josh Don <[7]joshdon@...gle.com>
This looks good to me, thanks for the cc.
Reviewed-by: Phil Auld <pauld@...hat.com>
> ---
> kernel/sched/fair.c | 78 ++++++++++++++++++++++++++-------------------
> 1 file changed, 46 insertions(+), 32 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 02f323b85b6d..3fb986c52825 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4587,17 +4587,15 @@ static inline struct cfs_bandwidth
> *tg_cfs_bandwidth(struct task_group *tg)
> return &tg->cfs_bandwidth;
> }
>
> -/* returns 0 on failure to allocate runtime */
> -static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
> +/* returns 0 on failure to allocate runtime, called with cfs_b->lock held
> */
> +static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
> + struct cfs_rq *cfs_rq, u64
> target_runtime)
> {
> - struct task_group *tg = cfs_rq->tg;
> - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
> u64 amount = 0, min_amount;
>
> /* note: this is a positive sum as runtime_remaining <= 0 */
> - min_amount = sched_cfs_bandwidth_slice() - cfs_rq->
> runtime_remaining;
> + min_amount = target_runtime - cfs_rq->runtime_remaining;
>
> - raw_spin_lock(&cfs_b->lock);
> if (cfs_b->quota == RUNTIME_INF)
> amount = min_amount;
> else {
> @@ -4609,13 +4607,26 @@ static int assign_cfs_rq_runtime(struct cfs_rq
> *cfs_rq)
> cfs_b->idle = 0;
> }
> }
> - raw_spin_unlock(&cfs_b->lock);
>
> cfs_rq->runtime_remaining += amount;
>
> return cfs_rq->runtime_remaining > 0;
> }
>
> +/* returns 0 on failure to allocate runtime */
> +static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
> +{
> + int ret;
> + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
> +
> + raw_spin_lock(&cfs_b->lock);
> + ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq,
> + sched_cfs_bandwidth_slice());
> + raw_spin_unlock(&cfs_b->lock);
> +
> + return ret;
> +}
> +
> static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64
> delta_exec)
> {
> /* dock delta_exec before expiring quota (as it could span periods)
> */
> @@ -4704,13 +4715,33 @@ static int tg_throttle_down(struct task_group *tg,
> void *data)
> return 0;
> }
>
> -static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
> +static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
> {
> struct rq *rq = rq_of(cfs_rq);
> struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
> struct sched_entity *se;
> long task_delta, idle_task_delta, dequeue = 1;
> - bool empty;
> +
> + raw_spin_lock(&cfs_b->lock);
> + /* This will start the period timer if necessary */
> + if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
> + /*
> + * We have raced with bandwidth becoming available, and if
> we
> + * actually throttled the timer might not unthrottle us for
> an
> + * entire period. We additionally needed to make sure that
> any
> + * subsequent check_cfs_rq_runtime calls agree not to
> throttle
> + * us, as we may commit to do cfs put_prev+pick_next, so we
> ask
> + * for 1ns of runtime rather than just check cfs_b.
> + */
> + dequeue = 0;
> + } else {
> + list_add_tail_rcu(&cfs_rq->throttled_list,
> + &cfs_b->throttled_cfs_rq);
> + }
> + raw_spin_unlock(&cfs_b->lock);
> +
> + if (!dequeue)
> + return false; /* Throttle no longer required. */
>
> se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
>
> @@ -4744,29 +4775,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
> if (!se)
> sub_nr_running(rq, task_delta);
>
> - cfs_rq->throttled = 1;
> - cfs_rq->throttled_clock = rq_clock(rq);
> - raw_spin_lock(&cfs_b->lock);
> - empty = list_empty(&cfs_b->throttled_cfs_rq);
> -
> /*
> - * Add to the _head_ of the list, so that an already-started
> - * distribute_cfs_runtime will not see us. If disribute_cfs_runtime
> is
> - * not running add to the tail so that later runqueues don't get
> starved.
> + * Note: distribution will already see us throttled via the
> + * throttled-list. rq->lock protects completion.
> */
> - if (cfs_b->distribute_running)
> - list_add_rcu(&cfs_rq->throttled_list, &cfs_b->
> throttled_cfs_rq);
> - else
> - list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->
> throttled_cfs_rq);
> -
> - /*
> - * If we're the first throttled task, make sure the bandwidth
> - * timer is running.
> - */
> - if (empty)
> - start_cfs_bandwidth(cfs_b);
> -
> - raw_spin_unlock(&cfs_b->lock);
> + cfs_rq->throttled = 1;
> + cfs_rq->throttled_clock = rq_clock(rq);
> + return true;
> }
>
> void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
> @@ -5121,8 +5136,7 @@ static bool check_cfs_rq_runtime(struct cfs_rq
> *cfs_rq)
> if (cfs_rq_throttled(cfs_rq))
> return true;
>
> - throttle_cfs_rq(cfs_rq);
> - return true;
> + return throttle_cfs_rq(cfs_rq);
> }
>
> static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
> --
> 2.26.0.110.g2183baf09c-goog
>
>
>
> References:
>
> [1] mailto:pauld@...hat.com
> [2] mailto:joshdon@...gle.com
> [3] mailto:pjt@...gle.com
> [4] mailto:pjt@...gle.com
> [5] mailto:bsegall@...gle.com
> [6] mailto:bsegall@...gle.com
> [7] mailto:joshdon@...gle.com
--
Powered by blists - more mailing lists