[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1b4b96ce-c08c-b10f-85eb-d28dbe6042b5@yandex-team.ru>
Date: Sun, 5 Aug 2018 15:41:09 +0300
From: Konstantin Khlebnikov <khlebnikov@...dex-team.ru>
To: Josef Bacik <josef@...icpanda.com>, axboe@...nel.dk,
linux-kernel@...r.kernel.org, akpm@...ux-foundation.org,
hannes@...xchg.org, tj@...nel.org, linux-fsdevel@...r.kernel.org,
linux-block@...r.kernel.org, kernel-team@...com
Cc: Josef Bacik <jbacik@...com>
Subject: Re: [PATCH 06/14] blkcg: add generic throttling mechanism
On 03.07.2018 18:14, Josef Bacik wrote:
> From: Josef Bacik <jbacik@...com>
>
> Since IO can be issued from literally anywhere it's almost impossible to
> do throttling without having some sort of adverse effect somewhere else
> in the system because of locking or other dependencies. The best way to
> solve this is to do the throttling when we know we aren't holding any
> other kernel resources. Do this by tracking throttling in a per-blkg
> basis, and if we require throttling flag the task that it needs to check
> before it returns to user space and possibly sleep there.
>
> This is to address the case where a process is doing work that is
> generating IO that can't be throttled, whether that is directly with a
> lot of REQ_META IO, or indirectly by allocating so much memory that it
> is swamping the disk with REQ_SWAP. We can't use task_add_work as we
> don't want to induce a memory allocation in the IO path, so simply
> saving the request queue in the task and flagging it to do the
> notify_resume thing achieves the same result without the overhead of a
> memory allocation.
Technically speaking task_add_work could be used for this job without memory allocation.
I've done the same thing some years ago by embedding call_head into task_struct.
https://lore.kernel.org/patchwork/patch/533621/
>
> Signed-off-by: Josef Bacik <jbacik@...com>
> Acked-by: Tejun Heo <tj@...nel.org>
> ---
> block/blk-cgroup.c | 220 ++++++++++++++++++++++++++++++++++++++++++++
> include/linux/blk-cgroup.h | 99 ++++++++++++++++++++
> include/linux/cgroup-defs.h | 3 +
> include/linux/sched.h | 8 ++
> include/linux/tracehook.h | 2 +
> 5 files changed, 332 insertions(+)
>
> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> index 7dc6f05cc44b..d3310ec96c2a 100644
> --- a/block/blk-cgroup.c
> +++ b/block/blk-cgroup.c
> @@ -27,6 +27,7 @@
> #include <linux/atomic.h>
> #include <linux/ctype.h>
> #include <linux/blk-cgroup.h>
> +#include <linux/tracehook.h>
> #include "blk.h"
>
> #define MAX_KEY_LEN 100
> @@ -999,6 +1000,14 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
> if (!blkcg_debug_stats)
> goto next;
>
> + if (atomic_read(&blkg->use_delay)) {
> + has_stats = true;
> + off += scnprintf(buf+off, size-off,
> + " use_delay=%d delay_nsec=%llu",
> + atomic_read(&blkg->use_delay),
> + (unsigned long long)atomic64_read(&blkg->delay_nsec));
> + }
> +
> for (i = 0; i < BLKCG_MAX_POLS; i++) {
> struct blkcg_policy *pol = blkcg_policy[i];
> size_t written;
> @@ -1326,6 +1335,13 @@ static void blkcg_bind(struct cgroup_subsys_state *root_css)
> mutex_unlock(&blkcg_pol_mutex);
> }
>
> +static void blkcg_exit(struct task_struct *tsk)
> +{
> + if (tsk->throttle_queue)
> + blk_put_queue(tsk->throttle_queue);
> + tsk->throttle_queue = NULL;
> +}
> +
> struct cgroup_subsys io_cgrp_subsys = {
> .css_alloc = blkcg_css_alloc,
> .css_offline = blkcg_css_offline,
> @@ -1335,6 +1351,7 @@ struct cgroup_subsys io_cgrp_subsys = {
> .dfl_cftypes = blkcg_files,
> .legacy_cftypes = blkcg_legacy_files,
> .legacy_name = "blkio",
> + .exit = blkcg_exit,
> #ifdef CONFIG_MEMCG
> /*
> * This ensures that, if available, memcg is automatically enabled
> @@ -1586,5 +1603,208 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
> }
> EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
>
> +/*
> + * Scale the accumulated delay based on how long it has been since we updated
> + * the delay. We only call this when we are adding delay, in case it's been a
> + * while since we added delay, and when we are checking to see if we need to
> + * delay a task, to account for any delays that may have occurred.
> + */
> +static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
> +{
> + u64 old = atomic64_read(&blkg->delay_start);
> +
> + /*
> + * We only want to scale down every second. The idea here is that we
> + * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain
> + * time window. We only want to throttle tasks for recent delay that
> + * has occurred, in 1 second time windows since that's the maximum
> + * things can be throttled. We save the current delay window in
> + * blkg->last_delay so we know what amount is still left to be charged
> + * to the blkg from this point onward. blkg->last_use keeps track of
> + * the use_delay counter. The idea is if we're unthrottling the blkg we
> + * are ok with whatever is happening now, and we can take away more of
> + * the accumulated delay as we've already throttled enough that
> + * everybody is happy with their IO latencies.
> + */
> + if (time_before64(old + NSEC_PER_SEC, now) &&
> + atomic64_cmpxchg(&blkg->delay_start, old, now) == old) {
> + u64 cur = atomic64_read(&blkg->delay_nsec);
> + u64 sub = min_t(u64, blkg->last_delay, now - old);
> + int cur_use = atomic_read(&blkg->use_delay);
> +
> + /*
> + * We've been unthrottled, subtract a larger chunk of our
> + * accumulated delay.
> + */
> + if (cur_use < blkg->last_use)
> + sub = max_t(u64, sub, blkg->last_delay >> 1);
> +
> + /*
> + * This shouldn't happen, but handle it anyway. Our delay_nsec
> + * should only ever be growing except here where we subtract out
> + * min(last_delay, 1 second), but lord knows bugs happen and I'd
> + * rather not end up with negative numbers.
> + */
> + if (unlikely(cur < sub)) {
> + atomic64_set(&blkg->delay_nsec, 0);
> + blkg->last_delay = 0;
> + } else {
> + atomic64_sub(sub, &blkg->delay_nsec);
> + blkg->last_delay = cur - sub;
> + }
> + blkg->last_use = cur_use;
> + }
> +}
> +
> +/*
> + * This is called when we want to actually walk up the hierarchy and check to
> + * see if we need to throttle, and then actually throttle if there is some
> + * accumulated delay. This should only be called upon return to user space so
> + * we're not holding some lock that would induce a priority inversion.
> + */
> +static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
> +{
> + u64 now = ktime_to_ns(ktime_get());
> + u64 exp;
> + u64 delay_nsec = 0;
> + int tok;
> +
> + while (blkg->parent) {
> + if (atomic_read(&blkg->use_delay)) {
> + blkcg_scale_delay(blkg, now);
> + delay_nsec = max_t(u64, delay_nsec,
> + atomic64_read(&blkg->delay_nsec));
> + }
> + blkg = blkg->parent;
> + }
> +
> + if (!delay_nsec)
> + return;
> +
> + /*
> + * Let's not sleep for all eternity if we've amassed a huge delay.
> + * Swapping or metadata IO can accumulate 10's of seconds worth of
> + * delay, and we want userspace to be able to do _something_ so cap the
> + * delays at 1 second. If there's 10's of seconds worth of delay then
> + * the tasks will be delayed for 1 second for every syscall.
> + */
> + delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
> +
> + /*
> + * TODO: the use_memdelay flag is going to be for the upcoming psi stuff
> + * that hasn't landed upstream yet. Once that stuff is in place we need
> + * to do a psi_memstall_enter/leave if memdelay is set.
> + */
> +
> + exp = ktime_add_ns(now, delay_nsec);
> + tok = io_schedule_prepare();
> + do {
> + __set_current_state(TASK_KILLABLE);
> + if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS))
> + break;
> + } while (!fatal_signal_pending(current));
> + io_schedule_finish(tok);
> +}
> +
> +/**
> + * blkcg_maybe_throttle_current - throttle the current task if it has been marked
> + *
> + * This is only called if we've been marked with set_notify_resume(). Obviously
> + * we can be set_notify_resume() for reasons other than blkcg throttling, so we
> + * check to see if current->throttle_queue is set and if not this doesn't do
> + * anything. This should only ever be called by the resume code, it's not meant
> + * to be called by people willy-nilly as it will actually do the work to
> + * throttle the task if it is setup for throttling.
> + */
> +void blkcg_maybe_throttle_current(void)
> +{
> + struct request_queue *q = current->throttle_queue;
> + struct cgroup_subsys_state *css;
> + struct blkcg *blkcg;
> + struct blkcg_gq *blkg;
> + bool use_memdelay = current->use_memdelay;
> +
> + if (!q)
> + return;
> +
> + current->throttle_queue = NULL;
> + current->use_memdelay = false;
> +
> + rcu_read_lock();
> + css = kthread_blkcg();
> + if (css)
> + blkcg = css_to_blkcg(css);
> + else
> + blkcg = css_to_blkcg(task_css(current, io_cgrp_id));
> +
> + if (!blkcg)
> + goto out;
> + blkg = blkg_lookup(blkcg, q);
> + if (!blkg)
> + goto out;
> + blkg = blkg_try_get(blkg);
> + if (!blkg)
> + goto out;
> + rcu_read_unlock();
> + blk_put_queue(q);
> +
> + blkcg_maybe_throttle_blkg(blkg, use_memdelay);
> + blkg_put(blkg);
> + return;
> +out:
> + rcu_read_unlock();
> + blk_put_queue(q);
> +}
> +EXPORT_SYMBOL_GPL(blkcg_maybe_throttle_current);
> +
> +/**
> + * blkcg_schedule_throttle - this task needs to check for throttling
> + * @q - the request queue IO was submitted on
> + * @use_memdelay - do we charge this to memory delay for PSI
> + *
> + * This is called by the IO controller when we know there's delay accumulated
> + * for the blkg for this task. We do not pass the blkg because there are places
> + * we call this that may not have that information, the swapping code for
> + * instance will only have a request_queue at that point. This set's the
> + * notify_resume for the task to check and see if it requires throttling before
> + * returning to user space.
> + *
> + * We will only schedule once per syscall. You can call this over and over
> + * again and it will only do the check once upon return to user space, and only
> + * throttle once. If the task needs to be throttled again it'll need to be
> + * re-set at the next time we see the task.
> + */
> +void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
> +{
> + if (unlikely(current->flags & PF_KTHREAD))
> + return;
> +
> + if (!blk_get_queue(q))
> + return;
> +
> + if (current->throttle_queue)
> + blk_put_queue(current->throttle_queue);
> + current->throttle_queue = q;
> + if (use_memdelay)
> + current->use_memdelay = use_memdelay;
> + set_notify_resume(current);
> +}
> +EXPORT_SYMBOL_GPL(blkcg_schedule_throttle);
> +
> +/**
> + * blkcg_add_delay - add delay to this blkg
> + * @now - the current time in nanoseconds
> + * @delta - how many nanoseconds of delay to add
> + *
> + * Charge @delta to the blkg's current delay accumulation. This is used to
> + * throttle tasks if an IO controller thinks we need more throttling.
> + */
> +void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
> +{
> + blkcg_scale_delay(blkg, now);
> + atomic64_add(delta, &blkg->delay_nsec);
> +}
> +EXPORT_SYMBOL_GPL(blkcg_add_delay);
> +
> module_param(blkcg_debug_stats, bool, 0644);
> MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
> diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
> index a8f9ba8f33a4..de57de4831d5 100644
> --- a/include/linux/blk-cgroup.h
> +++ b/include/linux/blk-cgroup.h
> @@ -136,6 +136,12 @@ struct blkcg_gq {
> struct blkg_policy_data *pd[BLKCG_MAX_POLS];
>
> struct rcu_head rcu_head;
> +
> + atomic_t use_delay;
> + atomic64_t delay_nsec;
> + atomic64_t delay_start;
> + u64 last_delay;
> + int last_use;
> };
>
> typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
> @@ -241,6 +247,26 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
> return css_to_blkcg(task_css(current, io_cgrp_id));
> }
>
> +static inline bool blk_cgroup_congested(void)
> +{
> + struct cgroup_subsys_state *css;
> + bool ret = false;
> +
> + rcu_read_lock();
> + css = kthread_blkcg();
> + if (!css)
> + css = task_css(current, io_cgrp_id);
> + while (css) {
> + if (atomic_read(&css->cgroup->congestion_count)) {
> + ret = true;
> + break;
> + }
> + css = css->parent;
> + }
> + rcu_read_unlock();
> + return ret;
> +}
> +
> /**
> * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
> * @return: true if this bio needs to be submitted with the root blkg context.
> @@ -374,6 +400,21 @@ static inline void blkg_get(struct blkcg_gq *blkg)
> atomic_inc(&blkg->refcnt);
> }
>
> +/**
> + * blkg_try_get - try and get a blkg reference
> + * @blkg: blkg to get
> + *
> + * This is for use when doing an RCU lookup of the blkg. We may be in the midst
> + * of freeing this blkg, so we can only use it if the refcnt is not zero.
> + */
> +static inline struct blkcg_gq *blkg_try_get(struct blkcg_gq *blkg)
> +{
> + if (atomic_inc_not_zero(&blkg->refcnt))
> + return blkg;
> + return NULL;
> +}
> +
> +
> void __blkg_release_rcu(struct rcu_head *rcu);
>
> /**
> @@ -734,6 +775,59 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
> return !throtl;
> }
>
> +static inline void blkcg_use_delay(struct blkcg_gq *blkg)
> +{
> + if (atomic_add_return(1, &blkg->use_delay) == 1)
> + atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
> +}
> +
> +static inline int blkcg_unuse_delay(struct blkcg_gq *blkg)
> +{
> + int old = atomic_read(&blkg->use_delay);
> +
> + if (old == 0)
> + return 0;
> +
> + /*
> + * We do this song and dance because we can race with somebody else
> + * adding or removing delay. If we just did an atomic_dec we'd end up
> + * negative and we'd already be in trouble. We need to subtract 1 and
> + * then check to see if we were the last delay so we can drop the
> + * congestion count on the cgroup.
> + */
> + while (old) {
> + int cur = atomic_cmpxchg(&blkg->use_delay, old, old - 1);
> + if (cur == old)
> + break;
> + old = cur;
> + }
> +
> + if (old == 0)
> + return 0;
> + if (old == 1)
> + atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
> + return 1;
> +}
> +
> +static inline void blkcg_clear_delay(struct blkcg_gq *blkg)
> +{
> + int old = atomic_read(&blkg->use_delay);
> + if (!old)
> + return;
> + /* We only want 1 person clearing the congestion count for this blkg. */
> + while (old) {
> + int cur = atomic_cmpxchg(&blkg->use_delay, old, 0);
> + if (cur == old) {
> + atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
> + break;
> + }
> + old = cur;
> + }
> +}
> +
> +void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta);
> +void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay);
> +void blkcg_maybe_throttle_current(void);
> #else /* CONFIG_BLK_CGROUP */
>
> struct blkcg {
> @@ -753,8 +847,13 @@ struct blkcg_policy {
>
> #define blkcg_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
>
> +static inline void blkcg_maybe_throttle_current(void) { }
> +static inline bool blk_cgroup_congested(void) { return false; }
> +
> #ifdef CONFIG_BLOCK
>
> +static inline void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) { }
> +
> static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
> static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
> static inline void blkcg_drain_queue(struct request_queue *q) { }
> diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
> index c0e68f903011..ff20b677fb9f 100644
> --- a/include/linux/cgroup-defs.h
> +++ b/include/linux/cgroup-defs.h
> @@ -438,6 +438,9 @@ struct cgroup {
> /* used to store eBPF programs */
> struct cgroup_bpf bpf;
>
> + /* If there is block congestion on this cgroup. */
> + atomic_t congestion_count;
> +
> /* ids of the ancestors at each level including self */
> int ancestor_ids[];
> };
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 3aa4fcb74e76..1c59d0573e4d 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -734,6 +734,10 @@ struct task_struct {
> /* disallow userland-initiated cgroup migration */
> unsigned no_cgroup_migration:1;
> #endif
> +#ifdef CONFIG_BLK_CGROUP
> + /* to be used once the psi infrastructure lands upstream. */
> + unsigned use_memdelay:1;
> +#endif
>
> unsigned long atomic_flags; /* Flags requiring atomic access. */
>
> @@ -1151,6 +1155,10 @@ struct task_struct {
> unsigned int memcg_nr_pages_over_high;
> #endif
>
> +#ifdef CONFIG_BLK_CGROUP
> + struct request_queue *throttle_queue;
> +#endif
> +
> #ifdef CONFIG_UPROBES
> struct uprobe_task *utask;
> #endif
> diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
> index 4a8841963c2e..05589a3e37f4 100644
> --- a/include/linux/tracehook.h
> +++ b/include/linux/tracehook.h
> @@ -51,6 +51,7 @@
> #include <linux/security.h>
> #include <linux/task_work.h>
> #include <linux/memcontrol.h>
> +#include <linux/blk-cgroup.h>
> struct linux_binprm;
>
> /*
> @@ -192,6 +193,7 @@ static inline void tracehook_notify_resume(struct pt_regs *regs)
> task_work_run();
>
> mem_cgroup_handle_over_high();
> + blkcg_maybe_throttle_current();
> }
>
> #endif /* <linux/tracehook.h> */
>
Powered by blists - more mailing lists