[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20220520160305.GA17335@blackbody.suse.cz>
Date: Fri, 20 May 2022 18:03:05 +0200
From: Michal Koutný <mkoutny@...e.com>
To: "yukuai (C)" <yukuai3@...wei.com>
Cc: tj@...nel.org, axboe@...nel.dk, ming.lei@...hat.com,
geert@...ux-m68k.org, cgroups@...r.kernel.org,
linux-block@...r.kernel.org, linux-kernel@...r.kernel.org,
yi.zhang@...wei.com
Subject: Re: [PATCH -next v3 2/2] blk-throttle: fix io hung due to
configuration updates
On Fri, May 20, 2022 at 09:36:11AM +0800, "yukuai (C)" <yukuai3@...wei.com> wrote:
> Just to simplify explanation (assum that throtl_slice is greater than
> 0.5s):
> Without this patch:
> wait time is caculated based on issuing 9k from now(3s) without any
> bytes aready dispatched.
I acknowledge that pre-patch state is incorrect because it erases
already passed wait-time from the previous slice.
> With this patch:
> wait time is caculated based on issuing 9k from 0s with 0.5 bytes
> aready dispatched.
Thanks for your further hint. Hopefully, I'm getting closer to real
understanding. Now, I calculate the wait times as durations between
current moment and timepoint when a bio can be dispatched.
IIUC, after config change the ideal wait time of a bio is
wait_ideal := (disp + bio - Δt*l_old) / l_new
where Δt is the elapsed time of the current slice.
You maintain the slice but scale disp, so you get
wait_kuai := ((l_new/l_old)*disp + bio - Δt*l_lew) / l_new
= disp / l_old + bio / l_new - Δt
Please confirm we're on the same page here.
Then I look at
error := wait_kuai - wait_ideal
...
= (Δt * l_old - disp) * (1/l_new - 1/l_old)
= (Δt * l_old - disp) * (1 - α) / (α * l_old)
where
α = l_new / l_old
The leftmost term is a unconsumed IO of the slice. Say it's positive,
while the bigger bio is throttled at the moment of a config change.
If the config change increases throttling (α < 1), the error grows very
high (i.e. over-throttling similar to the existing behavior).
If the config change relieves throttling (α > 1), the wait time's
slightly shorter (under-throttling) wrt the ideal.
If I was to propose a correction, it'd be like the patch at the bottom
derived from your but not finished (the XXX part). It's for potential
further discussion.
I had myself carried a way with the formulas. If I go back to the
beginning:
> Then io hung can be triggered by always submmiting new configuration
> before the throttled bio is dispatched.
How big is this a problem actually? Is it only shooting oneself in the leg
or can there be a user who's privileged enough to modify throttling
configuration yet not privileged enough to justify the hung's
consequences (like some global FS locks).
Thanks,
Michal
--- 8< ---
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 469c483719be..3fd458d16f31 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1274,7 +1274,62 @@ static int tg_print_conf_uint(struct seq_file *sf, void *v)
return 0;
}
-static void tg_conf_updated(struct throtl_grp *tg, bool global)
+static u64 throtl_update_slice_scale(unsigned int slice_start, u64 new_limit,
+ u64 old_limit)
+{
+ if (new_limit == old_limit)
+ return slice_start;
+
+ /* This shouldn't really matter but semantically we want to extend the
+ * slice from the earliest possible point of time. */
+ if (WARN_ON(new_limit == 0))
+ return 0;
+
+ return jiffies - div64_u64((jiffies - slice_start) * old_limit, new_limit);
+}
+
+static void throtl_update_slice(struct throtl_grp *tg, u64 *old_limits)
+{
+ /*
+ * How does this work? We're going to calculate new wait time in
+ * tg_with_in_bps_limit(). Ideal wait time after config change is
+ *
+ * wait_ideal := (disp + bio - Δt*l_old) / l_new
+ *
+ * where Δt = jiffies - tg->slice_start (elapsed time of slice).
+ * In reality, the function has no idea about l_old so it calculates
+ *
+ * wait_skewed := (disp + bio - Δt*l_new) / l_new
+ *
+ * So we modify slice_start to get correct number
+ *
+ * wait_fixed := (disp + bio - Δt'*l_new) / l_new == wait_ideal
+ *
+ * from that
+ * Δt' = Δt * l_old / l_new
+ * or
+ * jiffies - slice_start' = (jiffies - slice_start) * l_old / l_new
+ * .
+ */
+ tg->slice_start[READ] = throtl_update_slice_scale(tg->slice_start[READ],
+ tg_bps_limit(tg, READ),
+ old_limits[0]);
+ tg->slice_start[WRITE] = throtl_update_slice_scale(tg->slice_start[WRITE],
+ tg_bps_limit(tg, WRITE),
+ old_limits[1]);
+
+ // XXX This looks like OK since we should not change BPS and IOPS limit
+ // at the same time but it is not actually OK because scaling
+ // slice_start for one limit breaks the other anyway.
+ tg->slice_start[READ] = throtl_update_slice_scale(tg->slice_start[READ],
+ tg_iops_limit(tg, READ),
+ old_limits[2]);
+ tg->slice_start[WRITE] = throtl_update_slice_scale(tg->slice_start[WRITE],
+ tg_iops_limit(tg, WRITE),
+ old_limits[3]);
+}
+
+static void tg_conf_updated(struct throtl_grp *tg, u64 *old_limits, bool global)
{
struct throtl_service_queue *sq = &tg->service_queue;
struct cgroup_subsys_state *pos_css;
@@ -1313,16 +1368,7 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global)
parent_tg->latency_target);
}
- /*
- * We're already holding queue_lock and know @tg is valid. Let's
- * apply the new config directly.
- *
- * Restart the slices for both READ and WRITES. It might happen
- * that a group's limit are dropped suddenly and we don't want to
- * account recently dispatched IO with new low rate.
- */
- throtl_start_new_slice(tg, READ);
- throtl_start_new_slice(tg, WRITE);
+ throtl_update_slice(tg, old_limits);
if (tg->flags & THROTL_TG_PENDING) {
tg_update_disptime(tg);
@@ -1330,6 +1376,14 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global)
}
}
+static void tg_get_limits(struct throtl_grp *tg, u64 *limits)
+{
+ limits[0] = tg_bps_limit(tg, READ);
+ limits[1] = tg_bps_limit(tg, WRITE);
+ limits[2] = tg_iops_limit(tg, READ);
+ limits[3] = tg_iops_limit(tg, WRITE);
+}
+
static ssize_t tg_set_conf(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off, bool is_u64)
{
@@ -1338,6 +1392,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
struct throtl_grp *tg;
int ret;
u64 v;
+ u64 old_limits[4];
ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
if (ret)
@@ -1350,13 +1405,14 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
v = U64_MAX;
tg = blkg_to_tg(ctx.blkg);
+ tg_get_limits(tg, old_limits);
if (is_u64)
*(u64 *)((void *)tg + of_cft(of)->private) = v;
else
*(unsigned int *)((void *)tg + of_cft(of)->private) = v;
- tg_conf_updated(tg, false);
+ tg_conf_updated(tg, old_limits, false);
ret = 0;
out_finish:
blkg_conf_finish(&ctx);
@@ -1526,6 +1582,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
struct blkg_conf_ctx ctx;
struct throtl_grp *tg;
u64 v[4];
+ u64 old_limits[4];
unsigned long idle_time;
unsigned long latency_time;
int ret;
@@ -1536,6 +1593,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
return ret;
tg = blkg_to_tg(ctx.blkg);
+ tg_get_limits(tg, old_limits);
v[0] = tg->bps_conf[READ][index];
v[1] = tg->bps_conf[WRITE][index];
@@ -1627,7 +1685,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
tg->td->limit_index = LIMIT_LOW;
} else
tg->td->limit_index = LIMIT_MAX;
- tg_conf_updated(tg, index == LIMIT_LOW &&
+ tg_conf_updated(tg, old_limits, index == LIMIT_LOW &&
tg->td->limit_valid[LIMIT_LOW]);
ret = 0;
out_finish:
Powered by blists - more mailing lists