[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20250222092823.210318-3-yukuai1@huaweicloud.com>
Date: Sat, 22 Feb 2025 17:28:23 +0800
From: Yu Kuai <yukuai1@...weicloud.com>
To: ming.lei@...hat.com,
tj@...nel.org,
josef@...icpanda.com,
axboe@...nel.dk
Cc: cgroups@...r.kernel.org,
linux-block@...r.kernel.org,
linux-kernel@...r.kernel.org,
yukuai3@...wei.com,
yukuai1@...weicloud.com,
yi.zhang@...wei.com,
yangerkun@...wei.com
Subject: [PATCH 2/2] blk-throttle: fix off-by-one jiffies wait_time
From: Yu Kuai <yukuai3@...wei.com>
wait_time is based on jiffies, and the calculation is:
wait_time = extra_bytes * HZ / bps_limit;
If wait time is not zero, the remainder is ignored, means wait time is
short by at most 1 jiffes; On the other hand, if wait time is zero, it
will be used as 1 jiffies, means it's excessive by at most 1 jiffes.
This way causes blktests throtl/001 failure in case of CONFIG_HZ_100=y,
fix the problem by recording remainder as debt, and repay the debt
later.
Reported-and-tested-by: Ming Lei <ming.lei@...hat.com>
Closes: https://lore.kernel.org/all/20250220111735.1187999-1-ming.lei@redhat.com/
Signed-off-by: Yu Kuai <yukuai3@...wei.com>
---
block/blk-throttle.c | 24 +++++++++++++++++-------
block/blk-throttle.h | 12 ++++++++----
2 files changed, 25 insertions(+), 11 deletions(-)
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 0096e486b1e3..3828c6535605 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -703,9 +703,10 @@ static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio
}
static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
- u64 bps_limit)
+ u64 bps_limit, bool *has_debt)
{
bool rw = bio_data_dir(bio);
+ long long carryover_bytes;
long long bytes_allowed;
u64 extra_bytes;
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
@@ -730,10 +731,16 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
/* Calc approx time to dispatch */
extra_bytes = tg->bytes_disp[rw] + bio_size - bytes_allowed;
- jiffy_wait = div64_u64(extra_bytes * HZ, bps_limit);
-
- if (!jiffy_wait)
- jiffy_wait = 1;
+ jiffy_wait = div64_u64_rem(extra_bytes * HZ, bps_limit, &carryover_bytes);
+ if (carryover_bytes) {
+ /*
+ * If extra_bytes is not divisible, the remainder is recorded as
+ * debt. Caller must ensure the current slice has at least 1
+ * more jiffies to repay the debt.
+ */
+ *has_debt = true;
+ tg->carryover_bytes[rw] -= div64_u64(carryover_bytes, HZ);
+ }
/*
* This wait time is without taking into consideration the rounding
@@ -754,6 +761,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
u64 bps_limit = tg_bps_limit(tg, rw);
u32 iops_limit = tg_iops_limit(tg, rw);
+ bool has_debt = false;
/*
* Currently whole state machine of group depends on first bio
@@ -784,18 +792,20 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
else
throtl_extend_slice(tg, rw, jiffies + tg->td->throtl_slice);
- bps_wait = tg_within_bps_limit(tg, bio, bps_limit);
+ bps_wait = tg_within_bps_limit(tg, bio, bps_limit, &has_debt);
iops_wait = tg_within_iops_limit(tg, bio, iops_limit);
if (bps_wait + iops_wait == 0) {
if (wait)
*wait = 0;
+ if (has_debt)
+ throtl_extend_slice(tg, rw, jiffies + 1);
return true;
}
max_wait = max(bps_wait, iops_wait);
if (wait)
*wait = max_wait;
- throtl_extend_slice(tg, rw, jiffies + max_wait);
+ throtl_extend_slice(tg, rw, jiffies + max_wait + has_debt);
return false;
}
diff --git a/block/blk-throttle.h b/block/blk-throttle.h
index 1a36d1278eea..56dcb5ce412f 100644
--- a/block/blk-throttle.h
+++ b/block/blk-throttle.h
@@ -110,10 +110,14 @@ struct throtl_grp {
unsigned int last_io_disp[2];
/*
- * The following two fields are updated when new configuration is
- * submitted while some bios are still throttled, they record how many
- * bytes/ios are waited already in previous configuration, and they will
- * be used to calculate wait time under new configuration.
+ * The following two fields are updated when:
+ * 1) new configuration is submitted while some bios are still
+ * throttled, they record how many bytes/ios are waited already in
+ * previous configuration;
+ * 2) IOs which may cause priority inversions are dispatched while tg is
+ * over limit, these IOs will be dispatched directly;
+ * 3) While calculating wait_time for IO, extra_bytes * HZ is not
+ * divisible by bps_limit, the remainder will be recorded;
*/
long long carryover_bytes[2];
int carryover_ios[2];
--
2.39.2
Powered by blists - more mailing lists