lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20250222092823.210318-3-yukuai1@huaweicloud.com>
Date: Sat, 22 Feb 2025 17:28:23 +0800
From: Yu Kuai <yukuai1@...weicloud.com>
To: ming.lei@...hat.com,
	tj@...nel.org,
	josef@...icpanda.com,
	axboe@...nel.dk
Cc: cgroups@...r.kernel.org,
	linux-block@...r.kernel.org,
	linux-kernel@...r.kernel.org,
	yukuai3@...wei.com,
	yukuai1@...weicloud.com,
	yi.zhang@...wei.com,
	yangerkun@...wei.com
Subject: [PATCH 2/2] blk-throttle: fix off-by-one jiffies wait_time

From: Yu Kuai <yukuai3@...wei.com>

wait_time is based on jiffies, and the calculation is:

wait_time = extra_bytes * HZ / bps_limit;

If wait time is not zero, the remainder is ignored, means wait time is
short by at most 1 jiffes; On the other hand, if wait time is zero, it
will be used as 1 jiffies, means it's excessive by at most 1 jiffes.

This way causes blktests throtl/001 failure in case of CONFIG_HZ_100=y,
fix the problem by recording remainder as debt, and repay the debt
later.

Reported-and-tested-by: Ming Lei <ming.lei@...hat.com>
Closes: https://lore.kernel.org/all/20250220111735.1187999-1-ming.lei@redhat.com/
Signed-off-by: Yu Kuai <yukuai3@...wei.com>
---
 block/blk-throttle.c | 24 +++++++++++++++++-------
 block/blk-throttle.h | 12 ++++++++----
 2 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 0096e486b1e3..3828c6535605 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -703,9 +703,10 @@ static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio
 }
 
 static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
-				u64 bps_limit)
+				u64 bps_limit, bool *has_debt)
 {
 	bool rw = bio_data_dir(bio);
+	long long carryover_bytes;
 	long long bytes_allowed;
 	u64 extra_bytes;
 	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
@@ -730,10 +731,16 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
 
 	/* Calc approx time to dispatch */
 	extra_bytes = tg->bytes_disp[rw] + bio_size - bytes_allowed;
-	jiffy_wait = div64_u64(extra_bytes * HZ, bps_limit);
-
-	if (!jiffy_wait)
-		jiffy_wait = 1;
+	jiffy_wait = div64_u64_rem(extra_bytes * HZ, bps_limit, &carryover_bytes);
+	if (carryover_bytes) {
+		/*
+		 * If extra_bytes is not divisible, the remainder is recorded as
+		 * debt. Caller must ensure the current slice has at least 1
+		 * more jiffies to repay the debt.
+		 */
+		*has_debt = true;
+		tg->carryover_bytes[rw] -= div64_u64(carryover_bytes, HZ);
+	}
 
 	/*
 	 * This wait time is without taking into consideration the rounding
@@ -754,6 +761,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
 	unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
 	u64 bps_limit = tg_bps_limit(tg, rw);
 	u32 iops_limit = tg_iops_limit(tg, rw);
+	bool has_debt = false;
 
 	/*
  	 * Currently whole state machine of group depends on first bio
@@ -784,18 +792,20 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
 	else
 		throtl_extend_slice(tg, rw, jiffies + tg->td->throtl_slice);
 
-	bps_wait = tg_within_bps_limit(tg, bio, bps_limit);
+	bps_wait = tg_within_bps_limit(tg, bio, bps_limit, &has_debt);
 	iops_wait = tg_within_iops_limit(tg, bio, iops_limit);
 	if (bps_wait + iops_wait == 0) {
 		if (wait)
 			*wait = 0;
+		if (has_debt)
+			throtl_extend_slice(tg, rw, jiffies + 1);
 		return true;
 	}
 
 	max_wait = max(bps_wait, iops_wait);
 	if (wait)
 		*wait = max_wait;
-	throtl_extend_slice(tg, rw, jiffies + max_wait);
+	throtl_extend_slice(tg, rw, jiffies + max_wait + has_debt);
 
 	return false;
 }
diff --git a/block/blk-throttle.h b/block/blk-throttle.h
index 1a36d1278eea..56dcb5ce412f 100644
--- a/block/blk-throttle.h
+++ b/block/blk-throttle.h
@@ -110,10 +110,14 @@ struct throtl_grp {
 	unsigned int last_io_disp[2];
 
 	/*
-	 * The following two fields are updated when new configuration is
-	 * submitted while some bios are still throttled, they record how many
-	 * bytes/ios are waited already in previous configuration, and they will
-	 * be used to calculate wait time under new configuration.
+	 * The following two fields are updated when:
+	 * 1) new configuration is submitted while some bios are still
+	 * throttled, they record how many bytes/ios are waited already in
+	 * previous configuration;
+	 * 2) IOs which may cause priority inversions are dispatched while tg is
+	 * over limit, these IOs will be dispatched directly;
+	 * 3) While calculating wait_time for IO, extra_bytes * HZ is not
+	 * divisible by bps_limit, the remainder will be recorded;
 	 */
 	long long carryover_bytes[2];
 	int carryover_ios[2];
-- 
2.39.2


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ