lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <028fe809-3902-059f-586e-a3119ea63891@huaweicloud.com>
Date:   Fri, 1 Jul 2022 15:39:35 +0800
From:   Yu Kuai <yukuai1@...weicloud.com>
To:     Yu Kuai <yukuai3@...wei.com>, tj@...nel.org, mkoutny@...e.com,
        axboe@...nel.dk, ming.lei@...hat.com
Cc:     cgroups@...r.kernel.org, linux-block@...r.kernel.org,
        linux-kernel@...r.kernel.org, yi.zhang@...wei.com
Subject: Re: [PATCH] blk-throttle: fix io hung due to config updates

On 2022/07/01 15:49, Yu Kuai wrote:

Please ignore this patch, it's been sent by mistake.

Kuai

> If new configuration is submitted while a bio is throttled, then new
> waiting time is recalculated regardless that the bio might aready wait
> for some time:
>
> tg_conf_updated
>   throtl_start_new_slice
>    tg_update_disptime
>    throtl_schedule_next_dispatch
>
> Then io hung can be triggered by always submmiting new configuration
> before the throttled bio is dispatched.
>
> Fix the problem by respecting the time that throttled bio aready waited.
> In order to do that, add new fields to record how many bytes/io already
> waited, and use it to calculate wait time for throttled bio under new
> configuration.
>
> Some simple test:
> 1)
> cd /sys/fs/cgroup/blkio/
> echo $$ > cgroup.procs
> echo "8:0 2048" > blkio.throttle.write_bps_device
> {
>          sleep 2
>          echo "8:0 1024" > blkio.throttle.write_bps_device
> } &
> dd if=/dev/zero of=/dev/sda bs=8k count=1 oflag=direct
>
> 2)
> cd /sys/fs/cgroup/blkio/
> echo $$ > cgroup.procs
> echo "8:0 1024" > blkio.throttle.write_bps_device
> {
>          sleep 4
>          echo "8:0 2048" > blkio.throttle.write_bps_device
> } &
> dd if=/dev/zero of=/dev/sda bs=8k count=1 oflag=direct
>
> test results: io finish time
> 	before this patch	with this patch
> 1)	10s			6s
> 2)	8s			6s
>
> Signed-off-by: Yu Kuai <yukuai3@...wei.com>
> ---
>   block/blk-throttle.c | 51 ++++++++++++++++++++++++++++++++++++++------
>   block/blk-throttle.h |  9 ++++++++
>   2 files changed, 54 insertions(+), 6 deletions(-)
>
> diff --git a/block/blk-throttle.c b/block/blk-throttle.c
> index 8612a071305e..8178ef278e7a 100644
> --- a/block/blk-throttle.c
> +++ b/block/blk-throttle.c
> @@ -639,6 +639,8 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
>   {
>   	tg->bytes_disp[rw] = 0;
>   	tg->io_disp[rw] = 0;
> +	tg->bytes_skipped[rw] = 0;
> +	tg->io_skipped[rw] = 0;
>   
>   	/*
>   	 * Previous slice has expired. We must have trimmed it after last
> @@ -656,12 +658,17 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
>   		   tg->slice_end[rw], jiffies);
>   }
>   
> -static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
> +static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw,
> +					  bool clear_skipped)
>   {
>   	tg->bytes_disp[rw] = 0;
>   	tg->io_disp[rw] = 0;
>   	tg->slice_start[rw] = jiffies;
>   	tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
> +	if (clear_skipped) {
> +		tg->bytes_skipped[rw] = 0;
> +		tg->io_skipped[rw] = 0;
> +	}
>   
>   	throtl_log(&tg->service_queue,
>   		   "[%c] new slice start=%lu end=%lu jiffies=%lu",
> @@ -783,6 +790,34 @@ static u64 calculate_bytes_allowed(u64 bps_limit, unsigned long jiffy_elapsed)
>   	return mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed, (u64)HZ);
>   }
>   
> +static void __tg_update_skipped(struct throtl_grp *tg, bool rw)
> +{
> +	unsigned long jiffy_elapsed = jiffies - tg->slice_start[rw];
> +	u64 bps_limit = tg_bps_limit(tg, rw);
> +	u32 iops_limit = tg_iops_limit(tg, rw);
> +
> +	if (bps_limit != U64_MAX)
> +		tg->bytes_skipped[rw] +=
> +			calculate_bytes_allowed(bps_limit, jiffy_elapsed) -
> +			tg->bytes_disp[rw];
> +	if (iops_limit != UINT_MAX)
> +		tg->io_skipped[rw] +=
> +			calculate_io_allowed(iops_limit, jiffy_elapsed) -
> +			tg->io_disp[rw];
> +}
> +
> +static void tg_update_skipped(struct throtl_grp *tg)
> +{
> +	if (tg->service_queue.nr_queued[READ])
> +		__tg_update_skipped(tg, READ);
> +	if (tg->service_queue.nr_queued[WRITE])
> +		__tg_update_skipped(tg, WRITE);
> +
> +	throtl_log(&tg->service_queue, "%s: %llu %llu %u %u\n", __func__,
> +		   tg->bytes_skipped[READ], tg->bytes_skipped[WRITE],
> +		   tg->io_skipped[READ], tg->io_skipped[WRITE]);
> +}
> +
>   static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
>   				  u32 iops_limit, unsigned long *wait)
>   {
> @@ -800,7 +835,8 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
>   
>   	/* Round up to the next throttle slice, wait time must be nonzero */
>   	jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice);
> -	io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd);
> +	io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd) +
> +		     tg->io_skipped[rw];
>   	if (tg->io_disp[rw] + 1 <= io_allowed) {
>   		if (wait)
>   			*wait = 0;
> @@ -837,7 +873,8 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
>   		jiffy_elapsed_rnd = tg->td->throtl_slice;
>   
>   	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
> -	bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd);
> +	bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd) +
> +			tg->bytes_skipped[rw];
>   	if (tg->bytes_disp[rw] + bio_size <= bytes_allowed) {
>   		if (wait)
>   			*wait = 0;
> @@ -898,7 +935,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
>   	 * slice and it should be extended instead.
>   	 */
>   	if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw]))
> -		throtl_start_new_slice(tg, rw);
> +		throtl_start_new_slice(tg, rw, true);
>   	else {
>   		if (time_before(tg->slice_end[rw],
>   		    jiffies + tg->td->throtl_slice))
> @@ -1327,8 +1364,8 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global)
>   	 * that a group's limit are dropped suddenly and we don't want to
>   	 * account recently dispatched IO with new low rate.
>   	 */
> -	throtl_start_new_slice(tg, READ);
> -	throtl_start_new_slice(tg, WRITE);
> +	throtl_start_new_slice(tg, READ, false);
> +	throtl_start_new_slice(tg, WRITE, false);
>   
>   	if (tg->flags & THROTL_TG_PENDING) {
>   		tg_update_disptime(tg);
> @@ -1356,6 +1393,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
>   		v = U64_MAX;
>   
>   	tg = blkg_to_tg(ctx.blkg);
> +	tg_update_skipped(tg);
>   
>   	if (is_u64)
>   		*(u64 *)((void *)tg + of_cft(of)->private) = v;
> @@ -1542,6 +1580,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
>   		return ret;
>   
>   	tg = blkg_to_tg(ctx.blkg);
> +	tg_update_skipped(tg);
>   
>   	v[0] = tg->bps_conf[READ][index];
>   	v[1] = tg->bps_conf[WRITE][index];
> diff --git a/block/blk-throttle.h b/block/blk-throttle.h
> index c1b602996127..371d624af845 100644
> --- a/block/blk-throttle.h
> +++ b/block/blk-throttle.h
> @@ -115,6 +115,15 @@ struct throtl_grp {
>   	uint64_t bytes_disp[2];
>   	/* Number of bio's dispatched in current slice */
>   	unsigned int io_disp[2];
> +	/*
> +	 * The following two fields are used to calculate new wait time for
> +	 * throttled bio when new configuration is submmited.
> +	 *
> +	 * Number of bytes will be skipped in current slice
> +	 */
> +	uint64_t bytes_skipped[2];
> +	/* Number of bio will be skipped in current slice */
> +	unsigned int io_skipped[2];
>   
>   	unsigned long last_low_overflow_time[2];
>   

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ