lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <3140b37c-a2b5-3421-da32-748fc2a54e6f@bytedance.com>
Date:   Sat, 28 Jan 2023 23:11:12 +0800
From:   hanjinke <hanjinke.666@...edance.com>
To:     tj@...nel.org, josef@...icpanda.com, axboe@...nel.dk
Cc:     cgroups@...r.kernel.org, linux-block@...r.kernel.org,
        linux-kernel@...r.kernel.org, yinxin.x@...edance.com
Subject: Re: [PATCH v6] blk-throtl: Introduce sync and async queues for
 blk-throtl

Hi
pinging...

在 2023/1/12 下午11:25, Jinke Han 写道:
> From: Jinke Han <hanjinke.666@...edance.com>
> 
> Now we don't distinguish sync write ios from normal buffer write ios
> in blk-throtl. A bio with REQ_SYNC tagged always mean it will be wait
> until write completion soon after it submit. So it's reasonable for sync
> io to complete as soon as possible.
> 
> In our test, fio writes a 100g file in sequential 4k blocksize in
> a container with low bps limit configured (wbps=10M). More than 1200
> ios were throttled in blk-throtl queue and the avarage throtle time
> of each io is 140s. At the same time, the operation of saving a small
> file by vim will be blocked amolst 140s. As a fsync will be send by vim,
> the sync ios of fsync will be blocked by a huge amount of buffer write
> ios ahead. This is also a priority inversion problem within one cgroup.
> In the database scene, things got really bad with blk-throtle enabled
> as fsync is called very often.
> 
> This patch splits bio queue into sync and async queues for blk-throtl
> and gives a huge priority to sync write ios. Sync queue only make sense
> for write ios as we treat all read io as sync io. I think it's a nice
> respond to the semantics of REQ_SYNC. Bios with REQ_META and REQ_PRIO
> gains the same priority as they are important to fs. This may avoid
> some potential priority inversion problems.
> 
> With this patch, do the same test above, the duration of the fsync sent
> by vim drops to several hundreds of milliseconds.
> 
> Acked-by: Tejun Heo <tj@...nel.org>
> Signed-off-by: Jinke Han <hanjinke.666@...edance.com>
> Signed-off-by: Xin Yin <yinxin.x@...edance.com>
> ---
> 
> Changes in v2
> - Make code more simple.
> Changes in v3
> - Fix mismatch of waiting bio and the next dispatched bio.
> - Rename dispatch_sync_cnt to disp_sync_cnt.
> - Add more comments.
> Changes in v4
> - Improve patch in code style and code comments.
> Changes in v5
> - Add more comments for next_to_disp.
> Changes in v6
> - Adjust the comments for next_to_disp.
>    
>   block/blk-throttle.c | 144 ++++++++++++++++++++++++++++++++++++++++---
>   block/blk-throttle.h |  13 +++-
>   2 files changed, 148 insertions(+), 9 deletions(-)
> 
> diff --git a/block/blk-throttle.c b/block/blk-throttle.c
> index 6fb5a2f9e1ee..966b7ee48fd8 100644
> --- a/block/blk-throttle.c
> +++ b/block/blk-throttle.c
> @@ -21,6 +21,13 @@
>   /* Total max dispatch from all groups in one round */
>   #define THROTL_QUANTUM 32
>   
> +/* For write ios, dispatch 4 sync ios and 1 normal io in one loop */
> +#define THROTL_SYNC_FACTOR 4
> +
> +/* Only make sense for write ios, all read ios are treated as SYNC */
> +#define SYNC	0
> +#define ASYNC	1
> +
>   /* Throttling is performed over a slice and after that slice is renewed */
>   #define DFL_THROTL_SLICE_HD (HZ / 10)
>   #define DFL_THROTL_SLICE_SSD (HZ / 50)
> @@ -241,11 +248,30 @@ static inline unsigned int throtl_bio_data_size(struct bio *bio)
>   	return bio->bi_iter.bi_size;
>   }
>   
> -static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
> +static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg,
> +			      bool rw)
>   {
>   	INIT_LIST_HEAD(&qn->node);
> -	bio_list_init(&qn->bios);
> +	bio_list_init(&qn->bios[SYNC]);
> +	bio_list_init(&qn->bios[ASYNC]);
>   	qn->tg = tg;
> +	qn->disp_sync_cnt = rw == READ ? UINT_MAX : 0;
> +	qn->next_to_disp = NULL;
> +}
> +
> +static inline void throtl_qnode_add_bio_list(struct throtl_qnode *qn,
> +					     struct bio *bio)
> +{
> +	bool rw = bio_data_dir(bio);
> +
> +	/*
> +	 * All read bios and those write bios with REQ_SYNC, REQ_META or
> +	 * REQ_PRIO flags set are treated as SYNC io.
> +	 */
> +	if (rw == READ || bio->bi_opf & (REQ_SYNC | REQ_META | REQ_PRIO))
> +		bio_list_add(&qn->bios[SYNC], bio);
> +	else
> +		bio_list_add(&qn->bios[ASYNC], bio);
>   }
>   
>   /**
> @@ -261,13 +287,84 @@ static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
>   static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn,
>   				 struct list_head *queued)
>   {
> -	bio_list_add(&qn->bios, bio);
> +	throtl_qnode_add_bio_list(qn, bio);
>   	if (list_empty(&qn->node)) {
>   		list_add_tail(&qn->node, queued);
>   		blkg_get(tg_to_blkg(qn->tg));
>   	}
>   }
>   
> +/**
> + * throtl_qnode_bio_list_pop: pop a bio from sync/async queue
> + * @qn: the qnode to pop a bio from
> + *
> + * For writes, target SYNC or ASYNC queue based on disp_sync_cnt. If empty,
> + * try the other queue.
> + */
> +static struct bio *throtl_qnode_bio_list_pop(struct throtl_qnode *qn)
> +{
> +	struct bio *bio;
> +	int from = qn->disp_sync_cnt == THROTL_SYNC_FACTOR ? ASYNC : SYNC;
> +
> +	bio = bio_list_pop(&qn->bios[from]);
> +	if (!bio) {
> +		from = 1 - from;
> +		bio = bio_list_pop(&qn->bios[from]);
> +	}
> +
> +	if (qn->disp_sync_cnt < THROTL_SYNC_FACTOR && from == SYNC)
> +		qn->disp_sync_cnt++;
> +	else
> +		qn->disp_sync_cnt = 0;
> +
> +	return bio;
> +}
> +
> +/**
> + * throtl_qnode_bio_peek - peek a bio from a qn
> + * @qn: the qnode to peek from
> + *
> + * For read, always peek bio from the SYNC queue.
> + *
> + * For write, we first peek bio from next_to_disp. If it's NULL, a bio
> + * will be popped from SYNC or ASYNC queue to fill it. The next_to_disp
> + * is used to make sure that the previously bio and the next popped bio
> + * are always the same even in case that the spin lock of queue was
> + * released and re-grabbed. Consider the following situation:
> + *
> + * Assume that there are only bios queued in ASYNC queue and the SYNC
> + * queue is empty and all ASYNC bios are 1M in size and the bps limit is
> + * 1M/s. The throtl_slice is 100ms. The current slice is [jiffies1,
> + * jiffies1+100] and the bytes_disp[w] is 0.
> + *
> + * The disp_sync_cnt is 0 as it was set 0 after each dispatching of a
> + * ASYNC bio. A ASYNC bio wil be peeked to check in tg_may_dispatch.
> + * Obviously, it can't be dispatched in current slice and the wait time
> + * is 1000ms. The slice will be extended to [jiffies1, jiffies1+1000] in
> + * tg_may_dispatch. The queue lock is released and a 4k SYNC bio gets
> + * queued during the 1000ms wait. After 1000ms, it's time to dispatch the
> + * tg, the SYNC bio will be popped to dispatched as the disp_sync_cnt is 0
> + * and the SYNC queue is no-empty. The slice will be extended to [jiffies1,
> + * jiffies1+1100] in tg_may_dispatch. Then the slice will be trimed to
> + * [jiffies1+1000, jiffies1+1100] after the SYNC bio was dispatched. This
> + * means that only 4k size was dispatched in last 1000ms and the budget for
> + * this 1000ms has be voided immediately with the trim. If this happens
> + * often, it is hard to fill up the bandwidth considering that the bps
> + * limit is 1M/s.
> + */
> +static inline struct bio *throtl_qnode_bio_peek(struct throtl_qnode *qn)
> +{
> +	/* qn for read ios */
> +	if (qn->disp_sync_cnt == UINT_MAX)
> +		return bio_list_peek(&qn->bios[SYNC]);
> +
> +	/* qn for write ios */
> +	if (!qn->next_to_disp)
> +		qn->next_to_disp  = throtl_qnode_bio_list_pop(qn);
> +
> +	return qn->next_to_disp;
> +}
> +
>   /**
>    * throtl_peek_queued - peek the first bio on a qnode list
>    * @queued: the qnode list to peek
> @@ -281,11 +378,42 @@ static struct bio *throtl_peek_queued(struct list_head *queued)
>   		return NULL;
>   
>   	qn = list_first_entry(queued, struct throtl_qnode, node);
> -	bio = bio_list_peek(&qn->bios);
> +	bio = throtl_qnode_bio_peek(qn);
>   	WARN_ON_ONCE(!bio);
>   	return bio;
>   }
>   
> +/**
> + * throtl_qnode_bio_pop: pop a bio from a qnode
> + * @qn: the qnode to pop a bio from
> + */
> +static inline struct bio *throtl_qnode_bio_pop(struct throtl_qnode *qn)
> +{
> +	struct bio *bio;
> +
> +	/* qn for read ios */
> +	if (qn->disp_sync_cnt == UINT_MAX)
> +		return bio_list_pop(&qn->bios[SYNC]);
> +
> +	/* qn for write ios */
> +	if (qn->next_to_disp) {
> +		bio = qn->next_to_disp;
> +		qn->next_to_disp = NULL;
> +		return bio;
> +	}
> +
> +	return throtl_qnode_bio_list_pop(qn);
> +}
> +
> +static inline bool throtl_qnode_empty(struct throtl_qnode *qn)
> +{
> +	if (!qn->next_to_disp &&
> +	    bio_list_empty(&qn->bios[SYNC]) &&
> +			   bio_list_empty(&qn->bios[ASYNC]))
> +		return true;
> +	return false;
> +}
> +
>   /**
>    * throtl_pop_queued - pop the first bio form a qnode list
>    * @queued: the qnode list to pop a bio from
> @@ -310,10 +438,10 @@ static struct bio *throtl_pop_queued(struct list_head *queued,
>   		return NULL;
>   
>   	qn = list_first_entry(queued, struct throtl_qnode, node);
> -	bio = bio_list_pop(&qn->bios);
> +	bio = throtl_qnode_bio_pop(qn);
>   	WARN_ON_ONCE(!bio);
>   
> -	if (bio_list_empty(&qn->bios)) {
> +	if (throtl_qnode_empty(qn)) {
>   		list_del_init(&qn->node);
>   		if (tg_to_put)
>   			*tg_to_put = qn->tg;
> @@ -355,8 +483,8 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp,
>   	throtl_service_queue_init(&tg->service_queue);
>   
>   	for (rw = READ; rw <= WRITE; rw++) {
> -		throtl_qnode_init(&tg->qnode_on_self[rw], tg);
> -		throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
> +		throtl_qnode_init(&tg->qnode_on_self[rw], tg, rw);
> +		throtl_qnode_init(&tg->qnode_on_parent[rw], tg, rw);
>   	}
>   
>   	RB_CLEAR_NODE(&tg->rb_node);
> diff --git a/block/blk-throttle.h b/block/blk-throttle.h
> index ef4b7a4de987..bd4fff65b6d9 100644
> --- a/block/blk-throttle.h
> +++ b/block/blk-throttle.h
> @@ -28,8 +28,19 @@
>    */
>   struct throtl_qnode {
>   	struct list_head	node;		/* service_queue->queued[] */
> -	struct bio_list		bios;		/* queued bios */
> +	struct bio_list		bios[2];	/* queued bios */
>   	struct throtl_grp	*tg;		/* tg this qnode belongs to */
> +
> +	struct bio		*next_to_disp;	/* pinned for next to dispatch */
> +	/*
> +	 * 1) for write throtl_qnode:
> +	 * [0, THROTL_SYNC_FACTOR-1]: dispatch sync io
> +	 * [THROTL_SYNC_FACTOR]: dispatch async io
> +	 *
> +	 * 2) for read throtl_qnode:
> +	 * UINT_MAX
> +	 */
> +	unsigned int disp_sync_cnt;         /* sync io dispatch counter */
>   };
>   
>   struct throtl_service_queue {

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ