[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <1430d56b-ed99-6c30-85e2-11d0a8a40a12@huawei.com>
Date: Fri, 7 Jan 2022 12:05:33 +0000
From: John Garry <john.garry@...wei.com>
To: qiulaibin <qiulaibin@...wei.com>,
"axboe@...nel.dk" <axboe@...nel.dk>,
"ming.lei@...hat.com" <ming.lei@...hat.com>
CC: "martin.petersen@...cle.com" <martin.petersen@...cle.com>,
"hare@...e.de" <hare@...e.de>,
"andriy.shevchenko@...ux.intel.com"
<andriy.shevchenko@...ux.intel.com>,
"bvanassche@....org" <bvanassche@....org>,
"linux-block@...r.kernel.org" <linux-block@...r.kernel.org>,
"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>
Subject: Re: [PATCH -next v3] blk-mq: fix tag_get wait task can't be awakened
On 06/01/2022 13:34, qiulaibin wrote:
> In case of shared tags, there might be more than one hctx which
> allocates tag from single tags,
how about "allocates from the same tags"?
> and each hctx is limited to allocate at
> most:
> hctx_max_depth = max((bt->sb.depth + users - 1) / users, 4U);
>
> tag idle detection is lazy, and may be delayed for 30sec, so there
> could be just one real active hctx(queue) but all others are actually
> idle and still accounted as active because of the lazy idle detection.
> Then if wake_batch is > hctx_max_depth, driver tag allocation may wait
> forever on this real active hctx.
>
> Fix this by recalculating wake_batch when inc or dec active_queues.
>
> Fixes: 0d2602ca30e41 ("blk-mq: improve support for shared tags maps")
> Suggested-by: Ming Lei <ming.lei@...hat.com>
> Signed-off-by: Laibin Qiu <qiulaibin@...wei.com>
> ---
> block/blk-mq-tag.c | 32 ++++++++++++++++++++++++++++++--
> include/linux/sbitmap.h | 11 +++++++++++
> lib/sbitmap.c | 22 +++++++++++++++++++---
> 3 files changed, 60 insertions(+), 5 deletions(-)
>
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index e55a6834c9a6..e59ebf89c1bf 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -16,6 +16,21 @@
> #include "blk-mq-sched.h"
> #include "blk-mq-tag.h"
>
> +/*
> + * Recalculate wakeup batch when tag is shared by hctx.
> + */
> +static void blk_mq_update_wake_batch(struct blk_mq_tags *tags,
> + unsigned int users)
> +{
> + if (!users)
> + return;
> +
> + sbitmap_queue_recalculate_wake_batch(&tags->bitmap_tags,
> + users);
> + sbitmap_queue_recalculate_wake_batch(&tags->breserved_tags,
> + users);
> +}
> +
> /*
> * If a previously inactive queue goes active, bump the active user count.
> * We need to do this before try to allocate driver tag, then even if fail
> @@ -24,16 +39,25 @@
> */
> bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
> {
> + unsigned int users;
> if (blk_mq_is_shared_tags(hctx->flags)) {
> struct request_queue *q = hctx->queue;
>
> if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) &&
> - !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
> + !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) {
> atomic_inc(&hctx->tags->active_queues);
> +
> + users = atomic_read(&hctx->tags->active_queues);
> + blk_mq_update_wake_batch(hctx->tags, users);
> + }
> } else {
> if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
> - !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
> + !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) {
> atomic_inc(&hctx->tags->active_queues);
> +
> + users = atomic_read(&hctx->tags->active_queues);
atomic_inc_return() can do inc and read together
> + blk_mq_update_wake_batch(hctx->tags, users);
> + }
there seems to be more duplicated code here now, as we do the same in
both legs of the if-else statement.
> }
>
> return true;
> @@ -56,6 +80,7 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
> void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
> {
> struct blk_mq_tags *tags = hctx->tags;
> + unsigned int users;
>
> if (blk_mq_is_shared_tags(hctx->flags)) {
> struct request_queue *q = hctx->queue;
> @@ -70,6 +95,9 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
>
> atomic_dec(&tags->active_queues);
>
> + users = atomic_read(&hctx->tags->active_queues);
as above, atomic_dec_return()
> + blk_mq_update_wake_batch(hctx->tags, users);
> +
> blk_mq_tag_wakeup_all(tags, false);
> }
>
> diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
> index fc0357a6e19b..e1fced98dfca 100644
> --- a/include/linux/sbitmap.h
> +++ b/include/linux/sbitmap.h
> @@ -415,6 +415,17 @@ static inline void sbitmap_queue_free(struct sbitmap_queue *sbq)
> sbitmap_free(&sbq->sb);
> }
>
> +/**
> + * sbitmap_queue_recalculate_wake_batch() - Recalculate wake batch
> + * @sbq: Bitmap queue to Recalculate wake batch.
/s/Recalculate /recalculate/
> + * @users: Number of shares.
> + *
> + * Like sbitmap_queue_update_wake_batch(), this will calculate wake batch
> + * by depth. This interface is for sharing tags.
we have concept of shared tags (flag BLK_MQ_F_TAG_HCTX_SHARED) and queue
shared tags (flag BLK_MQ_F_TAG_QUEUE_SHARED) - please be careful in the
terminology
> + */
> +void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq,
> + unsigned int users);
> +
> /**
> * sbitmap_queue_resize() - Resize a &struct sbitmap_queue.
> * @sbq: Bitmap queue to resize.
> diff --git a/lib/sbitmap.c b/lib/sbitmap.c
> index 2709ab825499..94b3272effd8 100644
> --- a/lib/sbitmap.c
> +++ b/lib/sbitmap.c
> @@ -457,10 +457,9 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
> }
> EXPORT_SYMBOL_GPL(sbitmap_queue_init_node);
>
> -static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
> - unsigned int depth)
> +static inline void __sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
> + unsigned int wake_batch)
> {
> - unsigned int wake_batch = sbq_calc_wake_batch(sbq, depth);
> int i;
>
> if (sbq->wake_batch != wake_batch) {
> @@ -476,6 +475,23 @@ static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
> }
> }
>
> +static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
> + unsigned int depth)
> +{
> + unsigned int wake_batch = sbq_calc_wake_batch(sbq, depth);
> +
> + __sbitmap_queue_update_wake_batch(sbq, wake_batch);
> +}
> +
> +void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq,
> + unsigned int users)
> +{
> + unsigned int wake_batch = clamp_t(unsigned int,
> + (sbq->sb.depth + users - 1) / users, 4U, SBQ_WAKE_BATCH);
> + __sbitmap_queue_update_wake_batch(sbq, wake_batch);
> +}
> +EXPORT_SYMBOL_GPL(sbitmap_queue_recalculate_wake_batch);
> +
> void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth)
> {
> sbitmap_queue_update_wake_batch(sbq, depth);
>
Powered by blists - more mailing lists