linux-kernel - Re: [PATCH v2] blk-mq: Use request queue-wide tags for tagset-wide sbitmap

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <YJs2KWMCn2kpyryT@T590>
Date:   Wed, 12 May 2021 09:58:01 +0800
From:   Ming Lei <ming.lei@...hat.com>
To:     John Garry <john.garry@...wei.com>
Cc:     axboe@...nel.dk, linux-block@...r.kernel.org,
        linux-kernel@...r.kernel.org, linux-scsi@...r.kernel.org,
        kashyap.desai@...adcom.com, chenxiang66@...ilicon.com,
        yama@...hat.com, dgilbert@...erlog.com
Subject: Re: [PATCH v2] blk-mq: Use request queue-wide tags for tagset-wide
 sbitmap

On Wed, May 12, 2021 at 12:15:43AM +0800, John Garry wrote:
> The tags used for an IO scheduler are currently per hctx.
> 
> As such, when q->nr_hw_queues grows, so does the request queue total IO
> scheduler tag depth.
> 
> This may cause problems for SCSI MQ HBAs whose total driver depth is
> fixed.
> 
> Ming and Yanhui report higher CPU usage and lower throughput in scenarios
> where the fixed total driver tag depth is appreciably lower than the total
> scheduler tag depth:
> https://lore.kernel.org/linux-block/440dfcfc-1a2c-bd98-1161-cec4d78c6dfc@huawei.com/T/#mc0d6d4f95275a2743d1c8c3e4dc9ff6c9aa3a76b
> 
> In that scenario, since the scheduler tag is got first, much contention
> is introduced since a driver tag may not be available after we have got
> the sched tag.
> 
> Improve this scenario by introducing request queue-wide tags for when
> a tagset-wide sbitmap is used. The static sched requests are still
> allocated per hctx, as requests are initialised per hctx, as in
> blk_mq_init_request(..., hctx_idx, ...) ->
> set->ops->init_request(.., hctx_idx, ...).
> 
> For simplicity of resizing the request queue sbitmap when updating the
> request queue depth, just init at the max possible size, so we don't need
> to deal with the possibly with swapping out a new sbitmap for old if
> we need to grow.
> 
> Signed-off-by: John Garry <john.garry@...wei.com>
> ---
> 
> Please retest, thanks! For some reason I could not recreate the original
> issue, but I am using qemu...
> 
> Changes since v1:
> - Embed sbitmaps in request_queue struct
> - Relocate IO sched functions to blk-mq-sched.c
> - Fix error path code
> 
> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> index 42a365b1b9c0..9a012e0818cb 100644
> --- a/block/blk-mq-sched.c
> +++ b/block/blk-mq-sched.c
> @@ -507,11 +507,9 @@ static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
>  				   struct blk_mq_hw_ctx *hctx,
>  				   unsigned int hctx_idx)
>  {
> -	unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
> -
>  	if (hctx->sched_tags) {
>  		blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx);
> -		blk_mq_free_rq_map(hctx->sched_tags, flags);
> +		blk_mq_free_rq_map(hctx->sched_tags, set->flags);
>  		hctx->sched_tags = NULL;
>  	}
>  }
> @@ -521,12 +519,10 @@ static int blk_mq_sched_alloc_tags(struct request_queue *q,
>  				   unsigned int hctx_idx)
>  {
>  	struct blk_mq_tag_set *set = q->tag_set;
> -	/* Clear HCTX_SHARED so tags are init'ed */
> -	unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
>  	int ret;
>  
>  	hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
> -					       set->reserved_tags, flags);
> +					       set->reserved_tags, set->flags);
>  	if (!hctx->sched_tags)
>  		return -ENOMEM;
>  
> @@ -544,16 +540,40 @@ static void blk_mq_sched_tags_teardown(struct request_queue *q)
>  	int i;
>  
>  	queue_for_each_hw_ctx(q, hctx, i) {
> -		/* Clear HCTX_SHARED so tags are freed */
> -		unsigned int flags = hctx->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
> -
>  		if (hctx->sched_tags) {
> -			blk_mq_free_rq_map(hctx->sched_tags, flags);
> +			blk_mq_free_rq_map(hctx->sched_tags, hctx->flags);
>  			hctx->sched_tags = NULL;
>  		}
>  	}
>  }
>  
> +static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue)
> +{
> +	struct blk_mq_tag_set *set = queue->tag_set;
> +	int ret;
> +
> +	/*
> +	 * Set initial depth at max so that we don't need to reallocate for
> +	 * updating nr_requests.
> +	 */
> +	ret = blk_mq_init_bitmaps(&queue->sched_bitmap_tags,
> +				  &queue->sched_breserved_tags,
> +				  set, MAX_SCHED_RQ, set->reserved_tags);
> +	if (ret)
> +		return ret;
> +
> +	sbitmap_queue_resize(&queue->sched_bitmap_tags,
> +			     queue->nr_requests - set->reserved_tags);
> +
> +	return 0;
> +}
> +
> +static void blk_mq_exit_sched_shared_sbitmap(struct request_queue *queue)
> +{
> +	sbitmap_queue_free(&queue->sched_bitmap_tags);
> +	sbitmap_queue_free(&queue->sched_breserved_tags);
> +}
> +
>  int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
>  {
>  	struct blk_mq_hw_ctx *hctx;
> @@ -578,12 +598,25 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
>  	queue_for_each_hw_ctx(q, hctx, i) {
>  		ret = blk_mq_sched_alloc_tags(q, hctx, i);
>  		if (ret)
> -			goto err;
> +			goto err_free_tags;
> +	}
> +
> +	if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) {
> +		ret = blk_mq_init_sched_shared_sbitmap(q);
> +		if (ret)
> +			goto err_free_tags;
> +
> +		queue_for_each_hw_ctx(q, hctx, i) {
> +			hctx->sched_tags->bitmap_tags =
> +						&q->sched_bitmap_tags;
> +			hctx->sched_tags->breserved_tags =
> +						&q->sched_breserved_tags;
> +		}

The above assignment can be folded into blk_mq_init_sched_shared_sbitmap().

>  	}
>  
>  	ret = e->ops.init_sched(q, e);
>  	if (ret)
> -		goto err;
> +		goto err_free_sbitmap;
>  
>  	blk_mq_debugfs_register_sched(q);
>  
> @@ -603,7 +636,10 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
>  
>  	return 0;
>  
> -err:
> +err_free_sbitmap:
> +	if (blk_mq_is_sbitmap_shared(q->tag_set->flags))
> +		blk_mq_exit_sched_shared_sbitmap(q);
> +err_free_tags:
>  	blk_mq_sched_free_requests(q);
>  	blk_mq_sched_tags_teardown(q);
>  	q->elevator = NULL;
> @@ -641,5 +677,7 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
>  	if (e->type->ops.exit_sched)
>  		e->type->ops.exit_sched(e);
>  	blk_mq_sched_tags_teardown(q);
> +	if (blk_mq_is_sbitmap_shared(q->tag_set->flags))
> +		blk_mq_exit_sched_shared_sbitmap(q);

The above two lines can be moved to blk_mq_sched_tags_teardown().

>  	q->elevator = NULL;
>  }
> diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
> index 5b18ab915c65..aff037cfd8e7 100644
> --- a/block/blk-mq-sched.h
> +++ b/block/blk-mq-sched.h
> @@ -5,6 +5,8 @@
>  #include "blk-mq.h"
>  #include "blk-mq-tag.h"
>  
> +#define MAX_SCHED_RQ (16 * BLKDEV_MAX_RQ)
> +
>  void blk_mq_sched_assign_ioc(struct request *rq);
>  
>  bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index 2a37731e8244..e3ab8631be22 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -13,6 +13,7 @@
>  #include <linux/delay.h>
>  #include "blk.h"
>  #include "blk-mq.h"
> +#include "blk-mq-sched.h"
>  #include "blk-mq-tag.h"
>  
>  /*
> @@ -466,19 +467,39 @@ static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
>  	return -ENOMEM;
>  }
>  
> -int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int flags)
> +int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags,
> +			struct sbitmap_queue *breserved_tags,
> +			struct blk_mq_tag_set *set,

The 'set' parameter can be killed, meantime pass 'node' & 'alloc_policy',
just like blk_mq_init_bitmap_tags()'s type, then blk_mq_init_bitmaps()
can be re-used by blk_mq_init_bitmap_tags() for avoiding to duplicate
bitmap allocation code.

> +			unsigned int queue_depth, unsigned int reserved)

>  {
> -	unsigned int depth = set->queue_depth - set->reserved_tags;
> +	unsigned int depth = queue_depth - reserved;
>  	int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);
>  	bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR;
> -	int i, node = set->numa_node;
>  
> -	if (bt_alloc(&set->__bitmap_tags, depth, round_robin, node))
> +	if (bt_alloc(bitmap_tags, depth, round_robin, set->numa_node))
>  		return -ENOMEM;
> -	if (bt_alloc(&set->__breserved_tags, set->reserved_tags,
> -		     round_robin, node))
> +	if (bt_alloc(breserved_tags, set->reserved_tags,

s/set->reserved_tags/reserved/


Thanks, 
Ming