[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <c93e88cd-ade4-4264-8d80-a6669e361c32@suse.de>
Date: Fri, 21 Nov 2025 08:18:14 +0100
From: Hannes Reinecke <hare@...e.de>
To: Yu Kuai <yukuai@...as.com>, axboe@...nel.dk, nilay@...ux.ibm.com,
bvanassche@....org, linux-block@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v6 6/8] mq-deadline: covert to use
request_queue->async_depth
On 11/21/25 06:28, Yu Kuai wrote:
> In downstream kernel, we test with mq-deadline with many fio workloads, and
> we found a performance regression after commit 39823b47bbd4
> ("block/mq-deadline: Fix the tag reservation code") with following test:
>
> [global]
> rw=randread
> direct=1
> ramp_time=1
> ioengine=libaio
> iodepth=1024
> numjobs=24
> bs=1024k
> group_reporting=1
> runtime=60
>
> [job1]
> filename=/dev/sda
>
> Root cause is that mq-deadline now support configuring async_depth,
> although the default value is nr_request, however the minimal value is
> 1, hence min_shallow_depth is set to 1, causing wake_batch to be 1. For
> consequence, sbitmap_queue will be waken up after each IO instead of
> 8 IO.
>
> In this test case, sda is HDD and max_sectors is 128k, hence each
> submitted 1M io will be splited into 8 sequential 128k requests, however
> due to there are 24 jobs and total tags are exhausted, the 8 requests are
> unlikely to be dispatched sequentially, and changing wake_batch to 1
> will make this much worse, accounting blktrace D stage, the percentage
> of sequential io is decreased from 8% to 0.8%.
>
> Fix this problem by converting to request_queue->async_depth, where
> min_shallow_depth is set each time async_depth is updated.
>
> Noted elevator attribute async_depth is now removed, queue attribute
> with the same name is used instead.
>
> Fixes: 39823b47bbd4 ("block/mq-deadline: Fix the tag reservation code")
> Signed-off-by: Yu Kuai <yukuai@...as.com>
> Reviewed-by: Nilay Shroff <nilay@...ux.ibm.com>
> Reviewed-by: Bart Van Assche <bvanassche@....org>
> ---
> block/mq-deadline.c | 39 +++++----------------------------------
> 1 file changed, 5 insertions(+), 34 deletions(-)
>
> diff --git a/block/mq-deadline.c b/block/mq-deadline.c
> index 29d00221fbea..95917a88976f 100644
> --- a/block/mq-deadline.c
> +++ b/block/mq-deadline.c
> @@ -98,7 +98,6 @@ struct deadline_data {
> int fifo_batch;
> int writes_starved;
> int front_merges;
> - u32 async_depth;
> int prio_aging_expire;
>
> spinlock_t lock;
> @@ -486,32 +485,16 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
> return rq;
> }
>
> -/*
> - * Called by __blk_mq_alloc_request(). The shallow_depth value set by this
> - * function is used by __blk_mq_get_tag().
> - */
> static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
> {
> - struct deadline_data *dd = data->q->elevator->elevator_data;
> -
> - /* Do not throttle synchronous reads. */
> - if (blk_mq_is_sync_read(opf))
> - return;
> -
> - /*
> - * Throttle asynchronous requests and writes such that these requests
> - * do not block the allocation of synchronous requests.
> - */
> - data->shallow_depth = dd->async_depth;
> + if (!blk_mq_is_sync_read(opf))
> + data->shallow_depth = data->q->async_depth;
> }
>
> -/* Called by blk_mq_update_nr_requests(). */
> +/* Called by blk_mq_init_sched() and blk_mq_update_nr_requests(). */
> static void dd_depth_updated(struct request_queue *q)
> {
> - struct deadline_data *dd = q->elevator->elevator_data;
> -
> - dd->async_depth = q->nr_requests;
> - blk_mq_set_min_shallow_depth(q, 1);
> + blk_mq_set_min_shallow_depth(q, q->async_depth);
> }
>
> static void dd_exit_sched(struct elevator_queue *e)
> @@ -576,6 +559,7 @@ static int dd_init_sched(struct request_queue *q, struct elevator_queue *eq)
> blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
>
> q->elevator = eq;
> + q->async_depth = q->nr_requests;
> dd_depth_updated(q);
> return 0;
> }
> @@ -763,7 +747,6 @@ SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]);
> SHOW_JIFFIES(deadline_prio_aging_expire_show, dd->prio_aging_expire);
> SHOW_INT(deadline_writes_starved_show, dd->writes_starved);
> SHOW_INT(deadline_front_merges_show, dd->front_merges);
> -SHOW_INT(deadline_async_depth_show, dd->async_depth);
> SHOW_INT(deadline_fifo_batch_show, dd->fifo_batch);
> #undef SHOW_INT
> #undef SHOW_JIFFIES
> @@ -793,7 +776,6 @@ STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MA
> STORE_JIFFIES(deadline_prio_aging_expire_store, &dd->prio_aging_expire, 0, INT_MAX);
> STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX);
> STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1);
> -STORE_INT(deadline_async_depth_store, &dd->async_depth, 1, INT_MAX);
> STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX);
> #undef STORE_FUNCTION
> #undef STORE_INT
> @@ -807,7 +789,6 @@ static const struct elv_fs_entry deadline_attrs[] = {
> DD_ATTR(write_expire),
> DD_ATTR(writes_starved),
> DD_ATTR(front_merges),
> - DD_ATTR(async_depth),
> DD_ATTR(fifo_batch),
> DD_ATTR(prio_aging_expire),
> __ATTR_NULL
> @@ -894,15 +875,6 @@ static int deadline_starved_show(void *data, struct seq_file *m)
> return 0;
> }
>
> -static int dd_async_depth_show(void *data, struct seq_file *m)
> -{
> - struct request_queue *q = data;
> - struct deadline_data *dd = q->elevator->elevator_data;
> -
> - seq_printf(m, "%u\n", dd->async_depth);
> - return 0;
> -}
> -
> static int dd_queued_show(void *data, struct seq_file *m)
> {
> struct request_queue *q = data;
> @@ -1002,7 +974,6 @@ static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = {
> DEADLINE_NEXT_RQ_ATTR(write2),
> {"batching", 0400, deadline_batching_show},
> {"starved", 0400, deadline_starved_show},
> - {"async_depth", 0400, dd_async_depth_show},
> {"dispatch", 0400, .seq_ops = &deadline_dispatch_seq_ops},
> {"owned_by_driver", 0400, dd_owned_by_driver_show},
> {"queued", 0400, dd_queued_show},
... and this is now the opposite. We are removing an existing attribute
(raising questions about userland ABI stability).
Wouldn't it be better to use the generic infrastructure (ie having a
per-request queue async_depth variable), but keep the per-elevator
sysfs attributes (which then just would display the per-queue variable).
That way we won't break userland and get around the awkward issue
of presenting a dummy attribute when no elevator is loaded.
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@...e.de +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
Powered by blists - more mailing lists