[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aTI2L6j50VWjp7aW@kbusch-mbp>
Date: Thu, 4 Dec 2025 18:32:31 -0700
From: Keith Busch <kbusch@...nel.org>
To: Bart Van Assche <bvanassche@....org>
Cc: Mohamed Khalfella <mkhalfella@...estorage.com>,
Chaitanya Kulkarni <kch@...dia.com>, Christoph Hellwig <hch@....de>,
Jens Axboe <axboe@...nel.dk>, Sagi Grimberg <sagi@...mberg.me>,
Casey Chen <cachen@...estorage.com>,
Yuanyuan Zhong <yzhong@...estorage.com>,
Hannes Reinecke <hare@...e.de>, Ming Lei <ming.lei@...hat.com>,
Waiman Long <llong@...hat.com>, Hillf Danton <hdanton@...a.com>,
linux-nvme@...ts.infradead.org, linux-block@...r.kernel.org,
linux-kernel@...r.kernel.org
Subject: Re: [PATCH 1/1] block: Use RCU in blk_mq_[un]quiesce_tagset()
instead of set->tag_list_lock
On Thu, Dec 04, 2025 at 01:22:49PM -1000, Bart Van Assche wrote:
>
> On 12/4/25 11:26 AM, Keith Busch wrote:
> > On Thu, Dec 04, 2025 at 10:24:03AM -1000, Bart Van Assche wrote:>> Hence, the deadlock can be
> > > solved by removing the blk_mq_quiesce_tagset() call from nvme_timeout()
> > > and by failing I/O from inside nvme_timeout(). If nvme_timeout() fails
> > > I/O and does not call blk_mq_quiesce_tagset() then the
> > > blk_mq_freeze_queue_wait() call will finish instead of triggering a
> > > deadlock. However, I do not know whether this proposal seems acceptable
> > > to the NVMe maintainers.
> >
> > You periodically make this suggestion, but there's never a reason
> > offered to introduce yet another work queue for the driver to
> > synchronize with at various points. The whole point of making blk-mq
> > timeout handler in a work queue (it used to be a timer) was so that we
> > could do blocking actions like this.
> Hi Keith,
>
> The blk_mq_quiesce_tagset() call from the NVMe timeout handler is
> unfortunate because it triggers a deadlock with
> blk_mq_update_tag_set_shared().
So in this scenario, the driver is modifying a tagset list from two
queues to one, which causes blk-mq to clear the "shared" flags. The
remaining one just so happens to have hit a timeout at the same time,
which runs in a context with an elevated "q_usage_counter". The current
rule, then, is you can not take the tag_list_lock from any context using
any queue in the tag list.
> I proposed to modify the NVMe driver because I think that's a better
> approach than introducing a new synchronize_rcu() call in the block
> layer core.
I'm not interested in introducing rcu synchronize here either. I guess I
would make it so you can quiesce a tagset from a context that entered
the queue. So quick shot at that here:
---
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4e96bb2462475..20450017b9512 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -4262,11 +4262,16 @@ static void blk_mq_map_swqueue(struct request_queue *q)
* Caller needs to ensure that we're either frozen/quiesced, or that
* the queue isn't live yet.
*/
-static void queue_set_hctx_shared(struct request_queue *q, bool shared)
+static void queue_set_hctx_shared_locked(struct request_queue *q)
{
+ struct blk_mq_tag_set *set = q->tag_set;
struct blk_mq_hw_ctx *hctx;
unsigned long i;
+ bool shared;
+
+ lockdep_assert_held(&set->tag_list_lock);
+ shared = set->flags & BLK_MQ_F_TAG_QUEUE_SHARED;
queue_for_each_hw_ctx(q, hctx, i) {
if (shared) {
hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
@@ -4277,24 +4282,22 @@ static void queue_set_hctx_shared(struct request_queue *q, bool shared)
}
}
-static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set,
- bool shared)
+static void queue_set_hctx_shared(struct request_queue *q)
{
- struct request_queue *q;
+ struct blk_mq_tag_set *set = q->tag_set;
unsigned int memflags;
- lockdep_assert_held(&set->tag_list_lock);
-
- list_for_each_entry(q, &set->tag_list, tag_set_list) {
- memflags = blk_mq_freeze_queue(q);
- queue_set_hctx_shared(q, shared);
- blk_mq_unfreeze_queue(q, memflags);
- }
+ memflags = blk_mq_freeze_queue(q);
+ mutex_lock(&set->tag_list_lock);
+ queue_set_hctx_shared_locked(q);
+ mutex_unlock(&set->tag_list_lock);
+ blk_mq_unfreeze_queue(q, memflags);
}
static void blk_mq_del_queue_tag_set(struct request_queue *q)
{
struct blk_mq_tag_set *set = q->tag_set;
+ struct request_queue *shared = NULL;
mutex_lock(&set->tag_list_lock);
list_del(&q->tag_set_list);
@@ -4302,15 +4305,25 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q)
/* just transitioned to unshared */
set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
/* update existing queue */
- blk_mq_update_tag_set_shared(set, false);
+ shared = list_first_entry(&set->tag_list, struct request_queue,
+ tag_set_list);
+ if (!blk_get_queue(shared))
+ shared = NULL;
}
mutex_unlock(&set->tag_list_lock);
INIT_LIST_HEAD(&q->tag_set_list);
+
+ if (shared) {
+ queue_set_hctx_shared(shared);
+ blk_put_queue(shared);
+ }
}
static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
struct request_queue *q)
{
+ struct request_queue *shared = NULL;
+
mutex_lock(&set->tag_list_lock);
/*
@@ -4318,15 +4331,24 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
*/
if (!list_empty(&set->tag_list) &&
!(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
- set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
/* update existing queue */
- blk_mq_update_tag_set_shared(set, true);
+ shared = list_first_entry(&set->tag_list, struct request_queue,
+ tag_set_list);
+ if (!blk_get_queue(shared))
+ shared = NULL;
+ else
+ set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
}
if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
- queue_set_hctx_shared(q, true);
+ queue_set_hctx_shared_locked(q);
list_add_tail(&q->tag_set_list, &set->tag_list);
mutex_unlock(&set->tag_list_lock);
+
+ if (shared) {
+ queue_set_hctx_shared(shared);
+ blk_put_queue(shared);
+ }
}
/* All allocations will be freed in release handler of q->mq_kobj */
--
Powered by blists - more mailing lists