[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20250418175232.7mxlokunrackcjbn@purestorage.com>
Date: Fri, 18 Apr 2025 11:52:32 -0600
From: Michael Liang <mliang@...estorage.com>
To: Sagi Grimberg <sagi@...mberg.me>
Cc: Keith Busch <kbusch@...nel.org>, Jens Axboe <axboe@...nel.dk>,
Christoph Hellwig <hch@....de>,
Mohamed Khalfella <mkhalfella@...estorage.com>,
Randy Jennings <randyj@...estorage.com>,
linux-nvme@...ts.infradead.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v2 1/1] nvme-tcp: wait socket wmem to drain in queue stop
On Fri, Apr 18, 2025 at 02:49:25PM +0300, Sagi Grimberg wrote:
>
>
> On 4/18/25 14:30, Sagi Grimberg wrote:
> >
> >
> > On 4/17/25 10:13, Michael Liang wrote:
> > > This patch addresses a data corruption issue observed in nvme-tcp during
> > > testing.
> > >
> > > Issue description:
> > > In an NVMe native multipath setup, when an I/O timeout occurs, all
> > > inflight
> > > I/Os are canceled almost immediately after the kernel socket is shut
> > > down.
> > > These canceled I/Os are reported as host path errors, triggering a
> > > failover
> > > that succeeds on a different path.
> > >
> > > However, at this point, the original I/O may still be outstanding in the
> > > host's network transmission path (e.g., the NIC’s TX queue). From the
> > > user-space app's perspective, the buffer associated with the I/O is
> > > considered
> > > completed since they're acked on the different path and may be
> > > reused for new
> > > I/O requests.
> > >
> > > Because nvme-tcp enables zero-copy by default in the transmission path,
> > > this can lead to corrupted data being sent to the original target,
> > > ultimately
> > > causing data corruption.
> > >
> > > We can reproduce this data corruption by injecting delay on one path and
> > > triggering i/o timeout.
> > >
> > > To prevent this issue, this change ensures that all inflight
> > > transmissions are
> > > fully completed from host's perspective before returning from queue
> > > stop. To handle concurrent I/O timeout from multiple namespaces under
> > > the same controller, always wait in queue stop regardless of queue's
> > > state.
> > >
> > > This aligns with the behavior of queue stopping in other NVMe fabric
> > > transports.
> > >
> > > Reviewed-by: Mohamed Khalfella <mkhalfella@...estorage.com>
> > > Reviewed-by: Randy Jennings <randyj@...estorage.com>
> > > Signed-off-by: Michael Liang <mliang@...estorage.com>
> > > ---
> > > drivers/nvme/host/tcp.c | 16 ++++++++++++++++
> > > 1 file changed, 16 insertions(+)
> > >
> > > diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> > > index 26c459f0198d..62d73684e61e 100644
> > > --- a/drivers/nvme/host/tcp.c
> > > +++ b/drivers/nvme/host/tcp.c
> > > @@ -1944,6 +1944,21 @@ static void __nvme_tcp_stop_queue(struct
> > > nvme_tcp_queue *queue)
> > > cancel_work_sync(&queue->io_work);
> > > }
> > > +static void nvme_tcp_stop_queue_wait(struct nvme_tcp_queue *queue)
> > > +{
> > > + int timeout = 100;
> > > +
> > > + while (timeout > 0) {
> > > + if (!sk_wmem_alloc_get(queue->sock->sk))
> > > + return;
> > > + msleep(2);
> > > + timeout -= 2;
> > > + }
> > > + dev_warn(queue->ctrl->ctrl.device,
> > > + "qid %d: wait draining sock wmem allocation timeout\n",
> > > + nvme_tcp_queue_id(queue));
> > > +}
> > > +
> > > static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
> > > {
> > > struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
> > > @@ -1961,6 +1976,7 @@ static void nvme_tcp_stop_queue(struct
> > > nvme_ctrl *nctrl, int qid)
> > > /* Stopping the queue will disable TLS */
> > > queue->tls_enabled = false;
> > > mutex_unlock(&queue->queue_lock);
> > > + nvme_tcp_stop_queue_wait(queue);
> > > }
> > > static void nvme_tcp_setup_sock_ops(struct nvme_tcp_queue *queue)
> >
> > This makes sense. But I do not want to pay this price serially.
> > As the concern is just failover, lets do something like: diff --git
> > a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index
> > 5041cbfd8272..d482a8fe2c4b 100644 --- a/drivers/nvme/host/tcp.c +++
> > b/drivers/nvme/host/tcp.c @@ -2031,6 +2031,8 @@ static void
> > nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl) for (i = 1; i <
> > ctrl->queue_count; i++) nvme_tcp_stop_queue(ctrl, i); + for (i = 1; i <
> > ctrl->queue_count; i++) + nvme_tcp_stop_queue_wait(&ctrl->queues[i]); }
> > static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl, @@ -2628,8
> > +2630,10 @@ static void nvme_tcp_complete_timed_out(struct request *rq)
> > { struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); struct nvme_ctrl
> > *ctrl = &req->queue->ctrl->ctrl; + int idx =
> > nvme_tcp_queue_id(req->queue); - nvme_tcp_stop_queue(ctrl,
> > nvme_tcp_queue_id(req->queue)); + nvme_tcp_stop_queue(ctrl, idx); +
> > nvme_tcp_stop_queue_wait(&ctrl->queues[idx]);
> > nvmf_complete_timed_out_request(rq); }
>
> Or perhaps something like:
> --
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index 5041cbfd8272..3e206a2cbbf3 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -1944,7 +1944,7 @@ static void __nvme_tcp_stop_queue(struct
> nvme_tcp_queue *queue)
> cancel_work_sync(&queue->io_work);
> }
>
> -static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
> +static void nvme_tcp_stop_queue_nowait(struct nvme_ctrl *nctrl, int qid)
> {
> struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
> struct nvme_tcp_queue *queue = &ctrl->queues[qid];
> @@ -1963,6 +1963,29 @@ static void nvme_tcp_stop_queue(struct nvme_ctrl
> *nctrl, int qid)
> mutex_unlock(&queue->queue_lock);
> }
>
> +static void nvme_tcp_wait_queue(struct nvme_ctrl *nctrl, int qid)
> +{
> + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
> + struct nvme_tcp_queue *queue = ctrl->queues[qid];
> + int timeout = 100;
> +
> + while (timeout > 0) {
> + if (!sk_wmem_alloc_get(queue->sock->sk))
> + return;
> + msleep(2);
> + timeout -= 2;
> + }
> + dev_warn(queue->ctrl->ctrl.device,
> + "qid %d: timeout draining sock wmem allocation expired\n",
> + nvme_tcp_queue_id(queue));
> +}
> +
> +static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
> +{
> + nvme_tcp_stop_queue_nowait(nctrl, qid);
> + nvme_tcp_wait_queue(nctrl, qid);
> +}
> +
> static void nvme_tcp_setup_sock_ops(struct nvme_tcp_queue *queue)
> {
> write_lock_bh(&queue->sock->sk->sk_callback_lock);
> @@ -2030,7 +2053,9 @@ static void nvme_tcp_stop_io_queues(struct nvme_ctrl
> *ctrl)
> int i;
>
> for (i = 1; i < ctrl->queue_count; i++)
> - nvme_tcp_stop_queue(ctrl, i);
> + nvme_tcp_stop_queue_nowait(ctrl, i);
> + for (i = 1; i < ctrl->queue_count; i++)
> + nvme_tcp_wait_queue(ctrl, i);
> }
>
> static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl,
> --
Yes, good idea to stop first and then wait all. Will verify this patch.
Thanks,
Michael Liang
Powered by blists - more mailing lists