[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <akxwnhjya5pgnujvfcvb5r2nj2qo65lji5euriy3ghuvt22klt@vitzpdphcufg>
Date: Fri, 14 Feb 2025 03:54:47 +0000
From: Shinichiro Kawasaki <shinichiro.kawasaki@....com>
To: Daniel Wagner <dwagner@...e.de>
CC: Daniel Wagner <wagi@...nel.org>, James Smart <james.smart@...adcom.com>,
Keith Busch <kbusch@...nel.org>, hch <hch@....de>, Sagi Grimberg
<sagi@...mberg.me>, Hannes Reinecke <hare@...e.de>, Paul Ely
<paul.ely@...adcom.com>, "linux-nvme@...ts.infradead.org"
<linux-nvme@...ts.infradead.org>, "linux-kernel@...r.kernel.org"
<linux-kernel@...r.kernel.org>
Subject: Re: [PATCH v4 3/3] nvme-fc: do not ignore connectivity loss during
connecting
On Feb 13, 2025 / 15:22, Daniel Wagner wrote:
> On Thu, Feb 13, 2025 at 10:14:46AM +0100, Daniel Wagner wrote:
> > That means the idea to synchronize the state change with the
> > ASSOC_FAILED bit under the lock is not going to work. I am trying to
> > figure out a solution.
>
> I found a possible solution. The only 'catch' is to make the state
> machine a bit more restrictive. The only valid transition to LIVE would
> be from CONNECTING. We can do this because even the PCI driver is doing
> all the state transitions NEW -> CONNECTING -> LIVE. It's important that
> we can't enter LIVE from RESETTING to get the patch below working.
>
> We don't have to rely on another variable to figure in which state the
> ctrl is. The nvme_fc_ctrl_connectivity_loss callback needs always to
> trigger a reset. If the ctrl is not in LIVE it is a no-op. This makes it
> possible to remove the lock around the ASSOC_FAILED and the state read
> operation.
>
> In nvme_fc_create_association we only have to check if we can enter the
> LIVE state (thus we were in CONNECTING) and if this fails we entered the
> RESETTING state and should return an error.
>
>
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 818d4e49aab5..f028913e2e62 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -564,8 +564,6 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
> switch (new_state) {
> case NVME_CTRL_LIVE:
> switch (old_state) {
> - case NVME_CTRL_NEW:
> - case NVME_CTRL_RESETTING:
> case NVME_CTRL_CONNECTING:
> changed = true;
> fallthrough;
> diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
> index f4f1866fbd5b..e740814fd1ea 100644
> --- a/drivers/nvme/host/fc.c
> +++ b/drivers/nvme/host/fc.c
> @@ -781,61 +781,12 @@ nvme_fc_abort_lsops(struct nvme_fc_rport *rport)
> static void
> nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl)
> {
> - enum nvme_ctrl_state state;
> - unsigned long flags;
> -
> dev_info(ctrl->ctrl.device,
> "NVME-FC{%d}: controller connectivity lost. Awaiting "
> "Reconnect", ctrl->cnum);
>
> - spin_lock_irqsave(&ctrl->lock, flags);
> set_bit(ASSOC_FAILED, &ctrl->flags);
> - state = nvme_ctrl_state(&ctrl->ctrl);
> - spin_unlock_irqrestore(&ctrl->lock, flags);
> -
> - switch (state) {
> - case NVME_CTRL_NEW:
> - case NVME_CTRL_LIVE:
> - /*
> - * Schedule a controller reset. The reset will terminate the
> - * association and schedule the reconnect timer. Reconnects
> - * will be attempted until either the ctlr_loss_tmo
> - * (max_retries * connect_delay) expires or the remoteport's
> - * dev_loss_tmo expires.
> - */
> - if (nvme_reset_ctrl(&ctrl->ctrl)) {
> - dev_warn(ctrl->ctrl.device,
> - "NVME-FC{%d}: Couldn't schedule reset.\n",
> - ctrl->cnum);
> - nvme_delete_ctrl(&ctrl->ctrl);
> - }
> - break;
> -
> - case NVME_CTRL_CONNECTING:
> - /*
> - * The association has already been terminated and the
> - * controller is attempting reconnects. No need to do anything
> - * futher. Reconnects will be attempted until either the
> - * ctlr_loss_tmo (max_retries * connect_delay) expires or the
> - * remoteport's dev_loss_tmo expires.
> - */
> - break;
> -
> - case NVME_CTRL_RESETTING:
> - /*
> - * Controller is already in the process of terminating the
> - * association. No need to do anything further. The reconnect
> - * step will kick in naturally after the association is
> - * terminated.
> - */
> - break;
> -
> - case NVME_CTRL_DELETING:
> - case NVME_CTRL_DELETING_NOIO:
> - default:
> - /* no action to take - let it delete */
> - break;
> - }
> + nvme_reset_ctrl(&ctrl->ctrl);
> }
>
> /**
> @@ -3177,23 +3128,18 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
> else
> ret = nvme_fc_recreate_io_queues(ctrl);
> }
> + if (!ret && test_bit(ASSOC_FAILED, &ctrl->flags))
> + ret = -EIO;
> if (ret)
> goto out_term_aen_ops;
>
> - spin_lock_irqsave(&ctrl->lock, flags);
> - if (!test_bit(ASSOC_FAILED, &ctrl->flags))
> - changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
> - else
> + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE)) {
> ret = -EIO;
> - spin_unlock_irqrestore(&ctrl->lock, flags);
> -
> - if (ret)
> goto out_term_aen_ops;
> + }
>
> ctrl->ctrl.nr_reconnects = 0;
> -
> - if (changed)
> - nvme_start_ctrl(&ctrl->ctrl);
> + nvme_start_ctrl(&ctrl->ctrl);
>
> return 0; /* Success */
Thanks Daniel. I applied the patch on top of v6.14-rc2, and it avoided the
blktests failures. I ran the nvme test group with other transports (loop, rdma
and tcp), and observed no regression. It looks good from test run point of view.
Of note is that I observed a compiler warning at kenerl build:
drivers/nvme/host/fc.c: In function ‘nvme_fc_create_association’:
drivers/nvme/host/fc.c:3025:14: warning: unused variable ‘changed’ [-Wunused-variable]
3025 | bool changed;
| ^~~~~~~
Powered by blists - more mailing lists