[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <7cb33ebe-2ff6-4c3a-82f0-c4ed547e8a25@linux.ibm.com>
Date: Tue, 1 Apr 2025 19:02:11 +0530
From: Nilay Shroff <nilay@...ux.ibm.com>
To: Daniel Wagner <wagi@...nel.org>, Christoph Hellwig <hch@....de>,
Sagi Grimberg <sagi@...mberg.me>, Keith Busch <kbusch@...nel.org>,
Hannes Reinecke <hare@...e.de>, John Meneghini <jmeneghi@...hat.com>,
randyj@...estorage.com, Mohamed Khalfella <mkhalfella@...estorage.com>
Cc: linux-nvme@...ts.infradead.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH RFC 3/3] nvme: delay failover by command quiesce timeout
On 3/24/25 5:37 PM, Daniel Wagner wrote:
> The TP4129 mendates that the failover should be delayed by CQT. Thus when
> nvme_decide_disposition returns FAILOVER do not immediately re-queue it on
> the namespace level instead queue it on the ctrl's request_list and
> moved later to the namespace's requeue_list.
>
> Signed-off-by: Daniel Wagner <wagi@...nel.org>
> ---
> drivers/nvme/host/core.c | 19 ++++++++++++++++
> drivers/nvme/host/fc.c | 4 ++++
> drivers/nvme/host/multipath.c | 52 ++++++++++++++++++++++++++++++++++++++++---
> drivers/nvme/host/nvme.h | 15 +++++++++++++
> drivers/nvme/host/rdma.c | 2 ++
> drivers/nvme/host/tcp.c | 1 +
> 6 files changed, 90 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 135045528ea1c79eac0d6d47d5f7f05a7c98acc4..f3155c7735e75e06c4359c26db8931142c067e1d 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -239,6 +239,7 @@ static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl)
>
> flush_work(&ctrl->reset_work);
> nvme_stop_ctrl(ctrl);
> + nvme_flush_failover(ctrl);
> nvme_remove_namespaces(ctrl);
> ctrl->ops->delete_ctrl(ctrl);
> nvme_uninit_ctrl(ctrl);
> @@ -1310,6 +1311,19 @@ static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl)
> queue_delayed_work(nvme_wq, &ctrl->ka_work, delay);
> }
>
> +void nvme_schedule_failover(struct nvme_ctrl *ctrl)
> +{
> + unsigned long delay;
> +
> + if (ctrl->cqt)
> + delay = msecs_to_jiffies(ctrl->cqt);
> + else
> + delay = ctrl->kato * HZ;
> +
> + queue_delayed_work(nvme_wq, &ctrl->failover_work, delay);
> +}
> +EXPORT_SYMBOL_GPL(nvme_schedule_failover);
> +
> static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq,
> blk_status_t status)
> {
> @@ -1336,6 +1350,8 @@ static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq,
> dev_err(ctrl->device,
> "failed nvme_keep_alive_end_io error=%d\n",
> status);
> +
> + nvme_schedule_failover(ctrl);
> return RQ_END_IO_NONE;
> }
>
> @@ -4716,6 +4732,7 @@ EXPORT_SYMBOL_GPL(nvme_remove_io_tag_set);
>
> void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
> {
> + nvme_schedule_failover(ctrl);
> nvme_mpath_stop(ctrl);
> nvme_auth_stop(ctrl);
> nvme_stop_failfast_work(ctrl);
> @@ -4842,6 +4859,8 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
>
> INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
> INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work);
> + INIT_DELAYED_WORK(&ctrl->failover_work, nvme_failover_work);
> + INIT_LIST_HEAD(&ctrl->failover_list);
> memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
> ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
> ctrl->ka_last_check_time = jiffies;
> diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
> index cdc1ba277a5c23ef1afd26e6911b082f3d12b215..bd897b29cd286008b781bbcb4230e08019da6b6b 100644
> --- a/drivers/nvme/host/fc.c
> +++ b/drivers/nvme/host/fc.c
> @@ -2553,6 +2553,8 @@ nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg)
> {
> enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl);
>
> + nvme_schedule_failover(&ctrl->ctrl);
> +
> /*
> * if an error (io timeout, etc) while (re)connecting, the remote
> * port requested terminating of the association (disconnect_ls)
> @@ -3378,6 +3380,8 @@ nvme_fc_reset_ctrl_work(struct work_struct *work)
> /* will block will waiting for io to terminate */
> nvme_fc_delete_association(ctrl);
>
> + nvme_schedule_failover(&ctrl->ctrl);
> +
> if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING))
> dev_err(ctrl->ctrl.device,
> "NVME-FC{%d}: error_recovery: Couldn't change state "
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index 2a7635565083046c575efe1793362ae10581defd..a14b055796b982df96609f53174a5d1334c1c0c4 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -86,9 +86,11 @@ void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
> void nvme_failover_req(struct request *req)
> {
> struct nvme_ns *ns = req->q->queuedata;
> + struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
> u16 status = nvme_req(req)->status & NVME_SCT_SC_MASK;
> unsigned long flags;
> struct bio *bio;
> + enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
>
> nvme_mpath_clear_current_path(ns);
>
> @@ -121,9 +123,53 @@ void nvme_failover_req(struct request *req)
> blk_steal_bios(&ns->head->requeue_list, req);
> spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
>
> - nvme_req(req)->status = 0;
> - nvme_end_req(req);
> - kblockd_schedule_work(&ns->head->requeue_work);
> + spin_lock_irqsave(&ctrl->lock, flags);
> + list_add_tail(&req->queuelist, &ctrl->failover_list);
> + spin_unlock_irqrestore(&ctrl->lock, flags);
> +
Why do we need to wait until error_recovery for scheduling failover?
Can't we schedule failover as soon as we get path error? Also can't
we avoid failover_list?
Thanks,
--Nilay
Powered by blists - more mailing lists