[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20260130223531.2478849-12-mkhalfella@purestorage.com>
Date: Fri, 30 Jan 2026 14:34:15 -0800
From: Mohamed Khalfella <mkhalfella@...estorage.com>
To: Justin Tee <justin.tee@...adcom.com>,
Naresh Gottumukkala <nareshgottumukkala83@...il.com>,
Paul Ely <paul.ely@...adcom.com>,
Chaitanya Kulkarni <kch@...dia.com>,
Christoph Hellwig <hch@....de>,
Jens Axboe <axboe@...nel.dk>,
Keith Busch <kbusch@...nel.org>,
Sagi Grimberg <sagi@...mberg.me>
Cc: Aaron Dailey <adailey@...estorage.com>,
Randy Jennings <randyj@...estorage.com>,
Dhaval Giani <dgiani@...estorage.com>,
Hannes Reinecke <hare@...e.de>,
linux-nvme@...ts.infradead.org,
linux-kernel@...r.kernel.org,
Mohamed Khalfella <mkhalfella@...estorage.com>
Subject: [PATCH v2 11/14] nvme-rdma: Use CCR to recover controller that hits an error
An alive nvme controller that hits an error now will move to FENCING
state instead of RESETTING state. ctrl->fencing_work attempts CCR to
terminate inflight IOs. If CCR succeeds, switch to FENCED -> RESETTING
and continue error recovery as usual. If CCR fails, the behavior depends
on whether the subsystem supports CQT or not. If CQT is not supported
then reset the controller immediately as if CCR succeeded in order to
maintain the current behavior. If CQT is supported switch to time-based
recovery. Schedule ctrl->fenced_work resets the controller when time
based recovery finishes.
Either ctrl->err_work or ctrl->reset_work can run after a controller is
fenced. Flush fencing work when either work run.
Signed-off-by: Mohamed Khalfella <mkhalfella@...estorage.com>
---
drivers/nvme/host/rdma.c | 62 +++++++++++++++++++++++++++++++++++++++-
1 file changed, 61 insertions(+), 1 deletion(-)
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 35c0822edb2d..da45c9ea4f32 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -106,6 +106,8 @@ struct nvme_rdma_ctrl {
/* other member variables */
struct blk_mq_tag_set tag_set;
+ struct work_struct fencing_work;
+ struct delayed_work fenced_work;
struct work_struct err_work;
struct nvme_rdma_qe async_event_sqe;
@@ -1120,11 +1122,58 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
nvme_rdma_reconnect_or_remove(ctrl, ret);
}
+static void nvme_rdma_fenced_work(struct work_struct *work)
+{
+ struct nvme_rdma_ctrl *rdma_ctrl = container_of(to_delayed_work(work),
+ struct nvme_rdma_ctrl, fenced_work);
+ struct nvme_ctrl *ctrl = &rdma_ctrl->ctrl;
+
+ nvme_change_ctrl_state(ctrl, NVME_CTRL_FENCED);
+ if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
+ queue_work(nvme_reset_wq, &rdma_ctrl->err_work);
+}
+
+static void nvme_rdma_fencing_work(struct work_struct *work)
+{
+ struct nvme_rdma_ctrl *rdma_ctrl = container_of(work,
+ struct nvme_rdma_ctrl, fencing_work);
+ struct nvme_ctrl *ctrl = &rdma_ctrl->ctrl;
+ unsigned long rem;
+
+ rem = nvme_fence_ctrl(ctrl);
+ if (!rem)
+ goto done;
+
+ if (!ctrl->cqt) {
+ dev_info(ctrl->device,
+ "CCR failed, CQT not supported, skip time-based recovery\n");
+ goto done;
+ }
+
+ dev_info(ctrl->device,
+ "CCR failed, switch to time-based recovery, timeout = %ums\n",
+ jiffies_to_msecs(rem));
+ queue_delayed_work(nvme_wq, &rdma_ctrl->fenced_work, rem);
+ return;
+
+done:
+ nvme_change_ctrl_state(ctrl, NVME_CTRL_FENCED);
+ if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
+ queue_work(nvme_reset_wq, &rdma_ctrl->err_work);
+}
+
+static void nvme_rdma_flush_fencing_work(struct nvme_rdma_ctrl *ctrl)
+{
+ flush_work(&ctrl->fencing_work);
+ flush_delayed_work(&ctrl->fenced_work);
+}
+
static void nvme_rdma_error_recovery_work(struct work_struct *work)
{
struct nvme_rdma_ctrl *ctrl = container_of(work,
struct nvme_rdma_ctrl, err_work);
+ nvme_rdma_flush_fencing_work(ctrl);
nvme_stop_keep_alive(&ctrl->ctrl);
flush_work(&ctrl->ctrl.async_event_work);
nvme_rdma_teardown_io_queues(ctrl, false);
@@ -1147,6 +1196,12 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl)
{
+ if (nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_FENCING)) {
+ dev_warn(ctrl->ctrl.device, "starting controller fencing\n");
+ queue_work(nvme_wq, &ctrl->fencing_work);
+ return;
+ }
+
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
return;
@@ -1957,13 +2012,15 @@ static enum blk_eh_timer_return nvme_rdma_timeout(struct request *rq)
struct nvme_rdma_ctrl *ctrl = queue->ctrl;
struct nvme_command *cmd = req->req.cmd;
int qid = nvme_rdma_queue_idx(queue);
+ enum nvme_ctrl_state state;
dev_warn(ctrl->ctrl.device,
"I/O tag %d (%04x) opcode %#x (%s) QID %d timeout\n",
rq->tag, nvme_cid(rq), cmd->common.opcode,
nvme_fabrics_opcode_str(qid, cmd), qid);
- if (nvme_ctrl_state(&ctrl->ctrl) != NVME_CTRL_LIVE) {
+ state = nvme_ctrl_state(&ctrl->ctrl);
+ if (state != NVME_CTRL_LIVE && state != NVME_CTRL_FENCING) {
/*
* If we are resetting, connecting or deleting we should
* complete immediately because we may block controller
@@ -2169,6 +2226,7 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
container_of(work, struct nvme_rdma_ctrl, ctrl.reset_work);
int ret;
+ nvme_rdma_flush_fencing_work(ctrl);
nvme_stop_ctrl(&ctrl->ctrl);
nvme_rdma_shutdown_ctrl(ctrl, false);
@@ -2281,6 +2339,8 @@ static struct nvme_rdma_ctrl *nvme_rdma_alloc_ctrl(struct device *dev,
INIT_DELAYED_WORK(&ctrl->reconnect_work,
nvme_rdma_reconnect_ctrl_work);
+ INIT_DELAYED_WORK(&ctrl->fenced_work, nvme_rdma_fenced_work);
+ INIT_WORK(&ctrl->fencing_work, nvme_rdma_fencing_work);
INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work);
INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work);
--
2.52.0
Powered by blists - more mailing lists