linux-kernel - Re: [PATCH v2 08/14] nvme: Implement cross-controller reset recovery

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20260203200048.GE3729-mkhalfella@purestorage.com>
Date: Tue, 3 Feb 2026 12:00:48 -0800
From: Mohamed Khalfella <mkhalfella@...estorage.com>
To: Hannes Reinecke <hare@...e.de>
Cc: Justin Tee <justin.tee@...adcom.com>,
	Naresh Gottumukkala <nareshgottumukkala83@...il.com>,
	Paul Ely <paul.ely@...adcom.com>,
	Chaitanya Kulkarni <kch@...dia.com>, Christoph Hellwig <hch@....de>,
	Jens Axboe <axboe@...nel.dk>, Keith Busch <kbusch@...nel.org>,
	Sagi Grimberg <sagi@...mberg.me>,
	Aaron Dailey <adailey@...estorage.com>,
	Randy Jennings <randyj@...estorage.com>,
	Dhaval Giani <dgiani@...estorage.com>,
	linux-nvme@...ts.infradead.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v2 08/14] nvme: Implement cross-controller reset recovery

On Tue 2026-02-03 06:19:51 +0100, Hannes Reinecke wrote:
> On 1/30/26 23:34, Mohamed Khalfella wrote:
> > A host that has more than one path connecting to an nvme subsystem
> > typically has an nvme controller associated with every path. This is
> > mostly applicable to nvmeof. If one path goes down, inflight IOs on that
> > path should not be retried immediately on another path because this
> > could lead to data corruption as described in TP4129. TP8028 defines
> > cross-controller reset mechanism that can be used by host to terminate
> > IOs on the failed path using one of the remaining healthy paths. Only
> > after IOs are terminated, or long enough time passes as defined by
> > TP4129, inflight IOs should be retried on another path. Implement core
> > cross-controller reset shared logic to be used by the transports.
> > 
> > Signed-off-by: Mohamed Khalfella <mkhalfella@...estorage.com>
> > ---
> >   drivers/nvme/host/constants.c |   1 +
> >   drivers/nvme/host/core.c      | 129 ++++++++++++++++++++++++++++++++++
> >   drivers/nvme/host/nvme.h      |   9 +++
> >   3 files changed, 139 insertions(+)
> > 
> > diff --git a/drivers/nvme/host/constants.c b/drivers/nvme/host/constants.c
> > index dc90df9e13a2..f679efd5110e 100644
> > --- a/drivers/nvme/host/constants.c
> > +++ b/drivers/nvme/host/constants.c
> > @@ -46,6 +46,7 @@ static const char * const nvme_admin_ops[] = {
> >   	[nvme_admin_virtual_mgmt] = "Virtual Management",
> >   	[nvme_admin_nvme_mi_send] = "NVMe Send MI",
> >   	[nvme_admin_nvme_mi_recv] = "NVMe Receive MI",
> > +	[nvme_admin_cross_ctrl_reset] = "Cross Controller Reset",
> >   	[nvme_admin_dbbuf] = "Doorbell Buffer Config",
> >   	[nvme_admin_format_nvm] = "Format NVM",
> >   	[nvme_admin_security_send] = "Security Send",
> > diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> > index 3e1e02822dd4..13e0775d56b4 100644
> > --- a/drivers/nvme/host/core.c
> > +++ b/drivers/nvme/host/core.c
> > @@ -554,6 +554,134 @@ void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl)
> >   }
> >   EXPORT_SYMBOL_GPL(nvme_cancel_admin_tagset);
> >   
> > +static struct nvme_ctrl *nvme_find_ctrl_ccr(struct nvme_ctrl *ictrl,
> > +					    u32 min_cntlid)
> > +{
> > +	struct nvme_subsystem *subsys = ictrl->subsys;
> > +	struct nvme_ctrl *sctrl;
> > +	unsigned long flags;
> > +
> > +	mutex_lock(&nvme_subsystems_lock);
> > +	list_for_each_entry(sctrl, &subsys->ctrls, subsys_entry) {
> > +		if (sctrl->cntlid < min_cntlid)
> > +			continue;
> > +
> > +		if (atomic_dec_if_positive(&sctrl->ccr_limit) < 0)
> > +			continue;
> > +
> > +		spin_lock_irqsave(&sctrl->lock, flags);
> > +		if (sctrl->state != NVME_CTRL_LIVE) {
> > +			spin_unlock_irqrestore(&sctrl->lock, flags);
> > +			atomic_inc(&sctrl->ccr_limit);
> > +			continue;
> > +		}
> > +
> > +		/*
> > +		 * We got a good candidate source controller that is locked and
> > +		 * LIVE. However, no guarantee sctrl will not be deleted after
> > +		 * sctrl->lock is released. Get a ref of both sctrl and admin_q
> > +		 * so they do not disappear until we are done with them.
> > +		 */
> > +		WARN_ON_ONCE(!blk_get_queue(sctrl->admin_q));
> > +		nvme_get_ctrl(sctrl);
> > +		spin_unlock_irqrestore(&sctrl->lock, flags);
> > +		goto found;
> > +	}
> > +	sctrl = NULL;
> > +found:
> 
> Normally one would be using a temporary loop variable and assign 'sctrl' 
> to that one if found. Then you can just call 'break' and drop the 'goto'.

Got it. I did like you suggested. It looks cleaner this way.

> 
> > +	mutex_unlock(&nvme_subsystems_lock);
> > +	return sctrl;
> > +}
> > +
> > +static void nvme_put_ctrl_ccr(struct nvme_ctrl *sctrl)
> > +{
> > +	atomic_inc(&sctrl->ccr_limit);
> > +	blk_put_queue(sctrl->admin_q);
> > +	nvme_put_ctrl(sctrl);
> > +}
> > +
> > +static int nvme_issue_wait_ccr(struct nvme_ctrl *sctrl, struct nvme_ctrl *ictrl)
> > +{
> > +	struct nvme_ccr_entry ccr = { };
> > +	union nvme_result res = { 0 };
> > +	struct nvme_command c = { };
> > +	unsigned long flags, tmo;
> > +	int ret = 0;
> > +	u32 result;
> > +
> > +	init_completion(&ccr.complete);
> > +	ccr.ictrl = ictrl;
> > +
> > +	spin_lock_irqsave(&sctrl->lock, flags);
> > +	list_add_tail(&ccr.list, &sctrl->ccr_list);
> > +	spin_unlock_irqrestore(&sctrl->lock, flags);
> > +
> > +	c.ccr.opcode = nvme_admin_cross_ctrl_reset;
> > +	c.ccr.ciu = ictrl->ciu;
> > +	c.ccr.icid = cpu_to_le16(ictrl->cntlid);
> > +	c.ccr.cirn = cpu_to_le64(ictrl->cirn);
> > +	ret = __nvme_submit_sync_cmd(sctrl->admin_q, &c, &res,
> > +				     NULL, 0, NVME_QID_ANY, 0);
> > +	if (ret)
> > +		goto out;
> > +
> > +	result = le32_to_cpu(res.u32);
> > +	if (result & 0x01) /* Immediate Reset Successful */
> > +		goto out;
> > +
> > +	tmo = msecs_to_jiffies(max(ictrl->cqt, ictrl->kato * 1000));
> > +	if (!wait_for_completion_timeout(&ccr.complete, tmo)) {
> > +		ret = -ETIMEDOUT;
> > +		goto out;
> > +	}
> > +
> > +	if (ccr.ccrs != NVME_CCR_STATUS_SUCCESS)
> > +		ret = -EREMOTEIO;
> > +out:
> > +	spin_lock_irqsave(&sctrl->lock, flags);
> > +	list_del(&ccr.list);
> > +	spin_unlock_irqrestore(&sctrl->lock, flags);
> > +	return ret;
> > +}
> > +
> > +unsigned long nvme_fence_ctrl(struct nvme_ctrl *ictrl)
> > +{
> > +	unsigned long deadline, now, timeout;
> > +	struct nvme_ctrl *sctrl;
> > +	u32 min_cntlid = 0;
> > +	int ret;
> > +
> > +	timeout = nvme_fence_timeout_ms(ictrl);
> > +	dev_info(ictrl->device, "attempting CCR, timeout %lums\n", timeout);
> > +
> > +	now = jiffies;
> > +	deadline = now + msecs_to_jiffies(timeout);
> > +	while (time_before(now, deadline)) {
> > +		sctrl = nvme_find_ctrl_ccr(ictrl, min_cntlid);
> > +		if (!sctrl) {
> > +			/* CCR failed, switch to time-based recovery */
> > +			return deadline - now;
> > +		}
> > +
> > +		ret = nvme_issue_wait_ccr(sctrl, ictrl);
> > +		if (!ret) {
> > +			dev_info(ictrl->device, "CCR succeeded using %s\n",
> > +				 dev_name(sctrl->device));
> > +			nvme_put_ctrl_ccr(sctrl);
> > +			return 0;
> > +		}
> > +
> > +		/* CCR failed, try another path */
> > +		min_cntlid = sctrl->cntlid + 1;
> > +		nvme_put_ctrl_ccr(sctrl);
> > +		now = jiffies;
> > +	}
> 
> That will spin until 'deadline' is reached if 'nvme_issue_wait_ccr()' 
> returns an error. _And_ if the CCR itself runs into a timeout we would
> never have tried another path (which could have succeeded).

True. We can do one thing at a time in CCR time budget. Either wait for
CCR to succeed or give up early and try another path. It is a trade off.

> 
> I'd rather rework this loop to open-code 'issue_and_wait()' in the loop,
> and only switch to the next controller if the submission of CCR failed.
> Once that is done we can 'just' wait for completion, as a failure there
> will be after KATO timeout anyway and any subsequent CCR would be pointless.

If I understood this correctly then we will stick with the first sctrl
that accepts the CCR command. We wait for CCR to complete and give up on
fencing ictrl if CCR operation fails or times out. Did I get this correctly?

If I got it correctly, why is this better that current logic?

Currently nvme_issue_wait_ccr() waits max(cqt, kato) for CCR to
complete. If we change this logic, should it wait for "deadline - now"
for CCR to complete? Or keep it as it is?

The specs does not say how much time to wait for CCR operation to complete.
My impression is that max(cqt, kato) is a reasonable amount of time to
wait. If we do not hear from sctrl then we should switch to next path.

> 
> > +
> > +	dev_info(ictrl->device, "CCR reached timeout, call it done\n");
> > +	return 0;
> > +}
> > +EXPORT_SYMBOL_GPL(nvme_fence_ctrl);
> > +
> >   bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
> >   		enum nvme_ctrl_state new_state)
> >   {
> > @@ -5119,6 +5247,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
> >   
> >   	mutex_init(&ctrl->scan_lock);
> >   	INIT_LIST_HEAD(&ctrl->namespaces);
> > +	INIT_LIST_HEAD(&ctrl->ccr_list);
> >   	xa_init(&ctrl->cels);
> >   	ctrl->dev = dev;
> >   	ctrl->ops = ops;
> > diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> > index 00866bbc66f3..fa18f580d76a 100644
> > --- a/drivers/nvme/host/nvme.h
> > +++ b/drivers/nvme/host/nvme.h
> > @@ -279,6 +279,13 @@ enum nvme_ctrl_flags {
> >   	NVME_CTRL_FROZEN		= 6,
> >   };
> >   
> > +struct nvme_ccr_entry {
> > +	struct list_head list;
> > +	struct completion complete;
> > +	struct nvme_ctrl *ictrl;
> > +	u8 ccrs;
> > +};
> > +
>  >   struct nvme_ctrl {>   	bool comp_seen;
> >   	bool identified;
> > @@ -296,6 +303,7 @@ struct nvme_ctrl {
> >   	struct blk_mq_tag_set *tagset;
> >   	struct blk_mq_tag_set *admin_tagset;
> >   	struct list_head namespaces;
> > +	struct list_head ccr_list;
> >   	struct mutex namespaces_lock;
> >   	struct srcu_struct srcu;
> >   	struct device ctrl_device;
> > @@ -814,6 +822,7 @@ blk_status_t nvme_host_path_error(struct request *req);
> >   bool nvme_cancel_request(struct request *req, void *data);
> >   void nvme_cancel_tagset(struct nvme_ctrl *ctrl);
> >   void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl);
> > +unsigned long nvme_fence_ctrl(struct nvme_ctrl *ctrl);
> >   bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
> >   		enum nvme_ctrl_state new_state);
> >   int nvme_disable_ctrl(struct nvme_ctrl *ctrl, bool shutdown);
> 
> Cheers,
> 
> Hannes
> -- 
> Dr. Hannes Reinecke                  Kernel Storage Architect
> hare@...e.de                                +49 911 74053 688
> SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
> HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich