netdev - Re: [RFC PATCH v4 11/27] nvme-tcp-offload: Add controller level implementation

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Message-ID: <SJ0PR18MB3882D394624BAC7D57DCC846CC579@SJ0PR18MB3882.namprd18.prod.outlook.com>
Date:   Fri, 7 May 2021 13:13:07 +0000
From:   Shai Malin <smalin@...vell.com>
To:     Hannes Reinecke <hare@...e.com>
CC:     "netdev@...r.kernel.org" <netdev@...r.kernel.org>,
        "davem@...emloft.net" <davem@...emloft.net>,
        "kuba@...nel.org" <kuba@...nel.org>,
        "linux-nvme@...ts.infradead.org" <linux-nvme@...ts.infradead.org>,
        Sagi Grimberg <sagi@...mberg.me>,
        Christoph Hellwig <hch@....de>,
        Keith Busch <kbusch@...nel.org>, "axboe@...com" <axboe@...com>,
        Ariel Elior <aelior@...vell.com>,
        Michal Kalderon <mkalderon@...vell.com>,
        Omkar Kulkarni <okulkarni@...vell.com>,
        Prabhakar Kushwaha <pkushwaha@...vell.com>,
        "malin1024@...il.com" <malin1024@...il.com>
Subject: Re: [RFC PATCH v4 11/27] nvme-tcp-offload: Add controller level
 implementation


On 5/1/21 7:27 PM, Hannes Reinecke wrote:
> On 4/29/21 9:09 PM, Shai Malin wrote:
> > From: Arie Gershberg <agershberg@...vell.com>
> >
> > In this patch we implement controller level functionality including:
> > - create_ctrl.
> > - delete_ctrl.
> > - free_ctrl.
> >
> > The implementation is similar to other nvme fabrics modules, the main
> > difference being that the nvme-tcp-offload ULP calls the vendor specific
> > claim_dev() op with the given TCP/IP parameters to determine which device
> > will be used for this controller.
> > Once found, the vendor specific device and controller will be paired and
> > kept in a controller list managed by the ULP.
> >
> > Acked-by: Igor Russkikh <irusskikh@...vell.com>
> > Signed-off-by: Arie Gershberg <agershberg@...vell.com>
> > Signed-off-by: Prabhakar Kushwaha <pkushwaha@...vell.com>
> > Signed-off-by: Omkar Kulkarni <okulkarni@...vell.com>
> > Signed-off-by: Michal Kalderon <mkalderon@...vell.com>
> > Signed-off-by: Ariel Elior <aelior@...vell.com>
> > Signed-off-by: Shai Malin <smalin@...vell.com>
> > ---
> >   drivers/nvme/host/tcp-offload.c | 467
> +++++++++++++++++++++++++++++++-
> >   1 file changed, 459 insertions(+), 8 deletions(-)
> >
> > diff --git a/drivers/nvme/host/tcp-offload.c b/drivers/nvme/host/tcp-offload.c
> > index aa7cc239abf2..59e1955e02ec 100644
> > --- a/drivers/nvme/host/tcp-offload.c
> > +++ b/drivers/nvme/host/tcp-offload.c
> > @@ -12,6 +12,10 @@
> >
> >   static LIST_HEAD(nvme_tcp_ofld_devices);
> >   static DECLARE_RWSEM(nvme_tcp_ofld_devices_rwsem);
> > +static LIST_HEAD(nvme_tcp_ofld_ctrl_list);
> > +static DECLARE_RWSEM(nvme_tcp_ofld_ctrl_rwsem);
> > +static struct blk_mq_ops nvme_tcp_ofld_admin_mq_ops;
> > +static struct blk_mq_ops nvme_tcp_ofld_mq_ops;
> >
> >   static inline struct nvme_tcp_ofld_ctrl *to_tcp_ofld_ctrl(struct nvme_ctrl
> *nctrl)
> >   {
> > @@ -128,28 +132,430 @@ nvme_tcp_ofld_lookup_dev(struct
> nvme_tcp_ofld_ctrl *ctrl)
> >   	return dev;
> >   }
> >
> > +static struct blk_mq_tag_set *
> > +nvme_tcp_ofld_alloc_tagset(struct nvme_ctrl *nctrl, bool admin)
> > +{
> > +	struct nvme_tcp_ofld_ctrl *ctrl = to_tcp_ofld_ctrl(nctrl);
> > +	struct blk_mq_tag_set *set;
> > +	int rc;
> > +
> > +	if (admin) {
> > +		set = &ctrl->admin_tag_set;
> > +		memset(set, 0, sizeof(*set));
> > +		set->ops = &nvme_tcp_ofld_admin_mq_ops;
> > +		set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
> > +		set->reserved_tags = NVMF_RESERVED_TAGS;
> > +		set->numa_node = nctrl->numa_node;
> > +		set->flags = BLK_MQ_F_BLOCKING;
> > +		set->cmd_size = sizeof(struct nvme_tcp_ofld_req);
> > +		set->driver_data = ctrl;
> > +		set->nr_hw_queues = 1;
> > +		set->timeout = NVME_ADMIN_TIMEOUT;
> > +	} else {
> > +		set = &ctrl->tag_set;
> > +		memset(set, 0, sizeof(*set));
> > +		set->ops = &nvme_tcp_ofld_mq_ops;
> > +		set->queue_depth = nctrl->sqsize + 1;
> > +		set->reserved_tags = NVMF_RESERVED_TAGS;
> > +		set->numa_node = nctrl->numa_node;
> > +		set->flags = BLK_MQ_F_SHOULD_MERGE |
> BLK_MQ_F_BLOCKING;
> > +		set->cmd_size = sizeof(struct nvme_tcp_ofld_req);
> > +		set->driver_data = ctrl;
> > +		set->nr_hw_queues = nctrl->queue_count - 1;
> > +		set->timeout = NVME_IO_TIMEOUT;
> > +		set->nr_maps = nctrl->opts->nr_poll_queues ?
> HCTX_MAX_TYPES : 2;
> > +	}
> > +
> > +	rc = blk_mq_alloc_tag_set(set);
> > +	if (rc)
> > +		return ERR_PTR(rc);
> > +
> > +	return set;
> > +}
> > +
> > +static int nvme_tcp_ofld_configure_admin_queue(struct nvme_ctrl *nctrl,
> > +					       bool new)
> > +{
> > +	int rc;
> > +
> > +	/* Placeholder - alloc_admin_queue */
> > +	if (new) {
> > +		nctrl->admin_tagset =
> > +				nvme_tcp_ofld_alloc_tagset(nctrl, true);
> > +		if (IS_ERR(nctrl->admin_tagset)) {
> > +			rc = PTR_ERR(nctrl->admin_tagset);
> > +			nctrl->admin_tagset = NULL;
> > +			goto out_free_queue;
> > +		}
> > +
> > +		nctrl->fabrics_q = blk_mq_init_queue(nctrl->admin_tagset);
> > +		if (IS_ERR(nctrl->fabrics_q)) {
> > +			rc = PTR_ERR(nctrl->fabrics_q);
> > +			nctrl->fabrics_q = NULL;
> > +			goto out_free_tagset;
> > +		}
> > +
> > +		nctrl->admin_q = blk_mq_init_queue(nctrl->admin_tagset);
> > +		if (IS_ERR(nctrl->admin_q)) {
> > +			rc = PTR_ERR(nctrl->admin_q);
> > +			nctrl->admin_q = NULL;
> > +			goto out_cleanup_fabrics_q;
> > +		}
> > +	}
> > +
> > +	/* Placeholder - nvme_tcp_ofld_start_queue */
> > +
> > +	rc = nvme_enable_ctrl(nctrl);
> > +	if (rc)
> > +		goto out_stop_queue;
> > +
> > +	blk_mq_unquiesce_queue(nctrl->admin_q);
> > +
> > +	rc = nvme_init_identify(nctrl);
> > +	if (rc)
> > +		goto out_quiesce_queue;
> > +
> > +	return 0;
> > +
> > +out_quiesce_queue:
> > +	blk_mq_quiesce_queue(nctrl->admin_q);
> > +	blk_sync_queue(nctrl->admin_q);
> > +
> > +out_stop_queue:
> > +	/* Placeholder - stop offload queue */
> > +	nvme_cancel_admin_tagset(nctrl);
> > +
> > +out_cleanup_fabrics_q:
> > +	if (new)
> > +		blk_cleanup_queue(nctrl->fabrics_q);
> > +out_free_tagset:
> > +	if (new)
> > +		blk_mq_free_tag_set(nctrl->admin_tagset);
> > +out_free_queue:
> > +	/* Placeholder - free admin queue */
> > +
> > +	return rc;
> > +}
> > +
> > +static int
> > +nvme_tcp_ofld_configure_io_queues(struct nvme_ctrl *nctrl, bool new)
> > +{
> > +	int rc;
> > +
> > +	/* Placeholder - alloc_io_queues */
> > +
> > +	if (new) {
> > +		nctrl->tagset = nvme_tcp_ofld_alloc_tagset(nctrl, false);
> > +		if (IS_ERR(nctrl->tagset)) {
> > +			rc = PTR_ERR(nctrl->tagset);
> > +			nctrl->tagset = NULL;
> > +			goto out_free_io_queues;
> > +		}
> > +
> > +		nctrl->connect_q = blk_mq_init_queue(nctrl->tagset);
> > +		if (IS_ERR(nctrl->connect_q)) {
> > +			rc = PTR_ERR(nctrl->connect_q);
> > +			nctrl->connect_q = NULL;
> > +			goto out_free_tag_set;
> > +		}
> > +	}
> > +
> > +	/* Placeholder - start_io_queues */
> > +
> > +	if (!new) {
> > +		nvme_start_queues(nctrl);
> > +		if (!nvme_wait_freeze_timeout(nctrl, NVME_IO_TIMEOUT)) {
> > +			/*
> > +			 * If we timed out waiting for freeze we are likely to
> > +			 * be stuck.  Fail the controller initialization just
> > +			 * to be safe.
> > +			 */
> > +			rc = -ENODEV;
> > +			goto out_wait_freeze_timed_out;
> > +		}
> > +		blk_mq_update_nr_hw_queues(nctrl->tagset, nctrl-
> >queue_count - 1);
> > +		nvme_unfreeze(nctrl);
> > +	}
> > +
> > +	return 0;
> > +
> > +out_wait_freeze_timed_out:
> > +	nvme_stop_queues(nctrl);
> > +	nvme_sync_io_queues(nctrl);
> > +
> > +	/* Placeholder - Stop IO queues */
> > +
> > +	if (new)
> > +		blk_cleanup_queue(nctrl->connect_q);
> > +out_free_tag_set:
> > +	if (new)
> > +		blk_mq_free_tag_set(nctrl->tagset);
> > +out_free_io_queues:
> > +	/* Placeholder - free_io_queues */
> > +
> > +	return rc;
> > +}
> > +
> >   static int nvme_tcp_ofld_setup_ctrl(struct nvme_ctrl *nctrl, bool new)
> >   {
> > -	/* Placeholder - validates inputs and creates admin and IO queues */
> > +	struct nvmf_ctrl_options *opts = nctrl->opts;
> > +	int rc;
> > +
> > +	rc = nvme_tcp_ofld_configure_admin_queue(nctrl, new);
> > +	if (rc)
> > +		return rc;
> > +
> > +	if (nctrl->icdoff) {
> > +		dev_err(nctrl->device, "icdoff is not supported!\n");
> > +		rc = -EINVAL;
> > +		goto destroy_admin;
> > +	}
> > +
> > +	if (opts->queue_size > nctrl->sqsize + 1)
> > +		dev_warn(nctrl->device,
> > +			 "queue_size %zu > ctrl sqsize %u, clamping down\n",
> > +			 opts->queue_size, nctrl->sqsize + 1);
> > +
> > +	if (nctrl->sqsize + 1 > nctrl->maxcmd) {
> > +		dev_warn(nctrl->device,
> > +			 "sqsize %u > ctrl maxcmd %u, clamping down\n",
> > +			 nctrl->sqsize + 1, nctrl->maxcmd);
> > +		nctrl->sqsize = nctrl->maxcmd - 1;
> > +	}
> > +
> > +	if (nctrl->queue_count > 1) {
> > +		rc = nvme_tcp_ofld_configure_io_queues(nctrl, new);
> > +		if (rc)
> > +			goto destroy_admin;
> > +	}
> > +
> > +	if (!nvme_change_ctrl_state(nctrl, NVME_CTRL_LIVE)) {
> > +		/*
> > +		 * state change failure is ok if we started ctrl delete,
> > +		 * unless we're during creation of a new controller to
> > +		 * avoid races with teardown flow.
> > +		 */
> > +		WARN_ON_ONCE(nctrl->state != NVME_CTRL_DELETING &&
> > +			     nctrl->state != NVME_CTRL_DELETING_NOIO);
> > +		WARN_ON_ONCE(new);
> > +		rc = -EINVAL;
> > +		goto destroy_io;
> > +	}
> > +
> > +	nvme_start_ctrl(nctrl);
> > +
> > +	return 0;
> > +
> > +destroy_io:
> > +	/* Placeholder - stop and destroy io queues*/
> > +destroy_admin:
> > +	/* Placeholder - stop and destroy admin queue*/
> > +
> > +	return rc;
> > +}
> > +
> > +static int
> > +nvme_tcp_ofld_check_dev_opts(struct nvmf_ctrl_options *opts,
> > +			     struct nvme_tcp_ofld_ops *ofld_ops)
> > +{
> > +	unsigned int nvme_tcp_ofld_opt_mask = NVMF_ALLOWED_OPTS |
> > +			ofld_ops->allowed_opts | ofld_ops->required_opts;
> > +	if (opts->mask & ~nvme_tcp_ofld_opt_mask) {
> > +		pr_warn("One or more of the nvmf options isn't supported by
> %s.\n",
> > +			ofld_ops->name);
> > +
> > +		return -EINVAL;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> 
> I'd rather specify the options directly; that's how all the other
> transports do it.

As it was mentioned in patch 9 - " nvme-fabrics: Move 
NVMF_ALLOWED_OPTS and NVMF_REQUIRED_OPTS definitions":

Different HW devices that are offloading the NVMeTCP might have different
limitations of the allowed options.
For example, a device that does not support all the queue types.
With tcp and rdma, only the nvme-tcp and nvme-rdma layers handle those
attributes and the HW devices do not create any limitations for the allowed
options.

An alternative design could be to add separate fields in nvme_tcp_ofld_ops 
such as max_hw_sectors and max_segments that we already have in this series.

Which would you prefer?

> 
> > +static void nvme_tcp_ofld_free_ctrl(struct nvme_ctrl *nctrl)
> > +{
> > +	struct nvme_tcp_ofld_ctrl *ctrl = to_tcp_ofld_ctrl(nctrl);
> > +	struct nvme_tcp_ofld_dev *dev = ctrl->dev;
> > +
> > +	if (list_empty(&ctrl->list))
> > +		goto free_ctrl;
> > +
> > +	down_write(&nvme_tcp_ofld_ctrl_rwsem);
> > +	ctrl->dev->ops->release_ctrl(ctrl);
> > +	list_del(&ctrl->list);
> > +	up_write(&nvme_tcp_ofld_ctrl_rwsem);
> > +
> > +	nvmf_free_options(nctrl->opts);
> > +free_ctrl:
> > +	module_put(dev->ops->module);
> > +	kfree(ctrl->queues);
> > +	kfree(ctrl);
> > +}
> > +
> > +static void nvme_tcp_ofld_submit_async_event(struct nvme_ctrl *arg)
> > +{
> > +	/* Placeholder - submit_async_event */
> > +}
> > +
> > +static void
> > +nvme_tcp_ofld_teardown_admin_queue(struct nvme_ctrl *ctrl, bool
> remove)
> > +{
> > +	/* Placeholder - teardown_admin_queue */
> > +}
> > +
> > +static void
> > +nvme_tcp_ofld_teardown_io_queues(struct nvme_ctrl *nctrl, bool remove)
> > +{
> > +	/* Placeholder - teardown_io_queues */
> > +}
> > +
> > +static void
> > +nvme_tcp_ofld_teardown_ctrl(struct nvme_ctrl *nctrl, bool shutdown)
> > +{
> > +	/* Placeholder - err_work and connect_work */
> > +	nvme_tcp_ofld_teardown_io_queues(nctrl, shutdown);
> > +	blk_mq_quiesce_queue(nctrl->admin_q);
> > +	if (shutdown)
> > +		nvme_shutdown_ctrl(nctrl);
> > +	else
> > +		nvme_disable_ctrl(nctrl);
> > +	nvme_tcp_ofld_teardown_admin_queue(nctrl, shutdown);
> > +}
> > +
> > +static void nvme_tcp_ofld_delete_ctrl(struct nvme_ctrl *nctrl)
> > +{
> > +	nvme_tcp_ofld_teardown_ctrl(nctrl, true);
> > +}
> > +
> > +static int
> > +nvme_tcp_ofld_init_request(struct blk_mq_tag_set *set,
> > +			   struct request *rq,
> > +			   unsigned int hctx_idx,
> > +			   unsigned int numa_node)
> > +{
> > +	struct nvme_tcp_ofld_req *req = blk_mq_rq_to_pdu(rq);
> > +	struct nvme_tcp_ofld_ctrl *ctrl = set->driver_data;
> > +
> > +	/* Placeholder - init request */
> > +
> > +	req->done = nvme_tcp_ofld_req_done;
> > +	ctrl->dev->ops->init_req(req);
> >
> >   	return 0;
> >   }
> >
> > +static blk_status_t
> > +nvme_tcp_ofld_queue_rq(struct blk_mq_hw_ctx *hctx,
> > +		       const struct blk_mq_queue_data *bd)
> > +{
> > +	/* Call nvme_setup_cmd(...) */
> > +
> > +	/* Call ops->send_req(...) */
> > +
> > +	return BLK_STS_OK;
> > +}
> > +
> > +static struct blk_mq_ops nvme_tcp_ofld_mq_ops = {
> > +	.queue_rq	= nvme_tcp_ofld_queue_rq,
> > +	.init_request	= nvme_tcp_ofld_init_request,
> > +	/*
> > +	 * All additional ops will be also implemented and registered similar to
> > +	 * tcp.c
> > +	 */
> > +};
> > +
> > +static struct blk_mq_ops nvme_tcp_ofld_admin_mq_ops = {
> > +	.queue_rq	= nvme_tcp_ofld_queue_rq,
> > +	.init_request	= nvme_tcp_ofld_init_request,
> > +	/*
> > +	 * All additional ops will be also implemented and registered similar to
> > +	 * tcp.c
> > +	 */
> > +};
> > +
> > +static const struct nvme_ctrl_ops nvme_tcp_ofld_ctrl_ops = {
> > +	.name			= "tcp_offload",
> > +	.module			= THIS_MODULE,
> > +	.flags			= NVME_F_FABRICS,
> > +	.reg_read32		= nvmf_reg_read32,
> > +	.reg_read64		= nvmf_reg_read64,
> > +	.reg_write32		= nvmf_reg_write32,
> > +	.free_ctrl		= nvme_tcp_ofld_free_ctrl,
> > +	.submit_async_event	= nvme_tcp_ofld_submit_async_event,
> > +	.delete_ctrl		= nvme_tcp_ofld_delete_ctrl,
> > +	.get_address		= nvmf_get_address,
> > +};
> > +
> > +static bool
> > +nvme_tcp_ofld_existing_controller(struct nvmf_ctrl_options *opts)
> > +{
> > +	struct nvme_tcp_ofld_ctrl *ctrl;
> > +	bool found = false;
> > +
> > +	down_read(&nvme_tcp_ofld_ctrl_rwsem);
> > +	list_for_each_entry(ctrl, &nvme_tcp_ofld_ctrl_list, list) {
> > +		found = nvmf_ip_options_match(&ctrl->nctrl, opts);
> > +		if (found)
> > +			break;
> > +	}
> > +	up_read(&nvme_tcp_ofld_ctrl_rwsem);
> > +
> > +	return found;
> > +}
> > +
> >   static struct nvme_ctrl *
> >   nvme_tcp_ofld_create_ctrl(struct device *ndev, struct nvmf_ctrl_options
> *opts)
> >   {
> > +	struct nvme_tcp_ofld_queue *queue;
> >   	struct nvme_tcp_ofld_ctrl *ctrl;
> >   	struct nvme_tcp_ofld_dev *dev;
> >   	struct nvme_ctrl *nctrl;
> > -	int rc = 0;
> > +	int i, rc = 0;
> >
> >   	ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
> >   	if (!ctrl)
> >   		return ERR_PTR(-ENOMEM);
> >
> > +	INIT_LIST_HEAD(&ctrl->list);
> >   	nctrl = &ctrl->nctrl;
> > +	nctrl->opts = opts;
> > +	nctrl->queue_count = opts->nr_io_queues + opts->nr_write_queues +
> > +			     opts->nr_poll_queues + 1;
> > +	nctrl->sqsize = opts->queue_size - 1;
> > +	nctrl->kato = opts->kato;
> > +	if (!(opts->mask & NVMF_OPT_TRSVCID)) {
> > +		opts->trsvcid =
> > +			kstrdup(__stringify(NVME_TCP_DISC_PORT),
> GFP_KERNEL);
> > +		if (!opts->trsvcid) {
> > +			rc = -ENOMEM;
> > +			goto out_free_ctrl;
> > +		}
> > +		opts->mask |= NVMF_OPT_TRSVCID;
> > +	}
> >
> > -	/* Init nvme_tcp_ofld_ctrl and nvme_ctrl params based on received opts
> */
> > +	rc = inet_pton_with_scope(&init_net, AF_UNSPEC, opts->traddr,
> > +				  opts->trsvcid,
> > +				  &ctrl->conn_params.remote_ip_addr);
> > +	if (rc) {
> > +		pr_err("malformed address passed: %s:%s\n",
> > +		       opts->traddr, opts->trsvcid);
> > +		goto out_free_ctrl;
> > +	}
> > +
> > +	if (opts->mask & NVMF_OPT_HOST_TRADDR) {
> > +		rc = inet_pton_with_scope(&init_net, AF_UNSPEC,
> > +					  opts->host_traddr, NULL,
> > +					  &ctrl->conn_params.local_ip_addr);
> > +		if (rc) {
> > +			pr_err("malformed src address passed: %s\n",
> > +			       opts->host_traddr);
> > +			goto out_free_ctrl;
> > +		}
> > +	}
> > +
> > +	if (!opts->duplicate_connect &&
> > +	    nvme_tcp_ofld_existing_controller(opts)) {
> > +		rc = -EALREADY;
> > +		goto out_free_ctrl;
> > +	}
> >
> >   	/* Find device that can reach the dest addr */
> >   	dev = nvme_tcp_ofld_lookup_dev(ctrl);
> As mentioned in my previous comment: This is not necessarily unique.
> There is no guarantee that a specific host ip/target ip combination will
> devolve onto a single interface; in fact, given that this is an offload
> card which might handle only specific protocols I fully expect to have
> several interfaces through which the target ip address can be reached.
> 
> One could implement some sort of prioritisation such that offload
> engines should be preferred, but in the end it's a policy decision and
> not something we should impose from the driver side.
> 
> I'd rather be able to specify an interface directly, and tie specific
> host ip/target ip connections (ie host_traddr/traddr combinations) to
> that interface. That way we don't impose restrictions on the admin,
> and remove ambiguity when creating the controller from userspace.

I agree and I believe it was discussed on:
Patch 8: "nvme-tcp-offload: Add nvme-tcp-offload - NVMeTCP HW offload ULP"
Patch 10: " nvme-tcp-offload: Add device scan implementation"

Just to clarify, nvme-tcp-offload supports the host_traddr option.

> 
> > @@ -160,6 +566,10 @@ nvme_tcp_ofld_create_ctrl(struct device *ndev, struct
> nvmf_ctrl_options *opts)
> >   		goto out_free_ctrl;
> >   	}
> >
> > +	rc = nvme_tcp_ofld_check_dev_opts(opts, dev->ops);
> > +	if (rc)
> > +		goto out_module_put;
> > +
> >   	ctrl->dev = dev;
> >
> >   	if (ctrl->dev->ops->max_hw_sectors)
> > @@ -167,22 +577,55 @@ nvme_tcp_ofld_create_ctrl(struct device *ndev,
> struct nvmf_ctrl_options *opts)
> >   	if (ctrl->dev->ops->max_segments)
> >   		nctrl->max_segments = ctrl->dev->ops->max_segments;
> >
> > -	/* Init queues */
> > +	ctrl->queues = kcalloc(nctrl->queue_count,
> > +			       sizeof(struct nvme_tcp_ofld_queue),
> > +			       GFP_KERNEL);
> > +	if (!ctrl->queues) {
> > +		rc = -ENOMEM;
> > +		goto out_module_put;
> > +	}
> >
> > -	/* Call nvme_init_ctrl */
> > +	for (i = 0; i < nctrl->queue_count; ++i) {
> > +		queue = &ctrl->queues[i];
> > +		queue->ctrl = ctrl;
> > +		queue->dev = dev;
> > +		queue->report_err = nvme_tcp_ofld_report_queue_err;
> > +	}
> > +
> 
> What does this 'report_err' callback do?
> Maybe it's easier to understand when it's introduced alongside with the
> code handling it.

nvme-tcp-offload expose the following 2 options from which the vendor 
driver can report errors:
  1. Calling nvme_tcp_ofld_error_recovery()
  2. Calling report_err() from the context of the specific queue.

The qedn driver is currently using only the first option.

> 
> > +	rc = nvme_init_ctrl(nctrl, ndev, &nvme_tcp_ofld_ctrl_ops, 0);
> > +	if (rc)
> > +		goto out_free_queues;
> > +
> > +	if (!nvme_change_ctrl_state(nctrl, NVME_CTRL_CONNECTING)) {
> > +		WARN_ON_ONCE(1);
> > +		rc = -EINTR;
> > +		goto out_uninit_ctrl;
> > +	}
> >
> >   	rc = ctrl->dev->ops->setup_ctrl(ctrl, true);
> >   	if (rc)
> > -		goto out_module_put;
> > +		goto out_uninit_ctrl;
> >
> >   	rc = nvme_tcp_ofld_setup_ctrl(nctrl, true);
> >   	if (rc)
> > -		goto out_uninit_ctrl;
> > +		goto out_release_ctrl;
> > +
> > +	dev_info(nctrl->device, "new ctrl: NQN \"%s\", addr %pISp\n",
> > +		 opts->subsysnqn, &ctrl->conn_params.remote_ip_addr);
> > +
> > +	down_write(&nvme_tcp_ofld_ctrl_rwsem);
> > +	list_add_tail(&ctrl->list, &nvme_tcp_ofld_ctrl_list);
> > +	up_write(&nvme_tcp_ofld_ctrl_rwsem);
> >
> >   	return nctrl;
> >
> > -out_uninit_ctrl:
> > +out_release_ctrl:
> >   	ctrl->dev->ops->release_ctrl(ctrl);
> > +out_uninit_ctrl:
> > +	nvme_uninit_ctrl(nctrl);
> > +	nvme_put_ctrl(nctrl);
> > +out_free_queues:
> > +	kfree(ctrl->queues);
> >   out_module_put:
> >   	module_put(dev->ops->module);
> >   out_free_ctrl:
> > @@ -212,7 +655,15 @@ static int __init nvme_tcp_ofld_init_module(void)
> >
> >   static void __exit nvme_tcp_ofld_cleanup_module(void)
> >   {
> > +	struct nvme_tcp_ofld_ctrl *ctrl;
> > +
> >   	nvmf_unregister_transport(&nvme_tcp_ofld_transport);
> > +
> > +	down_write(&nvme_tcp_ofld_ctrl_rwsem);
> > +	list_for_each_entry(ctrl, &nvme_tcp_ofld_ctrl_list, list)
> > +		nvme_delete_ctrl(&ctrl->nctrl);
> > +	up_write(&nvme_tcp_ofld_ctrl_rwsem);
> > +	flush_workqueue(nvme_delete_wq);
> >   }
> >
> >   module_init(nvme_tcp_ofld_init_module);
> >
> Cheers,
> 
> Hannes
> --
> Dr. Hannes Reinecke                Kernel Storage Architect
> hare@...e.de                              +49 911 74053 688
> SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
> HRB 36809 (AG Nürnberg), Geschäftsführer: Felix Imendörffer
> 
> _______________________________________________
> Linux-nvme mailing list
> Linux-nvme@...ts.infradead.org
> https://urldefense.proofpoint.com/v2/url?u=http-
> 3A__lists.infradead.org_mailman_listinfo_linux-
> 2Dnvme&d=DwIGaQ&c=nKjWec2b6R0mOyPaz7xtfQ&r=gL_O7mMt4U8-
> BQyJeNh97LwlWMWb6LnAoaOBqmewlbM&m=7j1wy1l4XJzLKU7N2UzfejLW0B6n
> wS5DM1vp_74c57Y&s=Dv1w-YTwbkPJXzsUaQuoVxPl98IsdRBIDFZaDSqAK2w&e=