[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <deb7ac09-3c0a-4ade-b22d-ef054df42b58@oracle.com>
Date: Fri, 21 May 2021 12:19:44 -0500
From: Himanshu Madhani <himanshu.madhani@...cle.com>
To: Shai Malin <smalin@...vell.com>, netdev@...r.kernel.org,
linux-nvme@...ts.infradead.org, davem@...emloft.net,
kuba@...nel.org, sagi@...mberg.me, hch@....de, axboe@...com,
kbusch@...nel.org
Cc: aelior@...vell.com, mkalderon@...vell.com, okulkarni@...vell.com,
pkushwaha@...vell.com, malin1024@...il.com,
Arie Gershberg <agershberg@...vell.com>
Subject: Re: [RFC PATCH v5 04/27] nvme-tcp-offload: Add controller level
implementation
On 5/19/21 6:13 AM, Shai Malin wrote:
> From: Arie Gershberg <agershberg@...vell.com>
>
> In this patch we implement controller level functionality including:
> - create_ctrl.
> - delete_ctrl.
> - free_ctrl.
>
> The implementation is similar to other nvme fabrics modules, the main
> difference being that the nvme-tcp-offload ULP calls the vendor specific
> claim_dev() op with the given TCP/IP parameters to determine which device
> will be used for this controller.
> Once found, the vendor specific device and controller will be paired and
> kept in a controller list managed by the ULP.
>
> Acked-by: Igor Russkikh <irusskikh@...vell.com>
> Signed-off-by: Arie Gershberg <agershberg@...vell.com>
> Signed-off-by: Prabhakar Kushwaha <pkushwaha@...vell.com>
> Signed-off-by: Omkar Kulkarni <okulkarni@...vell.com>
> Signed-off-by: Michal Kalderon <mkalderon@...vell.com>
> Signed-off-by: Ariel Elior <aelior@...vell.com>
> Signed-off-by: Shai Malin <smalin@...vell.com>
> ---
> drivers/nvme/host/tcp-offload.c | 475 +++++++++++++++++++++++++++++++-
> 1 file changed, 467 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/nvme/host/tcp-offload.c b/drivers/nvme/host/tcp-offload.c
> index aa7cc239abf2..f7e0dc79bedd 100644
> --- a/drivers/nvme/host/tcp-offload.c
> +++ b/drivers/nvme/host/tcp-offload.c
> @@ -12,6 +12,10 @@
>
> static LIST_HEAD(nvme_tcp_ofld_devices);
> static DECLARE_RWSEM(nvme_tcp_ofld_devices_rwsem);
> +static LIST_HEAD(nvme_tcp_ofld_ctrl_list);
> +static DECLARE_RWSEM(nvme_tcp_ofld_ctrl_rwsem);
> +static struct blk_mq_ops nvme_tcp_ofld_admin_mq_ops;
> +static struct blk_mq_ops nvme_tcp_ofld_mq_ops;
>
> static inline struct nvme_tcp_ofld_ctrl *to_tcp_ofld_ctrl(struct nvme_ctrl *nctrl)
> {
> @@ -128,28 +132,435 @@ nvme_tcp_ofld_lookup_dev(struct nvme_tcp_ofld_ctrl *ctrl)
> return dev;
> }
>
> +static struct blk_mq_tag_set *
> +nvme_tcp_ofld_alloc_tagset(struct nvme_ctrl *nctrl, bool admin)
> +{
> + struct nvme_tcp_ofld_ctrl *ctrl = to_tcp_ofld_ctrl(nctrl);
> + struct blk_mq_tag_set *set;
> + int rc;
> +
> + if (admin) {
> + set = &ctrl->admin_tag_set;
> + memset(set, 0, sizeof(*set));
> + set->ops = &nvme_tcp_ofld_admin_mq_ops;
> + set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
> + set->reserved_tags = NVMF_RESERVED_TAGS;
> + set->numa_node = nctrl->numa_node;
> + set->flags = BLK_MQ_F_BLOCKING;
> + set->cmd_size = sizeof(struct nvme_tcp_ofld_req);
> + set->driver_data = ctrl;
> + set->nr_hw_queues = 1;
> + set->timeout = NVME_ADMIN_TIMEOUT;
> + } else {
> + set = &ctrl->tag_set;
> + memset(set, 0, sizeof(*set));
> + set->ops = &nvme_tcp_ofld_mq_ops;
> + set->queue_depth = nctrl->sqsize + 1;
> + set->reserved_tags = NVMF_RESERVED_TAGS;
> + set->numa_node = nctrl->numa_node;
> + set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
> + set->cmd_size = sizeof(struct nvme_tcp_ofld_req);
> + set->driver_data = ctrl;
> + set->nr_hw_queues = nctrl->queue_count - 1;
> + set->timeout = NVME_IO_TIMEOUT;
> + set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
> + }
> +
> + rc = blk_mq_alloc_tag_set(set);
> + if (rc)
> + return ERR_PTR(rc);
> +
> + return set;
> +}
> +
> +static int nvme_tcp_ofld_configure_admin_queue(struct nvme_ctrl *nctrl,
> + bool new)
> +{
> + int rc;
> +
> + /* Placeholder - alloc_admin_queue */
> + if (new) {
> + nctrl->admin_tagset =
> + nvme_tcp_ofld_alloc_tagset(nctrl, true);
> + if (IS_ERR(nctrl->admin_tagset)) {
> + rc = PTR_ERR(nctrl->admin_tagset);
> + nctrl->admin_tagset = NULL;
> + goto out_free_queue;
> + }
> +
> + nctrl->fabrics_q = blk_mq_init_queue(nctrl->admin_tagset);
> + if (IS_ERR(nctrl->fabrics_q)) {
> + rc = PTR_ERR(nctrl->fabrics_q);
> + nctrl->fabrics_q = NULL;
> + goto out_free_tagset;
> + }
> +
> + nctrl->admin_q = blk_mq_init_queue(nctrl->admin_tagset);
> + if (IS_ERR(nctrl->admin_q)) {
> + rc = PTR_ERR(nctrl->admin_q);
> + nctrl->admin_q = NULL;
> + goto out_cleanup_fabrics_q;
> + }
> + }
> +
> + /* Placeholder - nvme_tcp_ofld_start_queue */
> +
> + rc = nvme_enable_ctrl(nctrl);
> + if (rc)
> + goto out_stop_queue;
> +
> + blk_mq_unquiesce_queue(nctrl->admin_q);
> +
> + rc = nvme_init_ctrl_finish(nctrl);
> + if (rc)
> + goto out_quiesce_queue;
> +
> + return 0;
> +
> +out_quiesce_queue:
> + blk_mq_quiesce_queue(nctrl->admin_q);
> + blk_sync_queue(nctrl->admin_q);
> +
> +out_stop_queue:
> + /* Placeholder - stop offload queue */
> + nvme_cancel_admin_tagset(nctrl);
> +
> +out_cleanup_fabrics_q:
> + if (new)
> + blk_cleanup_queue(nctrl->fabrics_q);
> +out_free_tagset:
> + if (new)
> + blk_mq_free_tag_set(nctrl->admin_tagset);
> +out_free_queue:
> + /* Placeholder - free admin queue */
> +
> + return rc;
> +}
> +
> +static int
> +nvme_tcp_ofld_configure_io_queues(struct nvme_ctrl *nctrl, bool new)
> +{
> + int rc;
> +
> + /* Placeholder - alloc_io_queues */
> +
> + if (new) {
> + nctrl->tagset = nvme_tcp_ofld_alloc_tagset(nctrl, false);
> + if (IS_ERR(nctrl->tagset)) {
> + rc = PTR_ERR(nctrl->tagset);
> + nctrl->tagset = NULL;
> + goto out_free_io_queues;
> + }
> +
> + nctrl->connect_q = blk_mq_init_queue(nctrl->tagset);
> + if (IS_ERR(nctrl->connect_q)) {
> + rc = PTR_ERR(nctrl->connect_q);
> + nctrl->connect_q = NULL;
> + goto out_free_tag_set;
> + }
> + }
> +
> + /* Placeholder - start_io_queues */
> +
> + if (!new) {
> + nvme_start_queues(nctrl);
> + if (!nvme_wait_freeze_timeout(nctrl, NVME_IO_TIMEOUT)) {
> + /*
> + * If we timed out waiting for freeze we are likely to
> + * be stuck. Fail the controller initialization just
> + * to be safe.
> + */
> + rc = -ENODEV;
> + goto out_wait_freeze_timed_out;
> + }
> + blk_mq_update_nr_hw_queues(nctrl->tagset, nctrl->queue_count - 1);
> + nvme_unfreeze(nctrl);
> + }
> +
> + return 0;
> +
> +out_wait_freeze_timed_out:
> + nvme_stop_queues(nctrl);
> + nvme_sync_io_queues(nctrl);
> +
> + /* Placeholder - Stop IO queues */
> +
> + if (new)
> + blk_cleanup_queue(nctrl->connect_q);
> +out_free_tag_set:
> + if (new)
> + blk_mq_free_tag_set(nctrl->tagset);
> +out_free_io_queues:
> + /* Placeholder - free_io_queues */
> +
> + return rc;
> +}
> +
> static int nvme_tcp_ofld_setup_ctrl(struct nvme_ctrl *nctrl, bool new)
> {
> - /* Placeholder - validates inputs and creates admin and IO queues */
> + struct nvmf_ctrl_options *opts = nctrl->opts;
> + int rc;
> +
> + rc = nvme_tcp_ofld_configure_admin_queue(nctrl, new);
> + if (rc)
> + return rc;
> +
> + if (nctrl->icdoff) {
> + dev_err(nctrl->device, "icdoff is not supported!\n");
> + rc = -EINVAL;
> + goto destroy_admin;
> + }
> +
> + if (!(nctrl->sgls & ((1 << 0) | (1 << 1)))) {
> + dev_err(nctrl->device, "Mandatory sgls are not supported!\n");
> + goto destroy_admin;
> + }
> +
> + if (opts->queue_size > nctrl->sqsize + 1)
> + dev_warn(nctrl->device,
> + "queue_size %zu > ctrl sqsize %u, clamping down\n",
> + opts->queue_size, nctrl->sqsize + 1);
> +
> + if (nctrl->sqsize + 1 > nctrl->maxcmd) {
> + dev_warn(nctrl->device,
> + "sqsize %u > ctrl maxcmd %u, clamping down\n",
> + nctrl->sqsize + 1, nctrl->maxcmd);
> + nctrl->sqsize = nctrl->maxcmd - 1;
> + }
> +
> + if (nctrl->queue_count > 1) {
> + rc = nvme_tcp_ofld_configure_io_queues(nctrl, new);
> + if (rc)
> + goto destroy_admin;
> + }
> +
> + if (!nvme_change_ctrl_state(nctrl, NVME_CTRL_LIVE)) {
> + /*
> + * state change failure is ok if we started ctrl delete,
> + * unless we're during creation of a new controller to
> + * avoid races with teardown flow.
> + */
> + WARN_ON_ONCE(nctrl->state != NVME_CTRL_DELETING &&
> + nctrl->state != NVME_CTRL_DELETING_NOIO);
> + WARN_ON_ONCE(new);
> + rc = -EINVAL;
> + goto destroy_io;
> + }
> +
> + nvme_start_ctrl(nctrl);
> +
> + return 0;
> +
> +destroy_io:
> + /* Placeholder - stop and destroy io queues*/
> +destroy_admin:
> + /* Placeholder - stop and destroy admin queue*/
> +
> + return rc;
> +}
> +
> +static int
> +nvme_tcp_ofld_check_dev_opts(struct nvmf_ctrl_options *opts,
> + struct nvme_tcp_ofld_ops *ofld_ops)
> +{
> + unsigned int nvme_tcp_ofld_opt_mask = NVMF_ALLOWED_OPTS |
> + ofld_ops->allowed_opts | ofld_ops->required_opts;
> + if (opts->mask & ~nvme_tcp_ofld_opt_mask) {
> + pr_warn("One or more of the nvmf options isn't supported by %s.\n",
> + ofld_ops->name);
> +
> + return -EINVAL;
> + }
> +
> + return 0;
> +}
> +
> +static void nvme_tcp_ofld_free_ctrl(struct nvme_ctrl *nctrl)
> +{
> + struct nvme_tcp_ofld_ctrl *ctrl = to_tcp_ofld_ctrl(nctrl);
> + struct nvme_tcp_ofld_dev *dev = ctrl->dev;
> +
> + if (list_empty(&ctrl->list))
> + goto free_ctrl;
> +
> + down_write(&nvme_tcp_ofld_ctrl_rwsem);
> + ctrl->dev->ops->release_ctrl(ctrl);
> + list_del(&ctrl->list);
> + up_write(&nvme_tcp_ofld_ctrl_rwsem);
> +
> + nvmf_free_options(nctrl->opts);
> +free_ctrl:
> + module_put(dev->ops->module);
> + kfree(ctrl->queues);
> + kfree(ctrl);
> +}
> +
> +static void nvme_tcp_ofld_submit_async_event(struct nvme_ctrl *arg)
> +{
> + /* Placeholder - submit_async_event */
> +}
> +
> +static void
> +nvme_tcp_ofld_teardown_admin_queue(struct nvme_ctrl *ctrl, bool remove)
> +{
> + /* Placeholder - teardown_admin_queue */
> +}
> +
> +static void
> +nvme_tcp_ofld_teardown_io_queues(struct nvme_ctrl *nctrl, bool remove)
> +{
> + /* Placeholder - teardown_io_queues */
> +}
> +
> +static void
> +nvme_tcp_ofld_teardown_ctrl(struct nvme_ctrl *nctrl, bool shutdown)
> +{
> + /* Placeholder - err_work and connect_work */
> + nvme_tcp_ofld_teardown_io_queues(nctrl, shutdown);
> + blk_mq_quiesce_queue(nctrl->admin_q);
> + if (shutdown)
> + nvme_shutdown_ctrl(nctrl);
> + else
> + nvme_disable_ctrl(nctrl);
> + nvme_tcp_ofld_teardown_admin_queue(nctrl, shutdown);
> +}
> +
> +static void nvme_tcp_ofld_delete_ctrl(struct nvme_ctrl *nctrl)
> +{
> + nvme_tcp_ofld_teardown_ctrl(nctrl, true);
> +}
> +
> +static int
> +nvme_tcp_ofld_init_request(struct blk_mq_tag_set *set,
> + struct request *rq,
> + unsigned int hctx_idx,
> + unsigned int numa_node)
> +{
> + struct nvme_tcp_ofld_req *req = blk_mq_rq_to_pdu(rq);
> + struct nvme_tcp_ofld_ctrl *ctrl = set->driver_data;
> +
> + /* Placeholder - init request */
> +
> + req->done = nvme_tcp_ofld_req_done;
> + ctrl->dev->ops->init_req(req);
>
> return 0;
> }
>
> +static blk_status_t
> +nvme_tcp_ofld_queue_rq(struct blk_mq_hw_ctx *hctx,
> + const struct blk_mq_queue_data *bd)
> +{
> + /* Call nvme_setup_cmd(...) */
> +
> + /* Call ops->send_req(...) */
> +
> + return BLK_STS_OK;
> +}
> +
> +static struct blk_mq_ops nvme_tcp_ofld_mq_ops = {
> + .queue_rq = nvme_tcp_ofld_queue_rq,
> + .init_request = nvme_tcp_ofld_init_request,
> + /*
> + * All additional ops will be also implemented and registered similar to
> + * tcp.c
> + */
> +};
> +
> +static struct blk_mq_ops nvme_tcp_ofld_admin_mq_ops = {
> + .queue_rq = nvme_tcp_ofld_queue_rq,
> + .init_request = nvme_tcp_ofld_init_request,
> + /*
> + * All additional ops will be also implemented and registered similar to
> + * tcp.c
> + */
> +};
> +
> +static const struct nvme_ctrl_ops nvme_tcp_ofld_ctrl_ops = {
> + .name = "tcp_offload",
> + .module = THIS_MODULE,
> + .flags = NVME_F_FABRICS,
> + .reg_read32 = nvmf_reg_read32,
> + .reg_read64 = nvmf_reg_read64,
> + .reg_write32 = nvmf_reg_write32,
> + .free_ctrl = nvme_tcp_ofld_free_ctrl,
> + .submit_async_event = nvme_tcp_ofld_submit_async_event,
> + .delete_ctrl = nvme_tcp_ofld_delete_ctrl,
> + .get_address = nvmf_get_address,
> +};
> +
> +static bool
> +nvme_tcp_ofld_existing_controller(struct nvmf_ctrl_options *opts)
> +{
> + struct nvme_tcp_ofld_ctrl *ctrl;
> + bool found = false;
> +
> + down_read(&nvme_tcp_ofld_ctrl_rwsem);
> + list_for_each_entry(ctrl, &nvme_tcp_ofld_ctrl_list, list) {
> + found = nvmf_ip_options_match(&ctrl->nctrl, opts);
> + if (found)
> + break;
> + }
> + up_read(&nvme_tcp_ofld_ctrl_rwsem);
> +
> + return found;
> +}
> +
> static struct nvme_ctrl *
> nvme_tcp_ofld_create_ctrl(struct device *ndev, struct nvmf_ctrl_options *opts)
> {
> + struct nvme_tcp_ofld_queue *queue;
> struct nvme_tcp_ofld_ctrl *ctrl;
> struct nvme_tcp_ofld_dev *dev;
> struct nvme_ctrl *nctrl;
> - int rc = 0;
> + int i, rc = 0;
>
> ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
> if (!ctrl)
> return ERR_PTR(-ENOMEM);
>
> + INIT_LIST_HEAD(&ctrl->list);
> nctrl = &ctrl->nctrl;
> + nctrl->opts = opts;
> + nctrl->queue_count = opts->nr_io_queues + opts->nr_write_queues +
> + opts->nr_poll_queues + 1;
> + nctrl->sqsize = opts->queue_size - 1;
> + nctrl->kato = opts->kato;
> + if (!(opts->mask & NVMF_OPT_TRSVCID)) {
> + opts->trsvcid =
> + kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL);
> + if (!opts->trsvcid) {
> + rc = -ENOMEM;
> + goto out_free_ctrl;
> + }
> + opts->mask |= NVMF_OPT_TRSVCID;
> + }
>
> - /* Init nvme_tcp_ofld_ctrl and nvme_ctrl params based on received opts */
> + rc = inet_pton_with_scope(&init_net, AF_UNSPEC, opts->traddr,
> + opts->trsvcid,
> + &ctrl->conn_params.remote_ip_addr);
> + if (rc) {
> + pr_err("malformed address passed: %s:%s\n",
> + opts->traddr, opts->trsvcid);
> + goto out_free_ctrl;
> + }
> +
> + if (opts->mask & NVMF_OPT_HOST_TRADDR) {
> + rc = inet_pton_with_scope(&init_net, AF_UNSPEC,
> + opts->host_traddr, NULL,
> + &ctrl->conn_params.local_ip_addr);
> + if (rc) {
> + pr_err("malformed src address passed: %s\n",
> + opts->host_traddr);
> + goto out_free_ctrl;
> + }
> + }
> +
> + if (!opts->duplicate_connect &&
> + nvme_tcp_ofld_existing_controller(opts)) {
> + rc = -EALREADY;
> + goto out_free_ctrl;
> + }
>
> /* Find device that can reach the dest addr */
> dev = nvme_tcp_ofld_lookup_dev(ctrl);
> @@ -160,6 +571,10 @@ nvme_tcp_ofld_create_ctrl(struct device *ndev, struct nvmf_ctrl_options *opts)
> goto out_free_ctrl;
> }
>
> + rc = nvme_tcp_ofld_check_dev_opts(opts, dev->ops);
> + if (rc)
> + goto out_module_put;
> +
> ctrl->dev = dev;
>
> if (ctrl->dev->ops->max_hw_sectors)
> @@ -167,22 +582,58 @@ nvme_tcp_ofld_create_ctrl(struct device *ndev, struct nvmf_ctrl_options *opts)
> if (ctrl->dev->ops->max_segments)
> nctrl->max_segments = ctrl->dev->ops->max_segments;
>
> - /* Init queues */
> + ctrl->queues = kcalloc(nctrl->queue_count,
> + sizeof(struct nvme_tcp_ofld_queue),
> + GFP_KERNEL);
> + if (!ctrl->queues) {
> + rc = -ENOMEM;
> + goto out_module_put;
> + }
> +
> + for (i = 0; i < nctrl->queue_count; ++i) {
> + queue = &ctrl->queues[i];
> + queue->ctrl = ctrl;
> + queue->dev = dev;
> + queue->report_err = nvme_tcp_ofld_report_queue_err;
> + queue->hdr_digest = nctrl->opts->hdr_digest;
> + queue->data_digest = nctrl->opts->data_digest;
> + queue->tos = nctrl->opts->tos;
> + }
> +
> + rc = nvme_init_ctrl(nctrl, ndev, &nvme_tcp_ofld_ctrl_ops, 0);
> + if (rc)
> + goto out_free_queues;
>
> - /* Call nvme_init_ctrl */
> + if (!nvme_change_ctrl_state(nctrl, NVME_CTRL_CONNECTING)) {
> + WARN_ON_ONCE(1);
> + rc = -EINTR;
> + goto out_uninit_ctrl;
> + }
>
> rc = ctrl->dev->ops->setup_ctrl(ctrl, true);
> if (rc)
> - goto out_module_put;
> + goto out_uninit_ctrl;
>
> rc = nvme_tcp_ofld_setup_ctrl(nctrl, true);
> if (rc)
> - goto out_uninit_ctrl;
> + goto out_release_ctrl;
> +
> + dev_info(nctrl->device, "new ctrl: NQN \"%s\", addr %pISp\n",
> + opts->subsysnqn, &ctrl->conn_params.remote_ip_addr);
> +
> + down_write(&nvme_tcp_ofld_ctrl_rwsem);
> + list_add_tail(&ctrl->list, &nvme_tcp_ofld_ctrl_list);
> + up_write(&nvme_tcp_ofld_ctrl_rwsem);
>
> return nctrl;
>
> -out_uninit_ctrl:
> +out_release_ctrl:
> ctrl->dev->ops->release_ctrl(ctrl);
> +out_uninit_ctrl:
> + nvme_uninit_ctrl(nctrl);
> + nvme_put_ctrl(nctrl);
> +out_free_queues:
> + kfree(ctrl->queues);
> out_module_put:
> module_put(dev->ops->module);
> out_free_ctrl:
> @@ -212,7 +663,15 @@ static int __init nvme_tcp_ofld_init_module(void)
>
> static void __exit nvme_tcp_ofld_cleanup_module(void)
> {
> + struct nvme_tcp_ofld_ctrl *ctrl;
> +
> nvmf_unregister_transport(&nvme_tcp_ofld_transport);
> +
> + down_write(&nvme_tcp_ofld_ctrl_rwsem);
> + list_for_each_entry(ctrl, &nvme_tcp_ofld_ctrl_list, list)
> + nvme_delete_ctrl(&ctrl->nctrl);
> + up_write(&nvme_tcp_ofld_ctrl_rwsem);
> + flush_workqueue(nvme_delete_wq);
> }
>
> module_init(nvme_tcp_ofld_init_module);
>
Reviewed-by: Himanshu Madhani <himanshu.madhani@...cle.com>
--
Himanshu Madhani Oracle Linux Engineering
Powered by blists - more mailing lists