netdev - Re: [RFC PATCH v4 22/27] qedn: Add IO level nvme_req and fw

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <42912221-29b4-e97a-bec0-9d8eec2c97fa@suse.de>
Date:   Sun, 2 May 2021 13:42:20 +0200
From:   Hannes Reinecke <hare@...e.de>
To:     Shai Malin <smalin@...vell.com>, netdev@...r.kernel.org,
        linux-nvme@...ts.infradead.org, sagi@...mberg.me, hch@....de,
        axboe@...com, kbusch@...nel.org
Cc:     "David S . Miller davem @ davemloft . net --cc=Jakub Kicinski" 
        <kuba@...nel.org>, aelior@...vell.com, mkalderon@...vell.com,
        okulkarni@...vell.com, pkushwaha@...vell.com, malin1024@...il.com
Subject: Re: [RFC PATCH v4 22/27] qedn: Add IO level nvme_req and fw_cq
 workqueues

On 4/29/21 9:09 PM, Shai Malin wrote:
> This patch will present the IO level workqueues:
> 
> - qedn_nvme_req_fp_wq(): process new requests, similar to
> 			 nvme_tcp_io_work(). The flow starts from
> 			 send_req() and will aggregate all the requests
> 			 on this CPU core.
> 
> - qedn_fw_cq_fp_wq():   process new FW completions, the flow starts from
> 			the IRQ handler and for a single interrupt it will
> 			process all the pending NVMeoF Completions under
> 			polling mode.
> 
> Acked-by: Igor Russkikh <irusskikh@...vell.com>
> Signed-off-by: Prabhakar Kushwaha <pkushwaha@...vell.com>
> Signed-off-by: Omkar Kulkarni <okulkarni@...vell.com>
> Signed-off-by: Michal Kalderon <mkalderon@...vell.com>
> Signed-off-by: Ariel Elior <aelior@...vell.com>
> Signed-off-by: Shai Malin <smalin@...vell.com>
> ---
>   drivers/nvme/hw/qedn/Makefile    |   2 +-
>   drivers/nvme/hw/qedn/qedn.h      |  29 +++++++
>   drivers/nvme/hw/qedn/qedn_conn.c |   3 +
>   drivers/nvme/hw/qedn/qedn_main.c | 114 +++++++++++++++++++++++--
>   drivers/nvme/hw/qedn/qedn_task.c | 138 +++++++++++++++++++++++++++++++
>   5 files changed, 278 insertions(+), 8 deletions(-)
>   create mode 100644 drivers/nvme/hw/qedn/qedn_task.c
> 
> diff --git a/drivers/nvme/hw/qedn/Makefile b/drivers/nvme/hw/qedn/Makefile
> index d8b343afcd16..c7d838a61ae6 100644
> --- a/drivers/nvme/hw/qedn/Makefile
> +++ b/drivers/nvme/hw/qedn/Makefile
> @@ -1,4 +1,4 @@
>   # SPDX-License-Identifier: GPL-2.0-only
>   
>   obj-$(CONFIG_NVME_QEDN) += qedn.o
> -qedn-y := qedn_main.o qedn_conn.o
> +qedn-y := qedn_main.o qedn_conn.o qedn_task.o
> \ No newline at end of file
> diff --git a/drivers/nvme/hw/qedn/qedn.h b/drivers/nvme/hw/qedn/qedn.h
> index c15cac37ec1e..bd9a250cb2f5 100644
> --- a/drivers/nvme/hw/qedn/qedn.h
> +++ b/drivers/nvme/hw/qedn/qedn.h
> @@ -47,6 +47,9 @@
>   #define QEDN_NON_ABORTIVE_TERMINATION 0
>   #define QEDN_ABORTIVE_TERMINATION 1
>   
> +#define QEDN_FW_CQ_FP_WQ_WORKQUEUE "qedn_fw_cq_fp_wq"
> +#define QEDN_NVME_REQ_FP_WQ_WORKQUEUE "qedn_nvme_req_fp_wq"
> +
>   /*
>    * TCP offload stack default configurations and defines.
>    * Future enhancements will allow controlling the configurable
> @@ -100,6 +103,7 @@ struct qedn_fp_queue {
>   	struct qedn_ctx	*qedn;
>   	struct qed_sb_info *sb_info;
>   	unsigned int cpu;
> +	struct work_struct fw_cq_fp_wq_entry;
>   	u16 sb_id;
>   	char irqname[QEDN_IRQ_NAME_LEN];
>   };
> @@ -131,6 +135,8 @@ struct qedn_ctx {
>   	struct qedn_fp_queue *fp_q_arr;
>   	struct nvmetcp_glbl_queue_entry *fw_cq_array_virt;
>   	dma_addr_t fw_cq_array_phy; /* Physical address of fw_cq_array_virt */
> +	struct workqueue_struct *nvme_req_fp_wq;
> +	struct workqueue_struct *fw_cq_fp_wq;
>   };
>   
>   struct qedn_endpoint {
> @@ -213,6 +219,25 @@ struct qedn_ctrl {
>   
>   /* Connection level struct */
>   struct qedn_conn_ctx {
> +	/* IO path */
> +	struct workqueue_struct	*nvme_req_fp_wq; /* ptr to qedn->nvme_req_fp_wq */
> +	struct nvme_tcp_ofld_req *req; /* currently proccessed request */
> +
> +	struct list_head host_pend_req_list;
> +	/* Spinlock to access pending request list */
> +	spinlock_t nvme_req_lock;
> +	unsigned int cpu;
> +
> +	/* Entry for registering to nvme_req_fp_wq */
> +	struct work_struct nvme_req_fp_wq_entry;
> +	/*
> +	 * Spinlock for accessing qedn_process_req as it can be called
> +	 * from multiple place like queue_rq, async, self requeued
> +	 */
> +	struct mutex nvme_req_mutex;
> +	struct qedn_fp_queue *fp_q;
> +	int qid;
> +
>   	struct qedn_ctx *qedn;
>   	struct nvme_tcp_ofld_queue *queue;
>   	struct nvme_tcp_ofld_ctrl *ctrl;
> @@ -280,5 +305,9 @@ int qedn_wait_for_conn_est(struct qedn_conn_ctx *conn_ctx);
>   int qedn_set_con_state(struct qedn_conn_ctx *conn_ctx, enum qedn_conn_state new_state);
>   void qedn_terminate_connection(struct qedn_conn_ctx *conn_ctx, int abrt_flag);
>   __be16 qedn_get_in_port(struct sockaddr_storage *sa);
> +inline int qedn_validate_cccid_in_range(struct qedn_conn_ctx *conn_ctx, u16 cccid);
> +void qedn_queue_request(struct qedn_conn_ctx *qedn_conn, struct nvme_tcp_ofld_req *req);
> +void qedn_nvme_req_fp_wq_handler(struct work_struct *work);
> +void qedn_io_work_cq(struct qedn_ctx *qedn, struct nvmetcp_fw_cqe *cqe);
>   
>   #endif /* _QEDN_H_ */
> diff --git a/drivers/nvme/hw/qedn/qedn_conn.c b/drivers/nvme/hw/qedn/qedn_conn.c
> index 9bfc0a5f0cdb..90d8aa36d219 100644
> --- a/drivers/nvme/hw/qedn/qedn_conn.c
> +++ b/drivers/nvme/hw/qedn/qedn_conn.c
> @@ -385,6 +385,9 @@ static int qedn_prep_and_offload_queue(struct qedn_conn_ctx *conn_ctx)
>   	}
>   
>   	set_bit(QEDN_CONN_RESRC_FW_SQ, &conn_ctx->resrc_state);
> +	INIT_LIST_HEAD(&conn_ctx->host_pend_req_list);
> +	spin_lock_init(&conn_ctx->nvme_req_lock);
> +
>   	rc = qed_ops->acquire_conn(qedn->cdev,
>   				   &conn_ctx->conn_handle,
>   				   &conn_ctx->fw_cid,
> diff --git a/drivers/nvme/hw/qedn/qedn_main.c b/drivers/nvme/hw/qedn/qedn_main.c
> index 8b5714e7e2bb..38f23dbb03a5 100644
> --- a/drivers/nvme/hw/qedn/qedn_main.c
> +++ b/drivers/nvme/hw/qedn/qedn_main.c
> @@ -267,6 +267,18 @@ static int qedn_release_ctrl(struct nvme_tcp_ofld_ctrl *ctrl)
>   	return 0;
>   }
>   
> +static void qedn_set_ctrl_io_cpus(struct qedn_conn_ctx *conn_ctx, int qid)
> +{
> +	struct qedn_ctx *qedn = conn_ctx->qedn;
> +	struct qedn_fp_queue *fp_q = NULL;
> +	int index;
> +
> +	index = qid ? (qid - 1) % qedn->num_fw_cqs : 0;
> +	fp_q = &qedn->fp_q_arr[index];
> +
> +	conn_ctx->cpu = fp_q->cpu;
> +}
> +
>   static int qedn_create_queue(struct nvme_tcp_ofld_queue *queue, int qid, size_t q_size)
>   {
>   	struct nvme_tcp_ofld_ctrl *ctrl = queue->ctrl;
> @@ -288,6 +300,7 @@ static int qedn_create_queue(struct nvme_tcp_ofld_queue *queue, int qid, size_t
>   	conn_ctx->queue = queue;
>   	conn_ctx->ctrl = ctrl;
>   	conn_ctx->sq_depth = q_size;
> +	qedn_set_ctrl_io_cpus(conn_ctx, qid);
>   
>   	init_waitqueue_head(&conn_ctx->conn_waitq);
>   	atomic_set(&conn_ctx->est_conn_indicator, 0);
> @@ -295,6 +308,10 @@ static int qedn_create_queue(struct nvme_tcp_ofld_queue *queue, int qid, size_t
>   
>   	spin_lock_init(&conn_ctx->conn_state_lock);
>   
> +	INIT_WORK(&conn_ctx->nvme_req_fp_wq_entry, qedn_nvme_req_fp_wq_handler);
> +	conn_ctx->nvme_req_fp_wq = qedn->nvme_req_fp_wq;
> +	conn_ctx->qid = qid;
> +
>   	qedn_initialize_endpoint(&conn_ctx->ep, qedn->local_mac_addr,
>   				 &ctrl->conn_params);
>   
> @@ -356,6 +373,7 @@ static void qedn_destroy_queue(struct nvme_tcp_ofld_queue *queue)
>   	if (!conn_ctx)
>   		return;
>   
> +	cancel_work_sync(&conn_ctx->nvme_req_fp_wq_entry);
>   	qedn_terminate_connection(conn_ctx, QEDN_ABORTIVE_TERMINATION);
>   
>   	qedn_queue_wait_for_terminate_complete(conn_ctx);
> @@ -385,12 +403,24 @@ static int qedn_init_req(struct nvme_tcp_ofld_req *req)
>   
>   static void qedn_commit_rqs(struct nvme_tcp_ofld_queue *queue)
>   {
> -	/* Placeholder - queue work */
> +	struct qedn_conn_ctx *conn_ctx;
> +
> +	conn_ctx = (struct qedn_conn_ctx *)queue->private_data;
> +
> +	if (!list_empty(&conn_ctx->host_pend_req_list))
> +		queue_work_on(conn_ctx->cpu, conn_ctx->nvme_req_fp_wq,
> +			      &conn_ctx->nvme_req_fp_wq_entry);
>   }
>   
>   static int qedn_send_req(struct nvme_tcp_ofld_req *req)
>   {
> -	/* Placeholder - qedn_send_req */
> +	struct qedn_conn_ctx *qedn_conn = (struct qedn_conn_ctx *)req->queue->private_data;
> +
> +	/* Under the assumption that the cccid/tag will be in the range of 0 to sq_depth-1. */
> +	if (!req->async && qedn_validate_cccid_in_range(qedn_conn, req->rq->tag))
> +		return BLK_STS_NOTSUPP;
> +
> +	qedn_queue_request(qedn_conn, req);
>   
>   	return 0;
>   }
> @@ -434,9 +464,59 @@ struct qedn_conn_ctx *qedn_get_conn_hash(struct qedn_ctx *qedn, u16 icid)
>   }
>   
>   /* Fastpath IRQ handler */
> +void qedn_fw_cq_fp_handler(struct qedn_fp_queue *fp_q)
> +{
> +	u16 sb_id, cq_prod_idx, cq_cons_idx;
> +	struct qedn_ctx *qedn = fp_q->qedn;
> +	struct nvmetcp_fw_cqe *cqe = NULL;
> +
> +	sb_id = fp_q->sb_id;
> +	qed_sb_update_sb_idx(fp_q->sb_info);
> +
> +	/* rmb - to prevent missing new cqes */
> +	rmb();
> +
> +	/* Read the latest cq_prod from the SB */
> +	cq_prod_idx = *fp_q->cq_prod;
> +	cq_cons_idx = qed_chain_get_cons_idx(&fp_q->cq_chain);
> +
> +	while (cq_cons_idx != cq_prod_idx) {
> +		cqe = qed_chain_consume(&fp_q->cq_chain);
> +		if (likely(cqe))
> +			qedn_io_work_cq(qedn, cqe);
> +		else
> +			pr_err("Failed consuming cqe\n");
> +
> +		cq_cons_idx = qed_chain_get_cons_idx(&fp_q->cq_chain);
> +
> +		/* Check if new completions were posted */
> +		if (unlikely(cq_prod_idx == cq_cons_idx)) {
> +			/* rmb - to prevent missing new cqes */
> +			rmb();
> +
> +			/* Update the latest cq_prod from the SB */
> +			cq_prod_idx = *fp_q->cq_prod;
> +		}
> +	}
> +}
> +
> +static void qedn_fw_cq_fq_wq_handler(struct work_struct *work)
> +{
> +	struct qedn_fp_queue *fp_q = container_of(work, struct qedn_fp_queue, fw_cq_fp_wq_entry);
> +
> +	qedn_fw_cq_fp_handler(fp_q);
> +	qed_sb_ack(fp_q->sb_info, IGU_INT_ENABLE, 1);
> +}
> +
>   static irqreturn_t qedn_irq_handler(int irq, void *dev_id)
>   {
> -	/* Placeholder */
> +	struct qedn_fp_queue *fp_q = dev_id;
> +	struct qedn_ctx *qedn = fp_q->qedn;
> +
> +	fp_q->cpu = smp_processor_id();
> +
> +	qed_sb_ack(fp_q->sb_info, IGU_INT_DISABLE, 0);
> +	queue_work_on(fp_q->cpu, qedn->fw_cq_fp_wq, &fp_q->fw_cq_fp_wq_entry);
>   
>   	return IRQ_HANDLED;
>   }
> @@ -584,6 +664,11 @@ static void qedn_free_function_queues(struct qedn_ctx *qedn)
>   	int i;
>   
>   	/* Free workqueues */
> +	destroy_workqueue(qedn->fw_cq_fp_wq);
> +	qedn->fw_cq_fp_wq = NULL;
> +
> +	destroy_workqueue(qedn->nvme_req_fp_wq);
> +	qedn->nvme_req_fp_wq = NULL;
>   
>   	/* Free the fast path queues*/
>   	for (i = 0; i < qedn->num_fw_cqs; i++) {
> @@ -651,7 +736,23 @@ static int qedn_alloc_function_queues(struct qedn_ctx *qedn)
>   	u64 cq_phy_addr;
>   	int i;
>   
> -	/* Place holder - IO-path workqueues */
> +	qedn->fw_cq_fp_wq = alloc_workqueue(QEDN_FW_CQ_FP_WQ_WORKQUEUE,
> +					    WQ_HIGHPRI | WQ_MEM_RECLAIM, 0);
> +	if (!qedn->fw_cq_fp_wq) {
> +		rc = -ENODEV;
> +		pr_err("Unable to create fastpath FW CQ workqueue!\n");
> +
> +		return rc;
> +	}
> +
> +	qedn->nvme_req_fp_wq = alloc_workqueue(QEDN_NVME_REQ_FP_WQ_WORKQUEUE,
> +					       WQ_HIGHPRI | WQ_MEM_RECLAIM, 1);
> +	if (!qedn->nvme_req_fp_wq) {
> +		rc = -ENODEV;
> +		pr_err("Unable to create fastpath qedn nvme workqueue!\n");
> +
> +		return rc;
> +	}
>   
>   	qedn->fp_q_arr = kcalloc(qedn->num_fw_cqs,
>   				 sizeof(struct qedn_fp_queue), GFP_KERNEL);

Why don't you use threaded interrupts if you're spinning off a workqueue 
for handling interrupts anyway?

> @@ -679,7 +780,7 @@ static int qedn_alloc_function_queues(struct qedn_ctx *qedn)
>   		chain_params.mode = QED_CHAIN_MODE_PBL,
>   		chain_params.cnt_type = QED_CHAIN_CNT_TYPE_U16,
>   		chain_params.num_elems = QEDN_FW_CQ_SIZE;
> -		chain_params.elem_size = 64; /*Placeholder - sizeof(struct nvmetcp_fw_cqe)*/
> +		chain_params.elem_size = sizeof(struct nvmetcp_fw_cqe);
>   
>   		rc = qed_ops->common->chain_alloc(qedn->cdev,
>   						  &fp_q->cq_chain,
> @@ -708,8 +809,7 @@ static int qedn_alloc_function_queues(struct qedn_ctx *qedn)
>   		sb = fp_q->sb_info->sb_virt;
>   		fp_q->cq_prod = (u16 *)&sb->pi_array[QEDN_PROTO_CQ_PROD_IDX];
>   		fp_q->qedn = qedn;
> -
> -		/* Placeholder - Init IO-path workqueue */
> +		INIT_WORK(&fp_q->fw_cq_fp_wq_entry, qedn_fw_cq_fq_wq_handler);
>   
>   		/* Placeholder - Init IO-path resources */
>   	}
> diff --git a/drivers/nvme/hw/qedn/qedn_task.c b/drivers/nvme/hw/qedn/qedn_task.c
> new file mode 100644
> index 000000000000..d3474188efdc
> --- /dev/null
> +++ b/drivers/nvme/hw/qedn/qedn_task.c
> @@ -0,0 +1,138 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright 2021 Marvell. All rights reserved.
> + */
> +
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +
> + /* Kernel includes */
> +#include <linux/kernel.h>
> +
> +/* Driver includes */
> +#include "qedn.h"
> +
> +inline int qedn_validate_cccid_in_range(struct qedn_conn_ctx *conn_ctx, u16 cccid)
> +{
> +	int rc = 0;
> +
> +	if (unlikely(cccid >= conn_ctx->sq_depth)) {
> +		pr_err("cccid 0x%x out of range ( > sq depth)\n", cccid);
> +		rc = -EINVAL;
> +	}
> +
> +	return rc;
> +}
> +
> +static bool qedn_process_req(struct qedn_conn_ctx *qedn_conn)
> +{
> +	return true;
> +}
> +
> +/* The WQ handler can be call from 3 flows:
> + *	1. queue_rq.
> + *	2. async.
> + *	3. self requeued
> + * Try to send requests from the pending list. If a request proccess has failed,
> + * re-register to the workqueue.
> + * If there are no additional pending requests - exit the handler.
> + */
> +void qedn_nvme_req_fp_wq_handler(struct work_struct *work)
> +{
> +	struct qedn_conn_ctx *qedn_conn;
> +	bool more = false;
> +
> +	qedn_conn = container_of(work, struct qedn_conn_ctx, nvme_req_fp_wq_entry);
> +	do {
> +		if (mutex_trylock(&qedn_conn->nvme_req_mutex)) {
> +			more = qedn_process_req(qedn_conn);
> +			qedn_conn->req = NULL;
> +			mutex_unlock(&qedn_conn->nvme_req_mutex);
> +		}
> +	} while (more);
> +
> +	if (!list_empty(&qedn_conn->host_pend_req_list))
> +		queue_work_on(qedn_conn->cpu, qedn_conn->nvme_req_fp_wq,
> +			      &qedn_conn->nvme_req_fp_wq_entry);
> +}
> +
> +void qedn_queue_request(struct qedn_conn_ctx *qedn_conn, struct nvme_tcp_ofld_req *req)
> +{
> +	bool empty, res = false;
> +
> +	spin_lock(&qedn_conn->nvme_req_lock);
> +	empty = list_empty(&qedn_conn->host_pend_req_list) && !qedn_conn->req;
> +	list_add_tail(&req->queue_entry, &qedn_conn->host_pend_req_list);
> +	spin_unlock(&qedn_conn->nvme_req_lock);
> +
> +	/* attempt workqueue bypass */
> +	if (qedn_conn->cpu == smp_processor_id() && empty &&
> +	    mutex_trylock(&qedn_conn->nvme_req_mutex)) {
> +		res = qedn_process_req(qedn_conn);
> +		qedn_conn->req = NULL;
> +		mutex_unlock(&qedn_conn->nvme_req_mutex);
> +		if (res || list_empty(&qedn_conn->host_pend_req_list))
> +			return;
> +	} else if (req->last) {
> +		queue_work_on(qedn_conn->cpu, qedn_conn->nvme_req_fp_wq,
> +			      &qedn_conn->nvme_req_fp_wq_entry);
> +	}
> +}
> +

Queueing a request?
Does wonders for your latency ... Can't you do without?

> +struct qedn_task_ctx *qedn_cqe_get_active_task(struct nvmetcp_fw_cqe *cqe)
> +{
> +	struct regpair *p = &cqe->task_opaque;
> +
> +	return (struct qedn_task_ctx *)((((u64)(le32_to_cpu(p->hi)) << 32)
> +					+ le32_to_cpu(p->lo)));
> +}
> +
> +void qedn_io_work_cq(struct qedn_ctx *qedn, struct nvmetcp_fw_cqe *cqe)
> +{
> +	struct qedn_task_ctx *qedn_task = NULL;
> +	struct qedn_conn_ctx *conn_ctx = NULL;
> +	u16 itid;
> +	u32 cid;
> +
> +	conn_ctx = qedn_get_conn_hash(qedn, le16_to_cpu(cqe->conn_id));
> +	if (unlikely(!conn_ctx)) {
> +		pr_err("CID 0x%x: Failed to fetch conn_ctx from hash\n",
> +		       le16_to_cpu(cqe->conn_id));
> +
> +		return;
> +	}
> +
> +	cid = conn_ctx->fw_cid;
> +	itid = le16_to_cpu(cqe->itid);
> +	qedn_task = qedn_cqe_get_active_task(cqe);
> +	if (unlikely(!qedn_task))
> +		return;
> +
> +	if (likely(cqe->cqe_type == NVMETCP_FW_CQE_TYPE_NORMAL)) {
> +		/* Placeholder - verify the connection was established */
> +
> +		switch (cqe->task_type) {
> +		case NVMETCP_TASK_TYPE_HOST_WRITE:
> +		case NVMETCP_TASK_TYPE_HOST_READ:
> +
> +			/* Placeholder - IO flow */
> +
> +			break;
> +
> +		case NVMETCP_TASK_TYPE_HOST_READ_NO_CQE:
> +
> +			/* Placeholder - IO flow */
> +
> +			break;
> +
> +		case NVMETCP_TASK_TYPE_INIT_CONN_REQUEST:
> +
> +			/* Placeholder - ICReq flow */
> +
> +			break;
> +		default:
> +			pr_info("Could not identify task type\n");
> +		}
> +	} else {
> +		/* Placeholder - Recovery flows */
> +	}
> +}
> 
Cheers,

Hannes
-- 
Dr. Hannes Reinecke                Kernel Storage Architect
hare@...e.de                              +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Felix Imendörffer