[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <a104a5d1-b4cb-4275-6ced-b80f911b6f47@grimberg.me>
Date: Wed, 3 Feb 2021 01:06:53 -0800
From: Sagi Grimberg <sagi@...mberg.me>
To: Boris Pismenny <borisp@...lanox.com>, dsahern@...il.com,
kuba@...nel.org, davem@...emloft.net, saeedm@...dia.com,
hch@....de, axboe@...com, kbusch@...nel.org,
viro@...iv.linux.org.uk, edumazet@...gle.com, smalin@...vell.com
Cc: boris.pismenny@...il.com, linux-nvme@...ts.infradead.org,
netdev@...r.kernel.org, benishay@...dia.com, ogerlitz@...dia.com,
yorayz@...dia.com, Ben Ben-Ishay <benishay@...lanox.com>,
Or Gerlitz <ogerlitz@...lanox.com>,
Yoray Zack <yorayz@...lanox.com>
Subject: Re: [PATCH v3 net-next 08/21] nvme-tcp : Recalculate crc in the end
of the capsule
On 2/1/21 2:04 AM, Boris Pismenny wrote:
> From: Ben Ben-ishay <benishay@...dia.com>
>
> crc offload of the nvme capsule. Check if all the skb bits
> are on, and if not recalculate the crc in SW and check it.
>
> This patch reworks the receive-side crc calculation to always
> run at the end, so as to keep a single flow for both offload
> and non-offload. This change simplifies the code, but it may degrade
> performance for non-offload crc calculation.
>
> Signed-off-by: Boris Pismenny <borisp@...lanox.com>
> Signed-off-by: Ben Ben-Ishay <benishay@...lanox.com>
> Signed-off-by: Or Gerlitz <ogerlitz@...lanox.com>
> Signed-off-by: Yoray Zack <yorayz@...lanox.com>
> ---
> drivers/nvme/host/tcp.c | 118 ++++++++++++++++++++++++++++++++--------
> 1 file changed, 95 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index 5cb46deb56e0..eb47cf6982d7 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -69,6 +69,7 @@ enum nvme_tcp_queue_flags {
> NVME_TCP_Q_LIVE = 1,
> NVME_TCP_Q_POLLING = 2,
> NVME_TCP_Q_OFF_DDP = 3,
> + NVME_TCP_Q_OFF_DDGST_RX = 4,
> };
>
> enum nvme_tcp_recv_state {
> @@ -96,6 +97,7 @@ struct nvme_tcp_queue {
> size_t data_remaining;
> size_t ddgst_remaining;
> unsigned int nr_cqe;
> + bool ddgst_valid;
>
> /* send state */
> struct nvme_tcp_request *request;
> @@ -234,7 +236,56 @@ static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req,
> return nvme_tcp_pdu_data_left(req) <= len;
> }
>
> -#ifdef CONFIG_TCP_DDP
> +static inline bool nvme_tcp_ddp_ddgst_ok(struct nvme_tcp_queue *queue)
> +{
> + return queue->ddgst_valid;
> +}
> +
> +static inline void nvme_tcp_ddp_ddgst_update(struct nvme_tcp_queue *queue,
> + struct sk_buff *skb)
> +{
> + if (queue->ddgst_valid)
> +#ifdef CONFIG_TCP_DDP_CRC
> + queue->ddgst_valid = skb->ddp_crc;
> +#else
> + queue->ddgst_valid = false;
> +#endif
> +}
> +
> +static int nvme_tcp_req_map_sg(struct nvme_tcp_request *req, struct request *rq)
> +{
> + int ret;
> +
> + req->ddp.sg_table.sgl = req->ddp.first_sgl;
> + ret = sg_alloc_table_chained(&req->ddp.sg_table, blk_rq_nr_phys_segments(rq),
> + req->ddp.sg_table.sgl, SG_CHUNK_SIZE);
> + if (ret)
> + return -ENOMEM;
> + req->ddp.nents = blk_rq_map_sg(rq->q, rq, req->ddp.sg_table.sgl);
> + return 0;
> +}
> +
> +static void nvme_tcp_ddp_ddgst_recalc(struct ahash_request *hash,
> + struct request *rq)
> +{
> + struct nvme_tcp_request *req;
> +
> + if (!rq)
> + return;
> +
> + req = blk_mq_rq_to_pdu(rq);
> +
> + if (!req->offloaded && nvme_tcp_req_map_sg(req, rq))
> + return;
> +
> + crypto_ahash_init(hash);
> + req->ddp.sg_table.sgl = req->ddp.first_sgl;
> + ahash_request_set_crypt(hash, req->ddp.sg_table.sgl, NULL,
> + le32_to_cpu(req->data_len));
> + crypto_ahash_update(hash);
> +}
> +
> +#if defined(CONFIG_TCP_DDP) || defined(CONFIG_TCP_DDP_CRC)
>
> static bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags);
> static void nvme_tcp_ddp_teardown_done(void *ddp_ctx);
> @@ -290,12 +341,9 @@ int nvme_tcp_setup_ddp(struct nvme_tcp_queue *queue,
> }
>
> req->ddp.command_id = command_id;
> - req->ddp.sg_table.sgl = req->ddp.first_sgl;
> - ret = sg_alloc_table_chained(&req->ddp.sg_table, blk_rq_nr_phys_segments(rq),
> - req->ddp.sg_table.sgl, SG_CHUNK_SIZE);
> + ret = nvme_tcp_req_map_sg(req, rq);
Why didn't you introduce nvme_tcp_req_map_sg in the first place?
> if (ret)
> return -ENOMEM;
> - req->ddp.nents = blk_rq_map_sg(rq->q, rq, req->ddp.sg_table.sgl);
>
> ret = netdev->tcp_ddp_ops->tcp_ddp_setup(netdev,
> queue->sock->sk,
> @@ -317,7 +365,7 @@ int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue)
> return -ENODEV;
> }
>
> - if (!(netdev->features & NETIF_F_HW_TCP_DDP)) {
> + if (!(netdev->features & (NETIF_F_HW_TCP_DDP | NETIF_F_HW_TCP_DDP_CRC_RX))) {
> dev_put(netdev);
> return -EOPNOTSUPP;
> }
> @@ -345,6 +393,9 @@ int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue)
> if (netdev->features & NETIF_F_HW_TCP_DDP)
> set_bit(NVME_TCP_Q_OFF_DDP, &queue->flags);
>
> + if (netdev->features & NETIF_F_HW_TCP_DDP_CRC_RX)
> + set_bit(NVME_TCP_Q_OFF_DDGST_RX, &queue->flags);
> +
> return ret;
> }
>
> @@ -376,7 +427,7 @@ int nvme_tcp_offload_limits(struct nvme_tcp_queue *queue)
> return -ENODEV;
> }
>
> - if (netdev->features & NETIF_F_HW_TCP_DDP &&
> + if ((netdev->features & (NETIF_F_HW_TCP_DDP | NETIF_F_HW_TCP_DDP_CRC_RX)) &&
> netdev->tcp_ddp_ops &&
> netdev->tcp_ddp_ops->tcp_ddp_limits)
> ret = netdev->tcp_ddp_ops->tcp_ddp_limits(netdev, &limits);
> @@ -739,6 +790,7 @@ static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
> queue->pdu_offset = 0;
> queue->data_remaining = -1;
> queue->ddgst_remaining = 0;
> + queue->ddgst_valid = true;
> }
>
> static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
> @@ -919,7 +971,8 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
>
> u64 pdu_seq = TCP_SKB_CB(skb)->seq + *offset - queue->pdu_offset;
>
> - if (test_bit(NVME_TCP_Q_OFF_DDP, &queue->flags))
> + if (test_bit(NVME_TCP_Q_OFF_DDP, &queue->flags) ||
> + test_bit(NVME_TCP_Q_OFF_DDGST_RX, &queue->flags))
> nvme_tcp_resync_response(queue, pdu_seq);
>
> ret = skb_copy_bits(skb, *offset,
> @@ -988,6 +1041,8 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
> struct nvme_tcp_request *req;
> struct request *rq;
>
> + if (queue->data_digest && test_bit(NVME_TCP_Q_OFF_DDGST_RX, &queue->flags))
> + nvme_tcp_ddp_ddgst_update(queue, skb);
> rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
> if (!rq) {
> dev_err(queue->ctrl->ctrl.device,
> @@ -1025,15 +1080,17 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
> recv_len = min_t(size_t, recv_len,
> iov_iter_count(&req->iter));
>
> - if (test_bit(NVME_TCP_Q_OFF_DDP, &queue->flags)) {
> - if (queue->data_digest)
> + if (test_bit(NVME_TCP_Q_OFF_DDP, &queue->flags)) {
> + if (queue->data_digest &&
> + !test_bit(NVME_TCP_Q_OFF_DDGST_RX, &queue->flags))
> ret = skb_ddp_copy_and_hash_datagram_iter(skb, *offset,
> &req->iter, recv_len, queue->rcv_hash);
> else
> ret = skb_ddp_copy_datagram_iter(skb, *offset,
> &req->iter, recv_len);
> } else {
> - if (queue->data_digest)
> + if (queue->data_digest &&
> + !test_bit(NVME_TCP_Q_OFF_DDGST_RX, &queue->flags))
> ret = skb_copy_and_hash_datagram_iter(skb, *offset,
> &req->iter, recv_len, queue->rcv_hash);
> else
> @@ -1055,7 +1112,6 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
>
> if (!queue->data_remaining) {
> if (queue->data_digest) {
> - nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst);
> queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH;
> } else {
> if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
> @@ -1076,8 +1132,12 @@ static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
> char *ddgst = (char *)&queue->recv_ddgst;
> size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining);
> off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
> + bool offload_fail, offload_en;
> + struct request *rq = NULL;
> int ret;
>
> + if (test_bit(NVME_TCP_Q_OFF_DDGST_RX, &queue->flags))
> + nvme_tcp_ddp_ddgst_update(queue, skb);
> ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len);
> if (unlikely(ret))
> return ret;
> @@ -1088,17 +1148,29 @@ static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
> if (queue->ddgst_remaining)
> return 0;
>
> - if (queue->recv_ddgst != queue->exp_ddgst) {
> - dev_err(queue->ctrl->ctrl.device,
> - "data digest error: recv %#x expected %#x\n",
> - le32_to_cpu(queue->recv_ddgst),
> - le32_to_cpu(queue->exp_ddgst));
> - return -EIO;
> + offload_fail = !nvme_tcp_ddp_ddgst_ok(queue);
> + offload_en = test_bit(NVME_TCP_Q_OFF_DDGST_RX, &queue->flags);
> + if (!offload_en || offload_fail) {
> + if (offload_en && offload_fail) { // software-fallback
> + rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue),
> + pdu->command_id);
> + nvme_tcp_ddp_ddgst_recalc(queue->rcv_hash, rq);
> + }
> +
> + nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst);
> + if (queue->recv_ddgst != queue->exp_ddgst) {
> + dev_err(queue->ctrl->ctrl.device,
> + "data digest error: recv %#x expected %#x\n",
> + le32_to_cpu(queue->recv_ddgst),
> + le32_to_cpu(queue->exp_ddgst));
> + return -EIO;
> + }
I still dislike this hunk. Can you split the recalc login to its
own ddp function at least? This is just a confusing piece of code.
> }
>
> if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
> - struct request *rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue),
> - pdu->command_id);
> + if (!rq)
> + rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue),
> + pdu->command_id);
Why is this change needed? Maybe just move this assignment up?
>
> nvme_tcp_end_request(rq, NVME_SC_SUCCESS);
> queue->nr_cqe++;
> @@ -1841,8 +1913,10 @@ static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
> nvme_tcp_restore_sock_calls(queue);
> cancel_work_sync(&queue->io_work);
>
> - if (test_bit(NVME_TCP_Q_OFF_DDP, &queue->flags))
> + if (test_bit(NVME_TCP_Q_OFF_DDP, &queue->flags) ||
> + test_bit(NVME_TCP_Q_OFF_DDGST_RX, &queue->flags))
> nvme_tcp_unoffload_socket(queue);
> +
extra newline
> }
>
> static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
> @@ -1970,8 +2044,6 @@ static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl)
> {
> int ret;
>
> - to_tcp_ctrl(ctrl)->offloading_netdev = NULL;
> -
Unclear what is the intent here.
> ret = nvme_tcp_alloc_queue(ctrl, 0, NVME_AQ_DEPTH);
> if (ret)
> return ret;
>
Powered by blists - more mailing lists