[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <a5de567b-c07e-d21b-318a-5d2a9e38045c@gmail.com>
Date: Wed, 9 Dec 2020 10:25:24 +0200
From: Boris Pismenny <borispismenny@...il.com>
To: David Ahern <dsahern@...il.com>,
Boris Pismenny <borisp@...lanox.com>, kuba@...nel.org,
davem@...emloft.net, saeedm@...dia.com, hch@....de,
sagi@...mberg.me, axboe@...com, kbusch@...nel.org,
viro@...iv.linux.org.uk, edumazet@...gle.com
Cc: boris.pismenny@...il.com, linux-nvme@...ts.infradead.org,
netdev@...r.kernel.org, benishay@...dia.com, ogerlitz@...dia.com,
yorayz@...dia.com, Ben Ben-Ishay <benishay@...lanox.com>,
Or Gerlitz <ogerlitz@...lanox.com>,
Yoray Zack <yorayz@...lanox.com>
Subject: Re: [PATCH v1 net-next 02/15] net: Introduce direct data placement
tcp offload
On 09/12/2020 2:57, David Ahern wrote:
> On 12/7/20 2:06 PM, Boris Pismenny wrote:
>> diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
>> index 934de56644e7..fb35dcac03d2 100644
>> --- a/include/linux/netdev_features.h
>> +++ b/include/linux/netdev_features.h
>> @@ -84,6 +84,7 @@ enum {
>> NETIF_F_GRO_FRAGLIST_BIT, /* Fraglist GRO */
>>
>> NETIF_F_HW_MACSEC_BIT, /* Offload MACsec operations */
>> + NETIF_F_HW_TCP_DDP_BIT, /* TCP direct data placement offload */
>>
>> /*
>> * Add your fresh new feature above and remember to update
>> @@ -157,6 +158,7 @@ enum {
>> #define NETIF_F_GRO_FRAGLIST __NETIF_F(GRO_FRAGLIST)
>> #define NETIF_F_GSO_FRAGLIST __NETIF_F(GSO_FRAGLIST)
>> #define NETIF_F_HW_MACSEC __NETIF_F(HW_MACSEC)
>> +#define NETIF_F_HW_TCP_DDP __NETIF_F(HW_TCP_DDP)
>
> All of the DDP naming seems wrong to me. I realize the specific use case
> is targeted payloads of a ULP, but it is still S/W handing H/W specific
> buffers for a payload of a flow.
>
>
This is intended to be used strictly by ULPs. DDP is how such things were
called in the past. It is more than zerocopy as explained before, so naming
it zerocopy will be misleading at best. I can propose another name. How about:
"Autonomous copy offload (ACO)" and "Autonomous crc offload (ACRC)"?
This will indicate that it is independent of other offloads, i.e. autonomous,
while also informing users of the functionality (copy/crc). Other names are
welcome too.
>> * @icsk_listen_portaddr_node hash to the portaddr listener hashtable
>> * @icsk_ca_state: Congestion control state
>> * @icsk_retransmits: Number of unrecovered [RTO] timeouts
>> @@ -94,6 +96,8 @@ struct inet_connection_sock {
>> const struct tcp_ulp_ops *icsk_ulp_ops;
>> void __rcu *icsk_ulp_data;
>> void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq);
>> + const struct tcp_ddp_ulp_ops *icsk_ulp_ddp_ops;
>> + void __rcu *icsk_ulp_ddp_data;
>> struct hlist_node icsk_listen_portaddr_node;
>> unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
>> __u8 icsk_ca_state:5,
>> diff --git a/include/net/tcp_ddp.h b/include/net/tcp_ddp.h
>> new file mode 100644
>> index 000000000000..df3264be4600
>> --- /dev/null
>> +++ b/include/net/tcp_ddp.h
>> @@ -0,0 +1,129 @@
>> +/* SPDX-License-Identifier: GPL-2.0
>> + *
>> + * tcp_ddp.h
>> + * Author: Boris Pismenny <borisp@...lanox.com>
>> + * Copyright (C) 2020 Mellanox Technologies.
>> + */
>> +#ifndef _TCP_DDP_H
>> +#define _TCP_DDP_H
>> +
>> +#include <linux/netdevice.h>
>> +#include <net/inet_connection_sock.h>
>> +#include <net/sock.h>
>> +
>> +/* limits returned by the offload driver, zero means don't care */
>> +struct tcp_ddp_limits {
>> + int max_ddp_sgl_len;
>> +};
>> +
>> +enum tcp_ddp_type {
>> + TCP_DDP_NVME = 1,
>> +};
>> +
>> +/**
>> + * struct tcp_ddp_config - Generic tcp ddp configuration: tcp ddp IO queue
>> + * config implementations must use this as the first member.
>> + * Add new instances of tcp_ddp_config below (nvme-tcp, etc.).
>> + */
>> +struct tcp_ddp_config {
>> + enum tcp_ddp_type type;
>> + unsigned char buf[];
>
> you have this variable length buf, but it is not used (as far as I can
> tell). But then ...
>
>
True. This buf[] is here to indicate that users are expected to extend it with
the ULP specific data as in nvme_tcp_config. We can remove it and leave a comment
if you prefer that.
>> +};
>> +
>> +/**
>> + * struct nvme_tcp_ddp_config - nvme tcp ddp configuration for an IO queue
>> + *
>> + * @pfv: pdu version (e.g., NVME_TCP_PFV_1_0)
>> + * @cpda: controller pdu data alignmend (dwords, 0's based)
>> + * @dgst: digest types enabled.
>> + * The netdev will offload crc if ddp_crc is supported.
>> + * @queue_size: number of nvme-tcp IO queue elements
>> + * @queue_id: queue identifier
>> + * @cpu_io: cpu core running the IO thread for this queue
>> + */
>> +struct nvme_tcp_ddp_config {
>> + struct tcp_ddp_config cfg;
>
> ... how would you use it within another struct like this?
>
You don't.
>> +
>> + u16 pfv;
>> + u8 cpda;
>> + u8 dgst;
>> + int queue_size;
>> + int queue_id;
>> + int io_cpu;
>> +};
>> +
>> +/**
>> + * struct tcp_ddp_io - tcp ddp configuration for an IO request.
>> + *
>> + * @command_id: identifier on the wire associated with these buffers
>> + * @nents: number of entries in the sg_table
>> + * @sg_table: describing the buffers for this IO request
>> + * @first_sgl: first SGL in sg_table
>> + */
>> +struct tcp_ddp_io {
>> + u32 command_id;
>> + int nents;
>> + struct sg_table sg_table;
>> + struct scatterlist first_sgl[SG_CHUNK_SIZE];
>> +};
>> +
>> +/* struct tcp_ddp_dev_ops - operations used by an upper layer protocol to configure ddp offload
>> + *
>> + * @tcp_ddp_limits: limit the number of scatter gather entries per IO.
>> + * the device driver can use this to limit the resources allocated per queue.
>> + * @tcp_ddp_sk_add: add offload for the queue represennted by the socket+config pair.
>> + * this function is used to configure either copy, crc or both offloads.
>> + * @tcp_ddp_sk_del: remove offload from the socket, and release any device related resources.
>> + * @tcp_ddp_setup: request copy offload for buffers associated with a command_id in tcp_ddp_io.
>> + * @tcp_ddp_teardown: release offload resources association between buffers and command_id in
>> + * tcp_ddp_io.
>> + * @tcp_ddp_resync: respond to the driver's resync_request. Called only if resync is successful.
>> + */
>> +struct tcp_ddp_dev_ops {
>> + int (*tcp_ddp_limits)(struct net_device *netdev,
>> + struct tcp_ddp_limits *limits);
>> + int (*tcp_ddp_sk_add)(struct net_device *netdev,
>> + struct sock *sk,
>> + struct tcp_ddp_config *config);
>> + void (*tcp_ddp_sk_del)(struct net_device *netdev,
>> + struct sock *sk);
>> + int (*tcp_ddp_setup)(struct net_device *netdev,
>> + struct sock *sk,
>> + struct tcp_ddp_io *io);
>> + int (*tcp_ddp_teardown)(struct net_device *netdev,
>> + struct sock *sk,
>> + struct tcp_ddp_io *io,
>> + void *ddp_ctx);
>> + void (*tcp_ddp_resync)(struct net_device *netdev,
>> + struct sock *sk, u32 seq);
>> +};
>> +
>> +#define TCP_DDP_RESYNC_REQ (1 << 0)
>> +
>> +/**
>> + * struct tcp_ddp_ulp_ops - Interface to register uppper layer Direct Data Placement (DDP) TCP offload
>> + */
>> +struct tcp_ddp_ulp_ops {
>> + /* NIC requests ulp to indicate if @seq is the start of a message */
>> + bool (*resync_request)(struct sock *sk, u32 seq, u32 flags);
>> + /* NIC driver informs the ulp that ddp teardown is done - used for async completions*/
>> + void (*ddp_teardown_done)(void *ddp_ctx);
>> +};
>> +
>> +/**
>> + * struct tcp_ddp_ctx - Generic tcp ddp context: device driver per queue contexts must
>> + * use this as the first member.
>> + */
>> +struct tcp_ddp_ctx {
>> + enum tcp_ddp_type type;
>> + unsigned char buf[];
>
> similar to my comment above, I did not see any uses of the buf element.
>
Same idea, we will remove it an leave a comment.
Powered by blists - more mailing lists