lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20230803140441.53596-10-huangjie.albert@bytedance.com>
Date: Thu,  3 Aug 2023 22:04:35 +0800
From: "huangjie.albert" <huangjie.albert@...edance.com>
To: davem@...emloft.net,
	edumazet@...gle.com,
	kuba@...nel.org,
	pabeni@...hat.com
Cc: "huangjie.albert" <huangjie.albert@...edance.com>,
	Alexei Starovoitov <ast@...nel.org>,
	Daniel Borkmann <daniel@...earbox.net>,
	Jesper Dangaard Brouer <hawk@...nel.org>,
	John Fastabend <john.fastabend@...il.com>,
	Björn Töpel <bjorn@...nel.org>,
	Magnus Karlsson <magnus.karlsson@...el.com>,
	Maciej Fijalkowski <maciej.fijalkowski@...el.com>,
	Jonathan Lemon <jonathan.lemon@...il.com>,
	Pavel Begunkov <asml.silence@...il.com>,
	Yunsheng Lin <linyunsheng@...wei.com>,
	Kees Cook <keescook@...omium.org>,
	Richard Gobert <richardbgobert@...il.com>,
	netdev@...r.kernel.org (open list:NETWORKING DRIVERS),
	linux-kernel@...r.kernel.org (open list),
	bpf@...r.kernel.org (open list:XDP (eXpress Data Path))
Subject: [RFC Optimizing veth xsk performance 09/10] veth: support zero copy for af xdp

The following conditions need to be satisfied to achieve zero-copy:
1. The tx desc has enough space to store the xdp_frame and skb_share_info.
2. The memory address pointed to by the tx desc is within a page.

test zero copy with libxdp
Performance:
		     |MSS (bytes) | Packet rate (PPS)
AF_XDP               | 1300       | 480k
AF_XDP with zero copy| 1300       | 540K

signed-off-by: huangjie.albert <huangjie.albert@...edance.com>
---
 drivers/net/veth.c | 207 ++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 178 insertions(+), 29 deletions(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 600225e27e9e..e4f1a8345f42 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -103,6 +103,11 @@ struct veth_xdp_tx_bq {
 	unsigned int count;
 };
 
+struct veth_seg_info {
+	u32 segs;
+	u64 desc[] ____cacheline_aligned_in_smp;
+};
+
 /*
  * ethtool interface
  */
@@ -645,6 +650,100 @@ static int veth_xdp_tx(struct veth_rq *rq, struct xdp_buff *xdp,
 	return 0;
 }
 
+static struct sk_buff *veth_build_skb(void *head, int headroom, int len,
+				      int buflen)
+{
+	struct sk_buff *skb;
+
+	skb = build_skb(head, buflen);
+	if (!skb)
+		return NULL;
+
+	skb_reserve(skb, headroom);
+	skb_put(skb, len);
+
+	return skb;
+}
+
+static void veth_xsk_destruct_skb(struct sk_buff *skb)
+{
+	struct veth_seg_info *seg_info = (struct veth_seg_info *)skb_shinfo(skb)->destructor_arg;
+	struct xsk_buff_pool *pool = (struct xsk_buff_pool *)skb_shinfo(skb)->destructor_arg_xsk_pool;
+	unsigned long flags;
+	u32 index = 0;
+	u64 addr;
+
+	/* release cq */
+	spin_lock_irqsave(&pool->cq_lock, flags);
+	for (index = 0; index < seg_info->segs; index++) {
+		addr = (u64)(long)seg_info->desc[index];
+		xsk_tx_completed_addr(pool, addr);
+	}
+	spin_unlock_irqrestore(&pool->cq_lock, flags);
+
+	kfree(seg_info);
+	skb_shinfo(skb)->destructor_arg = NULL;
+	skb_shinfo(skb)->destructor_arg_xsk_pool = NULL;
+}
+
+static struct sk_buff *veth_build_skb_zerocopy(struct net_device *dev, struct xsk_buff_pool *pool,
+					      struct xdp_desc *desc)
+{
+	struct veth_seg_info *seg_info;
+	struct sk_buff *skb;
+	struct page *page;
+	void *hard_start;
+	u32 len, ts;
+	void *buffer;
+	int headroom;
+	u64 addr;
+	u32 index;
+
+	addr = desc->addr;
+	len = desc->len;
+	buffer = xsk_buff_raw_get_data(pool, addr);
+	ts = pool->unaligned ? len : pool->chunk_size;
+
+	headroom = offset_in_page(buffer);
+
+	/* offset in umem pool buffer */
+	addr = buffer - pool->addrs;
+
+	/* get the page of the desc */
+	page = pool->umem->pgs[addr >> PAGE_SHIFT];
+
+	/* in order to avoid to get freed by kfree_skb */
+	get_page(page);
+
+	hard_start = page_to_virt(page);
+
+	skb = veth_build_skb(hard_start, headroom, len, ts);
+	seg_info = (struct veth_seg_info *)kmalloc(struct_size(seg_info, desc, MAX_SKB_FRAGS), GFP_KERNEL);
+	if (!seg_info)
+	{
+		printk("here must to deal with\n");
+	}
+
+	/* later we will support gso for this */
+	index = skb_shinfo(skb)->gso_segs;
+	seg_info->desc[index] = desc->addr;
+	seg_info->segs = ++index;
+
+	skb->truesize += ts;
+	skb->dev = dev;
+	skb_shinfo(skb)->destructor_arg = (void *)(long)seg_info;
+	skb_shinfo(skb)->destructor_arg_xsk_pool = (void *)(long)pool;
+	skb->destructor = veth_xsk_destruct_skb;
+
+	/* set the mac header */
+	skb->protocol = eth_type_trans(skb, dev);
+
+	/* to do, add skb to sock. may be there is no need to do for this
+	*  refcount_add(ts, &xs->sk.sk_wmem_alloc);
+	*/
+	return skb;
+}
+
 static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq,
 					  struct xdp_frame *frame,
 					  struct veth_xdp_tx_bq *bq,
@@ -1063,6 +1162,20 @@ static int veth_poll(struct napi_struct *napi, int budget)
 	return done;
 }
 
+/*  if buffer contain in a page */
+static inline bool buffer_in_page(void *buffer, u32 len)
+{
+	u32 offset;
+
+	offset = offset_in_page(buffer);
+
+	if(PAGE_SIZE - offset >= len) {
+		return true;
+	} else {
+		return false;
+	}
+}
+
 static int veth_xsk_tx_xmit(struct veth_sq *sq, struct xsk_buff_pool *xsk_pool, int budget)
 {
 	struct veth_priv *priv, *peer_priv;
@@ -1073,6 +1186,9 @@ static int veth_xsk_tx_xmit(struct veth_sq *sq, struct xsk_buff_pool *xsk_pool,
 	struct veth_xdp_tx_bq bq;
 	struct xdp_desc desc;
 	void *xdpf;
+	struct sk_buff *skb = NULL;
+	bool zc = xsk_pool->umem->zc;
+	u32 xsk_headroom = xsk_pool->headroom;
 	int done = 0;
 
 	bq.count = 0;
@@ -1102,12 +1218,6 @@ static int veth_xsk_tx_xmit(struct veth_sq *sq, struct xsk_buff_pool *xsk_pool,
 			break;
 		}
 
-		/*
-		* Get a xmit addr
-		* desc.addr is a offset, so we should to convert to real virtual address
-		*/
-		addr = xsk_buff_raw_get_data(xsk_pool, desc.addr);
-
 		/* can not hold all data in a page */
 		truesize =  SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) + desc.len + sizeof(struct xdp_frame);
 		if (truesize > PAGE_SIZE) {
@@ -1116,16 +1226,39 @@ static int veth_xsk_tx_xmit(struct veth_sq *sq, struct xsk_buff_pool *xsk_pool,
 			continue;
 		}
 
-		page = dev_alloc_page();
-		if (!page) {
-			/*
-			* error , release xdp frame and increase drops
-			*/
-			xsk_tx_completed_addr(xsk_pool, desc.addr);
-			stats.xdp_drops++;
-			break;
+		/*
+		* Get a xmit addr
+		* desc.addr is a offset, so we should to convert to real virtual address
+		*/
+		addr = xsk_buff_raw_get_data(xsk_pool, desc.addr);
+
+		/*
+		 * in order to support zero copy, headroom must have enough space to hold xdp_frame
+		 */
+		if (zc && (xsk_headroom < sizeof(struct xdp_frame)))
+			zc = false;
+
+		/*
+		 * if desc not contain in a page, also do not support zero copy
+		*/
+		if (!buffer_in_page(addr, desc.len))
+			zc = false;
+
+		if (zc) {
+			/* headroom is reserved for xdp_frame */
+			new_addr = addr - sizeof(struct xdp_frame);
+		} else {
+			page = dev_alloc_page();
+			if (!page) {
+				/*
+				* error , release xdp frame and increase drops
+				*/
+				xsk_tx_completed_addr(xsk_pool, desc.addr);
+				stats.xdp_drops++;
+				break;
+			}
+			new_addr = page_to_virt(page);
 		}
-		new_addr = page_to_virt(page);
 
 		p_frame = new_addr;
 		new_addr += sizeof(struct xdp_frame);
@@ -1137,19 +1270,37 @@ static int veth_xsk_tx_xmit(struct veth_sq *sq, struct xsk_buff_pool *xsk_pool,
 		 */
 		p_frame->headroom = 0;
 		p_frame->metasize = 0;
-		p_frame->frame_sz = PAGE_SIZE;
 		p_frame->flags = 0;
-		p_frame->mem.type = MEM_TYPE_PAGE_SHARED;
-		memcpy(p_frame->data, addr, p_frame->len);
-		xsk_tx_completed_addr(xsk_pool, desc.addr);
-
-		/* if peer have xdp prog, if it has ,just send to peer */
-		p_frame = veth_xdp_rcv_one(peer_rq, p_frame, &bq, &peer_stats);
-		/* if no xdp with this queue, convert to skb to xmit*/
-		if (p_frame) {
-			xdpf = p_frame;
-			veth_xdp_rcv_bulk_skb(peer_rq, &xdpf, 1, &bq, &peer_stats);
-			p_frame = NULL;
+
+		if (zc) {
+			p_frame->frame_sz = xsk_pool->frame_len;
+			/* to do: if there is a xdp, how to recycle the tx desc */
+			p_frame->mem.type = MEM_TYPE_XSK_BUFF_POOL_TX;
+			/* no need to copy address for af+xdp */
+			p_frame = veth_xdp_rcv_one(peer_rq, p_frame, &bq, &peer_stats);
+			if (p_frame) {
+				skb = veth_build_skb_zerocopy(peer_dev, xsk_pool, &desc);
+				if (skb) {
+					napi_gro_receive(&peer_rq->xdp_napi, skb);
+					skb = NULL;
+				} else {
+					xsk_tx_completed_addr(xsk_pool, desc.addr);
+				}
+			}
+		} else {
+			p_frame->frame_sz = PAGE_SIZE;
+			p_frame->mem.type = MEM_TYPE_PAGE_SHARED;
+			memcpy(p_frame->data, addr, p_frame->len);
+			xsk_tx_completed_addr(xsk_pool, desc.addr);
+
+			/* if peer have xdp prog, if it has ,just send to peer */
+			p_frame = veth_xdp_rcv_one(peer_rq, p_frame, &bq, &peer_stats);
+			/* if no xdp with this queue, convert to skb to xmit*/
+			if (p_frame) {
+				xdpf = p_frame;
+				veth_xdp_rcv_bulk_skb(peer_rq, &xdpf, 1, &bq, &peer_stats);
+				p_frame = NULL;
+			}
 		}
 
 		stats.xdp_bytes += desc.len;
@@ -1163,8 +1314,6 @@ static int veth_xsk_tx_xmit(struct veth_sq *sq, struct xsk_buff_pool *xsk_pool,
 		xsk_tx_release(xsk_pool);
 	}
 
-
-
 	/* just for peer rq */
 	if (peer_stats.xdp_tx > 0)
 		veth_xdp_flush(peer_rq, &bq);
-- 
2.20.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ