lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Date:   Mon, 31 May 2021 14:24:00 +0800
From:   Jason Wang <jasowang@...hat.com>
To:     Xuan Zhuo <xuanzhuo@...ux.alibaba.com>, netdev@...r.kernel.org
Cc:     "Michael S. Tsirkin" <mst@...hat.com>,
        "David S. Miller" <davem@...emloft.net>,
        Jakub Kicinski <kuba@...nel.org>,
        Alexei Starovoitov <ast@...nel.org>,
        Daniel Borkmann <daniel@...earbox.net>,
        Jesper Dangaard Brouer <hawk@...nel.org>,
        John Fastabend <john.fastabend@...il.com>,
        Andrii Nakryiko <andrii@...nel.org>,
        Martin KaFai Lau <kafai@...com>,
        Song Liu <songliubraving@...com>, Yonghong Song <yhs@...com>,
        KP Singh <kpsingh@...nel.org>,
        virtualization@...ts.linux-foundation.org, bpf@...r.kernel.org
Subject: Re: [PATCH net-next] virtio-net: Refactor the code related to
 page_to_skb()


在 2021/5/19 下午5:00, Xuan Zhuo 写道:
> Due to long-term development, the current code structure of
> page_to_skb() and the semantic expression of variables are rather
> chaotic, so it is necessary to reconstruct this piece of logic.
>
> The code has been tested with the following test code.
> Call test_merge and test_big to test "merge" and "big" modes
> respectively.
>
> ================================= test.sh ========================
> function test()
> {
>      flag=$1
>      max=$2
>      inc=$3
>      if [ "x$max" == "x" ]
>      then
>          max=4096
>      fi
>      echo $max
>
>      for s in $(seq 64 $inc $max)
>      do
>          echo >> log
>          echo $flag :UDP_STREAM $s >> log
>          netperf -H 192.168.122.202 -l 5 -t UDP_STREAM -- -m $s >> log
>          echo $flag :UDP_STREAM $s $?
>      done
>
>      for s in $(seq 64 $inc $max)
>      do
>          echo >> log
>          echo $flag :TCP_STREAM $s >> log
>          netperf -H 192.168.122.202 -l 5 -t TCP_STREAM -- -m $s >> log
>          echo $flag :TCP_STREAM $s $?
>      done
> }
>
> function test_merge()
> {
>      XDP='no-xdp'
>
>      ssh root@....168.122.202 ip link set dev eth0 xdp off
>      test $XDP
>
>      XDP='xdp-pass'
>
>      ssh root@....168.122.202 ip link set dev eth0 xdp off
>      ssh root@....168.122.202 ip link set dev eth0 xdp object xdp.o sec xdp
>      test $XDP
>
>      XDP='xdp-meta'
>
>      ssh root@....168.122.202 ip link set dev eth0 xdp off
>      ssh root@....168.122.202 ip link set dev eth0 xdp object xdp_meta.o sec xdp_mark
>      test $XDP
> }
>
> function test_big()
> {
>      # set host net dev mtu to 60000
>      test "big" 5000
>      test "big" 60000 10
> }
>
> ssh root@....168.122.202 ./netserver
> echo '' > log
>
> test_merge # or test_big
> ================================== xdp.c =================================
>
> SEC("xdp")
> int _xdp(struct xdp_md *xdp)
> {
>          return XDP_PASS;
> }
>
> char _license[] SEC("license") = "GPL";
>
> ================================== xdp_meta.c =================================
>
> static long (*bpf_xdp_adjust_meta)(struct xdp_md *xdp_md, int delta) = (void *) 54;
>
> struct meta_info {
>          __u32 mark;
> } __attribute__((aligned(4)));
>
> SEC("xdp_mark")
> int _xdp_mark(struct xdp_md *ctx)
> {
>          struct meta_info *meta;
>          void *data, *data_end;
>          int ret;
>
>          ret = bpf_xdp_adjust_meta(ctx, -(int)sizeof(*meta));
>          if (ret < 0)
>                  return XDP_ABORTED;
>
>          data = (void *)(unsigned long)ctx->data;
>
>          /* Check data_meta have room for meta_info struct */
>          meta = (void *)(unsigned long)ctx->data_meta;
>          if ((void *)(meta + 1) > data)
>                  return XDP_ABORTED;
>
>          meta->mark = 42;
>
>          return XDP_PASS;
> }
>
> char _license[] SEC("license") = "GPL";
> ===================================================================


I'm not sure virtio-net needs such a huge refactoring that just do the 
simplification of parameters passing for page_to_skb(). This will 
complicate stable backports in the future.

I suspect it may have performance regression.

Do you have any perf numbers? E.g pps.

Thanks


>
> Signed-off-by: Xuan Zhuo <xuanzhuo@...ux.alibaba.com>
> ---
>   drivers/net/virtio_net.c | 309 +++++++++++++++++++++++----------------
>   1 file changed, 185 insertions(+), 124 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 7fda2ae4c40f..a117b3496653 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -249,6 +249,35 @@ struct padded_vnet_hdr {
>   	char padding[4];
>   };
>   
> +struct virtnet_page_info {
> +	struct virtnet_info *vi;
> +	struct receive_queue *rq;
> +
> +	/* this may be the head_page, buf not starts with this page */
> +	struct page *page;
> +
> +	/* the allcated buf. this may point to the headroom */
> +	char *buf;
> +
> +	/* the size of the buf */
> +	unsigned int buf_size;
> +
> +	/* OUT. the offset of the remaining data in the page */
> +	unsigned int offset;
> +
> +	char *virtnet_hdr;
> +
> +	/* packet data. generally point to eth header */
> +	char *packet;
> +
> +	/* IN. packet len without virtnet hdr
> +	 * OUT. the size of the remaining data
> +	 */
> +	unsigned int len;
> +
> +	unsigned int metasize;
> +};
> +
>   static bool is_xdp_frame(void *ptr)
>   {
>   	return (unsigned long)ptr & VIRTIO_XDP_FLAG;
> @@ -357,6 +386,89 @@ static void skb_xmit_done(struct virtqueue *vq)
>   		netif_wake_subqueue(vi->dev, vq2txq(vq));
>   }
>   
> +static struct sk_buff *virtnet_page_to_skb(struct virtnet_page_info *pinfo)
> +{
> +	int shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> +	struct virtio_net_hdr_mrg_rxbuf *hdr;
> +	struct sk_buff *skb;
> +	int tailroom, copy;
> +
> +	/* In the case of "big", tailroom may be negative, because len can be
> +	 * greater than PAGE_SIZE.
> +	 */
> +	tailroom = pinfo->buf + pinfo->buf_size - (pinfo->packet + pinfo->len);
> +
> +	if (!NET_IP_ALIGN && tailroom >= shinfo_size) {
> +		skb = build_skb(pinfo->buf, pinfo->buf_size);
> +		if (unlikely(!skb))
> +			return NULL;
> +
> +		skb_reserve(skb, pinfo->packet - pinfo->buf);
> +		skb_put(skb, pinfo->len);
> +
> +		/* mark. page has been used. */
> +		pinfo->page = NULL;
> +	} else {
> +		/* copy small data so we can reuse these pages for small data
> +		 *
> +		 * GOOD_COPY_LEN is used to save network headers, such as eth
> +		 * header, ip header, tcp header. If you want to save metadata
> +		 * information, we should apply for a larger space. Prevent the
> +		 * network header cannot fit in the linear space.
> +		 */
> +		skb = napi_alloc_skb(&pinfo->rq->napi,
> +				     pinfo->metasize + GOOD_COPY_LEN);
> +		if (unlikely(!skb))
> +			return NULL;
> +
> +		/* Copy all frame if it fits skb->head, otherwise
> +		 * we let virtio_net_hdr_to_skb() and GRO pull headers as needed.
> +		 */
> +		if (pinfo->len <= GOOD_COPY_LEN)
> +			copy = pinfo->len;
> +		else
> +			copy = ETH_HLEN;
> +
> +		skb_put_data(skb, pinfo->packet - pinfo->metasize,
> +			     copy + pinfo->metasize);
> +		__skb_pull(skb, pinfo->metasize);
> +		pinfo->len -= copy;
> +		pinfo->offset = pinfo->packet + copy -
> +				(char *)page_address(pinfo->page);
> +	}
> +
> +	if (pinfo->metasize)
> +		skb_metadata_set(skb, pinfo->metasize);
> +
> +	if (pinfo->virtnet_hdr) {
> +		hdr = skb_vnet_hdr(skb);
> +		memcpy(hdr, pinfo->virtnet_hdr, pinfo->vi->hdr_len);
> +	}
> +
> +	return skb;
> +}
> +
> +static struct sk_buff *virtnet_merge_page_to_skb(struct virtnet_page_info *pinfo)
> +{
> +	struct sk_buff *skb;
> +
> +	skb = virtnet_page_to_skb(pinfo);
> +	if (unlikely(!skb))
> +		return NULL;
> +
> +	/* page has been used by build_skb() */
> +	if (!pinfo->page)
> +		return skb;
> +
> +	if (pinfo->len)
> +		skb_add_rx_frag(skb, 0, pinfo->page, pinfo->offset, pinfo->len,
> +				pinfo->buf_size);
> +	else
> +		put_page(pinfo->page);
> +
> +	return skb;
> +}
> +
>   #define MRG_CTX_HEADER_SHIFT 22
>   static void *mergeable_len_to_ctx(unsigned int truesize,
>   				  unsigned int headroom)
> @@ -375,86 +487,30 @@ static unsigned int mergeable_ctx_to_truesize(void *mrg_ctx)
>   }
>   
>   /* Called from bottom half context */
> -static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> -				   struct receive_queue *rq,
> -				   struct page *page, unsigned int offset,
> -				   unsigned int len, unsigned int truesize,
> -				   bool hdr_valid, unsigned int metasize,
> -				   unsigned int headroom)
> +static struct sk_buff *virtnet_big_page_to_skb(struct virtnet_page_info *pinfo)
>   {
> +	unsigned int len, offset, truesize;
> +	struct receive_queue *rq;
>   	struct sk_buff *skb;
> -	struct virtio_net_hdr_mrg_rxbuf *hdr;
> -	unsigned int copy, hdr_len, hdr_padded_len;
> -	struct page *page_to_free = NULL;
> -	int tailroom, shinfo_size;
> -	char *p, *hdr_p, *buf;
> +	struct page *page;
>   
> -	p = page_address(page) + offset;
> -	hdr_p = p;
> +	/* save next page */
> +	page = (struct page *)pinfo->page->private;
>   
> -	hdr_len = vi->hdr_len;
> -	if (vi->mergeable_rx_bufs)
> -		hdr_padded_len = sizeof(*hdr);
> -	else
> -		hdr_padded_len = sizeof(struct padded_vnet_hdr);
> -
> -	/* If headroom is not 0, there is an offset between the beginning of the
> -	 * data and the allocated space, otherwise the data and the allocated
> -	 * space are aligned.
> -	 */
> -	if (headroom) {
> -		/* Buffers with headroom use PAGE_SIZE as alloc size,
> -		 * see add_recvbuf_mergeable() + get_mergeable_buf_len()
> -		 */
> -		truesize = PAGE_SIZE;
> -		tailroom = truesize - len - offset;
> -		buf = page_address(page);
> -	} else {
> -		tailroom = truesize - len;
> -		buf = p;
> -	}
> -
> -	len -= hdr_len;
> -	offset += hdr_padded_len;
> -	p += hdr_padded_len;
> -
> -	shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> -
> -	/* copy small packet so we can reuse these pages */
> -	if (!NET_IP_ALIGN && len > GOOD_COPY_LEN && tailroom >= shinfo_size) {
> -		skb = build_skb(buf, truesize);
> -		if (unlikely(!skb))
> -			return NULL;
> -
> -		skb_reserve(skb, p - buf);
> -		skb_put(skb, len);
> -		goto ok;
> -	}
> -
> -	/* copy small packet so we can reuse these pages for small data */
> -	skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);
> +	skb = virtnet_page_to_skb(pinfo);
>   	if (unlikely(!skb))
>   		return NULL;
>   
> -	/* Copy all frame if it fits skb->head, otherwise
> -	 * we let virtio_net_hdr_to_skb() and GRO pull headers as needed.
> -	 */
> -	if (len <= skb_tailroom(skb))
> -		copy = len;
> -	else
> -		copy = ETH_HLEN + metasize;
> -	skb_put_data(skb, p, copy);
> +	rq = pinfo->rq;
>   
> -	len -= copy;
> -	offset += copy;
> +	/* page has been used by build_skb() */
> +	if (!pinfo->page)
> +		goto end;
>   
> -	if (vi->mergeable_rx_bufs) {
> -		if (len)
> -			skb_add_rx_frag(skb, 0, page, offset, len, truesize);
> -		else
> -			page_to_free = page;
> -		goto ok;
> -	}
> +	page     = pinfo->page;
> +	len      = pinfo->len;
> +	offset   = pinfo->offset;
> +	truesize = pinfo->buf_size;
>   
>   	/*
>   	 * Verify that we can indeed put this data into a skb.
> @@ -477,23 +533,10 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
>   		offset = 0;
>   	}
>   
> +end:
>   	if (page)
>   		give_pages(rq, page);
>   
> -ok:
> -	/* hdr_valid means no XDP, so we can copy the vnet header */
> -	if (hdr_valid) {
> -		hdr = skb_vnet_hdr(skb);
> -		memcpy(hdr, hdr_p, hdr_len);
> -	}
> -	if (page_to_free)
> -		put_page(page_to_free);
> -
> -	if (metasize) {
> -		__skb_pull(skb, metasize);
> -		skb_metadata_set(skb, metasize);
> -	}
> -
>   	return skb;
>   }
>   
> @@ -654,17 +697,17 @@ static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
>    */
>   static struct page *xdp_linearize_page(struct receive_queue *rq,
>   				       u16 *num_buf,
> -				       struct page *p,
> -				       int offset,
> +				       void *buf,
>   				       int page_off,
>   				       unsigned int *len)
>   {
>   	struct page *page = alloc_page(GFP_ATOMIC);
> +	struct page *p;
>   
>   	if (!page)
>   		return NULL;
>   
> -	memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
> +	memcpy(page_address(page) + page_off, buf, *len);
>   	page_off += *len;
>   
>   	while (--*num_buf) {
> @@ -739,18 +782,18 @@ static struct sk_buff *receive_small(struct net_device *dev,
>   			goto err_xdp;
>   
>   		if (unlikely(xdp_headroom < virtnet_get_headroom(vi))) {
> -			int offset = buf - page_address(page) + header_offset;
>   			unsigned int tlen = len + vi->hdr_len;
>   			u16 num_buf = 1;
>   
> +			buf += header_offset;
> +
>   			xdp_headroom = virtnet_get_headroom(vi);
>   			header_offset = VIRTNET_RX_PAD + xdp_headroom;
>   			headroom = vi->hdr_len + header_offset;
>   			buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
>   				 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> -			xdp_page = xdp_linearize_page(rq, &num_buf, page,
> -						      offset, header_offset,
> -						      &tlen);
> +			xdp_page = xdp_linearize_page(rq, &num_buf, buf,
> +						      header_offset, &tlen);
>   			if (!xdp_page)
>   				goto err_xdp;
>   
> @@ -842,9 +885,21 @@ static struct sk_buff *receive_big(struct net_device *dev,
>   				   unsigned int len,
>   				   struct virtnet_rq_stats *stats)
>   {
> +	struct virtnet_page_info pinfo;
>   	struct page *page = buf;
> -	struct sk_buff *skb =
> -		page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, true, 0, 0);
> +	struct sk_buff *skb;
> +
> +	pinfo.rq          = rq;
> +	pinfo.vi          = vi;
> +	pinfo.page        = page;
> +	pinfo.buf         = page_address(page);
> +	pinfo.buf_size    = PAGE_SIZE;
> +	pinfo.virtnet_hdr = pinfo.buf;
> +	pinfo.packet      = pinfo.virtnet_hdr + sizeof(struct padded_vnet_hdr);
> +	pinfo.len         = len - vi->hdr_len;
> +	pinfo.metasize    = 0;
> +
> +	skb = virtnet_big_page_to_skb(&pinfo);
>   
>   	stats->bytes += len - vi->hdr_len;
>   	if (unlikely(!skb))
> @@ -870,12 +925,11 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>   	struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
>   	u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
>   	struct page *page = virt_to_head_page(buf);
> -	int offset = buf - page_address(page);
>   	struct sk_buff *head_skb, *curr_skb;
> +	struct virtnet_page_info pinfo;
>   	struct bpf_prog *xdp_prog;
>   	unsigned int truesize = mergeable_ctx_to_truesize(ctx);
>   	unsigned int headroom = mergeable_ctx_to_headroom(ctx);
> -	unsigned int metasize = 0;
>   	unsigned int frame_sz;
>   	int err;
>   
> @@ -887,8 +941,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>   	if (xdp_prog) {
>   		struct xdp_frame *xdpf;
>   		struct page *xdp_page;
> +		void *hard_start;
>   		struct xdp_buff xdp;
> -		void *data;
>   		u32 act;
>   
>   		/* Transient failure which in theory could occur if
> @@ -912,54 +966,47 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>   		if (unlikely(num_buf > 1 ||
>   			     headroom < virtnet_get_headroom(vi))) {
>   			/* linearize data for XDP */
> -			xdp_page = xdp_linearize_page(rq, &num_buf,
> -						      page, offset,
> +			xdp_page = xdp_linearize_page(rq, &num_buf, buf,
>   						      VIRTIO_XDP_HEADROOM,
>   						      &len);
>   			frame_sz = PAGE_SIZE;
>   
>   			if (!xdp_page)
>   				goto err_xdp;
> -			offset = VIRTIO_XDP_HEADROOM;
> +
> +			hard_start = page_address(xdp_page) + vi->hdr_len;
>   		} else {
>   			xdp_page = page;
> +			hard_start = buf + vi->hdr_len - VIRTIO_XDP_HEADROOM;
>   		}
>   
>   		/* Allow consuming headroom but reserve enough space to push
>   		 * the descriptor on if we get an XDP_TX return code.
>   		 */
> -		data = page_address(xdp_page) + offset;
>   		xdp_init_buff(&xdp, frame_sz - vi->hdr_len, &rq->xdp_rxq);
> -		xdp_prepare_buff(&xdp, data - VIRTIO_XDP_HEADROOM + vi->hdr_len,
> -				 VIRTIO_XDP_HEADROOM, len - vi->hdr_len, true);
> +		xdp_prepare_buff(&xdp, hard_start, VIRTIO_XDP_HEADROOM,
> +				 len - vi->hdr_len, true);
>   
>   		act = bpf_prog_run_xdp(xdp_prog, &xdp);
>   		stats->xdp_packets++;
>   
>   		switch (act) {
>   		case XDP_PASS:
> -			metasize = xdp.data - xdp.data_meta;
> -
> -			/* recalculate offset to account for any header
> -			 * adjustments and minus the metasize to copy the
> -			 * metadata in page_to_skb(). Note other cases do not
> -			 * build an skb and avoid using offset
> -			 */
> -			offset = xdp.data - page_address(xdp_page) -
> -				 vi->hdr_len - metasize;
> -
> -			/* recalculate len if xdp.data, xdp.data_end or
> -			 * xdp.data_meta were adjusted
> -			 */
> -			len = xdp.data_end - xdp.data + vi->hdr_len + metasize;
> +			pinfo.rq          = rq;
> +			pinfo.vi          = vi;
> +			pinfo.page        = xdp_page;
> +			pinfo.buf         = xdp.data_hard_start - vi->hdr_len;
> +			pinfo.buf_size    = PAGE_SIZE;
> +			pinfo.virtnet_hdr = NULL;
> +			pinfo.packet      = xdp.data;
> +			pinfo.len         = xdp.data_end - xdp.data;
> +			pinfo.metasize    = xdp.data - xdp.data_meta;
>   			/* We can only create skb based on xdp_page. */
>   			if (unlikely(xdp_page != page)) {
>   				rcu_read_unlock();
>   				put_page(page);
> -				head_skb = page_to_skb(vi, rq, xdp_page, offset,
> -						       len, PAGE_SIZE, false,
> -						       metasize, headroom);
> -				return head_skb;
> +
> +				return virtnet_merge_page_to_skb(&pinfo);
>   			}
>   			break;
>   		case XDP_TX:
> @@ -1005,8 +1052,22 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>   				__free_pages(xdp_page, 0);
>   			goto err_xdp;
>   		}
> +		rcu_read_unlock();
> +
> +		/* pinfo has been filled inside XDP_PASS */
> +	} else {
> +		rcu_read_unlock();
> +
> +		pinfo.rq          = rq;
> +		pinfo.vi          = vi;
> +		pinfo.page        = page;
> +		pinfo.buf         = buf - headroom;
> +		pinfo.buf_size    = headroom ? PAGE_SIZE : truesize;
> +		pinfo.virtnet_hdr = buf;
> +		pinfo.packet      = buf + sizeof(*hdr);
> +		pinfo.len         = len - sizeof(*hdr);
> +		pinfo.metasize    = 0;
>   	}
> -	rcu_read_unlock();
>   
>   	if (unlikely(len > truesize)) {
>   		pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
> @@ -1015,14 +1076,14 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>   		goto err_skb;
>   	}
>   
> -	head_skb = page_to_skb(vi, rq, page, offset, len, truesize, !xdp_prog,
> -			       metasize, headroom);
> +	head_skb = virtnet_merge_page_to_skb(&pinfo);
>   	curr_skb = head_skb;
>   
>   	if (unlikely(!curr_skb))
>   		goto err_skb;
>   	while (--num_buf) {
>   		int num_skb_frags;
> +		int offset;
>   
>   		buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
>   		if (unlikely(!buf)) {

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ