lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Y+zxY07GZ8aI7LrV@lore-desk>
Date:   Wed, 15 Feb 2023 15:51:15 +0100
From:   Lorenzo Bianconi <lorenzo.bianconi@...hat.com>
To:     Tirthendu Sarkar <tirthendu.sarkar@...el.com>
Cc:     intel-wired-lan@...ts.osuosl.org, jesse.brandeburg@...el.com,
        anthony.l.nguyen@...el.com, netdev@...r.kernel.org,
        bpf@...r.kernel.org, magnus.karlsson@...el.com,
        maciej.fijalkowski@...el.com
Subject: Re: [PATCH intel-next v4 8/8] i40e: add support for XDP multi-buffer
 Rx

> This patch adds multi-buffer support for the i40e_driver.
> 

[...]

>  
>  	netdev->features &= ~NETIF_F_HW_TC;
>  	netdev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT |
> -			       NETDEV_XDP_ACT_XSK_ZEROCOPY;
> +			       NETDEV_XDP_ACT_XSK_ZEROCOPY |
> +			       NETDEV_XDP_ACT_RX_SG;

Hi Tirthendu,

I guess we should set it just for I40E_VSI_MAIN, I posted a patch yesterday
to fix it:
https://patchwork.kernel.org/project/netdevbpf/patch/f2b537f86b34fc176fbc6b3d249b46a20a87a2f3.1676405131.git.lorenzo@kernel.org/

can you please rebase on top of it?

Regards,
Lorenzo

>  
>  	if (vsi->type == I40E_VSI_MAIN) {
>  		SET_NETDEV_DEV(netdev, &pf->pdev->dev);
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> index dc2c9aae0ffe..7ace7b7ec256 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> @@ -1477,9 +1477,6 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
>  	if (!rx_ring->rx_bi)
>  		return;
>  
> -	dev_kfree_skb(rx_ring->skb);
> -	rx_ring->skb = NULL;
> -
>  	if (rx_ring->xsk_pool) {
>  		i40e_xsk_clean_rx_ring(rx_ring);
>  		goto skip_free;
> @@ -2033,36 +2030,6 @@ static void i40e_rx_buffer_flip(struct i40e_rx_buffer *rx_buffer,
>  #endif
>  }
>  
> -/**
> - * i40e_add_rx_frag - Add contents of Rx buffer to sk_buff
> - * @rx_ring: rx descriptor ring to transact packets on
> - * @rx_buffer: buffer containing page to add
> - * @skb: sk_buff to place the data into
> - * @size: packet length from rx_desc
> - *
> - * This function will add the data contained in rx_buffer->page to the skb.
> - * It will just attach the page as a frag to the skb.
> - *
> - * The function will then update the page offset.
> - **/
> -static void i40e_add_rx_frag(struct i40e_ring *rx_ring,
> -			     struct i40e_rx_buffer *rx_buffer,
> -			     struct sk_buff *skb,
> -			     unsigned int size)
> -{
> -#if (PAGE_SIZE < 8192)
> -	unsigned int truesize = i40e_rx_pg_size(rx_ring) / 2;
> -#else
> -	unsigned int truesize = SKB_DATA_ALIGN(size + rx_ring->rx_offset);
> -#endif
> -
> -	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_buffer->page,
> -			rx_buffer->page_offset, size, truesize);
> -
> -	/* page is being used so we must update the page offset */
> -	i40e_rx_buffer_flip(rx_buffer, truesize);
> -}
> -
>  /**
>   * i40e_get_rx_buffer - Fetch Rx buffer and synchronize data for use
>   * @rx_ring: rx descriptor ring to transact packets on
> @@ -2099,20 +2066,82 @@ static struct i40e_rx_buffer *i40e_get_rx_buffer(struct i40e_ring *rx_ring,
>  }
>  
>  /**
> - * i40e_construct_skb - Allocate skb and populate it
> + * i40e_put_rx_buffer - Clean up used buffer and either recycle or free
>   * @rx_ring: rx descriptor ring to transact packets on
>   * @rx_buffer: rx buffer to pull data from
> + *
> + * This function will clean up the contents of the rx_buffer.  It will
> + * either recycle the buffer or unmap it and free the associated resources.
> + */
> +static void i40e_put_rx_buffer(struct i40e_ring *rx_ring,
> +			       struct i40e_rx_buffer *rx_buffer)
> +{
> +	if (i40e_can_reuse_rx_page(rx_buffer, &rx_ring->rx_stats)) {
> +		/* hand second half of page back to the ring */
> +		i40e_reuse_rx_page(rx_ring, rx_buffer);
> +	} else {
> +		/* we are not reusing the buffer so unmap it */
> +		dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma,
> +				     i40e_rx_pg_size(rx_ring),
> +				     DMA_FROM_DEVICE, I40E_RX_DMA_ATTR);
> +		__page_frag_cache_drain(rx_buffer->page,
> +					rx_buffer->pagecnt_bias);
> +		/* clear contents of buffer_info */
> +		rx_buffer->page = NULL;
> +	}
> +}
> +
> +/**
> + * i40e_process_rx_buffs- Processing of buffers post XDP prog or on error
> + * @rx_ring: Rx descriptor ring to transact packets on
> + * @xdp_res: Result of the XDP program
> + * @xdp: xdp_buff pointing to the data
> + **/
> +static void i40e_process_rx_buffs(struct i40e_ring *rx_ring, int xdp_res,
> +				  struct xdp_buff *xdp)
> +{
> +	u16 next = rx_ring->next_to_clean;
> +	struct i40e_rx_buffer *rx_buffer;
> +
> +	xdp->flags = 0;
> +
> +	while (1) {
> +		rx_buffer = i40e_rx_bi(rx_ring, next);
> +		if (++next == rx_ring->count)
> +			next = 0;
> +
> +		if (!rx_buffer->page)
> +			continue;
> +
> +		if (xdp_res == I40E_XDP_CONSUMED)
> +			rx_buffer->pagecnt_bias++;
> +		else
> +			i40e_rx_buffer_flip(rx_buffer, xdp->frame_sz);
> +
> +		/* EOP buffer will be put in i40e_clean_rx_irq() */
> +		if (next == rx_ring->next_to_process)
> +			return;
> +
> +		i40e_put_rx_buffer(rx_ring, rx_buffer);
> +	}
> +}
> +
> +/**
> + * i40e_construct_skb - Allocate skb and populate it
> + * @rx_ring: rx descriptor ring to transact packets on
>   * @xdp: xdp_buff pointing to the data
> + * @nr_frags: number of buffers for the packet
>   *
>   * This function allocates an skb.  It then populates it with the page
>   * data from the current receive descriptor, taking care to set up the
>   * skb correctly.
>   */
>  static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring,
> -					  struct i40e_rx_buffer *rx_buffer,
> -					  struct xdp_buff *xdp)
> +					  struct xdp_buff *xdp,
> +					  u32 nr_frags)
>  {
>  	unsigned int size = xdp->data_end - xdp->data;
> +	struct i40e_rx_buffer *rx_buffer;
>  	unsigned int headlen;
>  	struct sk_buff *skb;
>  
> @@ -2152,13 +2181,17 @@ static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring,
>  	memcpy(__skb_put(skb, headlen), xdp->data,
>  	       ALIGN(headlen, sizeof(long)));
>  
> +	rx_buffer = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
>  	/* update all of the pointers */
>  	size -= headlen;
>  	if (size) {
> +		if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
> +			dev_kfree_skb(skb);
> +			return NULL;
> +		}
>  		skb_add_rx_frag(skb, 0, rx_buffer->page,
>  				rx_buffer->page_offset + headlen,
>  				size, xdp->frame_sz);
> -
>  		/* buffer is used by skb, update page_offset */
>  		i40e_rx_buffer_flip(rx_buffer, xdp->frame_sz);
>  	} else {
> @@ -2166,21 +2199,40 @@ static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring,
>  		rx_buffer->pagecnt_bias++;
>  	}
>  
> +	if (unlikely(xdp_buff_has_frags(xdp))) {
> +		struct skb_shared_info *sinfo, *skinfo = skb_shinfo(skb);
> +
> +		sinfo = xdp_get_shared_info_from_buff(xdp);
> +		memcpy(&skinfo->frags[skinfo->nr_frags], &sinfo->frags[0],
> +		       sizeof(skb_frag_t) * nr_frags);
> +
> +		xdp_update_skb_shared_info(skb, skinfo->nr_frags + nr_frags,
> +					   sinfo->xdp_frags_size,
> +					   nr_frags * xdp->frame_sz,
> +					   xdp_buff_is_frag_pfmemalloc(xdp));
> +
> +		/* First buffer has already been processed, so bump ntc */
> +		if (++rx_ring->next_to_clean == rx_ring->count)
> +			rx_ring->next_to_clean = 0;
> +
> +		i40e_process_rx_buffs(rx_ring, I40E_XDP_PASS, xdp);
> +	}
> +
>  	return skb;
>  }
>  
>  /**
>   * i40e_build_skb - Build skb around an existing buffer
>   * @rx_ring: Rx descriptor ring to transact packets on
> - * @rx_buffer: Rx buffer to pull data from
>   * @xdp: xdp_buff pointing to the data
> + * @nr_frags: number of buffers for the packet
>   *
>   * This function builds an skb around an existing Rx buffer, taking care
>   * to set up the skb correctly and avoid any memcpy overhead.
>   */
>  static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring,
> -				      struct i40e_rx_buffer *rx_buffer,
> -				      struct xdp_buff *xdp)
> +				      struct xdp_buff *xdp,
> +				      u32 nr_frags)
>  {
>  	unsigned int metasize = xdp->data - xdp->data_meta;
>  	struct sk_buff *skb;
> @@ -2203,36 +2255,25 @@ static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring,
>  	if (metasize)
>  		skb_metadata_set(skb, metasize);
>  
> -	/* buffer is used by skb, update page_offset */
> -	i40e_rx_buffer_flip(rx_buffer, xdp->frame_sz);
> +	if (unlikely(xdp_buff_has_frags(xdp))) {
> +		struct skb_shared_info *sinfo;
>  
> -	return skb;
> -}
> +		sinfo = xdp_get_shared_info_from_buff(xdp);
> +		xdp_update_skb_shared_info(skb, nr_frags,
> +					   sinfo->xdp_frags_size,
> +					   nr_frags * xdp->frame_sz,
> +					   xdp_buff_is_frag_pfmemalloc(xdp));
>  
> -/**
> - * i40e_put_rx_buffer - Clean up used buffer and either recycle or free
> - * @rx_ring: rx descriptor ring to transact packets on
> - * @rx_buffer: rx buffer to pull data from
> - *
> - * This function will clean up the contents of the rx_buffer.  It will
> - * either recycle the buffer or unmap it and free the associated resources.
> - */
> -static void i40e_put_rx_buffer(struct i40e_ring *rx_ring,
> -			       struct i40e_rx_buffer *rx_buffer)
> -{
> -	if (i40e_can_reuse_rx_page(rx_buffer, &rx_ring->rx_stats)) {
> -		/* hand second half of page back to the ring */
> -		i40e_reuse_rx_page(rx_ring, rx_buffer);
> +		i40e_process_rx_buffs(rx_ring, I40E_XDP_PASS, xdp);
>  	} else {
> -		/* we are not reusing the buffer so unmap it */
> -		dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma,
> -				     i40e_rx_pg_size(rx_ring),
> -				     DMA_FROM_DEVICE, I40E_RX_DMA_ATTR);
> -		__page_frag_cache_drain(rx_buffer->page,
> -					rx_buffer->pagecnt_bias);
> -		/* clear contents of buffer_info */
> -		rx_buffer->page = NULL;
> +		struct i40e_rx_buffer *rx_buffer;
> +
> +		rx_buffer = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
> +		/* buffer is used by skb, update page_offset */
> +		i40e_rx_buffer_flip(rx_buffer, xdp->frame_sz);
>  	}
> +
> +	return skb;
>  }
>  
>  /**
> @@ -2387,6 +2428,55 @@ static void i40e_inc_ntp(struct i40e_ring *rx_ring)
>  	prefetch(I40E_RX_DESC(rx_ring, ntp));
>  }
>  
> +/**
> + * i40e_add_xdp_frag: Add a frag to xdp_buff
> + * @xdp: xdp_buff pointing to the data
> + * @nr_frags: return number of buffers for the packet
> + * @rx_buffer: rx_buffer holding data of the current frag
> + * @size: size of data of current frag
> + */
> +static int i40e_add_xdp_frag(struct xdp_buff *xdp, u32 *nr_frags,
> +			     struct i40e_rx_buffer *rx_buffer, u32 size)
> +{
> +	struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
> +
> +	if (!xdp_buff_has_frags(xdp)) {
> +		sinfo->nr_frags = 0;
> +		sinfo->xdp_frags_size = 0;
> +		xdp_buff_set_frags_flag(xdp);
> +	} else if (unlikely(sinfo->nr_frags >= MAX_SKB_FRAGS)) {
> +		/* Overflowing packet: All frags need to be dropped */
> +		return -ENOMEM;
> +	}
> +
> +	__skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++, rx_buffer->page,
> +				   rx_buffer->page_offset, size);
> +
> +	sinfo->xdp_frags_size += size;
> +
> +	if (page_is_pfmemalloc(rx_buffer->page))
> +		xdp_buff_set_frag_pfmemalloc(xdp);
> +	*nr_frags = sinfo->nr_frags;
> +
> +	return 0;
> +}
> +
> +/**
> + * i40e_consume_xdp_buff - Consume all the buffers of the packet and update ntc
> + * @rx_ring: rx descriptor ring to transact packets on
> + * @xdp: xdp_buff pointing to the data
> + * @rx_buffer: rx_buffer of eop desc
> + */
> +static void i40e_consume_xdp_buff(struct i40e_ring *rx_ring,
> +				  struct xdp_buff *xdp,
> +				  struct i40e_rx_buffer *rx_buffer)
> +{
> +	i40e_process_rx_buffs(rx_ring, I40E_XDP_CONSUMED, xdp);
> +	i40e_put_rx_buffer(rx_ring, rx_buffer);
> +	rx_ring->next_to_clean = rx_ring->next_to_process;
> +	xdp->data = NULL;
> +}
> +
>  /**
>   * i40e_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf
>   * @rx_ring: rx descriptor ring to transact packets on
> @@ -2405,9 +2495,9 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget,
>  {
>  	unsigned int total_rx_bytes = 0, total_rx_packets = 0;
>  	u16 cleaned_count = I40E_DESC_UNUSED(rx_ring);
> +	u16 clean_threshold = rx_ring->count / 2;
>  	unsigned int offset = rx_ring->rx_offset;
>  	struct xdp_buff *xdp = &rx_ring->xdp;
> -	struct sk_buff *skb = rx_ring->skb;
>  	unsigned int xdp_xmit = 0;
>  	struct bpf_prog *xdp_prog;
>  	bool failure = false;
> @@ -2419,11 +2509,14 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget,
>  		u16 ntp = rx_ring->next_to_process;
>  		struct i40e_rx_buffer *rx_buffer;
>  		union i40e_rx_desc *rx_desc;
> +		struct sk_buff *skb;
>  		unsigned int size;
> +		u32 nfrags = 0;
> +		bool neop;
>  		u64 qword;
>  
>  		/* return some buffers to hardware, one at a time is too slow */
> -		if (cleaned_count >= I40E_RX_BUFFER_WRITE) {
> +		if (cleaned_count >= clean_threshold) {
>  			failure = failure ||
>  				  i40e_alloc_rx_buffers(rx_ring, cleaned_count);
>  			cleaned_count = 0;
> @@ -2461,76 +2554,83 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget,
>  			break;
>  
>  		i40e_trace(clean_rx_irq, rx_ring, rx_desc, xdp);
> +		/* retrieve a buffer from the ring */
>  		rx_buffer = i40e_get_rx_buffer(rx_ring, size);
>  
> -		/* retrieve a buffer from the ring */
> -		if (!skb) {
> +		neop = i40e_is_non_eop(rx_ring, rx_desc);
> +		i40e_inc_ntp(rx_ring);
> +
> +		if (!xdp->data) {
>  			unsigned char *hard_start;
>  
>  			hard_start = page_address(rx_buffer->page) +
>  				     rx_buffer->page_offset - offset;
>  			xdp_prepare_buff(xdp, hard_start, offset, size, true);
> -			xdp_buff_clear_frags_flag(xdp);
>  #if (PAGE_SIZE > 4096)
>  			/* At larger PAGE_SIZE, frame_sz depend on len size */
>  			xdp->frame_sz = i40e_rx_frame_truesize(rx_ring, size);
>  #endif
> -			xdp_res = i40e_run_xdp(rx_ring, xdp, xdp_prog);
> +		} else if (i40e_add_xdp_frag(xdp, &nfrags, rx_buffer, size) &&
> +			   !neop) {
> +			/* Overflowing packet: Drop all frags on EOP */
> +			i40e_consume_xdp_buff(rx_ring, xdp, rx_buffer);
> +			break;
>  		}
>  
> +		if (neop)
> +			continue;
> +
> +		xdp_res = i40e_run_xdp(rx_ring, xdp, xdp_prog);
> +
>  		if (xdp_res) {
> -			if (xdp_res & (I40E_XDP_TX | I40E_XDP_REDIR)) {
> -				xdp_xmit |= xdp_res;
> +			xdp_xmit |= xdp_res & (I40E_XDP_TX | I40E_XDP_REDIR);
> +
> +			if (unlikely(xdp_buff_has_frags(xdp))) {
> +				i40e_process_rx_buffs(rx_ring, xdp_res, xdp);
> +				size = xdp_get_buff_len(xdp);
> +			} else if (xdp_res & (I40E_XDP_TX | I40E_XDP_REDIR)) {
>  				i40e_rx_buffer_flip(rx_buffer, xdp->frame_sz);
>  			} else {
>  				rx_buffer->pagecnt_bias++;
>  			}
>  			total_rx_bytes += size;
> -			total_rx_packets++;
> -		} else if (skb) {
> -			i40e_add_rx_frag(rx_ring, rx_buffer, skb, size);
> -		} else if (ring_uses_build_skb(rx_ring)) {
> -			skb = i40e_build_skb(rx_ring, rx_buffer, xdp);
>  		} else {
> -			skb = i40e_construct_skb(rx_ring, rx_buffer, xdp);
> -		}
> +			if (ring_uses_build_skb(rx_ring))
> +				skb = i40e_build_skb(rx_ring, xdp, nfrags);
> +			else
> +				skb = i40e_construct_skb(rx_ring, xdp, nfrags);
> +
> +			/* drop if we failed to retrieve a buffer */
> +			if (!skb) {
> +				rx_ring->rx_stats.alloc_buff_failed++;
> +				i40e_consume_xdp_buff(rx_ring, xdp, rx_buffer);
> +				break;
> +			}
>  
> -		/* exit if we failed to retrieve a buffer */
> -		if (!xdp_res && !skb) {
> -			rx_ring->rx_stats.alloc_buff_failed++;
> -			rx_buffer->pagecnt_bias++;
> -			break;
> -		}
> +			if (i40e_cleanup_headers(rx_ring, skb, rx_desc))
> +				goto process_next;
>  
> -		i40e_put_rx_buffer(rx_ring, rx_buffer);
> -		cleaned_count++;
> +			/* probably a little skewed due to removing CRC */
> +			total_rx_bytes += skb->len;
>  
> -		i40e_inc_ntp(rx_ring);
> -		rx_ring->next_to_clean = rx_ring->next_to_process;
> -		if (i40e_is_non_eop(rx_ring, rx_desc))
> -			continue;
> +			/* populate checksum, VLAN, and protocol */
> +			i40e_process_skb_fields(rx_ring, rx_desc, skb);
>  
> -		if (xdp_res || i40e_cleanup_headers(rx_ring, skb, rx_desc)) {
> -			skb = NULL;
> -			continue;
> +			i40e_trace(clean_rx_irq_rx, rx_ring, rx_desc, xdp);
> +			napi_gro_receive(&rx_ring->q_vector->napi, skb);
>  		}
>  
> -		/* probably a little skewed due to removing CRC */
> -		total_rx_bytes += skb->len;
> -
> -		/* populate checksum, VLAN, and protocol */
> -		i40e_process_skb_fields(rx_ring, rx_desc, skb);
> -
> -		i40e_trace(clean_rx_irq_rx, rx_ring, rx_desc, xdp);
> -		napi_gro_receive(&rx_ring->q_vector->napi, skb);
> -		skb = NULL;
> -
>  		/* update budget accounting */
>  		total_rx_packets++;
> +process_next:
> +		cleaned_count += nfrags + 1;
> +		i40e_put_rx_buffer(rx_ring, rx_buffer);
> +		rx_ring->next_to_clean = rx_ring->next_to_process;
> +
> +		xdp->data = NULL;
>  	}
>  
>  	i40e_finalize_xdp_rx(rx_ring, xdp_xmit);
> -	rx_ring->skb = skb;
>  
>  	i40e_update_rx_stats(rx_ring, total_rx_bytes, total_rx_packets);
>  
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> index e86abc25bb5e..14ad074639ab 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> @@ -393,14 +393,6 @@ struct i40e_ring {
>  
>  	struct rcu_head rcu;		/* to avoid race on free */
>  	u16 next_to_alloc;
> -	struct sk_buff *skb;		/* When i40e_clean_rx_ring_irq() must
> -					 * return before it sees the EOP for
> -					 * the current packet, we save that skb
> -					 * here and resume receiving this
> -					 * packet the next time
> -					 * i40e_clean_rx_ring_irq() is called
> -					 * for this ring.
> -					 */
>  
>  	struct i40e_channel *ch;
>  	u16 rx_offset;
> -- 
> 2.34.1
> 

Download attachment "signature.asc" of type "application/pgp-signature" (229 bytes)

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ