lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1e78d2df-2b9e-3e46-017e-5cc42807db03@intel.com>
Date:   Mon, 30 Jan 2023 13:26:39 +0100
From:   Alexander Lobakin <alexandr.lobakin@...el.com>
To:     William Tu <u9012063@...il.com>
CC:     <netdev@...r.kernel.org>, <jsankararama@...are.com>,
        <gyang@...are.com>, <doshir@...are.com>,
        <alexander.duyck@...il.com>, <bang@...are.com>,
        "Yifeng Sun" <yifengs@...are.com>,
        Alexander Duyck <alexanderduyck@...com>
Subject: Re: [RFC PATCH net-next v15] vmxnet3: Add XDP support.

From: William Tu <u9012063@...il.com>
Date: Fri, 27 Jan 2023 08:30:27 -0800

> The patch adds native-mode XDP support: XDP DROP, PASS, TX, and REDIRECT.
> 
> Background:
> The vmxnet3 rx consists of three rings: ring0, ring1, and dataring.
> For r0 and r1, buffers at r0 are allocated using alloc_skb APIs and dma
> mapped to the ring's descriptor. If LRO is enabled and packet size larger
> than 3K, VMXNET3_MAX_SKB_BUF_SIZE, then r1 is used to mapped the rest of
> the buffer larger than VMXNET3_MAX_SKB_BUF_SIZE. Each buffer in r1 is
> allocated using alloc_page. So for LRO packets, the payload will be in one
> buffer from r0 and multiple from r1, for non-LRO packets, only one
> descriptor in r0 is used for packet size less than 3k.

[...]

> @@ -1404,6 +1496,8 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
>  	struct Vmxnet3_RxDesc rxCmdDesc;
>  	struct Vmxnet3_RxCompDesc rxComp;
>  #endif
> +	bool need_flush = false;
> +
>  	vmxnet3_getRxComp(rcd, &rq->comp_ring.base[rq->comp_ring.next2proc].rcd,
>  			  &rxComp);
>  	while (rcd->gen == rq->comp_ring.gen) {
> @@ -1444,6 +1538,31 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
>  			goto rcd_done;
>  		}
>  
> +		if (rcd->sop && rcd->eop && vmxnet3_xdp_enabled(adapter)) {

Hmm, it's a relatively big block of code for one `if`. I mean, could we
do it like that?

		if (!rcd->sop || !vmxnet3_xdp_enabled(adapter))
			goto skip_xdp;

		if (VMXNET3_RX_DATA_RING(adapter, rcd->rqID)) {
			...

This way you would save 1 indent level and make code a little bit more
readable.
But it might be a matter of personal tastes :) Just noticed you have
this `skip_xdp` label anyway, why not reuse it here.

> +			struct sk_buff *skb_xdp_pass;
> +			int act;
> +
> +			if (VMXNET3_RX_DATA_RING(adapter, rcd->rqID)) {
> +				ctx->skb = NULL;
> +				goto skip_xdp; /* Handle it later. */
> +			}
> +
> +			if (rbi->buf_type != VMXNET3_RX_BUF_XDP)
> +				goto rcd_done;
> +
> +			act = vmxnet3_process_xdp(adapter, rq, rcd, rbi, rxd,
> +						  &skb_xdp_pass);
> +			if (act == XDP_PASS) {
> +				ctx->skb = skb_xdp_pass;
> +				goto sop_done;
> +			}
> +			ctx->skb = NULL;
> +			if (act == XDP_REDIRECT)
> +				need_flush = true;

			need_flush |= act == XDP_REDIRECT;

But this looks a big ugly to be honest, I just wrote it to show off a
bit :D Your `if` is perfectly fine, not even sure the line I wrote above
produces more optimized object code.

> +			goto rcd_done;
> +		}
> +skip_xdp:
> +
>  		if (rcd->sop) { /* first buf of the pkt */
>  			bool rxDataRingUsed;
>  			u16 len;
> @@ -1452,7 +1571,8 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
>  			       (rcd->rqID != rq->qid &&
>  				rcd->rqID != rq->dataRingQid));
>  
> -			BUG_ON(rbi->buf_type != VMXNET3_RX_BUF_SKB);
> +			BUG_ON(rbi->buf_type != VMXNET3_RX_BUF_SKB &&
> +			       rbi->buf_type != VMXNET3_RX_BUF_XDP);
>  			BUG_ON(ctx->skb != NULL || rbi->skb == NULL);
>  
>  			if (unlikely(rcd->len == 0)) {
> @@ -1470,6 +1590,26 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
>  			rxDataRingUsed =
>  				VMXNET3_RX_DATA_RING(adapter, rcd->rqID);
>  			len = rxDataRingUsed ? rcd->len : rbi->len;
> +
> +			if (rxDataRingUsed && vmxnet3_xdp_enabled(adapter)) {

Maybe save 1 level here as well:

			if (!rxDataRingUsed || !vmxnet3_xdp_enabled(ad))
				goto alloc_skb;

			sz = rcd->rxdIdx * ...

> +				struct sk_buff *skb_xdp_pass;
> +				size_t sz;
> +				int act;
> +
> +				sz = rcd->rxdIdx * rq->data_ring.desc_size;
> +				act = vmxnet3_process_xdp_small(adapter, rq,
> +								&rq->data_ring.base[sz],
> +								rcd->len,
> +								&skb_xdp_pass);
> +				if (act == XDP_PASS) {
> +					ctx->skb = skb_xdp_pass;
> +					goto sop_done;
> +				}
> +				if (act == XDP_REDIRECT)
> +					need_flush = true;
> +
> +				goto rcd_done;
> +			}

alloc_skb:

>  			new_skb = netdev_alloc_skb_ip_align(adapter->netdev,
>  							    len);
>  			if (new_skb == NULL) {

[...]

> @@ -217,6 +221,9 @@ struct vmxnet3_tq_driver_stats {
>  	u64 linearized;         /* # of pkts linearized */
>  	u64 copy_skb_header;    /* # of times we have to copy skb header */
>  	u64 oversized_hdr;
> +
> +	u64 xdp_xmit;
> +	u64 xdp_xmit_err;

Sorry for missing this earlier... You use u64 here for stats and
previously we were using it for the stats, but then u64_stat_t was
introduced to exclude partial updates and tearing. I know there are
stats already in u64 above, but maybe right now is the best time to use
u64_stat_t for the new fields? :)

>  };
>  
>  struct vmxnet3_tx_ctx {
> @@ -253,12 +260,13 @@ struct vmxnet3_tx_queue {
>  						    * stopped */
>  	int				qid;
>  	u16				txdata_desc_size;
> -} __attribute__((__aligned__(SMP_CACHE_BYTES)));
> +} ____cacheline_aligned;
>  
>  enum vmxnet3_rx_buf_type {
>  	VMXNET3_RX_BUF_NONE = 0,
>  	VMXNET3_RX_BUF_SKB = 1,
> -	VMXNET3_RX_BUF_PAGE = 2
> +	VMXNET3_RX_BUF_PAGE = 2,
> +	VMXNET3_RX_BUF_XDP = 3,
>  };
>  
>  #define VMXNET3_RXD_COMP_PENDING        0
> @@ -285,6 +293,12 @@ struct vmxnet3_rq_driver_stats {
>  	u64 drop_err;
>  	u64 drop_fcs;
>  	u64 rx_buf_alloc_failure;
> +
> +	u64 xdp_packets;	/* Total packets processed by XDP. */
> +	u64 xdp_tx;
> +	u64 xdp_redirects;
> +	u64 xdp_drops;
> +	u64 xdp_aborted;

(same)

>  };
>  
>  struct vmxnet3_rx_data_ring {

[...]

> +static int
> +vmxnet3_xdp_xmit_frame(struct vmxnet3_adapter *adapter,
> +		       struct xdp_frame *xdpf,
> +		       struct vmxnet3_tx_queue *tq, bool dma_map)
> +{
> +	struct vmxnet3_tx_buf_info *tbi = NULL;
> +	union Vmxnet3_GenericDesc *gdesc;
> +	struct vmxnet3_tx_ctx ctx;
> +	int tx_num_deferred;
> +	struct page *page;
> +	u32 buf_size;
> +	int ret = 0;
> +	u32 dw2;
> +
> +	dw2 = (tq->tx_ring.gen ^ 0x1) << VMXNET3_TXD_GEN_SHIFT;
> +	dw2 |= xdpf->len;
> +	ctx.sop_txd = tq->tx_ring.base + tq->tx_ring.next2fill;
> +	gdesc = ctx.sop_txd;
> +
> +	buf_size = xdpf->len;
> +	tbi = tq->buf_info + tq->tx_ring.next2fill;
> +
> +	if (vmxnet3_cmd_ring_desc_avail(&tq->tx_ring) == 0) {
> +		tq->stats.tx_ring_full++;
> +		return -ENOSPC;
> +	}
> +
> +	tbi->map_type = VMXNET3_MAP_XDP;
> +	if (dma_map) { /* ndo_xdp_xmit */
> +		tbi->dma_addr = dma_map_single(&adapter->pdev->dev,
> +					       xdpf->data, buf_size,
> +					       DMA_TO_DEVICE);
> +		if (dma_mapping_error(&adapter->pdev->dev, tbi->dma_addr))
> +			return -EFAULT;
> +		tbi->map_type |= VMXNET3_MAP_SINGLE;
> +	} else { /* XDP buffer from page pool */
> +		page = virt_to_head_page(xdpf->data);

Nit: your page pools are always order-0, thus you could shortcut those
virt_to_head_page() throughout the drivers to just virt_to_page() and
save at least 1 unlikely() condition check this way (it will always be
false for the pages coming from your page pools). I remember we had huge
CPU load point on this compound_head() check inside virt_to_head_page()...

> +		tbi->dma_addr = page_pool_get_dma_addr(page) +
> +				XDP_PACKET_HEADROOM;
> +		dma_sync_single_for_device(&adapter->pdev->dev,
> +					   tbi->dma_addr, buf_size,
> +					   DMA_BIDIRECTIONAL);
> +	}

[...]

Those are some minors/propositions, up to you whether to address the
rest was addressed perfectly by you, thanks!

Reviewed-by: Alexander Lobakin <alexandr.lobakin@...el.com>

Olek

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ