[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1e78d2df-2b9e-3e46-017e-5cc42807db03@intel.com>
Date: Mon, 30 Jan 2023 13:26:39 +0100
From: Alexander Lobakin <alexandr.lobakin@...el.com>
To: William Tu <u9012063@...il.com>
CC: <netdev@...r.kernel.org>, <jsankararama@...are.com>,
<gyang@...are.com>, <doshir@...are.com>,
<alexander.duyck@...il.com>, <bang@...are.com>,
"Yifeng Sun" <yifengs@...are.com>,
Alexander Duyck <alexanderduyck@...com>
Subject: Re: [RFC PATCH net-next v15] vmxnet3: Add XDP support.
From: William Tu <u9012063@...il.com>
Date: Fri, 27 Jan 2023 08:30:27 -0800
> The patch adds native-mode XDP support: XDP DROP, PASS, TX, and REDIRECT.
>
> Background:
> The vmxnet3 rx consists of three rings: ring0, ring1, and dataring.
> For r0 and r1, buffers at r0 are allocated using alloc_skb APIs and dma
> mapped to the ring's descriptor. If LRO is enabled and packet size larger
> than 3K, VMXNET3_MAX_SKB_BUF_SIZE, then r1 is used to mapped the rest of
> the buffer larger than VMXNET3_MAX_SKB_BUF_SIZE. Each buffer in r1 is
> allocated using alloc_page. So for LRO packets, the payload will be in one
> buffer from r0 and multiple from r1, for non-LRO packets, only one
> descriptor in r0 is used for packet size less than 3k.
[...]
> @@ -1404,6 +1496,8 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
> struct Vmxnet3_RxDesc rxCmdDesc;
> struct Vmxnet3_RxCompDesc rxComp;
> #endif
> + bool need_flush = false;
> +
> vmxnet3_getRxComp(rcd, &rq->comp_ring.base[rq->comp_ring.next2proc].rcd,
> &rxComp);
> while (rcd->gen == rq->comp_ring.gen) {
> @@ -1444,6 +1538,31 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
> goto rcd_done;
> }
>
> + if (rcd->sop && rcd->eop && vmxnet3_xdp_enabled(adapter)) {
Hmm, it's a relatively big block of code for one `if`. I mean, could we
do it like that?
if (!rcd->sop || !vmxnet3_xdp_enabled(adapter))
goto skip_xdp;
if (VMXNET3_RX_DATA_RING(adapter, rcd->rqID)) {
...
This way you would save 1 indent level and make code a little bit more
readable.
But it might be a matter of personal tastes :) Just noticed you have
this `skip_xdp` label anyway, why not reuse it here.
> + struct sk_buff *skb_xdp_pass;
> + int act;
> +
> + if (VMXNET3_RX_DATA_RING(adapter, rcd->rqID)) {
> + ctx->skb = NULL;
> + goto skip_xdp; /* Handle it later. */
> + }
> +
> + if (rbi->buf_type != VMXNET3_RX_BUF_XDP)
> + goto rcd_done;
> +
> + act = vmxnet3_process_xdp(adapter, rq, rcd, rbi, rxd,
> + &skb_xdp_pass);
> + if (act == XDP_PASS) {
> + ctx->skb = skb_xdp_pass;
> + goto sop_done;
> + }
> + ctx->skb = NULL;
> + if (act == XDP_REDIRECT)
> + need_flush = true;
need_flush |= act == XDP_REDIRECT;
But this looks a big ugly to be honest, I just wrote it to show off a
bit :D Your `if` is perfectly fine, not even sure the line I wrote above
produces more optimized object code.
> + goto rcd_done;
> + }
> +skip_xdp:
> +
> if (rcd->sop) { /* first buf of the pkt */
> bool rxDataRingUsed;
> u16 len;
> @@ -1452,7 +1571,8 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
> (rcd->rqID != rq->qid &&
> rcd->rqID != rq->dataRingQid));
>
> - BUG_ON(rbi->buf_type != VMXNET3_RX_BUF_SKB);
> + BUG_ON(rbi->buf_type != VMXNET3_RX_BUF_SKB &&
> + rbi->buf_type != VMXNET3_RX_BUF_XDP);
> BUG_ON(ctx->skb != NULL || rbi->skb == NULL);
>
> if (unlikely(rcd->len == 0)) {
> @@ -1470,6 +1590,26 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
> rxDataRingUsed =
> VMXNET3_RX_DATA_RING(adapter, rcd->rqID);
> len = rxDataRingUsed ? rcd->len : rbi->len;
> +
> + if (rxDataRingUsed && vmxnet3_xdp_enabled(adapter)) {
Maybe save 1 level here as well:
if (!rxDataRingUsed || !vmxnet3_xdp_enabled(ad))
goto alloc_skb;
sz = rcd->rxdIdx * ...
> + struct sk_buff *skb_xdp_pass;
> + size_t sz;
> + int act;
> +
> + sz = rcd->rxdIdx * rq->data_ring.desc_size;
> + act = vmxnet3_process_xdp_small(adapter, rq,
> + &rq->data_ring.base[sz],
> + rcd->len,
> + &skb_xdp_pass);
> + if (act == XDP_PASS) {
> + ctx->skb = skb_xdp_pass;
> + goto sop_done;
> + }
> + if (act == XDP_REDIRECT)
> + need_flush = true;
> +
> + goto rcd_done;
> + }
alloc_skb:
> new_skb = netdev_alloc_skb_ip_align(adapter->netdev,
> len);
> if (new_skb == NULL) {
[...]
> @@ -217,6 +221,9 @@ struct vmxnet3_tq_driver_stats {
> u64 linearized; /* # of pkts linearized */
> u64 copy_skb_header; /* # of times we have to copy skb header */
> u64 oversized_hdr;
> +
> + u64 xdp_xmit;
> + u64 xdp_xmit_err;
Sorry for missing this earlier... You use u64 here for stats and
previously we were using it for the stats, but then u64_stat_t was
introduced to exclude partial updates and tearing. I know there are
stats already in u64 above, but maybe right now is the best time to use
u64_stat_t for the new fields? :)
> };
>
> struct vmxnet3_tx_ctx {
> @@ -253,12 +260,13 @@ struct vmxnet3_tx_queue {
> * stopped */
> int qid;
> u16 txdata_desc_size;
> -} __attribute__((__aligned__(SMP_CACHE_BYTES)));
> +} ____cacheline_aligned;
>
> enum vmxnet3_rx_buf_type {
> VMXNET3_RX_BUF_NONE = 0,
> VMXNET3_RX_BUF_SKB = 1,
> - VMXNET3_RX_BUF_PAGE = 2
> + VMXNET3_RX_BUF_PAGE = 2,
> + VMXNET3_RX_BUF_XDP = 3,
> };
>
> #define VMXNET3_RXD_COMP_PENDING 0
> @@ -285,6 +293,12 @@ struct vmxnet3_rq_driver_stats {
> u64 drop_err;
> u64 drop_fcs;
> u64 rx_buf_alloc_failure;
> +
> + u64 xdp_packets; /* Total packets processed by XDP. */
> + u64 xdp_tx;
> + u64 xdp_redirects;
> + u64 xdp_drops;
> + u64 xdp_aborted;
(same)
> };
>
> struct vmxnet3_rx_data_ring {
[...]
> +static int
> +vmxnet3_xdp_xmit_frame(struct vmxnet3_adapter *adapter,
> + struct xdp_frame *xdpf,
> + struct vmxnet3_tx_queue *tq, bool dma_map)
> +{
> + struct vmxnet3_tx_buf_info *tbi = NULL;
> + union Vmxnet3_GenericDesc *gdesc;
> + struct vmxnet3_tx_ctx ctx;
> + int tx_num_deferred;
> + struct page *page;
> + u32 buf_size;
> + int ret = 0;
> + u32 dw2;
> +
> + dw2 = (tq->tx_ring.gen ^ 0x1) << VMXNET3_TXD_GEN_SHIFT;
> + dw2 |= xdpf->len;
> + ctx.sop_txd = tq->tx_ring.base + tq->tx_ring.next2fill;
> + gdesc = ctx.sop_txd;
> +
> + buf_size = xdpf->len;
> + tbi = tq->buf_info + tq->tx_ring.next2fill;
> +
> + if (vmxnet3_cmd_ring_desc_avail(&tq->tx_ring) == 0) {
> + tq->stats.tx_ring_full++;
> + return -ENOSPC;
> + }
> +
> + tbi->map_type = VMXNET3_MAP_XDP;
> + if (dma_map) { /* ndo_xdp_xmit */
> + tbi->dma_addr = dma_map_single(&adapter->pdev->dev,
> + xdpf->data, buf_size,
> + DMA_TO_DEVICE);
> + if (dma_mapping_error(&adapter->pdev->dev, tbi->dma_addr))
> + return -EFAULT;
> + tbi->map_type |= VMXNET3_MAP_SINGLE;
> + } else { /* XDP buffer from page pool */
> + page = virt_to_head_page(xdpf->data);
Nit: your page pools are always order-0, thus you could shortcut those
virt_to_head_page() throughout the drivers to just virt_to_page() and
save at least 1 unlikely() condition check this way (it will always be
false for the pages coming from your page pools). I remember we had huge
CPU load point on this compound_head() check inside virt_to_head_page()...
> + tbi->dma_addr = page_pool_get_dma_addr(page) +
> + XDP_PACKET_HEADROOM;
> + dma_sync_single_for_device(&adapter->pdev->dev,
> + tbi->dma_addr, buf_size,
> + DMA_BIDIRECTIONAL);
> + }
[...]
Those are some minors/propositions, up to you whether to address the
rest was addressed perfectly by you, thanks!
Reviewed-by: Alexander Lobakin <alexandr.lobakin@...el.com>
Olek
Powered by blists - more mailing lists