[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAJ+HfNjiU5JXm3cKJYrEF9ySWQ-kOcWXJZXUFp2=N=ujjw+Png@mail.gmail.com>
Date: Thu, 18 Apr 2019 12:10:29 +0200
From: Björn Töpel <bjorn.topel@...il.com>
To: Jonathan Lemon <jonathan.lemon@...il.com>
Cc: Netdev <netdev@...r.kernel.org>,
"Karlsson, Magnus" <magnus.karlsson@...el.com>, kernel-team@...com,
Jesper Dangaard Brouer <brouer@...hat.com>,
maciej.fijalkowski@...el.com
Subject: Re: [PATCH RFC] xdp: Support zero-copy XDP_TX from AF_XDP sockets.
On Wed, 17 Apr 2019 at 21:58, Jonathan Lemon <jonathan.lemon@...il.com> wrote:
>
> When the XDP program attached to a zero-copy AF_XDP socket returns XDP_TX,
> queue the umem frame on the XDP TX ring, and arrange for it to be released
> via the ZCA free routine, which should place it back onto the reuseq.
>
There are a bunch of compiler errors, but I'll leave them out from the comments!
> Signed-off-by: Jonathan Lemon <jonathan.lemon@...il.com>
> ---
> drivers/net/ethernet/intel/i40e/i40e_txrx.h | 1 +
> drivers/net/ethernet/intel/i40e/i40e_xsk.c | 52 +++++++++++++++++++--
> include/net/xdp.h | 20 ++++++--
> 3 files changed, 65 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> index 100e92d2982f..3e7954277737 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> @@ -274,6 +274,7 @@ static inline unsigned int i40e_txd_use_count(unsigned int size)
> #define I40E_TX_FLAGS_TSYN BIT(8)
> #define I40E_TX_FLAGS_FD_SB BIT(9)
> #define I40E_TX_FLAGS_UDP_TUNNEL BIT(10)
> +#define I40E_TX_FLAGS_ZC_FRAME BIT(11)
> #define I40E_TX_FLAGS_VLAN_MASK 0xffff0000
> #define I40E_TX_FLAGS_VLAN_PRIO_MASK 0xe0000000
> #define I40E_TX_FLAGS_VLAN_PRIO_SHIFT 29
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> index d2e212d007c3..16a31c57906a 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> @@ -188,7 +188,6 @@ int i40e_xsk_umem_setup(struct i40e_vsi *vsi, struct xdp_umem *umem,
> static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp)
> {
> int err, result = I40E_XDP_PASS;
> - struct i40e_ring *xdp_ring;
> struct bpf_prog *xdp_prog;
> u32 act;
>
> @@ -202,9 +201,8 @@ static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp)
> switch (act) {
> case XDP_PASS:
> break;
> - case XDP_TX:
> - xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->queue_index];
> - result = i40e_xmit_xdp_tx_ring(xdp, xdp_ring);
> + case XDP_TX
> + result = i40e_xmit_rcvd_zc(rx_ring, xdp);
> break;
> case XDP_REDIRECT:
> err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
> @@ -623,6 +621,48 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
> return failure ? budget : (int)total_rx_packets;
> }
>
> +static int i40e_xmit_rcvd_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp)
> +{
> + struct i40e_ring *xdp_ring;
> + struct i40e_tx_desc *tx_desc;
> + struct i40e_tx_buffer *tx_bi;
> + struct xdp_frame *xdpf;
> + dma_addr_t dma;
> +
> + xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->queue_index];
> +
> + if (!unlikely(I40E_DESC_UNUSED(xdp_ring))) {
> + xdp_ring->tx_stats.tx_busy++;
> + return I40E_XDP_CONSUMED;
> + }
> + xpdf = convert_to_xdp_frame_keep_zc(xdp);
> + if (unlikely(!xpdf)
> + return I40E_XDP_CONSUMED;
> + xpdf->handle = xdp->handle;
> +
> + dma = xdp_umem_get_dma(rx_ring->xsk_umem, xdp->handle);
> + tx_bi = &xdp_ring->tx_bi[xdp_ring->next_to_use];
> + tx_bi->bytecount = xpdf->len;
> + tx_bi->gso_segs = 1;
> + tx_bi->xdpf = xdpf;
> + tx_bi->tx_flags = I40E_TX_FLAGS_ZC_FRAME;
> +
> + tx_desc = I40E_TX_DESC(xdp_ring, xdp_ring->next_to_use);
> + tx_desc->buffer_addr = cpu_to_le64(dma);
> + tx_desc->cmd_type_offset_bsz = build_ctob(I40E_TX_DESC_CMD_ICRC |
> + I40E_TX_DESC_CMD_EOP,
> + 0, xpdf->len, 0);
> + smp_wmb();
> +
> + xdp_ring->next_to_use++;
> + if (xdp_ring->next_to_use == xdp_ring->count)
> + xdp_ring->next_to_use = 0;
> +
> + tx_bi->next_to_watch = tx_desc;
> +
> + return I40E_XDP_TX;
> +}
What you're basically doing here is a AF_XDP Tx, but triggered from
the Rx path, and instead of completion (after the packet is sent) to
the completion ring, it's recycled to the Rx HW ring (via zca_free). I
like the idea but we need more plumbing first. Let me expand;
Unfortunately, the current recycle mechanism requires that at the
point of recycling, there has to be space in Rx ring. In the XDP_TX
case, there's no completion ring, and we cannot guarantee that there's
space on Rx ring (since Rx and Tx are asynchronous). IOW, Rx recycling
can currently *only* be done in an Rx context.
What I would like to do, is moving i40e-xsk to Jesper's page-pool,
instead of the existing recycle mechanism. Then we could return the
descriptor to the pool, if the Rx ring doesn't have space for the
completed/sent buffer.
TL;DR version: Passing zc-frames in XDP_TX cannot be done properly
until the Rx recycle mechanism is more robust. :-(
(I think Maciej is looking into using the page_pool on the ice driver.)
But again, I like the idea!
Thanks,
Björn
> +
> /**
> * i40e_xmit_zc - Performs zero-copy Tx AF_XDP
> * @xdp_ring: XDP Tx ring
> @@ -689,6 +729,10 @@ static void i40e_clean_xdp_tx_buffer(struct i40e_ring *tx_ring,
> struct i40e_tx_buffer *tx_bi)
> {
> xdp_return_frame(tx_bi->xdpf);
> + if (tx_bi->tx_flags & I40E_TX_FLAGS_ZC_FRAME) {
> + tx_bi->tx_flags = 0;
> + return;
> + }
> dma_unmap_single(tx_ring->dev,
> dma_unmap_addr(tx_bi, dma),
> dma_unmap_len(tx_bi, len), DMA_TO_DEVICE);
> diff --git a/include/net/xdp.h b/include/net/xdp.h
> index 0f25b3675c5c..191359c5ebdd 100644
> --- a/include/net/xdp.h
> +++ b/include/net/xdp.h
> @@ -82,6 +82,7 @@ struct xdp_frame {
> */
> struct xdp_mem_info mem;
> struct net_device *dev_rx; /* used by cpumap */
> + unsigned long handle;
> };
>
> /* Clear kernel pointers in xdp_frame */
> @@ -95,15 +96,12 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp);
>
> /* Convert xdp_buff to xdp_frame */
> static inline
> -struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
> +struct xdp_frame *__convert_to_xdp_frame(struct xdp_buff *xdp)
> {
> struct xdp_frame *xdp_frame;
> int metasize;
> int headroom;
>
> - if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY)
> - return xdp_convert_zc_to_xdp_frame(xdp);
> -
> /* Assure headroom is available for storing info */
> headroom = xdp->data - xdp->data_hard_start;
> metasize = xdp->data - xdp->data_meta;
> @@ -125,6 +123,20 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
> return xdp_frame;
> }
>
> +static inline
> +struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
> +{
> + if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY)
> + return xdp_convert_zc_to_xdp_frame(xdp);
> + return __convert_to_xdp_frame(xdp);
> +}
> +
> +static inline
> +struct xdp_frame *convert_to_xdp_frame_keep_zc(struct xdp_buff *xdp)
> +{
> + return __convert_to_xdp_frame(xdp);
> +}
> +
> void xdp_return_frame(struct xdp_frame *xdpf);
> void xdp_return_frame_rx_napi(struct xdp_frame *xdpf);
> void xdp_return_buff(struct xdp_buff *xdp);
> --
> 2.17.1
>
Powered by blists - more mailing lists