[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAJ+HfNjiU5JXm3cKJYrEF9ySWQ-kOcWXJZXUFp2=N=ujjw+Png@mail.gmail.com>
Date:   Thu, 18 Apr 2019 12:10:29 +0200
From:   Björn Töpel <bjorn.topel@...il.com>
To:     Jonathan Lemon <jonathan.lemon@...il.com>
Cc:     Netdev <netdev@...r.kernel.org>,
        "Karlsson, Magnus" <magnus.karlsson@...el.com>, kernel-team@...com,
        Jesper Dangaard Brouer <brouer@...hat.com>,
        maciej.fijalkowski@...el.com
Subject: Re: [PATCH RFC] xdp: Support zero-copy XDP_TX from AF_XDP sockets.
On Wed, 17 Apr 2019 at 21:58, Jonathan Lemon <jonathan.lemon@...il.com> wrote:
>
> When the XDP program attached to a zero-copy AF_XDP socket returns XDP_TX,
> queue the umem frame on the XDP TX ring, and arrange for it to be released
> via the ZCA free routine, which should place it back onto the reuseq.
>
There are a bunch of compiler errors, but I'll leave them out from the comments!
> Signed-off-by: Jonathan Lemon <jonathan.lemon@...il.com>
> ---
>  drivers/net/ethernet/intel/i40e/i40e_txrx.h |  1 +
>  drivers/net/ethernet/intel/i40e/i40e_xsk.c  | 52 +++++++++++++++++++--
>  include/net/xdp.h                           | 20 ++++++--
>  3 files changed, 65 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> index 100e92d2982f..3e7954277737 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> @@ -274,6 +274,7 @@ static inline unsigned int i40e_txd_use_count(unsigned int size)
>  #define I40E_TX_FLAGS_TSYN             BIT(8)
>  #define I40E_TX_FLAGS_FD_SB            BIT(9)
>  #define I40E_TX_FLAGS_UDP_TUNNEL       BIT(10)
> +#define I40E_TX_FLAGS_ZC_FRAME         BIT(11)
>  #define I40E_TX_FLAGS_VLAN_MASK                0xffff0000
>  #define I40E_TX_FLAGS_VLAN_PRIO_MASK   0xe0000000
>  #define I40E_TX_FLAGS_VLAN_PRIO_SHIFT  29
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> index d2e212d007c3..16a31c57906a 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
> @@ -188,7 +188,6 @@ int i40e_xsk_umem_setup(struct i40e_vsi *vsi, struct xdp_umem *umem,
>  static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp)
>  {
>         int err, result = I40E_XDP_PASS;
> -       struct i40e_ring *xdp_ring;
>         struct bpf_prog *xdp_prog;
>         u32 act;
>
> @@ -202,9 +201,8 @@ static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp)
>         switch (act) {
>         case XDP_PASS:
>                 break;
> -       case XDP_TX:
> -               xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->queue_index];
> -               result = i40e_xmit_xdp_tx_ring(xdp, xdp_ring);
> +       case XDP_TX
> +               result = i40e_xmit_rcvd_zc(rx_ring, xdp);
>                 break;
>         case XDP_REDIRECT:
>                 err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
> @@ -623,6 +621,48 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
>         return failure ? budget : (int)total_rx_packets;
>  }
>
> +static int i40e_xmit_rcvd_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp)
> +{
> +       struct i40e_ring *xdp_ring;
> +       struct i40e_tx_desc *tx_desc;
> +       struct i40e_tx_buffer *tx_bi;
> +       struct xdp_frame *xdpf;
> +       dma_addr_t dma;
> +
> +       xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->queue_index];
> +
> +       if (!unlikely(I40E_DESC_UNUSED(xdp_ring))) {
> +               xdp_ring->tx_stats.tx_busy++;
> +               return I40E_XDP_CONSUMED;
> +       }
> +       xpdf = convert_to_xdp_frame_keep_zc(xdp);
> +       if (unlikely(!xpdf)
> +               return I40E_XDP_CONSUMED;
> +       xpdf->handle = xdp->handle;
> +
> +       dma = xdp_umem_get_dma(rx_ring->xsk_umem, xdp->handle);
> +       tx_bi = &xdp_ring->tx_bi[xdp_ring->next_to_use];
> +       tx_bi->bytecount = xpdf->len;
> +       tx_bi->gso_segs = 1;
> +       tx_bi->xdpf = xdpf;
> +       tx_bi->tx_flags = I40E_TX_FLAGS_ZC_FRAME;
> +
> +       tx_desc = I40E_TX_DESC(xdp_ring, xdp_ring->next_to_use);
> +       tx_desc->buffer_addr = cpu_to_le64(dma);
> +       tx_desc->cmd_type_offset_bsz = build_ctob(I40E_TX_DESC_CMD_ICRC |
> +                                                 I40E_TX_DESC_CMD_EOP,
> +                                                 0, xpdf->len, 0);
> +       smp_wmb();
> +
> +       xdp_ring->next_to_use++;
> +       if (xdp_ring->next_to_use == xdp_ring->count)
> +               xdp_ring->next_to_use = 0;
> +
> +       tx_bi->next_to_watch = tx_desc;
> +
> +       return I40E_XDP_TX;
> +}
What you're basically doing here is a AF_XDP Tx, but triggered from
the Rx path, and instead of completion (after the packet is sent) to
the completion ring, it's recycled to the Rx HW ring (via zca_free). I
like the idea but we need more plumbing first. Let me expand;
Unfortunately, the current recycle mechanism requires that at the
point of recycling, there has to be space in Rx ring. In the XDP_TX
case, there's no completion ring, and we cannot guarantee that there's
space on Rx ring (since Rx and Tx are asynchronous). IOW, Rx recycling
can currently *only* be done in an Rx context.
What I would like to do, is moving i40e-xsk to Jesper's page-pool,
instead of the existing recycle mechanism. Then we could return the
descriptor to the pool, if the Rx ring doesn't have space for the
completed/sent buffer.
TL;DR version: Passing zc-frames in XDP_TX cannot be done properly
until the Rx recycle mechanism is more robust. :-(
(I think Maciej is looking into using the page_pool on the ice driver.)
But again, I like the idea!
Thanks,
Björn
> +
>  /**
>   * i40e_xmit_zc - Performs zero-copy Tx AF_XDP
>   * @xdp_ring: XDP Tx ring
> @@ -689,6 +729,10 @@ static void i40e_clean_xdp_tx_buffer(struct i40e_ring *tx_ring,
>                                      struct i40e_tx_buffer *tx_bi)
>  {
>         xdp_return_frame(tx_bi->xdpf);
> +       if (tx_bi->tx_flags & I40E_TX_FLAGS_ZC_FRAME) {
> +               tx_bi->tx_flags = 0;
> +               return;
> +       }
>         dma_unmap_single(tx_ring->dev,
>                          dma_unmap_addr(tx_bi, dma),
>                          dma_unmap_len(tx_bi, len), DMA_TO_DEVICE);
> diff --git a/include/net/xdp.h b/include/net/xdp.h
> index 0f25b3675c5c..191359c5ebdd 100644
> --- a/include/net/xdp.h
> +++ b/include/net/xdp.h
> @@ -82,6 +82,7 @@ struct xdp_frame {
>          */
>         struct xdp_mem_info mem;
>         struct net_device *dev_rx; /* used by cpumap */
> +       unsigned long handle;
>  };
>
>  /* Clear kernel pointers in xdp_frame */
> @@ -95,15 +96,12 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp);
>
>  /* Convert xdp_buff to xdp_frame */
>  static inline
> -struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
> +struct xdp_frame *__convert_to_xdp_frame(struct xdp_buff *xdp)
>  {
>         struct xdp_frame *xdp_frame;
>         int metasize;
>         int headroom;
>
> -       if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY)
> -               return xdp_convert_zc_to_xdp_frame(xdp);
> -
>         /* Assure headroom is available for storing info */
>         headroom = xdp->data - xdp->data_hard_start;
>         metasize = xdp->data - xdp->data_meta;
> @@ -125,6 +123,20 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
>         return xdp_frame;
>  }
>
> +static inline
> +struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
> +{
> +       if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY)
> +               return xdp_convert_zc_to_xdp_frame(xdp);
> +       return __convert_to_xdp_frame(xdp);
> +}
> +
> +static inline
> +struct xdp_frame *convert_to_xdp_frame_keep_zc(struct xdp_buff *xdp)
> +{
> +       return __convert_to_xdp_frame(xdp);
> +}
> +
>  void xdp_return_frame(struct xdp_frame *xdpf);
>  void xdp_return_frame_rx_napi(struct xdp_frame *xdpf);
>  void xdp_return_buff(struct xdp_buff *xdp);
> --
> 2.17.1
>
Powered by blists - more mailing lists
 
