[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAKgT0UdBMFtmMSXGChj_Hq0UOs1OLMEDxtQ2Lg2c-bY=oObMeg@mail.gmail.com>
Date: Tue, 15 May 2018 13:25:54 -0700
From: Alexander Duyck <alexander.duyck@...il.com>
To: Björn Töpel <bjorn.topel@...il.com>
Cc: magnus.karlsson@...il.com,
"Karlsson, Magnus" <magnus.karlsson@...el.com>,
"Duyck, Alexander H" <alexander.h.duyck@...el.com>,
John Fastabend <john.fastabend@...il.com>,
Alexei Starovoitov <ast@...com>,
Jesper Dangaard Brouer <brouer@...hat.com>,
Willem de Bruijn <willemdebruijn.kernel@...il.com>,
Daniel Borkmann <daniel@...earbox.net>,
"Michael S. Tsirkin" <mst@...hat.com>,
Netdev <netdev@...r.kernel.org>,
Björn Töpel <bjorn.topel@...el.com>,
michael.lundkvist@...csson.com,
"Brandeburg, Jesse" <jesse.brandeburg@...el.com>,
Anjali Singhai Jain <anjali.singhai@...el.com>,
qi.z.zhang@...el.com,
intel-wired-lan <intel-wired-lan@...ts.osuosl.org>
Subject: Re: [RFC PATCH bpf-next 11/12] i40e: implement AF_XDP zero-copy
support for Rx
On Tue, May 15, 2018 at 12:06 PM, Björn Töpel <bjorn.topel@...il.com> wrote:
> From: Björn Töpel <bjorn.topel@...el.com>
>
> A lot of things here. First we add support for the new
> XDP_SETUP_XSK_UMEM command in ndo_bpf. This allows the AF_XDP socket
> to pass a UMEM to the driver. The driver will then DMA map all the
> frames in the UMEM for the driver. Next, the Rx code will allocate
> frames from the UMEM fill queue, instead of the regular page
> allocator.
>
> Externally, for the rest of the XDP code, the driver the driver
> internal UMEM allocator will appear as a MEM_TYPE_ZERO_COPY.
>
> Keep in mind that having frames coming from userland requires some
> extra care taken when passing them to the regular kernel stack. In
> these cases the ZC frame must be copied.
>
> The commit also introduces a completely new clean_rx_irq/allocator
> functions for zero-copy, and means (functions pointers) to set
> allocators and clean_rx functions.
>
> Finally, a lot of this are *not* implemented here. To mention some:
>
> * No passing to the stack via XDP_PASS (clone/copy to skb).
> * No XDP redirect to other than sockets (convert_to_xdp_frame does not
> clone the frame yet).
>
> And yes, too much C&P and too big commit. :-)
>
> Signed-off-by: Björn Töpel <bjorn.topel@...el.com>
A few minor comments below.
> ---
> drivers/net/ethernet/intel/i40e/i40e.h | 20 ++
> drivers/net/ethernet/intel/i40e/i40e_main.c | 202 +++++++++++++-
> drivers/net/ethernet/intel/i40e/i40e_txrx.c | 400 ++++++++++++++++++++++++++--
> drivers/net/ethernet/intel/i40e/i40e_txrx.h | 30 ++-
> 4 files changed, 619 insertions(+), 33 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h
> index 7a80652e2500..e6ee6c9bf094 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e.h
> +++ b/drivers/net/ethernet/intel/i40e/i40e.h
> @@ -786,6 +786,12 @@ struct i40e_vsi {
>
> /* VSI specific handlers */
> irqreturn_t (*irq_handler)(int irq, void *data);
> +
> + /* AF_XDP zero-copy */
> + struct xdp_umem **xsk_umems;
> + u16 num_xsk_umems_used;
> + u16 num_xsk_umems;
> +
> } ____cacheline_internodealigned_in_smp;
>
> struct i40e_netdev_priv {
> @@ -1090,6 +1096,20 @@ static inline bool i40e_enabled_xdp_vsi(struct i40e_vsi *vsi)
> return !!vsi->xdp_prog;
> }
>
> +static inline struct xdp_umem *i40e_xsk_umem(struct i40e_ring *ring)
> +{
> + bool xdp_on = i40e_enabled_xdp_vsi(ring->vsi);
> + int qid = ring->queue_index;
> +
> + if (ring_is_xdp(ring))
> + qid -= ring->vsi->alloc_queue_pairs;
> +
> + if (!ring->vsi->xsk_umems || !ring->vsi->xsk_umems[qid] || !xdp_on)
> + return NULL;
> +
> + return ring->vsi->xsk_umems[qid];
> +}
> +
> int i40e_create_queue_channel(struct i40e_vsi *vsi, struct i40e_channel *ch);
> int i40e_set_bw_limit(struct i40e_vsi *vsi, u16 seid, u64 max_tx_rate);
> int i40e_add_del_cloud_filter(struct i40e_vsi *vsi,
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
> index b4c23cf3979c..dc3d668a741e 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_main.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
> @@ -5,6 +5,7 @@
> #include <linux/of_net.h>
> #include <linux/pci.h>
> #include <linux/bpf.h>
> +#include <net/xdp_sock.h>
>
> /* Local includes */
> #include "i40e.h"
> @@ -3054,6 +3055,9 @@ static int i40e_configure_tx_ring(struct i40e_ring *ring)
> i40e_status err = 0;
> u32 qtx_ctl = 0;
>
> + if (ring_is_xdp(ring))
> + ring->xsk_umem = i40e_xsk_umem(ring);
> +
> /* some ATR related tx ring init */
> if (vsi->back->flags & I40E_FLAG_FD_ATR_ENABLED) {
> ring->atr_sample_rate = vsi->back->atr_sample_rate;
> @@ -3163,13 +3167,31 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
> struct i40e_hw *hw = &vsi->back->hw;
> struct i40e_hmc_obj_rxq rx_ctx;
> i40e_status err = 0;
> + int ret;
>
> bitmap_zero(ring->state, __I40E_RING_STATE_NBITS);
>
> /* clear the context structure first */
> memset(&rx_ctx, 0, sizeof(rx_ctx));
>
> - ring->rx_buf_len = vsi->rx_buf_len;
> + ring->xsk_umem = i40e_xsk_umem(ring);
> + if (ring->xsk_umem) {
> + ring->clean_rx_irq = i40e_clean_rx_irq_zc;
> + ring->alloc_rx_buffers = i40e_alloc_rx_buffers_zc;
> + ring->rx_buf_len = ring->xsk_umem->props.frame_size -
> + ring->xsk_umem->frame_headroom -
> + XDP_PACKET_HEADROOM;
> + ring->zca.free = i40e_zca_free;
> + ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
> + MEM_TYPE_ZERO_COPY,
> + &ring->zca);
> + if (ret)
> + return ret;
> + } else {
> + ring->clean_rx_irq = i40e_clean_rx_irq;
> + ring->alloc_rx_buffers = i40e_alloc_rx_buffers;
> + ring->rx_buf_len = vsi->rx_buf_len;
> + }
>
> rx_ctx.dbuff = DIV_ROUND_UP(ring->rx_buf_len,
> BIT_ULL(I40E_RXQ_CTX_DBUFF_SHIFT));
> @@ -3225,7 +3247,7 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
> ring->tail = hw->hw_addr + I40E_QRX_TAIL(pf_q);
> writel(0, ring->tail);
>
> - i40e_alloc_rx_buffers(ring, I40E_DESC_UNUSED(ring));
> + ring->alloc_rx_buffers(ring, I40E_DESC_UNUSED(ring));
>
> return 0;
> }
> @@ -12050,6 +12072,179 @@ static int i40e_queue_pair_enable(struct i40e_vsi *vsi, int queue_pair)
> return err;
> }
>
> +static int i40e_alloc_xsk_umems(struct i40e_vsi *vsi)
> +{
> + if (vsi->xsk_umems)
> + return 0;
> +
> + vsi->num_xsk_umems_used = 0;
> + vsi->num_xsk_umems = vsi->alloc_queue_pairs;
> + vsi->xsk_umems = kcalloc(vsi->num_xsk_umems, sizeof(*vsi->xsk_umems),
> + GFP_KERNEL);
> + if (!vsi->xsk_umems) {
> + vsi->num_xsk_umems = 0;
> + return -ENOMEM;
> + }
> +
> + return 0;
> +}
> +
> +static int i40e_add_xsk_umem(struct i40e_vsi *vsi, struct xdp_umem *umem,
> + u16 qid)
> +{
> + int err;
> +
> + err = i40e_alloc_xsk_umems(vsi);
> + if (err)
> + return err;
> +
> + vsi->xsk_umems[qid] = umem;
> + vsi->num_xsk_umems_used++;
> +
> + return 0;
> +}
> +
> +static void i40e_remove_xsk_umem(struct i40e_vsi *vsi, u16 qid)
> +{
> + vsi->xsk_umems[qid] = NULL;
> + vsi->num_xsk_umems_used--;
> +
> + if (vsi->num_xsk_umems == 0) {
> + kfree(vsi->xsk_umems);
> + vsi->xsk_umems = NULL;
> + vsi->num_xsk_umems = 0;
> + }
> +}
> +
> +static int i40e_xsk_umem_dma_map(struct i40e_vsi *vsi, struct xdp_umem *umem)
> +{
> + struct i40e_pf *pf = vsi->back;
> + struct device *dev;
> + unsigned int i, j;
> + dma_addr_t dma;
> +
> + dev = &pf->pdev->dev;
> +
> + for (i = 0; i < umem->props.nframes; i++) {
> + dma = dma_map_single_attrs(dev, umem->frames[i].addr,
> + umem->props.frame_size,
> + DMA_BIDIRECTIONAL, I40E_RX_DMA_ATTR);
> + if (dma_mapping_error(dev, dma))
> + goto out_unmap;
> +
> + umem->frames[i].dma = dma;
> + }
> +
> + return 0;
> +
> +out_unmap:
> + for (j = 0; j < i; j++) {
> + dma_unmap_single_attrs(dev, umem->frames[i].dma,
> + umem->props.frame_size,
> + DMA_BIDIRECTIONAL,
> + I40E_RX_DMA_ATTR);
> + umem->frames[i].dma = 0;
> + }
> +
> + return -1;
> +}
> +
> +static void i40e_xsk_umem_dma_unmap(struct i40e_vsi *vsi, struct xdp_umem *umem)
> +{
> + struct i40e_pf *pf = vsi->back;
> + struct device *dev;
> + unsigned int i;
> +
> + dev = &pf->pdev->dev;
> +
> + for (i = 0; i < umem->props.nframes; i++) {
> + dma_unmap_single_attrs(dev, umem->frames[i].dma,
> + umem->props.frame_size,
> + DMA_BIDIRECTIONAL,
> + I40E_RX_DMA_ATTR);
> +
> + umem->frames[i].dma = 0;
> + }
> +}
> +
> +static int i40e_xsk_umem_enable(struct i40e_vsi *vsi, struct xdp_umem *umem,
> + u16 qid)
> +{
> + bool if_running;
> + int err;
> +
> + if (vsi->type != I40E_VSI_MAIN)
> + return -EINVAL;
> +
> + if (qid >= vsi->num_queue_pairs)
> + return -EINVAL;
> +
> + if (vsi->xsk_umems && vsi->xsk_umems[qid])
> + return -EBUSY;
> +
> + err = i40e_xsk_umem_dma_map(vsi, umem);
> + if (err)
> + return err;
> +
> + if_running = netif_running(vsi->netdev) && i40e_enabled_xdp_vsi(vsi);
> +
> + if (if_running) {
> + err = i40e_queue_pair_disable(vsi, qid);
> + if (err)
> + return err;
> + }
> +
> + err = i40e_add_xsk_umem(vsi, umem, qid);
> + if (err)
> + return err;
> +
> + if (if_running) {
> + err = i40e_queue_pair_enable(vsi, qid);
> + if (err)
> + return err;
> + }
> +
> + return 0;
> +}
> +
> +static int i40e_xsk_umem_disable(struct i40e_vsi *vsi, u16 qid)
> +{
> + bool if_running;
> + int err;
> +
> + if (!vsi->xsk_umems || qid >= vsi->num_xsk_umems ||
> + !vsi->xsk_umems[qid])
> + return -EINVAL;
> +
> + if_running = netif_running(vsi->netdev) && i40e_enabled_xdp_vsi(vsi);
> +
> + if (if_running) {
> + err = i40e_queue_pair_disable(vsi, qid);
> + if (err)
> + return err;
> + }
> +
> + i40e_xsk_umem_dma_unmap(vsi, vsi->xsk_umems[qid]);
> + i40e_remove_xsk_umem(vsi, qid);
> +
> + if (if_running) {
> + err = i40e_queue_pair_enable(vsi, qid);
> + if (err)
> + return err;
> + }
> +
> + return 0;
> +}
> +
> +static int i40e_xsk_umem_setup(struct i40e_vsi *vsi, struct xdp_umem *umem,
> + u16 qid)
> +{
> + if (umem)
> + return i40e_xsk_umem_enable(vsi, umem, qid);
> +
> + return i40e_xsk_umem_disable(vsi, qid);
> +}
> +
> /**
> * i40e_xdp - implements ndo_bpf for i40e
> * @dev: netdevice
> @@ -12071,6 +12266,9 @@ static int i40e_xdp(struct net_device *dev,
> xdp->prog_attached = i40e_enabled_xdp_vsi(vsi);
> xdp->prog_id = vsi->xdp_prog ? vsi->xdp_prog->aux->id : 0;
> return 0;
> + case XDP_SETUP_XSK_UMEM:
> + return i40e_xsk_umem_setup(vsi, xdp->xsk.umem,
> + xdp->xsk.queue_id);
> default:
> return -EINVAL;
> }
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> index 5efa68de935b..f89ac524652c 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> @@ -5,6 +5,7 @@
> #include <net/busy_poll.h>
> #include <linux/bpf_trace.h>
> #include <net/xdp.h>
> +#include <net/xdp_sock.h>
> #include "i40e.h"
> #include "i40e_trace.h"
> #include "i40e_prototype.h"
> @@ -1373,31 +1374,35 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
> }
>
> /* Free all the Rx ring sk_buffs */
> - for (i = 0; i < rx_ring->count; i++) {
> - struct i40e_rx_buffer *rx_bi = &rx_ring->rx_bi[i];
> + if (!rx_ring->xsk_umem) {
> + for (i = 0; i < rx_ring->count; i++) {
I'm not a fan of all this extra indenting. This could be much more
easily handled with just a goto and a label.
> + struct i40e_rx_buffer *rx_bi = &rx_ring->rx_bi[i];
>
> - if (!rx_bi->page)
> - continue;
> -
> - /* Invalidate cache lines that may have been written to by
> - * device so that we avoid corrupting memory.
> - */
> - dma_sync_single_range_for_cpu(rx_ring->dev,
> - rx_bi->dma,
> - rx_bi->page_offset,
> - rx_ring->rx_buf_len,
> - DMA_FROM_DEVICE);
> -
> - /* free resources associated with mapping */
> - dma_unmap_page_attrs(rx_ring->dev, rx_bi->dma,
> - i40e_rx_pg_size(rx_ring),
> - DMA_FROM_DEVICE,
> - I40E_RX_DMA_ATTR);
> -
> - __page_frag_cache_drain(rx_bi->page, rx_bi->pagecnt_bias);
> + if (!rx_bi->page)
> + continue;
>
> - rx_bi->page = NULL;
> - rx_bi->page_offset = 0;
> + /* Invalidate cache lines that may have been
> + * written to by device so that we avoid
> + * corrupting memory.
> + */
> + dma_sync_single_range_for_cpu(rx_ring->dev,
> + rx_bi->dma,
> + rx_bi->page_offset,
> + rx_ring->rx_buf_len,
> + DMA_FROM_DEVICE);
> +
> + /* free resources associated with mapping */
> + dma_unmap_page_attrs(rx_ring->dev, rx_bi->dma,
> + i40e_rx_pg_size(rx_ring),
> + DMA_FROM_DEVICE,
> + I40E_RX_DMA_ATTR);
> +
> + __page_frag_cache_drain(rx_bi->page,
> + rx_bi->pagecnt_bias);
> +
> + rx_bi->page = NULL;
> + rx_bi->page_offset = 0;
> + }
> }
>
> bi_size = sizeof(struct i40e_rx_buffer) * rx_ring->count;
> @@ -2214,8 +2219,6 @@ static struct sk_buff *i40e_run_xdp(struct i40e_ring *rx_ring,
> if (!xdp_prog)
> goto xdp_out;
>
> - prefetchw(xdp->data_hard_start); /* xdp_frame write */
> -
> act = bpf_prog_run_xdp(xdp_prog, xdp);
> switch (act) {
> case XDP_PASS:
> @@ -2284,7 +2287,7 @@ static inline void i40e_xdp_ring_update_tail(struct i40e_ring *xdp_ring)
> *
> * Returns amount of work completed
> **/
> -static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
> +int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
> {
> unsigned int total_rx_bytes = 0, total_rx_packets = 0;
> struct sk_buff *skb = rx_ring->skb;
> @@ -2426,6 +2429,349 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
> return failure ? budget : (int)total_rx_packets;
> }
>
How much of the code below is actually reused anywhere else? I would
almost be inclined to say that maybe the zero-copy path should be
moved to a new file since so much of this is being duplicated from the
original tx/rx code path. I can easily see this becoming confusing as
to which is which when a bug gets found and needs to be fixed.
> +static struct sk_buff *i40e_run_xdp_zc(struct i40e_ring *rx_ring,
> + struct xdp_buff *xdp)
> +{
> + int err, result = I40E_XDP_PASS;
> + struct i40e_ring *xdp_ring;
> + struct bpf_prog *xdp_prog;
> + u32 act;
> +
> + rcu_read_lock();
> + xdp_prog = READ_ONCE(rx_ring->xdp_prog);
> +
> + act = bpf_prog_run_xdp(xdp_prog, xdp);
> + switch (act) {
> + case XDP_PASS:
> + break;
> + case XDP_TX:
> + xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->queue_index];
> + result = i40e_xmit_xdp_tx_ring(xdp, xdp_ring);
> + break;
> + case XDP_REDIRECT:
> + err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
> + result = !err ? I40E_XDP_TX : I40E_XDP_CONSUMED;
> + break;
> + default:
> + bpf_warn_invalid_xdp_action(act);
> + case XDP_ABORTED:
> + trace_xdp_exception(rx_ring->netdev, xdp_prog, act);
> + /* fallthrough -- handle aborts by dropping packet */
> + case XDP_DROP:
> + result = I40E_XDP_CONSUMED;
> + break;
> + }
> +
> + rcu_read_unlock();
> + return ERR_PTR(-result);
> +}
> +
> +static bool i40e_alloc_frame_zc(struct i40e_ring *rx_ring,
> + struct i40e_rx_buffer *bi)
> +{
> + struct xdp_umem *umem = rx_ring->xsk_umem;
> + void *addr = bi->addr;
> + u32 *id;
> +
> + if (addr) {
> + rx_ring->rx_stats.page_reuse_count++;
> + return true;
> + }
> +
> + id = xsk_umem_peek_id(umem);
> + if (unlikely(!id)) {
> + rx_ring->rx_stats.alloc_page_failed++;
> + return false;
> + }
> +
> + bi->dma = umem->frames[*id].dma + umem->frame_headroom +
> + XDP_PACKET_HEADROOM;
> + bi->addr = umem->frames[*id].addr + umem->frame_headroom +
> + XDP_PACKET_HEADROOM;
> + bi->id = *id;
> +
> + xsk_umem_discard_id(umem);
> + return true;
> +}
> +
> +bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 cleaned_count)
> +{
> + u16 ntu = rx_ring->next_to_use;
> + union i40e_rx_desc *rx_desc;
> + struct i40e_rx_buffer *bi;
> +
> + rx_desc = I40E_RX_DESC(rx_ring, ntu);
> + bi = &rx_ring->rx_bi[ntu];
> +
> + do {
> + if (!i40e_alloc_frame_zc(rx_ring, bi))
> + goto no_buffers;
> +
> + /* sync the buffer for use by the device */
> + dma_sync_single_range_for_device(rx_ring->dev, bi->dma, 0,
> + rx_ring->rx_buf_len,
> + DMA_BIDIRECTIONAL);
> +
> + /* Refresh the desc even if buffer_addrs didn't change
> + * because each write-back erases this info.
> + */
> + rx_desc->read.pkt_addr = cpu_to_le64(bi->dma);
> +
> + rx_desc++;
> + bi++;
> + ntu++;
> + if (unlikely(ntu == rx_ring->count)) {
> + rx_desc = I40E_RX_DESC(rx_ring, 0);
> + bi = rx_ring->rx_bi;
> + ntu = 0;
> + }
> +
> + /* clear the status bits for the next_to_use descriptor */
> + rx_desc->wb.qword1.status_error_len = 0;
> +
> + cleaned_count--;
> + } while (cleaned_count);
> +
> + if (rx_ring->next_to_use != ntu)
> + i40e_release_rx_desc(rx_ring, ntu);
> +
> + return false;
> +
> +no_buffers:
> + if (rx_ring->next_to_use != ntu)
> + i40e_release_rx_desc(rx_ring, ntu);
> +
> + /* make sure to come back via polling to try again after
> + * allocation failure
> + */
> + return true;
> +}
> +
> +static struct i40e_rx_buffer *i40e_get_rx_buffer_zc(struct i40e_ring *rx_ring,
> + const unsigned int size)
> +{
> + struct i40e_rx_buffer *rx_buffer;
> +
> + rx_buffer = &rx_ring->rx_bi[rx_ring->next_to_clean];
> +
> + /* we are reusing so sync this buffer for CPU use */
> + dma_sync_single_range_for_cpu(rx_ring->dev,
> + rx_buffer->dma, 0,
> + size,
> + DMA_BIDIRECTIONAL);
> +
> + return rx_buffer;
> +}
> +
> +static void i40e_reuse_rx_buffer_zc(struct i40e_ring *rx_ring,
> + struct i40e_rx_buffer *old_buff)
> +{
> + struct i40e_rx_buffer *new_buff;
> + u16 nta = rx_ring->next_to_alloc;
> +
> + new_buff = &rx_ring->rx_bi[nta];
> +
> + /* update, and store next to alloc */
> + nta++;
> + rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
> +
> + /* transfer page from old buffer to new buffer */
> + new_buff->dma = old_buff->dma;
> + new_buff->addr = old_buff->addr;
> + new_buff->id = old_buff->id;
> +}
> +
> +/* Called from the XDP return API in NAPI context. */
> +void i40e_zca_free(struct zero_copy_allocator *alloc, unsigned long handle)
> +{
> + struct i40e_rx_buffer *new_buff;
> + struct i40e_ring *rx_ring;
> + u16 nta;
> +
> + rx_ring = container_of(alloc, struct i40e_ring, zca);
> + nta = rx_ring->next_to_alloc;
> +
> + new_buff = &rx_ring->rx_bi[nta];
> +
> + /* update, and store next to alloc */
> + nta++;
> + rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
> +
> + new_buff->dma = rx_ring->xsk_umem->frames[handle].dma;
> + new_buff->addr = rx_ring->xsk_umem->frames[handle].addr;
> + new_buff->id = (u32)handle;
> +}
> +
> +static struct sk_buff *i40e_zc_frame_to_skb(struct i40e_ring *rx_ring,
> + struct i40e_rx_buffer *rx_buffer,
> + struct xdp_buff *xdp)
> +{
> + // XXX implement alloc skb and copy
> + i40e_reuse_rx_buffer_zc(rx_ring, rx_buffer);
> + return NULL;
> +}
> +
> +static void i40e_clean_programming_status_zc(struct i40e_ring *rx_ring,
> + union i40e_rx_desc *rx_desc,
> + u64 qw)
> +{
> + struct i40e_rx_buffer *rx_buffer;
> + u32 ntc = rx_ring->next_to_clean;
> + u8 id;
> +
> + /* fetch, update, and store next to clean */
> + rx_buffer = &rx_ring->rx_bi[ntc++];
> + ntc = (ntc < rx_ring->count) ? ntc : 0;
> + rx_ring->next_to_clean = ntc;
> +
> + prefetch(I40E_RX_DESC(rx_ring, ntc));
> +
> + /* place unused page back on the ring */
> + i40e_reuse_rx_buffer_zc(rx_ring, rx_buffer);
> + rx_ring->rx_stats.page_reuse_count++;
> +
> + /* clear contents of buffer_info */
> + rx_buffer->addr = NULL;
> +
> + id = (qw & I40E_RX_PROG_STATUS_DESC_QW1_PROGID_MASK) >>
> + I40E_RX_PROG_STATUS_DESC_QW1_PROGID_SHIFT;
> +
> + if (id == I40E_RX_PROG_STATUS_DESC_FD_FILTER_STATUS)
> + i40e_fd_handle_status(rx_ring, rx_desc, id);
> +}
> +
> +int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
> +{
> + unsigned int total_rx_bytes = 0, total_rx_packets = 0;
> + u16 cleaned_count = I40E_DESC_UNUSED(rx_ring);
> + bool failure = false, xdp_xmit = false;
> + struct sk_buff *skb;
> + struct xdp_buff xdp;
> +
> + xdp.rxq = &rx_ring->xdp_rxq;
> +
> + while (likely(total_rx_packets < (unsigned int)budget)) {
> + struct i40e_rx_buffer *rx_buffer;
> + union i40e_rx_desc *rx_desc;
> + unsigned int size;
> + u16 vlan_tag;
> + u8 rx_ptype;
> + u64 qword;
> + u32 ntc;
> +
> + /* return some buffers to hardware, one at a time is too slow */
> + if (cleaned_count >= I40E_RX_BUFFER_WRITE) {
> + failure = failure ||
> + i40e_alloc_rx_buffers_zc(rx_ring,
> + cleaned_count);
> + cleaned_count = 0;
> + }
> +
> + rx_desc = I40E_RX_DESC(rx_ring, rx_ring->next_to_clean);
> +
> + /* status_error_len will always be zero for unused descriptors
> + * because it's cleared in cleanup, and overlaps with hdr_addr
> + * which is always zero because packet split isn't used, if the
> + * hardware wrote DD then the length will be non-zero
> + */
> + qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
> +
> + /* This memory barrier is needed to keep us from reading
> + * any other fields out of the rx_desc until we have
> + * verified the descriptor has been written back.
> + */
> + dma_rmb();
> +
> + if (unlikely(i40e_rx_is_programming_status(qword))) {
> + i40e_clean_programming_status_zc(rx_ring, rx_desc,
> + qword);
> + cleaned_count++;
> + continue;
> + }
> + size = (qword & I40E_RXD_QW1_LENGTH_PBUF_MASK) >>
> + I40E_RXD_QW1_LENGTH_PBUF_SHIFT;
> + if (!size)
> + break;
> +
> + rx_buffer = i40e_get_rx_buffer_zc(rx_ring, size);
> +
> + /* retrieve a buffer from the ring */
> + xdp.data = rx_buffer->addr;
> + xdp_set_data_meta_invalid(&xdp);
> + xdp.data_hard_start = xdp.data - XDP_PACKET_HEADROOM;
> + xdp.data_end = xdp.data + size;
> + xdp.handle = rx_buffer->id;
> +
> + skb = i40e_run_xdp_zc(rx_ring, &xdp);
> +
> + if (IS_ERR(skb)) {
> + if (PTR_ERR(skb) == -I40E_XDP_TX)
> + xdp_xmit = true;
> + else
> + i40e_reuse_rx_buffer_zc(rx_ring, rx_buffer);
> + total_rx_bytes += size;
> + total_rx_packets++;
> + } else {
> + skb = i40e_zc_frame_to_skb(rx_ring, rx_buffer, &xdp);
> + if (!skb) {
> + rx_ring->rx_stats.alloc_buff_failed++;
> + break;
> + }
> + }
> +
> + rx_buffer->addr = NULL;
> + cleaned_count++;
> +
> + /* don't care about non-EOP frames in XDP mode */
> + ntc = rx_ring->next_to_clean + 1;
> + ntc = (ntc < rx_ring->count) ? ntc : 0;
> + rx_ring->next_to_clean = ntc;
> + prefetch(I40E_RX_DESC(rx_ring, ntc));
> +
> + if (i40e_cleanup_headers(rx_ring, skb, rx_desc)) {
> + skb = NULL;
> + continue;
> + }
> +
> + /* probably a little skewed due to removing CRC */
> + total_rx_bytes += skb->len;
> +
> + qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
> + rx_ptype = (qword & I40E_RXD_QW1_PTYPE_MASK) >>
> + I40E_RXD_QW1_PTYPE_SHIFT;
> +
> + /* populate checksum, VLAN, and protocol */
> + i40e_process_skb_fields(rx_ring, rx_desc, skb, rx_ptype);
> +
> + vlan_tag = (qword & BIT(I40E_RX_DESC_STATUS_L2TAG1P_SHIFT)) ?
> + le16_to_cpu(rx_desc->wb.qword0.lo_dword.l2tag1) : 0;
> +
> + i40e_receive_skb(rx_ring, skb, vlan_tag);
> + skb = NULL;
> +
> + /* update budget accounting */
> + total_rx_packets++;
> + }
> +
> + if (xdp_xmit) {
> + struct i40e_ring *xdp_ring =
> + rx_ring->vsi->xdp_rings[rx_ring->queue_index];
> +
> + i40e_xdp_ring_update_tail(xdp_ring);
> + xdp_do_flush_map();
> + }
> +
> + u64_stats_update_begin(&rx_ring->syncp);
> + rx_ring->stats.packets += total_rx_packets;
> + rx_ring->stats.bytes += total_rx_bytes;
> + u64_stats_update_end(&rx_ring->syncp);
> + rx_ring->q_vector->rx.total_packets += total_rx_packets;
> + rx_ring->q_vector->rx.total_bytes += total_rx_bytes;
> +
> + /* guarantee a trip back through this routine if there was a failure */
> + return failure ? budget : (int)total_rx_packets;
> +}
> +
> static inline u32 i40e_buildreg_itr(const int type, u16 itr)
> {
> u32 val;
> @@ -2576,7 +2922,7 @@ int i40e_napi_poll(struct napi_struct *napi, int budget)
> budget_per_ring = max(budget/q_vector->num_ringpairs, 1);
>
> i40e_for_each_ring(ring, q_vector->rx) {
> - int cleaned = i40e_clean_rx_irq(ring, budget_per_ring);
> + int cleaned = ring->clean_rx_irq(ring, budget_per_ring);
>
> work_done += cleaned;
> /* if we clean as many as budgeted, we must not be done */
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> index fdd2c55f03a6..9d5d9862e9f1 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> @@ -296,13 +296,22 @@ struct i40e_tx_buffer {
>
> struct i40e_rx_buffer {
> dma_addr_t dma;
> - struct page *page;
> + union {
> + struct {
> + struct page *page;
> #if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536)
> - __u32 page_offset;
> + __u32 page_offset;
> #else
> - __u16 page_offset;
> + __u16 page_offset;
> #endif
> - __u16 pagecnt_bias;
> + __u16 pagecnt_bias;
> + };
> + struct {
> + /* for umem */
> + void *addr;
> + u32 id;
> + };
> + };
> };
>
> struct i40e_queue_stats {
> @@ -344,6 +353,8 @@ enum i40e_ring_state_t {
> #define I40E_RX_SPLIT_TCP_UDP 0x4
> #define I40E_RX_SPLIT_SCTP 0x8
>
> +void i40e_zc_recycle(struct zero_copy_allocator *alloc, unsigned long handle);
> +
> /* struct that defines a descriptor ring, associated with a VSI */
> struct i40e_ring {
> struct i40e_ring *next; /* pointer to next ring in q_vector */
> @@ -414,6 +425,12 @@ struct i40e_ring {
>
> struct i40e_channel *ch;
> struct xdp_rxq_info xdp_rxq;
> +
> + int (*clean_rx_irq)(struct i40e_ring *, int);
> + bool (*alloc_rx_buffers)(struct i40e_ring *, u16);
> + struct xdp_umem *xsk_umem;
> +
> + struct zero_copy_allocator zca; /* ZC allocator anchor */
> } ____cacheline_internodealigned_in_smp;
>
> static inline bool ring_uses_build_skb(struct i40e_ring *ring)
> @@ -474,6 +491,7 @@ static inline unsigned int i40e_rx_pg_order(struct i40e_ring *ring)
> #define i40e_rx_pg_size(_ring) (PAGE_SIZE << i40e_rx_pg_order(_ring))
>
> bool i40e_alloc_rx_buffers(struct i40e_ring *rxr, u16 cleaned_count);
> +bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 cleaned_count);
> netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev);
> void i40e_clean_tx_ring(struct i40e_ring *tx_ring);
> void i40e_clean_rx_ring(struct i40e_ring *rx_ring);
> @@ -489,6 +507,9 @@ int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size);
> bool __i40e_chk_linearize(struct sk_buff *skb);
> int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf);
> void i40e_xdp_flush(struct net_device *dev);
> +int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget);
> +int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget);
> +void i40e_zca_free(struct zero_copy_allocator *alloc, unsigned long handle);
>
> /**
> * i40e_get_head - Retrieve head from head writeback
> @@ -575,4 +596,5 @@ static inline struct netdev_queue *txring_txq(const struct i40e_ring *ring)
> {
> return netdev_get_tx_queue(ring->netdev, ring->queue_index);
> }
> +
> #endif /* _I40E_TXRX_H_ */
> --
> 2.14.1
>
Powered by blists - more mailing lists