[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <546CE067.9010903@redhat.com>
Date: Wed, 19 Nov 2014 10:24:39 -0800
From: Alexander Duyck <alexander.h.duyck@...hat.com>
To: Jeff Kirsher <jeffrey.t.kirsher@...el.com>, davem@...emloft.net
CC: Emil Tantilov <emil.s.tantilov@...el.com>, netdev@...r.kernel.org,
nhorman@...hat.com, sassmann@...hat.com, jogreene@...hat.com
Subject: Re: [net-next 07/15] ixgbevf: Change receive model to use double
buffered page based receives
There were a few things in this patch that should be addressed.
Comments inline below.
- Alex
On 11/18/2014 08:10 PM, Jeff Kirsher wrote:
> From: Emil Tantilov <emil.s.tantilov@...el.com>
>
> This patch changes the basic receive path for ixgbevf so that instead of
> receiving the data into an skb it is received into a double buffered page.
> The main change is that the receives will be done in pages only and then
> pull the header out of the page and copy it into the sk_buff data.
>
> This has the advantages of reduced cache misses and improved performance on
> IOMMU enabled systems.
>
> CC: Alexander Duyck <alexander.h.duyck@...hat.com>
> Signed-off-by: Emil Tantilov <emil.s.tantilov@...el.com>
> Tested-by: Phil Schmitt <phillip.j.schmitt@...el.com>
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@...el.com>
> ---
> drivers/net/ethernet/intel/ixgbevf/ixgbevf.h | 24 +-
> drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 452 +++++++++++++++-------
> 2 files changed, 331 insertions(+), 145 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
> index 72a354b..2362001 100644
> --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
> +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf.h
> @@ -58,8 +58,9 @@ struct ixgbevf_tx_buffer {
> };
>
> struct ixgbevf_rx_buffer {
> - struct sk_buff *skb;
> dma_addr_t dma;
> + struct page *page;
> + unsigned int page_offset;
> };
>
> struct ixgbevf_stats {
> @@ -91,9 +92,10 @@ struct ixgbevf_ring {
> void *desc; /* descriptor ring memory */
> dma_addr_t dma; /* phys. address of descriptor ring */
> unsigned int size; /* length in bytes */
> - unsigned int count; /* amount of descriptors */
> - unsigned int next_to_use;
> - unsigned int next_to_clean;
> + u16 count; /* amount of descriptors */
> + u16 next_to_use;
> + u16 next_to_clean;
> + u16 next_to_alloc;
>
> union {
> struct ixgbevf_tx_buffer *tx_buffer_info;
> @@ -109,12 +111,11 @@ struct ixgbevf_ring {
>
> u64 hw_csum_rx_error;
> u8 __iomem *tail;
> + struct sk_buff *skb;
>
> u16 reg_idx; /* holds the special value that gets the hardware register
> * offset associated with this ring, which is different
> * for DCB and RSS modes */
> -
> - u16 rx_buf_len;
> int queue_index; /* needed for multiqueue queue management */
> };
>
> @@ -133,12 +134,10 @@ struct ixgbevf_ring {
>
> /* Supported Rx Buffer Sizes */
> #define IXGBEVF_RXBUFFER_256 256 /* Used for packet split */
> -#define IXGBEVF_RXBUFFER_2K 2048
> -#define IXGBEVF_RXBUFFER_4K 4096
> -#define IXGBEVF_RXBUFFER_8K 8192
> -#define IXGBEVF_RXBUFFER_10K 10240
> +#define IXGBEVF_RXBUFFER_2048 2048
>
> #define IXGBEVF_RX_HDR_SIZE IXGBEVF_RXBUFFER_256
> +#define IXGBEVF_RX_BUFSZ IXGBEVF_RXBUFFER_2048
>
> #define MAXIMUM_ETHERNET_VLAN_SIZE (VLAN_ETH_FRAME_LEN + ETH_FCS_LEN)
>
> @@ -430,11 +429,6 @@ enum ixbgevf_state_t {
> __IXGBEVF_WORK_INIT,
> };
>
> -struct ixgbevf_cb {
> - struct sk_buff *prev;
> -};
> -#define IXGBE_CB(skb) ((struct ixgbevf_cb *)(skb)->cb)
> -
> enum ixgbevf_boards {
> board_82599_vf,
> board_X540_vf,
> diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> index 20bebd2..2ca7c96 100644
> --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
> @@ -422,8 +422,7 @@ static void ixgbevf_process_skb_fields(struct ixgbevf_ring *rx_ring,
> * that this is in fact a non-EOP buffer.
> **/
> static bool ixgbevf_is_non_eop(struct ixgbevf_ring *rx_ring,
> - union ixgbe_adv_rx_desc *rx_desc,
> - struct sk_buff *skb)
> + union ixgbe_adv_rx_desc *rx_desc)
> {
> u32 ntc = rx_ring->next_to_clean + 1;
>
> @@ -439,37 +438,40 @@ static bool ixgbevf_is_non_eop(struct ixgbevf_ring *rx_ring,
> return true;
> }
>
> -static bool ixgbevf_alloc_mapped_skb(struct ixgbevf_ring *rx_ring,
> - struct ixgbevf_rx_buffer *bi)
> +static bool ixgbevf_alloc_mapped_page(struct ixgbevf_ring *rx_ring,
> + struct ixgbevf_rx_buffer *bi)
> {
> - struct sk_buff *skb = bi->skb;
> + struct page *page = bi->page;
> dma_addr_t dma = bi->dma;
>
> - if (unlikely(skb))
> + /* since we are recycling buffers we should seldom need to alloc */
> + if (likely(page))
> return true;
>
> - skb = netdev_alloc_skb_ip_align(rx_ring->netdev,
> - rx_ring->rx_buf_len);
> - if (unlikely(!skb)) {
> - rx_ring->rx_stats.alloc_rx_buff_failed++;
> + /* alloc new page for storage */
> + page = dev_alloc_page();
> + if (unlikely(!page)) {
> + rx_ring->rx_stats.alloc_rx_page_failed++;
> return false;
> }
>
> - dma = dma_map_single(rx_ring->dev, skb->data,
> - rx_ring->rx_buf_len, DMA_FROM_DEVICE);
> + /* map page for use */
> + dma = dma_map_page(rx_ring->dev, page, 0,
> + PAGE_SIZE, DMA_FROM_DEVICE);
>
> /* if mapping failed free memory back to system since
> * there isn't much point in holding memory we can't use
> */
> if (dma_mapping_error(rx_ring->dev, dma)) {
> - dev_kfree_skb_any(skb);
> + __free_page(page);
>
> rx_ring->rx_stats.alloc_rx_buff_failed++;
> return false;
> }
>
> - bi->skb = skb;
> bi->dma = dma;
> + bi->page = page;
> + bi->page_offset = 0;
>
> return true;
> }
> @@ -495,13 +497,13 @@ static void ixgbevf_alloc_rx_buffers(struct ixgbevf_ring *rx_ring,
> i -= rx_ring->count;
>
> do {
> - if (!ixgbevf_alloc_mapped_skb(rx_ring, bi))
> + if (!ixgbevf_alloc_mapped_page(rx_ring, bi))
> break;
>
> /* Refresh the desc even if pkt_addr didn't change
> * because each write-back erases this info.
> */
> - rx_desc->read.pkt_addr = cpu_to_le64(bi->dma);
> + rx_desc->read.pkt_addr = cpu_to_le64(bi->dma + bi->page_offset);
>
> rx_desc++;
> bi++;
> @@ -524,6 +526,9 @@ static void ixgbevf_alloc_rx_buffers(struct ixgbevf_ring *rx_ring,
> /* record the next descriptor to use */
> rx_ring->next_to_use = i;
>
> + /* update next to alloc since we have filled the ring */
> + rx_ring->next_to_alloc = i;
> +
> /* Force memory writes to complete before letting h/w
> * know there are new descriptors to fetch. (Only
> * applicable for weak-ordered memory model archs,
> @@ -534,6 +539,257 @@ static void ixgbevf_alloc_rx_buffers(struct ixgbevf_ring *rx_ring,
> }
> }
>
> +/* ixgbevf_pull_tail - ixgbevf specific version of skb_pull_tail
> + * @rx_ring: Rx descriptor ring packet is being transacted on
> + * @skb: pointer to current skb being adjusted
> + *
> + * This function is an ixgbevf specific version of __pskb_pull_tail. The
> + * main difference between this version and the original function is that
> + * this function can make several assumptions about the state of things
> + * that allow for significant optimizations versus the standard function.
> + * As a result we can do things like drop a frag and maintain an accurate
> + * truesize for the skb.
> + */
> +static void ixgbevf_pull_tail(struct ixgbevf_ring *rx_ring,
> + struct sk_buff *skb)
> +{
> + struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[0];
> + unsigned char *va;
> + unsigned int pull_len;
> +
> + /* it is valid to use page_address instead of kmap since we are
> + * working with pages allocated out of the lomem pool per
> + * alloc_page(GFP_ATOMIC)
> + */
> + va = skb_frag_address(frag);
> +
> + /* we need the header to contain the greater of either ETH_HLEN or
> + * 60 bytes if the skb->len is less than 60 for skb_pad.
> + */
> + pull_len = eth_get_headlen(va, IXGBEVF_RX_HDR_SIZE);
> +
> + /* align pull length to size of long to optimize memcpy performance */
> + skb_copy_to_linear_data(skb, va, ALIGN(pull_len, sizeof(long)));
> +
> + /* update all of the pointers */
> + skb_frag_size_sub(frag, pull_len);
> + frag->page_offset += pull_len;
> + skb->data_len -= pull_len;
> + skb->tail += pull_len;
> +}
I really think we should look at making this into a generic function.
Maybe I will submit something later today to get a common function
placed in the code. Maybe something like eth_pull_tail.
> +/* ixgbevf_cleanup_headers - Correct corrupted or empty headers
> + * @rx_ring: Rx descriptor ring packet is being transacted on
> + * @rx_desc: pointer to the EOP Rx descriptor
> + * @skb: pointer to current skb being fixed
> + *
> + * Check for corrupted packet headers caused by senders on the local L2
> + * embedded NIC switch not setting up their Tx Descriptors right. These
> + * should be very rare.
> + *
> + * Also address the case where we are pulling data in on pages only
> + * and as such no data is present in the skb header.
> + *
> + * In addition if skb is not at least 60 bytes we need to pad it so that
> + * it is large enough to qualify as a valid Ethernet frame.
> + *
> + * Returns true if an error was encountered and skb was freed.
> + */
> +static bool ixgbevf_cleanup_headers(struct ixgbevf_ring *rx_ring,
> + union ixgbe_adv_rx_desc *rx_desc,
> + struct sk_buff *skb)
> +{
> + /* verify that the packet does not have any known errors */
> + if (unlikely(ixgbevf_test_staterr(rx_desc,
> + IXGBE_RXDADV_ERR_FRAME_ERR_MASK))) {
> + struct net_device *netdev = rx_ring->netdev;
> +
> + if (!(netdev->features & NETIF_F_RXALL)) {
> + dev_kfree_skb_any(skb);
> + return true;
> + }
> + }
> +
> + /* place header in linear portion of buffer */
> + if (skb_is_nonlinear(skb))
> + ixgbevf_pull_tail(rx_ring, skb);
> +
> + /* if skb_pad returns an error the skb was freed */
> + if (unlikely(skb->len < 60)) {
> + int pad_len = 60 - skb->len;
> +
> + if (skb_pad(skb, pad_len))
> + return true;
> + __skb_put(skb, pad_len);
> + }
> +
> + return false;
> +}
The same goes for the padding bit here. Maybe something like eth_skb_pad.
> +/* ixgbevf_reuse_rx_page - page flip buffer and store it back on the ring
> + * @rx_ring: Rx descriptor ring to store buffers on
> + * @old_buff: donor buffer to have page reused
> + *
> + * Synchronizes page for reuse by the adapter
> + */
> +static void ixgbevf_reuse_rx_page(struct ixgbevf_ring *rx_ring,
> + struct ixgbevf_rx_buffer *old_buff)
> +{
> + struct ixgbevf_rx_buffer *new_buff;
> + u16 nta = rx_ring->next_to_alloc;
> +
> + new_buff = &rx_ring->rx_buffer_info[nta];
> +
> + /* update, and store next to alloc */
> + nta++;
> + rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
> +
> + /* transfer page from old buffer to new buffer */
> + new_buff->page = old_buff->page;
> + new_buff->dma = old_buff->dma;
> + new_buff->page_offset = old_buff->page_offset;
> +
> + /* sync the buffer for use by the device */
> + dma_sync_single_range_for_device(rx_ring->dev, new_buff->dma,
> + new_buff->page_offset,
> + IXGBEVF_RX_BUFSZ,
> + DMA_FROM_DEVICE);
> +}
> +
> +/* ixgbevf_add_rx_frag - Add contents of Rx buffer to sk_buff
> + * @rx_ring: Rx descriptor ring to transact packets on
> + * @rx_buffer: buffer containing page to add
> + * @rx_desc: descriptor containing length of buffer written by hardware
> + * @skb: sk_buff to place the data into
> + *
> + * This function will add the data contained in rx_buffer->page to the skb.
> + * This is done either through a direct copy if the data in the buffer is
> + * less than the skb header size, otherwise it will just attach the page as
> + * a frag to the skb.
> + *
> + * The function will then update the page offset if necessary and return
> + * true if the buffer can be reused by the adapter.
> + */
> +static bool ixgbevf_add_rx_frag(struct ixgbevf_ring *rx_ring,
> + struct ixgbevf_rx_buffer *rx_buffer,
> + union ixgbe_adv_rx_desc *rx_desc,
> + struct sk_buff *skb)
> +{
> + struct page *page = rx_buffer->page;
> + unsigned int size = le16_to_cpu(rx_desc->wb.upper.length);
> +#if (PAGE_SIZE < 8192)
> + unsigned int truesize = IXGBEVF_RX_BUFSZ;
> +#else
> + unsigned int truesize = ALIGN(size, L1_CACHE_BYTES);
> +#endif
> +
> + if ((size <= IXGBEVF_RX_HDR_SIZE) && !skb_is_nonlinear(skb)) {
> + unsigned char *va = page_address(page) + rx_buffer->page_offset;
> +
> + memcpy(__skb_put(skb, size), va, ALIGN(size, sizeof(long)));
> +
> + /* we can reuse buffer as-is, just make sure it is local */
> + if (likely(page_to_nid(page) == numa_node_id()))
> + return true;
> +
> + /* this page cannot be reused so discard it */
> + put_page(page);
> + return false;
> + }
> +
> + skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page,
> + rx_buffer->page_offset, size, truesize);
> +
> + /* avoid re-using remote pages */
> + if (unlikely(page_to_nid(page) != numa_node_id()))
> + return false;
> +
This is missing the pfmemalloc fix that is already in igb, and that I
submitted for ixgbe and fm10k.
> +#if (PAGE_SIZE < 8192)
> + /* if we are only owner of page we can reuse it */
> + if (unlikely(page_count(page) != 1))
> + return false;
> +
> + /* flip page offset to other buffer */
> + rx_buffer->page_offset ^= IXGBEVF_RX_BUFSZ;
> +
> + /* since we are the only owner of the page and we need to
> + * increment it.
> + */
> + atomic_inc(&page->_count);
> +#else
> + /* move offset up to the next cache line */
> + rx_buffer->page_offset += truesize;
> +
> + if (rx_buffer->page_offset > (PAGE_SIZE - IXGBEVF_RX_BUFSZ))
> + return false;
> +
> + /* bump ref count on page before it is given to the stack */
> + get_page(page);
> +#endif
> +
> + return true;
> +}
The get_page and atomic_inc calls can be pulled out and placed after if
#if/else logic. The preference is to use atomic_inc since you are using
an order 0 page and don't need to check for PageTail().
> +static struct sk_buff *ixgbevf_fetch_rx_buffer(struct ixgbevf_ring *rx_ring,
> + union ixgbe_adv_rx_desc *rx_desc,
> + struct sk_buff *skb)
> +{
> + struct ixgbevf_rx_buffer *rx_buffer;
> + struct page *page;
> +
> + rx_buffer = &rx_ring->rx_buffer_info[rx_ring->next_to_clean];
> + page = rx_buffer->page;
> + prefetchw(page);
> +
> + if (likely(!skb)) {
> + void *page_addr = page_address(page) +
> + rx_buffer->page_offset;
> +
> + /* prefetch first cache line of first page */
> + prefetch(page_addr);
> +#if L1_CACHE_BYTES < 128
> + prefetch(page_addr + L1_CACHE_BYTES);
> +#endif
> +
> + /* allocate a skb to store the frags */
> + skb = netdev_alloc_skb_ip_align(rx_ring->netdev,
> + IXGBEVF_RX_HDR_SIZE);
> + if (unlikely(!skb)) {
> + rx_ring->rx_stats.alloc_rx_buff_failed++;
> + return NULL;
> + }
> +
> + /* we will be copying header into skb->data in
> + * pskb_may_pull so it is in our interest to prefetch
> + * it now to avoid a possible cache miss
> + */
> + prefetchw(skb->data);
> + }
> +
> + /* we are reusing so sync this buffer for CPU use */
> + dma_sync_single_range_for_cpu(rx_ring->dev,
> + rx_buffer->dma,
> + rx_buffer->page_offset,
> + IXGBEVF_RX_BUFSZ,
> + DMA_FROM_DEVICE);
> +
> + /* pull page into skb */
> + if (ixgbevf_add_rx_frag(rx_ring, rx_buffer, rx_desc, skb)) {
> + /* hand second half of page back to the ring */
> + ixgbevf_reuse_rx_page(rx_ring, rx_buffer);
> + } else {
> + /* we are not reusing the buffer so unmap it */
> + dma_unmap_page(rx_ring->dev, rx_buffer->dma,
> + PAGE_SIZE, DMA_FROM_DEVICE);
> + }
> +
> + /* clear contents of buffer_info */
> + rx_buffer->dma = 0;
> + rx_buffer->page = NULL;
> +
> + return skb;
> +}
> +
> static inline void ixgbevf_irq_enable_queues(struct ixgbevf_adapter *adapter,
> u32 qmask)
> {
<snip>
> @@ -3320,21 +3522,11 @@ static int ixgbevf_set_mac(struct net_device *netdev, void *p)
> static int ixgbevf_change_mtu(struct net_device *netdev, int new_mtu)
> {
> struct ixgbevf_adapter *adapter = netdev_priv(netdev);
> + struct ixgbe_hw *hw = &adapter->hw;
> int max_frame = new_mtu + ETH_HLEN + ETH_FCS_LEN;
> - int max_possible_frame = MAXIMUM_ETHERNET_VLAN_SIZE;
> -
> - switch (adapter->hw.api_version) {
> - case ixgbe_mbox_api_11:
> - max_possible_frame = IXGBE_MAX_JUMBO_FRAME_SIZE;
> - break;
> - default:
> - if (adapter->hw.mac.type == ixgbe_mac_X540_vf)
> - max_possible_frame = IXGBE_MAX_JUMBO_FRAME_SIZE;
> - break;
> - }
>
> /* MTU < 68 is an error and causes problems on some kernels */
> - if ((new_mtu < 68) || (max_frame > max_possible_frame))
> + if ((new_mtu < 68) || (max_frame > IXGBE_MAX_JUMBO_FRAME_SIZE))
> return -EINVAL;
>
> hw_dbg(&adapter->hw, "changing MTU from %d to %d\n",
This is wrong. You are still limited by the PF so if it is a version
1.0 mailbox on an 82599 you cannot enable jumbo frames. Yes you can
support it but the PF won't let you do it.
> @@ -3342,8 +3534,8 @@ static int ixgbevf_change_mtu(struct net_device *netdev, int new_mtu)
> /* must set new MTU before calling down or up */
> netdev->mtu = new_mtu;
>
> - if (netif_running(netdev))
> - ixgbevf_reinit_locked(adapter);
> + /* notify the PF of our intent to use this size of frame */
> + ixgbevf_rlpml_set_vf(hw, max_frame);
>
> return 0;
> }
>
This is the reason why the change is wrong. If the mailbox api is
version 1.0 you cannot support jumbo frames so ixgbevf_rlmpl_set_vf will
return an error via the mailbox indicating that the message is not
supported.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists