lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Sat, 17 Mar 2012 01:50:53 -0700
From:	Jeff Kirsher <jeffrey.t.kirsher@...el.com>
To:	davem@...emloft.net
Cc:	Alexander Duyck <alexander.h.duyck@...el.com>,
	netdev@...r.kernel.org, gospo@...hat.com, sassmann@...hat.com,
	Jeff Kirsher <jeffrey.t.kirsher@...el.com>
Subject: [net-next 05/12] ixgbe: Replace standard receive path with a page based receive

From: Alexander Duyck <alexander.h.duyck@...el.com>

This patch replaces the existing Rx hot-path in the ixgbe driver with a new
implementation that is based on performing a double buffered receive.  The
ixgbe driver already had something similar in place for its' packet split
path, however in that case we were still receiving the header for the
packet into the sk_buff.  The big change here is the entire receive path
will receive into pages only, and then pull the header out of the page and
copy it into the sk_buff data.  There are several motivations behind this
approach.

First, this allows us to avoid several cache misses as we were taking a
set of cache misses for allocating the sk_buff and then another set for
receiving data into the sk_buff.  We are able to avoid these misses on
receive now as we allocate the sk_buff when data is available.

Second we are able to see a considerable performance gain when an IOMMU is
enabled because we are no longer unmapping every buffer on receive.
Instead we can delay the unmap until we are unable to use the page, and
instead we can simply call sync_single_range on the half of the page that
contains new data.

Finally we are able to drop a considerable amount of code from the driver
as we no longer have to support 2 different receive modes, packet split and
one buffer.  This allows us to optimize the Rx path further since less
branching is required.

Signed-off-by: Alexander Duyck <alexander.h.duyck@...el.com>
Tested-by: Ross Brattain <ross.b.brattain@...el.com>
Tested-by: Stephen Ko <stephen.s.ko@...el.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@...el.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe.h         |   45 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c |   23 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c    |  819 +++++++++++-----------
 3 files changed, 457 insertions(+), 430 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index f069c1b..b1e3baf 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -72,12 +72,6 @@
 
 /* Supported Rx Buffer Sizes */
 #define IXGBE_RXBUFFER_512   512    /* Used for packet split */
-#define IXGBE_RXBUFFER_2K   2048
-#define IXGBE_RXBUFFER_3K   3072
-#define IXGBE_RXBUFFER_4K   4096
-#define IXGBE_RXBUFFER_7K   7168
-#define IXGBE_RXBUFFER_8K   8192
-#define IXGBE_RXBUFFER_15K  15360
 #define IXGBE_MAX_RXBUFFER  16384  /* largest size for a single descriptor */
 
 /*
@@ -168,7 +162,6 @@ struct ixgbe_rx_buffer {
 	struct sk_buff *skb;
 	dma_addr_t dma;
 	struct page *page;
-	dma_addr_t page_dma;
 	unsigned int page_offset;
 };
 
@@ -193,21 +186,15 @@ struct ixgbe_rx_queue_stats {
 	u64 csum_err;
 };
 
-enum ixbge_ring_state_t {
+enum ixgbe_ring_state_t {
 	__IXGBE_TX_FDIR_INIT_DONE,
 	__IXGBE_TX_DETECT_HANG,
 	__IXGBE_HANG_CHECK_ARMED,
-	__IXGBE_RX_PS_ENABLED,
 	__IXGBE_RX_RSC_ENABLED,
 	__IXGBE_RX_CSUM_UDP_ZERO_ERR,
+	__IXGBE_RX_FCOE_BUFSZ,
 };
 
-#define ring_is_ps_enabled(ring) \
-	test_bit(__IXGBE_RX_PS_ENABLED, &(ring)->state)
-#define set_ring_ps_enabled(ring) \
-	set_bit(__IXGBE_RX_PS_ENABLED, &(ring)->state)
-#define clear_ring_ps_enabled(ring) \
-	clear_bit(__IXGBE_RX_PS_ENABLED, &(ring)->state)
 #define check_for_tx_hang(ring) \
 	test_bit(__IXGBE_TX_DETECT_HANG, &(ring)->state)
 #define set_check_for_tx_hang(ring) \
@@ -233,7 +220,6 @@ struct ixgbe_ring {
 	u8 __iomem *tail;
 
 	u16 count;			/* amount of descriptors */
-	u16 rx_buf_len;
 
 	u8 queue_index; /* needed for multiqueue queue management */
 	u8 reg_idx;			/* holds the special value that gets
@@ -241,8 +227,13 @@ struct ixgbe_ring {
 					 * associated with this ring, which is
 					 * different for DCB and RSS modes
 					 */
-	u8 atr_sample_rate;
-	u8 atr_count;
+	union {
+		struct {
+			u8 atr_sample_rate;
+			u8 atr_count;
+		};
+		u16 next_to_alloc;
+	};
 
 	u16 next_to_use;
 	u16 next_to_clean;
@@ -287,6 +278,22 @@ struct ixgbe_ring_feature {
 	int mask;
 } ____cacheline_internodealigned_in_smp;
 
+/*
+ * FCoE requires that all Rx buffers be over 2200 bytes in length.  Since
+ * this is twice the size of a half page we need to double the page order
+ * for FCoE enabled Rx queues.
+ */
+#if defined(IXGBE_FCOE) && (PAGE_SIZE < 8192)
+static inline unsigned int ixgbe_rx_pg_order(struct ixgbe_ring *ring)
+{
+	return test_bit(__IXGBE_RX_FCOE_BUFSZ, &ring->state) ? 1 : 0;
+}
+#else
+#define ixgbe_rx_pg_order(_ring) 0
+#endif
+#define ixgbe_rx_pg_size(_ring) (PAGE_SIZE << ixgbe_rx_pg_order(_ring))
+#define ixgbe_rx_bufsz(_ring) ((PAGE_SIZE / 2) << ixgbe_rx_pg_order(_ring))
+
 struct ixgbe_ring_container {
 	struct ixgbe_ring *ring;	/* pointer to linked list of rings */
 	unsigned int total_bytes;	/* total bytes processed this int */
@@ -554,7 +561,7 @@ struct ixgbe_cb {
 	};
 	dma_addr_t dma;
 	u16 append_cnt;
-	bool delay_unmap;
+	bool page_released;
 };
 #define IXGBE_CB(skb) ((struct ixgbe_cb *)(skb)->cb)
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
index 24f7291..b09e67c 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
@@ -35,6 +35,7 @@
 #include <linux/netdevice.h>
 #include <linux/ethtool.h>
 #include <linux/vmalloc.h>
+#include <linux/highmem.h>
 #include <linux/uaccess.h>
 
 #include "ixgbe.h"
@@ -1615,7 +1616,6 @@ static int ixgbe_setup_desc_rings(struct ixgbe_adapter *adapter)
 	rx_ring->dev = &adapter->pdev->dev;
 	rx_ring->netdev = adapter->netdev;
 	rx_ring->reg_idx = adapter->rx_ring[0]->reg_idx;
-	rx_ring->rx_buf_len = IXGBE_RXBUFFER_2K;
 
 	err = ixgbe_setup_rx_resources(rx_ring);
 	if (err) {
@@ -1718,13 +1718,15 @@ static bool ixgbe_check_lbtest_frame(struct ixgbe_rx_buffer *rx_buffer,
 
 	frame_size >>= 1;
 
-	data = rx_buffer->skb->data;
+	data = kmap(rx_buffer->page) + rx_buffer->page_offset;
 
 	if (data[3] != 0xFF ||
 	    data[frame_size + 10] != 0xBE ||
 	    data[frame_size + 12] != 0xAF)
 		match = false;
 
+	kunmap(rx_buffer->page);
+
 	return match;
 }
 
@@ -1746,17 +1748,22 @@ static u16 ixgbe_clean_test_rings(struct ixgbe_ring *rx_ring,
 		/* check Rx buffer */
 		rx_buffer = &rx_ring->rx_buffer_info[rx_ntc];
 
-		/* unmap Rx buffer, will be remapped by alloc_rx_buffers */
-		dma_unmap_single(rx_ring->dev,
-				 rx_buffer->dma,
-				 rx_ring->rx_buf_len,
-				 DMA_FROM_DEVICE);
-		rx_buffer->dma = 0;
+		/* sync Rx buffer for CPU read */
+		dma_sync_single_for_cpu(rx_ring->dev,
+					rx_buffer->dma,
+					ixgbe_rx_bufsz(rx_ring),
+					DMA_FROM_DEVICE);
 
 		/* verify contents of skb */
 		if (ixgbe_check_lbtest_frame(rx_buffer, size))
 			count++;
 
+		/* sync Rx buffer for device write */
+		dma_sync_single_for_device(rx_ring->dev,
+					   rx_buffer->dma,
+					   ixgbe_rx_bufsz(rx_ring),
+					   DMA_FROM_DEVICE);
+
 		/* unmap buffer on Tx side */
 		tx_buffer = &tx_ring->tx_buffer_info[tx_ntc];
 		ixgbe_unmap_and_free_tx_resource(tx_ring, tx_buffer);
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index cae763c..e97ef45 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -469,17 +469,7 @@ rx_ring_summary:
 					print_hex_dump(KERN_INFO, "",
 					   DUMP_PREFIX_ADDRESS, 16, 1,
 					   phys_to_virt(rx_buffer_info->dma),
-					   rx_ring->rx_buf_len, true);
-
-					if (rx_ring->rx_buf_len
-						< IXGBE_RXBUFFER_2K)
-						print_hex_dump(KERN_INFO, "",
-						  DUMP_PREFIX_ADDRESS, 16, 1,
-						  phys_to_virt(
-						    rx_buffer_info->page_dma +
-						    rx_buffer_info->page_offset
-						  ),
-						  PAGE_SIZE/2, true);
+					   ixgbe_rx_bufsz(rx_ring), true);
 				}
 			}
 
@@ -1006,6 +996,7 @@ static inline void ixgbe_rx_hash(struct ixgbe_ring *ring,
 		skb->rxhash = le32_to_cpu(rx_desc->wb.lower.hi_dword.rss);
 }
 
+#ifdef IXGBE_FCOE
 /**
  * ixgbe_rx_is_fcoe - check the rx desc for incoming pkt type
  * @adapter: address of board private structure
@@ -1024,6 +1015,7 @@ static inline bool ixgbe_rx_is_fcoe(struct ixgbe_adapter *adapter,
 			     IXGBE_RXDADV_PKTTYPE_ETQF_SHIFT)));
 }
 
+#endif /* IXGBE_FCOE */
 /**
  * ixgbe_rx_checksum - indicate in skb if hw indicated a good cksum
  * @ring: structure containing ring specific data
@@ -1051,7 +1043,7 @@ static inline void ixgbe_rx_checksum(struct ixgbe_ring *ring,
 		return;
 
 	if (ixgbe_test_staterr(rx_desc, IXGBE_RXDADV_ERR_TCPE)) {
-		u16 pkt_info = rx_desc->wb.lower.lo_dword.hs_rss.pkt_info;
+		__le16 pkt_info = rx_desc->wb.lower.lo_dword.hs_rss.pkt_info;
 
 		/*
 		 * 82599 errata, UDP frames with a 0 checksum can be marked as
@@ -1072,6 +1064,9 @@ static inline void ixgbe_rx_checksum(struct ixgbe_ring *ring,
 static inline void ixgbe_release_rx_desc(struct ixgbe_ring *rx_ring, u32 val)
 {
 	rx_ring->next_to_use = val;
+
+	/* update next to alloc since we have filled the ring */
+	rx_ring->next_to_alloc = val;
 	/*
 	 * Force memory writes to complete before letting h/w
 	 * know there are new descriptors to fetch.  (Only
@@ -1082,67 +1077,46 @@ static inline void ixgbe_release_rx_desc(struct ixgbe_ring *rx_ring, u32 val)
 	writel(val, rx_ring->tail);
 }
 
-static bool ixgbe_alloc_mapped_skb(struct ixgbe_ring *rx_ring,
-				   struct ixgbe_rx_buffer *bi)
-{
-	struct sk_buff *skb = bi->skb;
-	dma_addr_t dma = bi->dma;
-
-	if (dma)
-		return true;
-
-	if (likely(!skb)) {
-		skb = netdev_alloc_skb_ip_align(rx_ring->netdev,
-						rx_ring->rx_buf_len);
-		bi->skb = skb;
-		if (!skb) {
-			rx_ring->rx_stats.alloc_rx_buff_failed++;
-			return false;
-		}
-	}
-
-	dma = dma_map_single(rx_ring->dev, skb->data,
-			     rx_ring->rx_buf_len, DMA_FROM_DEVICE);
-
-	if (dma_mapping_error(rx_ring->dev, dma)) {
-		rx_ring->rx_stats.alloc_rx_buff_failed++;
-		return false;
-	}
-
-	bi->dma = dma;
-	return true;
-}
-
 static bool ixgbe_alloc_mapped_page(struct ixgbe_ring *rx_ring,
 				    struct ixgbe_rx_buffer *bi)
 {
 	struct page *page = bi->page;
-	dma_addr_t page_dma = bi->page_dma;
-	unsigned int page_offset = bi->page_offset ^ (PAGE_SIZE / 2);
+	dma_addr_t dma = bi->dma;
 
-	if (page_dma)
+	/* since we are recycling buffers we should seldom need to alloc */
+	if (likely(dma))
 		return true;
 
-	if (!page) {
-		page = alloc_page(GFP_ATOMIC | __GFP_COLD);
-		bi->page = page;
+	/* alloc new page for storage */
+	if (likely(!page)) {
+		page = alloc_pages(GFP_ATOMIC | __GFP_COLD,
+				   ixgbe_rx_pg_order(rx_ring));
 		if (unlikely(!page)) {
 			rx_ring->rx_stats.alloc_rx_page_failed++;
 			return false;
 		}
+		bi->page = page;
 	}
 
-	page_dma = dma_map_page(rx_ring->dev, page,
-				page_offset, PAGE_SIZE / 2,
-				DMA_FROM_DEVICE);
+	/* map page for use */
+	dma = dma_map_page(rx_ring->dev, page, 0,
+			   ixgbe_rx_pg_size(rx_ring), DMA_FROM_DEVICE);
+
+	/*
+	 * if mapping failed free memory back to system since
+	 * there isn't much point in holding memory we can't use
+	 */
+	if (dma_mapping_error(rx_ring->dev, dma)) {
+		put_page(page);
+		bi->page = NULL;
 
-	if (dma_mapping_error(rx_ring->dev, page_dma)) {
 		rx_ring->rx_stats.alloc_rx_page_failed++;
 		return false;
 	}
 
-	bi->page_dma = page_dma;
-	bi->page_offset = page_offset;
+	bi->dma = dma;
+	bi->page_offset ^= ixgbe_rx_bufsz(rx_ring);
+
 	return true;
 }
 
@@ -1157,30 +1131,23 @@ void ixgbe_alloc_rx_buffers(struct ixgbe_ring *rx_ring, u16 cleaned_count)
 	struct ixgbe_rx_buffer *bi;
 	u16 i = rx_ring->next_to_use;
 
-	/* nothing to do or no valid netdev defined */
-	if (!cleaned_count || !rx_ring->netdev)
+	/* nothing to do */
+	if (!cleaned_count)
 		return;
 
 	rx_desc = IXGBE_RX_DESC(rx_ring, i);
 	bi = &rx_ring->rx_buffer_info[i];
 	i -= rx_ring->count;
 
-	while (cleaned_count--) {
-		if (!ixgbe_alloc_mapped_skb(rx_ring, bi))
+	do {
+		if (!ixgbe_alloc_mapped_page(rx_ring, bi))
 			break;
 
-		/* Refresh the desc even if buffer_addrs didn't change
-		 * because each write-back erases this info. */
-		if (ring_is_ps_enabled(rx_ring)) {
-			rx_desc->read.hdr_addr = cpu_to_le64(bi->dma);
-
-			if (!ixgbe_alloc_mapped_page(rx_ring, bi))
-				break;
-
-			rx_desc->read.pkt_addr = cpu_to_le64(bi->page_dma);
-		} else {
-			rx_desc->read.pkt_addr = cpu_to_le64(bi->dma);
-		}
+		/*
+		 * Refresh the desc even if buffer_addrs didn't change
+		 * because each write-back erases this info.
+		 */
+		rx_desc->read.pkt_addr = cpu_to_le64(bi->dma + bi->page_offset);
 
 		rx_desc++;
 		bi++;
@@ -1193,7 +1160,9 @@ void ixgbe_alloc_rx_buffers(struct ixgbe_ring *rx_ring, u16 cleaned_count)
 
 		/* clear the hdr_addr for the next_to_use descriptor */
 		rx_desc->read.hdr_addr = 0;
-	}
+
+		cleaned_count--;
+	} while (cleaned_count);
 
 	i += rx_ring->count;
 
@@ -1201,90 +1170,6 @@ void ixgbe_alloc_rx_buffers(struct ixgbe_ring *rx_ring, u16 cleaned_count)
 		ixgbe_release_rx_desc(rx_ring, i);
 }
 
-static inline u16 ixgbe_get_hlen(union ixgbe_adv_rx_desc *rx_desc)
-{
-	/* HW will not DMA in data larger than the given buffer, even if it
-	 * parses the (NFS, of course) header to be larger.  In that case, it
-	 * fills the header buffer and spills the rest into the page.
-	 */
-	u16 hdr_info = le16_to_cpu(rx_desc->wb.lower.lo_dword.hs_rss.hdr_info);
-	u16 hlen = (hdr_info &  IXGBE_RXDADV_HDRBUFLEN_MASK) >>
-		    IXGBE_RXDADV_HDRBUFLEN_SHIFT;
-	if (hlen > IXGBE_RX_HDR_SIZE)
-		hlen = IXGBE_RX_HDR_SIZE;
-	return hlen;
-}
-
-/**
- * ixgbe_merge_active_tail - merge active tail into lro skb
- * @tail: pointer to active tail in frag_list
- *
- * This function merges the length and data of an active tail into the
- * skb containing the frag_list.  It resets the tail's pointer to the head,
- * but it leaves the heads pointer to tail intact.
- **/
-static inline struct sk_buff *ixgbe_merge_active_tail(struct sk_buff *tail)
-{
-	struct sk_buff *head = IXGBE_CB(tail)->head;
-
-	if (!head)
-		return tail;
-
-	head->len += tail->len;
-	head->data_len += tail->len;
-	head->truesize += tail->len;
-
-	IXGBE_CB(tail)->head = NULL;
-
-	return head;
-}
-
-/**
- * ixgbe_add_active_tail - adds an active tail into the skb frag_list
- * @head: pointer to the start of the skb
- * @tail: pointer to active tail to add to frag_list
- *
- * This function adds an active tail to the end of the frag list.  This tail
- * will still be receiving data so we cannot yet ad it's stats to the main
- * skb.  That is done via ixgbe_merge_active_tail.
- **/
-static inline void ixgbe_add_active_tail(struct sk_buff *head,
-					 struct sk_buff *tail)
-{
-	struct sk_buff *old_tail = IXGBE_CB(head)->tail;
-
-	if (old_tail) {
-		ixgbe_merge_active_tail(old_tail);
-		old_tail->next = tail;
-	} else {
-		skb_shinfo(head)->frag_list = tail;
-	}
-
-	IXGBE_CB(tail)->head = head;
-	IXGBE_CB(head)->tail = tail;
-}
-
-/**
- * ixgbe_close_active_frag_list - cleanup pointers on a frag_list skb
- * @head: pointer to head of an active frag list
- *
- * This function will clear the frag_tail_tracker pointer on an active
- * frag_list and returns true if the pointer was actually set
- **/
-static inline bool ixgbe_close_active_frag_list(struct sk_buff *head)
-{
-	struct sk_buff *tail = IXGBE_CB(head)->tail;
-
-	if (!tail)
-		return false;
-
-	ixgbe_merge_active_tail(tail);
-
-	IXGBE_CB(head)->tail = NULL;
-
-	return true;
-}
-
 /**
  * ixgbe_get_headlen - determine size of header for RSC/LRO/GRO/FCOE
  * @data: pointer to the start of the headers
@@ -1346,7 +1231,7 @@ static unsigned int ixgbe_get_headlen(unsigned char *data,
 		/* record next protocol */
 		nexthdr = hdr.ipv4->protocol;
 		hdr.network += hlen;
-#ifdef CONFIG_FCOE
+#ifdef IXGBE_FCOE
 	} else if (protocol == __constant_htons(ETH_P_FCOE)) {
 		if ((hdr.network - data) > (max_len - FCOE_HEADER_LEN))
 			return max_len;
@@ -1409,7 +1294,7 @@ static void ixgbe_get_rsc_cnt(struct ixgbe_ring *rx_ring,
 static void ixgbe_set_rsc_gso_size(struct ixgbe_ring *ring,
 				   struct sk_buff *skb)
 {
-	u16 hdr_len = ixgbe_get_headlen(skb->data, skb_headlen(skb));
+	u16 hdr_len = skb_headlen(skb);
 
 	/* set gso_size to avoid messing up TCP MSS */
 	skb_shinfo(skb)->gso_size = DIV_ROUND_UP((skb->len - hdr_len),
@@ -1473,150 +1358,346 @@ static void ixgbe_rx_skb(struct ixgbe_q_vector *q_vector,
 		netif_rx(skb);
 }
 
+/**
+ * ixgbe_is_non_eop - process handling of non-EOP buffers
+ * @rx_ring: Rx ring being processed
+ * @rx_desc: Rx descriptor for current buffer
+ * @skb: Current socket buffer containing buffer in progress
+ *
+ * This function updates next to clean.  If the buffer is an EOP buffer
+ * this function exits returning false, otherwise it will place the
+ * sk_buff in the next buffer to be chained and return true indicating
+ * that this is in fact a non-EOP buffer.
+ **/
+static bool ixgbe_is_non_eop(struct ixgbe_ring *rx_ring,
+			     union ixgbe_adv_rx_desc *rx_desc,
+			     struct sk_buff *skb)
+{
+	u32 ntc = rx_ring->next_to_clean + 1;
+
+	/* fetch, update, and store next to clean */
+	ntc = (ntc < rx_ring->count) ? ntc : 0;
+	rx_ring->next_to_clean = ntc;
+
+	prefetch(IXGBE_RX_DESC(rx_ring, ntc));
+
+	if (likely(ixgbe_test_staterr(rx_desc, IXGBE_RXD_STAT_EOP)))
+		return false;
+
+	/* append_cnt indicates packet is RSC, if so fetch nextp */
+	if (IXGBE_CB(skb)->append_cnt) {
+		ntc = le32_to_cpu(rx_desc->wb.upper.status_error);
+		ntc &= IXGBE_RXDADV_NEXTP_MASK;
+		ntc >>= IXGBE_RXDADV_NEXTP_SHIFT;
+	}
+
+	/* place skb in next buffer to be received */
+	rx_ring->rx_buffer_info[ntc].skb = skb;
+	rx_ring->rx_stats.non_eop_descs++;
+
+	return true;
+}
+
+/**
+ * ixgbe_cleanup_headers - Correct corrupted or empty headers
+ * @rx_ring: rx descriptor ring packet is being transacted on
+ * @rx_desc: pointer to the EOP Rx descriptor
+ * @skb: pointer to current skb being fixed
+ *
+ * Check for corrupted packet headers caused by senders on the local L2
+ * embedded NIC switch not setting up their Tx Descriptors right.  These
+ * should be very rare.
+ *
+ * Also address the case where we are pulling data in on pages only
+ * and as such no data is present in the skb header.
+ *
+ * In addition if skb is not at least 60 bytes we need to pad it so that
+ * it is large enough to qualify as a valid Ethernet frame.
+ *
+ * Returns true if an error was encountered and skb was freed.
+ **/
+static bool ixgbe_cleanup_headers(struct ixgbe_ring *rx_ring,
+				  union ixgbe_adv_rx_desc *rx_desc,
+				  struct sk_buff *skb)
+{
+	struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[0];
+	struct net_device *netdev = rx_ring->netdev;
+	unsigned char *va;
+	unsigned int pull_len;
+
+	/* if the page was released unmap it, else just sync our portion */
+	if (unlikely(IXGBE_CB(skb)->page_released)) {
+		dma_unmap_page(rx_ring->dev, IXGBE_CB(skb)->dma,
+			       ixgbe_rx_pg_size(rx_ring), DMA_FROM_DEVICE);
+		IXGBE_CB(skb)->page_released = false;
+	} else {
+		dma_sync_single_range_for_cpu(rx_ring->dev,
+					      IXGBE_CB(skb)->dma,
+					      frag->page_offset,
+					      ixgbe_rx_bufsz(rx_ring),
+					      DMA_FROM_DEVICE);
+	}
+	IXGBE_CB(skb)->dma = 0;
+
+	/* verify that the packet does not have any known errors */
+	if (unlikely(ixgbe_test_staterr(rx_desc,
+					IXGBE_RXDADV_ERR_FRAME_ERR_MASK) &&
+	    !(netdev->features & NETIF_F_RXALL))) {
+		dev_kfree_skb_any(skb);
+		return true;
+	}
+
+	/*
+	 * it is valid to use page_address instead of kmap since we are
+	 * working with pages allocated out of the lomem pool per
+	 * alloc_page(GFP_ATOMIC)
+	 */
+	va = skb_frag_address(frag);
+
+	/*
+	 * we need the header to contain the greater of either ETH_HLEN or
+	 * 60 bytes if the skb->len is less than 60 for skb_pad.
+	 */
+	pull_len = skb_frag_size(frag);
+	if (pull_len > 256)
+		pull_len = ixgbe_get_headlen(va, pull_len);
+
+	/* align pull length to size of long to optimize memcpy performance */
+	skb_copy_to_linear_data(skb, va, ALIGN(pull_len, sizeof(long)));
+
+	/* update all of the pointers */
+	skb_frag_size_sub(frag, pull_len);
+	frag->page_offset += pull_len;
+	skb->data_len -= pull_len;
+	skb->tail += pull_len;
+
+	/*
+	 * if we sucked the frag empty then we should free it,
+	 * if there are other frags here something is screwed up in hardware
+	 */
+	if (skb_frag_size(frag) == 0) {
+		BUG_ON(skb_shinfo(skb)->nr_frags != 1);
+		skb_shinfo(skb)->nr_frags = 0;
+		__skb_frag_unref(frag);
+		skb->truesize -= ixgbe_rx_bufsz(rx_ring);
+	}
+
+	/* if skb_pad returns an error the skb was freed */
+	if (unlikely(skb->len < 60)) {
+		int pad_len = 60 - skb->len;
+
+		if (skb_pad(skb, pad_len))
+			return true;
+		__skb_put(skb, pad_len);
+	}
+
+	return false;
+}
+
+/**
+ * ixgbe_can_reuse_page - determine if we can reuse a page
+ * @rx_buffer: pointer to rx_buffer containing the page we want to reuse
+ *
+ * Returns true if page can be reused in another Rx buffer
+ **/
+static inline bool ixgbe_can_reuse_page(struct ixgbe_rx_buffer *rx_buffer)
+{
+	struct page *page = rx_buffer->page;
+
+	/* if we are only owner of page and it is local we can reuse it */
+	return likely(page_count(page) == 1) &&
+	       likely(page_to_nid(page) == numa_node_id());
+}
+
+/**
+ * ixgbe_reuse_rx_page - page flip buffer and store it back on the ring
+ * @rx_ring: rx descriptor ring to store buffers on
+ * @old_buff: donor buffer to have page reused
+ *
+ * Syncronizes page for reuse by the adapter
+ **/
+static void ixgbe_reuse_rx_page(struct ixgbe_ring *rx_ring,
+				struct ixgbe_rx_buffer *old_buff)
+{
+	struct ixgbe_rx_buffer *new_buff;
+	u16 nta = rx_ring->next_to_alloc;
+	u16 bufsz = ixgbe_rx_bufsz(rx_ring);
+
+	new_buff = &rx_ring->rx_buffer_info[nta];
+
+	/* update, and store next to alloc */
+	nta++;
+	rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
+
+	/* transfer page from old buffer to new buffer */
+	new_buff->page = old_buff->page;
+	new_buff->dma = old_buff->dma;
+
+	/* flip page offset to other buffer and store to new_buff */
+	new_buff->page_offset = old_buff->page_offset ^ bufsz;
+
+	/* sync the buffer for use by the device */
+	dma_sync_single_range_for_device(rx_ring->dev, new_buff->dma,
+					 new_buff->page_offset, bufsz,
+					 DMA_FROM_DEVICE);
+
+	/* bump ref count on page before it is given to the stack */
+	get_page(new_buff->page);
+}
+
+/**
+ * ixgbe_add_rx_frag - Add contents of Rx buffer to sk_buff
+ * @rx_ring: rx descriptor ring to transact packets on
+ * @rx_buffer: buffer containing page to add
+ * @rx_desc: descriptor containing length of buffer written by hardware
+ * @skb: sk_buff to place the data into
+ *
+ * This function is based on skb_add_rx_frag.  I would have used that
+ * function however it doesn't handle the truesize case correctly since we
+ * are allocating more memory than might be used for a single receive.
+ **/
+static void ixgbe_add_rx_frag(struct ixgbe_ring *rx_ring,
+			      struct ixgbe_rx_buffer *rx_buffer,
+			      struct sk_buff *skb, int size)
+{
+	skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags,
+			   rx_buffer->page, rx_buffer->page_offset,
+			   size);
+	skb->len += size;
+	skb->data_len += size;
+	skb->truesize += ixgbe_rx_bufsz(rx_ring);
+}
+
+/**
+ * ixgbe_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf
+ * @q_vector: structure containing interrupt and ring information
+ * @rx_ring: rx descriptor ring to transact packets on
+ * @budget: Total limit on number of packets to process
+ *
+ * This function provides a "bounce buffer" approach to Rx interrupt
+ * processing.  The advantage to this is that on systems that have
+ * expensive overhead for IOMMU access this provides a means of avoiding
+ * it by maintaining the mapping of the page to the syste.
+ *
+ * Returns true if all work is completed without reaching budget
+ **/
 static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 			       struct ixgbe_ring *rx_ring,
 			       int budget)
 {
-	union ixgbe_adv_rx_desc *rx_desc, *next_rxd;
-	struct ixgbe_rx_buffer *rx_buffer_info;
-	struct sk_buff *skb;
 	unsigned int total_rx_bytes = 0, total_rx_packets = 0;
-	const int current_node = numa_node_id();
-	struct ixgbe_adapter *adapter = q_vector->adapter;
 #ifdef IXGBE_FCOE
+	struct ixgbe_adapter *adapter = q_vector->adapter;
 	int ddp_bytes = 0;
 #endif /* IXGBE_FCOE */
-	u16 i;
-	u16 cleaned_count = 0;
+	u16 cleaned_count = ixgbe_desc_unused(rx_ring);
 
-	i = rx_ring->next_to_clean;
-	rx_desc = IXGBE_RX_DESC(rx_ring, i);
+	do {
+		struct ixgbe_rx_buffer *rx_buffer;
+		union ixgbe_adv_rx_desc *rx_desc;
+		struct sk_buff *skb;
+		struct page *page;
+		u16 ntc;
 
-	while (ixgbe_test_staterr(rx_desc, IXGBE_RXD_STAT_DD)) {
-		u32 upper_len = 0;
+		/* return some buffers to hardware, one at a time is too slow */
+		if (cleaned_count >= IXGBE_RX_BUFFER_WRITE) {
+			ixgbe_alloc_rx_buffers(rx_ring, cleaned_count);
+			cleaned_count = 0;
+		}
 
-		rmb(); /* read descriptor and rx_buffer_info after status DD */
+		ntc = rx_ring->next_to_clean;
+		rx_desc = IXGBE_RX_DESC(rx_ring, ntc);
+		rx_buffer = &rx_ring->rx_buffer_info[ntc];
 
-		rx_buffer_info = &rx_ring->rx_buffer_info[i];
+		if (!ixgbe_test_staterr(rx_desc, IXGBE_RXD_STAT_DD))
+			break;
 
-		skb = rx_buffer_info->skb;
-		rx_buffer_info->skb = NULL;
-		prefetch(skb->data);
+		/*
+		 * This memory barrier is needed to keep us from reading
+		 * any other fields out of the rx_desc until we know the
+		 * RXD_STAT_DD bit is set
+		 */
+		rmb();
 
-		/* linear means we are building an skb from multiple pages */
-		if (!skb_is_nonlinear(skb)) {
-			u16 hlen;
-			if (ring_is_ps_enabled(rx_ring)) {
-				hlen = ixgbe_get_hlen(rx_desc);
-				upper_len = le16_to_cpu(rx_desc->wb.upper.length);
-			} else {
-				hlen = le16_to_cpu(rx_desc->wb.upper.length);
+		page = rx_buffer->page;
+		prefetchw(page);
+
+		skb = rx_buffer->skb;
+
+		if (likely(!skb)) {
+			void *page_addr = page_address(page) +
+					  rx_buffer->page_offset;
+
+			/* prefetch first cache line of first page */
+			prefetch(page_addr);
+#if L1_CACHE_BYTES < 128
+			prefetch(page_addr + L1_CACHE_BYTES);
+#endif
+
+			/* allocate a skb to store the frags */
+			skb = netdev_alloc_skb_ip_align(rx_ring->netdev,
+							IXGBE_RX_HDR_SIZE);
+			if (unlikely(!skb)) {
+				rx_ring->rx_stats.alloc_rx_buff_failed++;
+				break;
 			}
 
-			skb_put(skb, hlen);
+			/*
+			 * we will be copying header into skb->data in
+			 * pskb_may_pull so it is in our interest to prefetch
+			 * it now to avoid a possible cache miss
+			 */
+			prefetchw(skb->data);
 
 			/*
 			 * Delay unmapping of the first packet. It carries the
 			 * header information, HW may still access the header
-			 * after writeback.  Only unmap it when EOP is reached
+			 * after the writeback.  Only unmap it when EOP is
+			 * reached
 			 */
-			if (!IXGBE_CB(skb)->head) {
-				IXGBE_CB(skb)->delay_unmap = true;
-				IXGBE_CB(skb)->dma = rx_buffer_info->dma;
-			} else {
-				skb = ixgbe_merge_active_tail(skb);
-				dma_unmap_single(rx_ring->dev,
-						 rx_buffer_info->dma,
-						 rx_ring->rx_buf_len,
-						 DMA_FROM_DEVICE);
-			}
-			rx_buffer_info->dma = 0;
+			IXGBE_CB(skb)->dma = rx_buffer->dma;
 		} else {
-			/* assume packet split since header is unmapped */
-			upper_len = le16_to_cpu(rx_desc->wb.upper.length);
+			/* we are reusing so sync this buffer for CPU use */
+			dma_sync_single_range_for_cpu(rx_ring->dev,
+						      rx_buffer->dma,
+						      rx_buffer->page_offset,
+						      ixgbe_rx_bufsz(rx_ring),
+						      DMA_FROM_DEVICE);
 		}
 
-		if (upper_len) {
-			dma_unmap_page(rx_ring->dev,
-				       rx_buffer_info->page_dma,
-				       PAGE_SIZE / 2,
-				       DMA_FROM_DEVICE);
-			rx_buffer_info->page_dma = 0;
-			skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags,
-					   rx_buffer_info->page,
-					   rx_buffer_info->page_offset,
-					   upper_len);
-
-			if ((page_count(rx_buffer_info->page) == 1) &&
-			    (page_to_nid(rx_buffer_info->page) == current_node))
-				get_page(rx_buffer_info->page);
-			else
-				rx_buffer_info->page = NULL;
+		/* pull page into skb */
+		ixgbe_add_rx_frag(rx_ring, rx_buffer, skb,
+				  le16_to_cpu(rx_desc->wb.upper.length));
 
-			skb->len += upper_len;
-			skb->data_len += upper_len;
-			skb->truesize += PAGE_SIZE / 2;
+		if (ixgbe_can_reuse_page(rx_buffer)) {
+			/* hand second half of page back to the ring */
+			ixgbe_reuse_rx_page(rx_ring, rx_buffer);
+		} else if (IXGBE_CB(skb)->dma == rx_buffer->dma) {
+			/* the page has been released from the ring */
+			IXGBE_CB(skb)->page_released = true;
+		} else {
+			/* we are not reusing the buffer so unmap it */
+			dma_unmap_page(rx_ring->dev, rx_buffer->dma,
+				       ixgbe_rx_pg_size(rx_ring),
+				       DMA_FROM_DEVICE);
 		}
 
-		ixgbe_get_rsc_cnt(rx_ring, rx_desc, skb);
+		/* clear contents of buffer_info */
+		rx_buffer->skb = NULL;
+		rx_buffer->dma = 0;
+		rx_buffer->page = NULL;
 
-		i++;
-		if (i == rx_ring->count)
-			i = 0;
+		ixgbe_get_rsc_cnt(rx_ring, rx_desc, skb);
 
-		next_rxd = IXGBE_RX_DESC(rx_ring, i);
-		prefetch(next_rxd);
 		cleaned_count++;
 
-		if ((!ixgbe_test_staterr(rx_desc, IXGBE_RXD_STAT_EOP))) {
-			struct ixgbe_rx_buffer *next_buffer;
-			u32 nextp;
-
-			if (IXGBE_CB(skb)->append_cnt) {
-				nextp = le32_to_cpu(
-						rx_desc->wb.upper.status_error);
-				nextp >>= IXGBE_RXDADV_NEXTP_SHIFT;
-			} else {
-				nextp = i;
-			}
-
-			next_buffer = &rx_ring->rx_buffer_info[nextp];
-
-			if (ring_is_ps_enabled(rx_ring)) {
-				rx_buffer_info->skb = next_buffer->skb;
-				rx_buffer_info->dma = next_buffer->dma;
-				next_buffer->skb = skb;
-				next_buffer->dma = 0;
-			} else {
-				struct sk_buff *next_skb = next_buffer->skb;
-				ixgbe_add_active_tail(skb, next_skb);
-				IXGBE_CB(next_skb)->head = skb;
-			}
-			rx_ring->rx_stats.non_eop_descs++;
-			goto next_desc;
-		}
-
-		dma_unmap_single(rx_ring->dev,
-				 IXGBE_CB(skb)->dma,
-				 rx_ring->rx_buf_len,
-				 DMA_FROM_DEVICE);
-		IXGBE_CB(skb)->dma = 0;
-		IXGBE_CB(skb)->delay_unmap = false;
-
-		if (ixgbe_close_active_frag_list(skb) &&
-		    !IXGBE_CB(skb)->append_cnt) {
-			/* if we got here without RSC the packet is invalid */
-			dev_kfree_skb_any(skb);
-			goto next_desc;
-		}
+		/* place incomplete frames back on ring for completion */
+		if (ixgbe_is_non_eop(rx_ring, rx_desc, skb))
+			continue;
 
-		/* ERR_MASK will only have valid bits if EOP set */
-		if (unlikely(ixgbe_test_staterr(rx_desc,
-					    IXGBE_RXDADV_ERR_FRAME_ERR_MASK) &&
-		    !(adapter->netdev->features & NETIF_F_RXALL))) {
-			dev_kfree_skb_any(skb);
-			goto next_desc;
-		}
+		/* verify the packet layout is correct */
+		if (ixgbe_cleanup_headers(rx_ring, rx_desc, skb))
+			continue;
 
 		/* probably a little skewed due to removing CRC */
 		total_rx_bytes += skb->len;
@@ -1631,32 +1712,16 @@ static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 			ddp_bytes = ixgbe_fcoe_ddp(adapter, rx_desc, skb);
 			if (!ddp_bytes) {
 				dev_kfree_skb_any(skb);
-				goto next_desc;
+				continue;
 			}
 		}
+
 #endif /* IXGBE_FCOE */
 		ixgbe_rx_skb(q_vector, skb);
 
+		/* update budget accounting */
 		budget--;
-next_desc:
-		if (!budget)
-			break;
-
-		/* return some buffers to hardware, one at a time is too slow */
-		if (cleaned_count >= IXGBE_RX_BUFFER_WRITE) {
-			ixgbe_alloc_rx_buffers(rx_ring, cleaned_count);
-			cleaned_count = 0;
-		}
-
-		/* use prefetched values */
-		rx_desc = next_rxd;
-	}
-
-	rx_ring->next_to_clean = i;
-	cleaned_count = ixgbe_desc_unused(rx_ring);
-
-	if (cleaned_count)
-		ixgbe_alloc_rx_buffers(rx_ring, cleaned_count);
+	} while (likely(budget));
 
 #ifdef IXGBE_FCOE
 	/* include DDPed FCoE data */
@@ -1671,8 +1736,8 @@ next_desc:
 		total_rx_bytes += ddp_bytes;
 		total_rx_packets += DIV_ROUND_UP(ddp_bytes, mss);
 	}
-#endif /* IXGBE_FCOE */
 
+#endif /* IXGBE_FCOE */
 	u64_stats_update_begin(&rx_ring->syncp);
 	rx_ring->stats.packets += total_rx_packets;
 	rx_ring->stats.bytes += total_rx_bytes;
@@ -1680,6 +1745,9 @@ next_desc:
 	q_vector->rx.total_packets += total_rx_packets;
 	q_vector->rx.total_bytes += total_rx_bytes;
 
+	if (cleaned_count)
+		ixgbe_alloc_rx_buffers(rx_ring, cleaned_count);
+
 	return !!budget;
 }
 
@@ -2635,18 +2703,12 @@ static void ixgbe_configure_srrctl(struct ixgbe_adapter *adapter,
 	srrctl |= (IXGBE_RX_HDR_SIZE << IXGBE_SRRCTL_BSIZEHDRSIZE_SHIFT) &
 		  IXGBE_SRRCTL_BSIZEHDR_MASK;
 
-	if (ring_is_ps_enabled(rx_ring)) {
-#if (PAGE_SIZE / 2) > IXGBE_MAX_RXBUFFER
-		srrctl |= IXGBE_MAX_RXBUFFER >> IXGBE_SRRCTL_BSIZEPKT_SHIFT;
+#if PAGE_SIZE > IXGBE_MAX_RXBUFFER
+	srrctl |= IXGBE_MAX_RXBUFFER >> IXGBE_SRRCTL_BSIZEPKT_SHIFT;
 #else
-		srrctl |= (PAGE_SIZE / 2) >> IXGBE_SRRCTL_BSIZEPKT_SHIFT;
+	srrctl |= ixgbe_rx_bufsz(rx_ring) >> IXGBE_SRRCTL_BSIZEPKT_SHIFT;
 #endif
-		srrctl |= IXGBE_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS;
-	} else {
-		srrctl |= ALIGN(rx_ring->rx_buf_len, 1024) >>
-			  IXGBE_SRRCTL_BSIZEPKT_SHIFT;
-		srrctl |= IXGBE_SRRCTL_DESCTYPE_ADV_ONEBUF;
-	}
+	srrctl |= IXGBE_SRRCTL_DESCTYPE_ADV_ONEBUF;
 
 	IXGBE_WRITE_REG(&adapter->hw, IXGBE_SRRCTL(reg_idx), srrctl);
 }
@@ -2729,13 +2791,11 @@ static void ixgbe_configure_rscctl(struct ixgbe_adapter *adapter,
 {
 	struct ixgbe_hw *hw = &adapter->hw;
 	u32 rscctrl;
-	int rx_buf_len;
 	u8 reg_idx = ring->reg_idx;
 
 	if (!ring_is_rsc_enabled(ring))
 		return;
 
-	rx_buf_len = ring->rx_buf_len;
 	rscctrl = IXGBE_READ_REG(hw, IXGBE_RSCCTL(reg_idx));
 	rscctrl |= IXGBE_RSCCTL_RSCEN;
 	/*
@@ -2743,24 +2803,13 @@ static void ixgbe_configure_rscctl(struct ixgbe_adapter *adapter,
 	 * total size of max desc * buf_len is not greater
 	 * than 65536
 	 */
-	if (ring_is_ps_enabled(ring)) {
-#if (PAGE_SIZE < 8192)
-		rscctrl |= IXGBE_RSCCTL_MAXDESC_16;
-#elif (PAGE_SIZE < 16384)
-		rscctrl |= IXGBE_RSCCTL_MAXDESC_8;
-#elif (PAGE_SIZE < 32768)
-		rscctrl |= IXGBE_RSCCTL_MAXDESC_4;
+#if (PAGE_SIZE <= 8192)
+	rscctrl |= IXGBE_RSCCTL_MAXDESC_16;
+#elif (PAGE_SIZE <= 16384)
+	rscctrl |= IXGBE_RSCCTL_MAXDESC_8;
 #else
-		rscctrl |= IXGBE_RSCCTL_MAXDESC_1;
+	rscctrl |= IXGBE_RSCCTL_MAXDESC_4;
 #endif
-	} else {
-		if (rx_buf_len <= IXGBE_RXBUFFER_4K)
-			rscctrl |= IXGBE_RSCCTL_MAXDESC_16;
-		else if (rx_buf_len <= IXGBE_RXBUFFER_8K)
-			rscctrl |= IXGBE_RSCCTL_MAXDESC_8;
-		else
-			rscctrl |= IXGBE_RSCCTL_MAXDESC_4;
-	}
 	IXGBE_WRITE_REG(hw, IXGBE_RSCCTL(reg_idx), rscctrl);
 }
 
@@ -2977,23 +3026,10 @@ static void ixgbe_set_rx_buffer_len(struct ixgbe_adapter *adapter)
 	struct ixgbe_hw *hw = &adapter->hw;
 	struct net_device *netdev = adapter->netdev;
 	int max_frame = netdev->mtu + ETH_HLEN + ETH_FCS_LEN;
-	int rx_buf_len;
 	struct ixgbe_ring *rx_ring;
 	int i;
 	u32 mhadd, hlreg0;
 
-	/* Decide whether to use packet split mode or not */
-	/* On by default */
-	adapter->flags |= IXGBE_FLAG_RX_PS_ENABLED;
-
-	/* Do not use packet split if we're in SR-IOV Mode */
-	if (adapter->num_vfs)
-		adapter->flags &= ~IXGBE_FLAG_RX_PS_ENABLED;
-
-	/* Disable packet split due to 82599 erratum #45 */
-	if (hw->mac.type == ixgbe_mac_82599EB)
-		adapter->flags &= ~IXGBE_FLAG_RX_PS_ENABLED;
-
 #ifdef IXGBE_FCOE
 	/* adjust max frame to be able to do baby jumbo for FCoE */
 	if ((adapter->flags & IXGBE_FLAG_FCOE_ENABLED) &&
@@ -3012,27 +3048,6 @@ static void ixgbe_set_rx_buffer_len(struct ixgbe_adapter *adapter)
 	/* MHADD will allow an extra 4 bytes past for vlan tagged frames */
 	max_frame += VLAN_HLEN;
 
-	/* Set the RX buffer length according to the mode */
-	if (adapter->flags & IXGBE_FLAG_RX_PS_ENABLED) {
-		rx_buf_len = IXGBE_RX_HDR_SIZE;
-	} else {
-		if (!(adapter->flags2 & IXGBE_FLAG2_RSC_ENABLED) &&
-		    (netdev->mtu <= ETH_DATA_LEN))
-			rx_buf_len = MAXIMUM_ETHERNET_VLAN_SIZE;
-		/*
-		 * Make best use of allocation by using all but 1K of a
-		 * power of 2 allocation that will be used for skb->head.
-		 */
-		else if (max_frame <= IXGBE_RXBUFFER_3K)
-			rx_buf_len = IXGBE_RXBUFFER_3K;
-		else if (max_frame <= IXGBE_RXBUFFER_7K)
-			rx_buf_len = IXGBE_RXBUFFER_7K;
-		else if (max_frame <= IXGBE_RXBUFFER_15K)
-			rx_buf_len = IXGBE_RXBUFFER_15K;
-		else
-			rx_buf_len = IXGBE_MAX_RXBUFFER;
-	}
-
 	hlreg0 = IXGBE_READ_REG(hw, IXGBE_HLREG0);
 	/* set jumbo enable since MHADD.MFS is keeping size locked at max_frame */
 	hlreg0 |= IXGBE_HLREG0_JUMBOEN;
@@ -3044,32 +3059,16 @@ static void ixgbe_set_rx_buffer_len(struct ixgbe_adapter *adapter)
 	 */
 	for (i = 0; i < adapter->num_rx_queues; i++) {
 		rx_ring = adapter->rx_ring[i];
-		rx_ring->rx_buf_len = rx_buf_len;
-
-		if (adapter->flags & IXGBE_FLAG_RX_PS_ENABLED)
-			set_ring_ps_enabled(rx_ring);
-		else
-			clear_ring_ps_enabled(rx_ring);
-
 		if (adapter->flags2 & IXGBE_FLAG2_RSC_ENABLED)
 			set_ring_rsc_enabled(rx_ring);
 		else
 			clear_ring_rsc_enabled(rx_ring);
-
 #ifdef IXGBE_FCOE
 		if (netdev->features & NETIF_F_FCOE_MTU) {
 			struct ixgbe_ring_feature *f;
 			f = &adapter->ring_feature[RING_F_FCOE];
-			if ((i >= f->mask) && (i < f->mask + f->indices)) {
-				clear_ring_ps_enabled(rx_ring);
-				if (rx_buf_len < IXGBE_FCOE_JUMBO_FRAME_SIZE)
-					rx_ring->rx_buf_len =
-						IXGBE_FCOE_JUMBO_FRAME_SIZE;
-			} else if (!ring_is_rsc_enabled(rx_ring) &&
-				   !ring_is_ps_enabled(rx_ring)) {
-				rx_ring->rx_buf_len =
-						IXGBE_FCOE_JUMBO_FRAME_SIZE;
-			}
+			if ((i >= f->mask) && (i < f->mask + f->indices))
+				set_bit(__IXGBE_RX_FCOE_BUFSZ, &rx_ring->state);
 		}
 #endif /* IXGBE_FCOE */
 	}
@@ -3991,6 +3990,27 @@ void ixgbe_reset(struct ixgbe_adapter *adapter)
 }
 
 /**
+ * ixgbe_init_rx_page_offset - initialize page offset values for Rx buffers
+ * @rx_ring: ring to setup
+ *
+ * On many IA platforms the L1 cache has a critical stride of 4K, this
+ * results in each receive buffer starting in the same cache set.  To help
+ * reduce the pressure on this cache set we can interleave the offsets so
+ * that only every other buffer will be in the same cache set.
+ **/
+static void ixgbe_init_rx_page_offset(struct ixgbe_ring *rx_ring)
+{
+	struct ixgbe_rx_buffer *rx_buffer = rx_ring->rx_buffer_info;
+	u16 i;
+
+	for (i = 0; i < rx_ring->count; i += 2) {
+		rx_buffer[0].page_offset = 0;
+		rx_buffer[1].page_offset = ixgbe_rx_bufsz(rx_ring);
+		rx_buffer = &rx_buffer[2];
+	}
+}
+
+/**
  * ixgbe_clean_rx_ring - Free Rx Buffers per Queue
  * @rx_ring: ring to free buffers from
  **/
@@ -4006,49 +4026,40 @@ static void ixgbe_clean_rx_ring(struct ixgbe_ring *rx_ring)
 
 	/* Free all the Rx ring sk_buffs */
 	for (i = 0; i < rx_ring->count; i++) {
-		struct ixgbe_rx_buffer *rx_buffer_info;
-
-		rx_buffer_info = &rx_ring->rx_buffer_info[i];
-		if (rx_buffer_info->dma) {
-			dma_unmap_single(rx_ring->dev, rx_buffer_info->dma,
-					 rx_ring->rx_buf_len,
-					 DMA_FROM_DEVICE);
-			rx_buffer_info->dma = 0;
-		}
-		if (rx_buffer_info->skb) {
-			struct sk_buff *skb = rx_buffer_info->skb;
-			rx_buffer_info->skb = NULL;
-			/* We need to clean up RSC frag lists */
-			skb = ixgbe_merge_active_tail(skb);
-			ixgbe_close_active_frag_list(skb);
-			if (IXGBE_CB(skb)->delay_unmap) {
-				dma_unmap_single(dev,
-						 IXGBE_CB(skb)->dma,
-						 rx_ring->rx_buf_len,
-						 DMA_FROM_DEVICE);
-				IXGBE_CB(skb)->dma = 0;
-				IXGBE_CB(skb)->delay_unmap = false;
+		struct ixgbe_rx_buffer *rx_buffer;
+
+		rx_buffer = &rx_ring->rx_buffer_info[i];
+		if (rx_buffer->skb) {
+			struct sk_buff *skb = rx_buffer->skb;
+			if (IXGBE_CB(skb)->page_released) {
+				dma_unmap_page(dev,
+					       IXGBE_CB(skb)->dma,
+					       ixgbe_rx_bufsz(rx_ring),
+					       DMA_FROM_DEVICE);
+				IXGBE_CB(skb)->page_released = false;
 			}
 			dev_kfree_skb(skb);
 		}
-		if (!rx_buffer_info->page)
-			continue;
-		if (rx_buffer_info->page_dma) {
-			dma_unmap_page(dev, rx_buffer_info->page_dma,
-				       PAGE_SIZE / 2, DMA_FROM_DEVICE);
-			rx_buffer_info->page_dma = 0;
-		}
-		put_page(rx_buffer_info->page);
-		rx_buffer_info->page = NULL;
-		rx_buffer_info->page_offset = 0;
+		rx_buffer->skb = NULL;
+		if (rx_buffer->dma)
+			dma_unmap_page(dev, rx_buffer->dma,
+				       ixgbe_rx_pg_size(rx_ring),
+				       DMA_FROM_DEVICE);
+		rx_buffer->dma = 0;
+		if (rx_buffer->page)
+			put_page(rx_buffer->page);
+		rx_buffer->page = NULL;
 	}
 
 	size = sizeof(struct ixgbe_rx_buffer) * rx_ring->count;
 	memset(rx_ring->rx_buffer_info, 0, size);
 
+	ixgbe_init_rx_page_offset(rx_ring);
+
 	/* Zero out the descriptor ring */
 	memset(rx_ring->desc, 0, rx_ring->size);
 
+	rx_ring->next_to_alloc = 0;
 	rx_ring->next_to_clean = 0;
 	rx_ring->next_to_use = 0;
 }
@@ -5412,6 +5423,8 @@ int ixgbe_setup_rx_resources(struct ixgbe_ring *rx_ring)
 	rx_ring->next_to_clean = 0;
 	rx_ring->next_to_use = 0;
 
+	ixgbe_init_rx_page_offset(rx_ring);
+
 	return 0;
 err:
 	vfree(rx_ring->rx_buffer_info);
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ