lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1335448252.2775.41.camel@edumazet-glaptop>
Date:	Thu, 26 Apr 2012 15:50:52 +0200
From:	Eric Dumazet <eric.dumazet@...il.com>
To:	Maciej Żenczykowski <maze@...gle.com>
Cc:	David Miller <davem@...emloft.net>, ilpo.jarvinen@...sinki.fi,
	rick.jones2@...com, netdev@...r.kernel.org, therbert@...gle.com,
	ncardwell@...gle.com, ycheng@...gle.com
Subject: Re: [RFC] allow skb->head to point/alias to first skb frag

On Thu, 2012-04-26 at 14:32 +0200, Eric Dumazet wrote:
> On Thu, 2012-04-26 at 03:09 -0700, Maciej Żenczykowski wrote:
> > it would be very useful if there was an api for freeing skb->head memory...
> > 
> > I've fooled around with a single memory buffer for both the head and
> > the data and seen huge performance wins under heavy packet load (half
> > the amount of malloc/free), but could never quite get it to be 100%
> > stable.
> 
> My patch is almost ready, and seems very good.
> 
> I added a param to skb_build() to tell the memory comes from a (sub)page
> 
> tg3 uses the new thing.
> 
> GRO does the correct merging.
> 
> (by the way, I noticed GRO lies about skb truesize... since it
> accumulates frag lengths instead of allocated space... with a 1448/2048
> mismatch for MTU=1500)
> 
> And this apparently solves my slub performance issue I had on my dual
> quad core machine, since I no longer hit slub slow path.
> 
> 

Here is the raw patch. (tg3 part is dirty and needs some helpers)

I am working on the TCP coalesce part, the splice() helpers (to avoid a
copy), and split in five different patches for further
discussion/inclusion.


 drivers/net/ethernet/broadcom/bnx2.c            |    2 
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c |    4 
 drivers/net/ethernet/broadcom/tg3.c             |   61 +++++++++++---
 drivers/net/ethernet/broadcom/tg3.h             |    2 
 include/linux/skbuff.h                          |    7 +
 net/core/dev.c                                  |    5 -
 net/core/skbuff.c                               |   46 ++++++++--
 net/ipv4/tcp_input.c                            |    1 
 8 files changed, 103 insertions(+), 25 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c
index ab55979..ac7b744 100644
--- a/drivers/net/ethernet/broadcom/bnx2.c
+++ b/drivers/net/ethernet/broadcom/bnx2.c
@@ -3006,7 +3006,7 @@ error:
 
 	dma_unmap_single(&bp->pdev->dev, dma_addr, bp->rx_buf_use_size,
 			 PCI_DMA_FROMDEVICE);
-	skb = build_skb(data);
+	skb = build_skb(data, 0);
 	if (!skb) {
 		kfree(data);
 		goto error;
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 60d5b54..9c5bc5d 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -513,7 +513,7 @@ static inline void bnx2x_tpa_stop(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 	dma_unmap_single(&bp->pdev->dev, dma_unmap_addr(rx_buf, mapping),
 			 fp->rx_buf_size, DMA_FROM_DEVICE);
 	if (likely(new_data))
-		skb = build_skb(data);
+		skb = build_skb(data, 0);
 
 	if (likely(skb)) {
 #ifdef BNX2X_STOP_ON_ERROR
@@ -721,7 +721,7 @@ int bnx2x_rx_int(struct bnx2x_fastpath *fp, int budget)
 						 dma_unmap_addr(rx_buf, mapping),
 						 fp->rx_buf_size,
 						 DMA_FROM_DEVICE);
-				skb = build_skb(data);
+				skb = build_skb(data, 0);
 				if (unlikely(!skb)) {
 					kfree(data);
 					fp->eth_q_stats.rx_skb_alloc_failed++;
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 0c3e7c7..d216076 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -5619,12 +5619,18 @@ static void tg3_tx(struct tg3_napi *tnapi)
 
 static void tg3_rx_data_free(struct tg3 *tp, struct ring_info *ri, u32 map_sz)
 {
+	unsigned int skb_size = SKB_DATA_ALIGN(map_sz + TG3_RX_OFFSET(tp)) +
+		   SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
 	if (!ri->data)
 		return;
 
 	pci_unmap_single(tp->pdev, dma_unmap_addr(ri, mapping),
 			 map_sz, PCI_DMA_FROMDEVICE);
-	kfree(ri->data);
+	if (skb_size <= 2048)
+		put_page(virt_to_head_page(ri->data));
+	else
+		kfree(ri->data);
 	ri->data = NULL;
 }
 
@@ -5640,7 +5646,8 @@ static void tg3_rx_data_free(struct tg3 *tp, struct ring_info *ri, u32 map_sz)
  * (to fetch the error flags, vlan tag, checksum, and opaque cookie).
  */
 static int tg3_alloc_rx_data(struct tg3 *tp, struct tg3_rx_prodring_set *tpr,
-			    u32 opaque_key, u32 dest_idx_unmasked)
+			     u32 opaque_key, u32 dest_idx_unmasked,
+			     unsigned int *frag_size)
 {
 	struct tg3_rx_buffer_desc *desc;
 	struct ring_info *map;
@@ -5675,16 +5682,36 @@ static int tg3_alloc_rx_data(struct tg3 *tp, struct tg3_rx_prodring_set *tpr,
 	 */
 	skb_size = SKB_DATA_ALIGN(data_size + TG3_RX_OFFSET(tp)) +
 		   SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-	data = kmalloc(skb_size, GFP_ATOMIC);
-	if (!data)
-		return -ENOMEM;
-
+	if (skb_size <= 2048) {
+		if (tpr->rx_page_size < 2048) {
+			struct page *page = alloc_page(GFP_ATOMIC);
+			if (!page)
+				return -ENOMEM;
+			atomic_add((PAGE_SIZE / 2048) - 1, &page->_count); 
+			tpr->rx_page_addr = page_address(page);
+			tpr->rx_page_size = PAGE_SIZE;
+		}
+		data = tpr->rx_page_addr;
+		tpr->rx_page_addr += 2048;
+		tpr->rx_page_size -= 2048;
+		*frag_size = 2048;
+	} else {
+		data = kmalloc(skb_size, GFP_ATOMIC);
+		if (!data)
+			return -ENOMEM;
+		*frag_size = 0;
+	}
 	mapping = pci_map_single(tp->pdev,
 				 data + TG3_RX_OFFSET(tp),
 				 data_size,
 				 PCI_DMA_FROMDEVICE);
 	if (pci_dma_mapping_error(tp->pdev, mapping)) {
-		kfree(data);
+		if (skb_size <= 2048) {
+			tpr->rx_page_addr -= 2048;
+			tpr->rx_page_size += 2048;
+		} else {
+			kfree(data);
+		}
 		return -EIO;
 	}
 
@@ -5835,18 +5862,22 @@ static int tg3_rx(struct tg3_napi *tnapi, int budget)
 
 		if (len > TG3_RX_COPY_THRESH(tp)) {
 			int skb_size;
+			unsigned int frag_size;
 
 			skb_size = tg3_alloc_rx_data(tp, tpr, opaque_key,
-						    *post_ptr);
+						    *post_ptr, &frag_size);
 			if (skb_size < 0)
 				goto drop_it;
 
 			pci_unmap_single(tp->pdev, dma_addr, skb_size,
 					 PCI_DMA_FROMDEVICE);
 
-			skb = build_skb(data);
+			skb = build_skb(data, frag_size);
 			if (!skb) {
-				kfree(data);
+				if (frag_size)
+					put_page(virt_to_head_page(data));
+				else
+					kfree(data);
 				goto drop_it_no_recycle;
 			}
 			skb_reserve(skb, TG3_RX_OFFSET(tp));
@@ -7279,7 +7310,10 @@ static int tg3_rx_prodring_alloc(struct tg3 *tp,
 
 	/* Now allocate fresh SKBs for each rx ring. */
 	for (i = 0; i < tp->rx_pending; i++) {
-		if (tg3_alloc_rx_data(tp, tpr, RXD_OPAQUE_RING_STD, i) < 0) {
+		unsigned int frag_size;
+
+		if (tg3_alloc_rx_data(tp, tpr, RXD_OPAQUE_RING_STD, i,
+				      &frag_size) < 0) {
 			netdev_warn(tp->dev,
 				    "Using a smaller RX standard ring. Only "
 				    "%d out of %d buffers were allocated "
@@ -7311,7 +7345,10 @@ static int tg3_rx_prodring_alloc(struct tg3 *tp,
 	}
 
 	for (i = 0; i < tp->rx_jumbo_pending; i++) {
-		if (tg3_alloc_rx_data(tp, tpr, RXD_OPAQUE_RING_JUMBO, i) < 0) {
+		unsigned int frag_size;
+
+		if (tg3_alloc_rx_data(tp, tpr, RXD_OPAQUE_RING_JUMBO, i,
+				      &frag_size) < 0) {
 			netdev_warn(tp->dev,
 				    "Using a smaller RX jumbo ring. Only %d "
 				    "out of %d buffers were allocated "
diff --git a/drivers/net/ethernet/broadcom/tg3.h b/drivers/net/ethernet/broadcom/tg3.h
index 93865f8..7c85545 100644
--- a/drivers/net/ethernet/broadcom/tg3.h
+++ b/drivers/net/ethernet/broadcom/tg3.h
@@ -2815,6 +2815,8 @@ struct tg3_rx_prodring_set {
 	struct ring_info		*rx_jmb_buffers;
 	dma_addr_t			rx_std_mapping;
 	dma_addr_t			rx_jmb_mapping;
+	void				*rx_page_addr;
+	unsigned int			rx_page_size;
 };
 
 #define TG3_IRQ_MAX_VECS_RSS		5
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4a656b5..1161111 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -470,7 +470,8 @@ struct sk_buff {
 	__u8			wifi_acked_valid:1;
 	__u8			wifi_acked:1;
 	__u8			no_fcs:1;
-	/* 9/11 bit hole (depending on ndisc_nodetype presence) */
+	__u8			head_frag:1;
+	/* 8/10 bit hole (depending on ndisc_nodetype presence) */
 	kmemcheck_bitfield_end(flags2);
 
 #ifdef CONFIG_NET_DMA
@@ -558,11 +559,13 @@ static inline struct rtable *skb_rtable(const struct sk_buff *skb)
 }
 
 extern void kfree_skb(struct sk_buff *skb);
+extern struct kmem_cache *skbuff_head_cache;
+
 extern void consume_skb(struct sk_buff *skb);
 extern void	       __kfree_skb(struct sk_buff *skb);
 extern struct sk_buff *__alloc_skb(unsigned int size,
 				   gfp_t priority, int fclone, int node);
-extern struct sk_buff *build_skb(void *data);
+extern struct sk_buff *build_skb(void *data, unsigned int frag_size);
 static inline struct sk_buff *alloc_skb(unsigned int size,
 					gfp_t priority)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index 501f3cc..9537321 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3546,7 +3546,10 @@ gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
 		break;
 
 	case GRO_MERGED_FREE:
-		consume_skb(skb);
+		if (skb->head_frag)
+			kmem_cache_free(skbuff_head_cache, skb);
+		else
+			consume_skb(skb);
 		break;
 
 	case GRO_HELD:
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2342a72..d8fb4b2 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -69,7 +69,7 @@
 #include <trace/events/skb.h>
 #include <linux/highmem.h>
 
-static struct kmem_cache *skbuff_head_cache __read_mostly;
+struct kmem_cache *skbuff_head_cache __read_mostly;
 static struct kmem_cache *skbuff_fclone_cache __read_mostly;
 
 static void sock_pipe_buf_release(struct pipe_inode_info *pipe,
@@ -245,6 +245,7 @@ EXPORT_SYMBOL(__alloc_skb);
 /**
  * build_skb - build a network buffer
  * @data: data buffer provided by caller
+ * @frag_size: size of fragment, or 0 if head was kmalloced
  *
  * Allocate a new &sk_buff. Caller provides space holding head and
  * skb_shared_info. @data must have been allocated by kmalloc()
@@ -258,20 +259,21 @@ EXPORT_SYMBOL(__alloc_skb);
  *  before giving packet to stack.
  *  RX rings only contains data buffers, not full skbs.
  */
-struct sk_buff *build_skb(void *data)
+struct sk_buff *build_skb(void *data, unsigned int frag_size)
 {
 	struct skb_shared_info *shinfo;
 	struct sk_buff *skb;
-	unsigned int size;
+	unsigned int size = frag_size ? : ksize(data);
 
 	skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
 	if (!skb)
 		return NULL;
 
-	size = ksize(data) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 
 	memset(skb, 0, offsetof(struct sk_buff, tail));
 	skb->truesize = SKB_TRUESIZE(size);
+	skb->head_frag = frag_size != 0;
 	atomic_set(&skb->users, 1);
 	skb->head = data;
 	skb->data = data;
@@ -376,6 +378,14 @@ static void skb_clone_fraglist(struct sk_buff *skb)
 		skb_get(list);
 }
 
+static void skb_free_head(struct sk_buff *skb)
+{
+	if (skb->head_frag)
+		put_page(virt_to_head_page(skb->head));
+	else
+		kfree(skb->head);
+}
+
 static void skb_release_data(struct sk_buff *skb)
 {
 	if (!skb->cloned ||
@@ -402,7 +412,7 @@ static void skb_release_data(struct sk_buff *skb)
 		if (skb_has_frag_list(skb))
 			skb_drop_fraglist(skb);
 
-		kfree(skb->head);
+		skb_free_head(skb);
 	}
 }
 
@@ -644,6 +654,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
 	C(tail);
 	C(end);
 	C(head);
+	C(head_frag);
 	C(data);
 	C(truesize);
 	atomic_set(&n->users, 1);
@@ -940,7 +951,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 		fastpath = atomic_read(&skb_shinfo(skb)->dataref) == delta;
 	}
 
-	if (fastpath &&
+	if (fastpath && !skb->head_frag &&
 	    size + sizeof(struct skb_shared_info) <= ksize(skb->head)) {
 		memmove(skb->head + size, skb_shinfo(skb),
 			offsetof(struct skb_shared_info,
@@ -967,7 +978,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 	       offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
 
 	if (fastpath) {
-		kfree(skb->head);
+		skb_free_head(skb);
 	} else {
 		/* copy this zero copy skb frags */
 		if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
@@ -985,6 +996,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 	off = (data + nhead) - skb->head;
 
 	skb->head     = data;
+	skb->head_frag = 0;
 adjust_others:
 	skb->data    += off;
 #ifdef NET_SKBUFF_DATA_USES_OFFSET
@@ -2889,6 +2901,26 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 
 		NAPI_GRO_CB(skb)->free = 1;
 		goto done;
+	} else if (skb->head_frag) {
+		int nr_frags = pinfo->nr_frags;
+		skb_frag_t *frag = pinfo->frags + nr_frags;
+		struct page *page = virt_to_head_page(skb->head);
+
+
+		if (nr_frags >= MAX_SKB_FRAGS)
+			return -E2BIG;
+		offset -= headlen;
+		offset += skb->head - (unsigned char *)page_address(page);
+
+		pinfo->nr_frags = nr_frags + 1;
+
+		frag->page.p	  = page;
+		frag->page_offset = offset;
+		skb_frag_size_set(frag, len);
+
+		/* note : napi_skb_finish() will free skb, not skb->head */
+		NAPI_GRO_CB(skb)->free = 1;
+		goto done;
 	} else if (skb_gro_len(p) != pinfo->gso_size)
 		return -E2BIG;
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c93b0cb..69d379a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4478,6 +4478,7 @@ merge:
 		TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
 		return true;
 	}
+	/* TODO : handle special case where from->frag_head is true */
 	if (skb_headlen(from) == 0 &&
 	    !skb_has_frag_list(to) &&
 	    !skb_has_frag_list(from) &&


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ