[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1335448252.2775.41.camel@edumazet-glaptop>
Date: Thu, 26 Apr 2012 15:50:52 +0200
From: Eric Dumazet <eric.dumazet@...il.com>
To: Maciej Żenczykowski <maze@...gle.com>
Cc: David Miller <davem@...emloft.net>, ilpo.jarvinen@...sinki.fi,
rick.jones2@...com, netdev@...r.kernel.org, therbert@...gle.com,
ncardwell@...gle.com, ycheng@...gle.com
Subject: Re: [RFC] allow skb->head to point/alias to first skb frag
On Thu, 2012-04-26 at 14:32 +0200, Eric Dumazet wrote:
> On Thu, 2012-04-26 at 03:09 -0700, Maciej Żenczykowski wrote:
> > it would be very useful if there was an api for freeing skb->head memory...
> >
> > I've fooled around with a single memory buffer for both the head and
> > the data and seen huge performance wins under heavy packet load (half
> > the amount of malloc/free), but could never quite get it to be 100%
> > stable.
>
> My patch is almost ready, and seems very good.
>
> I added a param to skb_build() to tell the memory comes from a (sub)page
>
> tg3 uses the new thing.
>
> GRO does the correct merging.
>
> (by the way, I noticed GRO lies about skb truesize... since it
> accumulates frag lengths instead of allocated space... with a 1448/2048
> mismatch for MTU=1500)
>
> And this apparently solves my slub performance issue I had on my dual
> quad core machine, since I no longer hit slub slow path.
>
>
Here is the raw patch. (tg3 part is dirty and needs some helpers)
I am working on the TCP coalesce part, the splice() helpers (to avoid a
copy), and split in five different patches for further
discussion/inclusion.
drivers/net/ethernet/broadcom/bnx2.c | 2
drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 4
drivers/net/ethernet/broadcom/tg3.c | 61 +++++++++++---
drivers/net/ethernet/broadcom/tg3.h | 2
include/linux/skbuff.h | 7 +
net/core/dev.c | 5 -
net/core/skbuff.c | 46 ++++++++--
net/ipv4/tcp_input.c | 1
8 files changed, 103 insertions(+), 25 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c
index ab55979..ac7b744 100644
--- a/drivers/net/ethernet/broadcom/bnx2.c
+++ b/drivers/net/ethernet/broadcom/bnx2.c
@@ -3006,7 +3006,7 @@ error:
dma_unmap_single(&bp->pdev->dev, dma_addr, bp->rx_buf_use_size,
PCI_DMA_FROMDEVICE);
- skb = build_skb(data);
+ skb = build_skb(data, 0);
if (!skb) {
kfree(data);
goto error;
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 60d5b54..9c5bc5d 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -513,7 +513,7 @@ static inline void bnx2x_tpa_stop(struct bnx2x *bp, struct bnx2x_fastpath *fp,
dma_unmap_single(&bp->pdev->dev, dma_unmap_addr(rx_buf, mapping),
fp->rx_buf_size, DMA_FROM_DEVICE);
if (likely(new_data))
- skb = build_skb(data);
+ skb = build_skb(data, 0);
if (likely(skb)) {
#ifdef BNX2X_STOP_ON_ERROR
@@ -721,7 +721,7 @@ int bnx2x_rx_int(struct bnx2x_fastpath *fp, int budget)
dma_unmap_addr(rx_buf, mapping),
fp->rx_buf_size,
DMA_FROM_DEVICE);
- skb = build_skb(data);
+ skb = build_skb(data, 0);
if (unlikely(!skb)) {
kfree(data);
fp->eth_q_stats.rx_skb_alloc_failed++;
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 0c3e7c7..d216076 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -5619,12 +5619,18 @@ static void tg3_tx(struct tg3_napi *tnapi)
static void tg3_rx_data_free(struct tg3 *tp, struct ring_info *ri, u32 map_sz)
{
+ unsigned int skb_size = SKB_DATA_ALIGN(map_sz + TG3_RX_OFFSET(tp)) +
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
if (!ri->data)
return;
pci_unmap_single(tp->pdev, dma_unmap_addr(ri, mapping),
map_sz, PCI_DMA_FROMDEVICE);
- kfree(ri->data);
+ if (skb_size <= 2048)
+ put_page(virt_to_head_page(ri->data));
+ else
+ kfree(ri->data);
ri->data = NULL;
}
@@ -5640,7 +5646,8 @@ static void tg3_rx_data_free(struct tg3 *tp, struct ring_info *ri, u32 map_sz)
* (to fetch the error flags, vlan tag, checksum, and opaque cookie).
*/
static int tg3_alloc_rx_data(struct tg3 *tp, struct tg3_rx_prodring_set *tpr,
- u32 opaque_key, u32 dest_idx_unmasked)
+ u32 opaque_key, u32 dest_idx_unmasked,
+ unsigned int *frag_size)
{
struct tg3_rx_buffer_desc *desc;
struct ring_info *map;
@@ -5675,16 +5682,36 @@ static int tg3_alloc_rx_data(struct tg3 *tp, struct tg3_rx_prodring_set *tpr,
*/
skb_size = SKB_DATA_ALIGN(data_size + TG3_RX_OFFSET(tp)) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- data = kmalloc(skb_size, GFP_ATOMIC);
- if (!data)
- return -ENOMEM;
-
+ if (skb_size <= 2048) {
+ if (tpr->rx_page_size < 2048) {
+ struct page *page = alloc_page(GFP_ATOMIC);
+ if (!page)
+ return -ENOMEM;
+ atomic_add((PAGE_SIZE / 2048) - 1, &page->_count);
+ tpr->rx_page_addr = page_address(page);
+ tpr->rx_page_size = PAGE_SIZE;
+ }
+ data = tpr->rx_page_addr;
+ tpr->rx_page_addr += 2048;
+ tpr->rx_page_size -= 2048;
+ *frag_size = 2048;
+ } else {
+ data = kmalloc(skb_size, GFP_ATOMIC);
+ if (!data)
+ return -ENOMEM;
+ *frag_size = 0;
+ }
mapping = pci_map_single(tp->pdev,
data + TG3_RX_OFFSET(tp),
data_size,
PCI_DMA_FROMDEVICE);
if (pci_dma_mapping_error(tp->pdev, mapping)) {
- kfree(data);
+ if (skb_size <= 2048) {
+ tpr->rx_page_addr -= 2048;
+ tpr->rx_page_size += 2048;
+ } else {
+ kfree(data);
+ }
return -EIO;
}
@@ -5835,18 +5862,22 @@ static int tg3_rx(struct tg3_napi *tnapi, int budget)
if (len > TG3_RX_COPY_THRESH(tp)) {
int skb_size;
+ unsigned int frag_size;
skb_size = tg3_alloc_rx_data(tp, tpr, opaque_key,
- *post_ptr);
+ *post_ptr, &frag_size);
if (skb_size < 0)
goto drop_it;
pci_unmap_single(tp->pdev, dma_addr, skb_size,
PCI_DMA_FROMDEVICE);
- skb = build_skb(data);
+ skb = build_skb(data, frag_size);
if (!skb) {
- kfree(data);
+ if (frag_size)
+ put_page(virt_to_head_page(data));
+ else
+ kfree(data);
goto drop_it_no_recycle;
}
skb_reserve(skb, TG3_RX_OFFSET(tp));
@@ -7279,7 +7310,10 @@ static int tg3_rx_prodring_alloc(struct tg3 *tp,
/* Now allocate fresh SKBs for each rx ring. */
for (i = 0; i < tp->rx_pending; i++) {
- if (tg3_alloc_rx_data(tp, tpr, RXD_OPAQUE_RING_STD, i) < 0) {
+ unsigned int frag_size;
+
+ if (tg3_alloc_rx_data(tp, tpr, RXD_OPAQUE_RING_STD, i,
+ &frag_size) < 0) {
netdev_warn(tp->dev,
"Using a smaller RX standard ring. Only "
"%d out of %d buffers were allocated "
@@ -7311,7 +7345,10 @@ static int tg3_rx_prodring_alloc(struct tg3 *tp,
}
for (i = 0; i < tp->rx_jumbo_pending; i++) {
- if (tg3_alloc_rx_data(tp, tpr, RXD_OPAQUE_RING_JUMBO, i) < 0) {
+ unsigned int frag_size;
+
+ if (tg3_alloc_rx_data(tp, tpr, RXD_OPAQUE_RING_JUMBO, i,
+ &frag_size) < 0) {
netdev_warn(tp->dev,
"Using a smaller RX jumbo ring. Only %d "
"out of %d buffers were allocated "
diff --git a/drivers/net/ethernet/broadcom/tg3.h b/drivers/net/ethernet/broadcom/tg3.h
index 93865f8..7c85545 100644
--- a/drivers/net/ethernet/broadcom/tg3.h
+++ b/drivers/net/ethernet/broadcom/tg3.h
@@ -2815,6 +2815,8 @@ struct tg3_rx_prodring_set {
struct ring_info *rx_jmb_buffers;
dma_addr_t rx_std_mapping;
dma_addr_t rx_jmb_mapping;
+ void *rx_page_addr;
+ unsigned int rx_page_size;
};
#define TG3_IRQ_MAX_VECS_RSS 5
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4a656b5..1161111 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -470,7 +470,8 @@ struct sk_buff {
__u8 wifi_acked_valid:1;
__u8 wifi_acked:1;
__u8 no_fcs:1;
- /* 9/11 bit hole (depending on ndisc_nodetype presence) */
+ __u8 head_frag:1;
+ /* 8/10 bit hole (depending on ndisc_nodetype presence) */
kmemcheck_bitfield_end(flags2);
#ifdef CONFIG_NET_DMA
@@ -558,11 +559,13 @@ static inline struct rtable *skb_rtable(const struct sk_buff *skb)
}
extern void kfree_skb(struct sk_buff *skb);
+extern struct kmem_cache *skbuff_head_cache;
+
extern void consume_skb(struct sk_buff *skb);
extern void __kfree_skb(struct sk_buff *skb);
extern struct sk_buff *__alloc_skb(unsigned int size,
gfp_t priority, int fclone, int node);
-extern struct sk_buff *build_skb(void *data);
+extern struct sk_buff *build_skb(void *data, unsigned int frag_size);
static inline struct sk_buff *alloc_skb(unsigned int size,
gfp_t priority)
{
diff --git a/net/core/dev.c b/net/core/dev.c
index 501f3cc..9537321 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3546,7 +3546,10 @@ gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
break;
case GRO_MERGED_FREE:
- consume_skb(skb);
+ if (skb->head_frag)
+ kmem_cache_free(skbuff_head_cache, skb);
+ else
+ consume_skb(skb);
break;
case GRO_HELD:
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2342a72..d8fb4b2 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -69,7 +69,7 @@
#include <trace/events/skb.h>
#include <linux/highmem.h>
-static struct kmem_cache *skbuff_head_cache __read_mostly;
+struct kmem_cache *skbuff_head_cache __read_mostly;
static struct kmem_cache *skbuff_fclone_cache __read_mostly;
static void sock_pipe_buf_release(struct pipe_inode_info *pipe,
@@ -245,6 +245,7 @@ EXPORT_SYMBOL(__alloc_skb);
/**
* build_skb - build a network buffer
* @data: data buffer provided by caller
+ * @frag_size: size of fragment, or 0 if head was kmalloced
*
* Allocate a new &sk_buff. Caller provides space holding head and
* skb_shared_info. @data must have been allocated by kmalloc()
@@ -258,20 +259,21 @@ EXPORT_SYMBOL(__alloc_skb);
* before giving packet to stack.
* RX rings only contains data buffers, not full skbs.
*/
-struct sk_buff *build_skb(void *data)
+struct sk_buff *build_skb(void *data, unsigned int frag_size)
{
struct skb_shared_info *shinfo;
struct sk_buff *skb;
- unsigned int size;
+ unsigned int size = frag_size ? : ksize(data);
skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
if (!skb)
return NULL;
- size = ksize(data) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
memset(skb, 0, offsetof(struct sk_buff, tail));
skb->truesize = SKB_TRUESIZE(size);
+ skb->head_frag = frag_size != 0;
atomic_set(&skb->users, 1);
skb->head = data;
skb->data = data;
@@ -376,6 +378,14 @@ static void skb_clone_fraglist(struct sk_buff *skb)
skb_get(list);
}
+static void skb_free_head(struct sk_buff *skb)
+{
+ if (skb->head_frag)
+ put_page(virt_to_head_page(skb->head));
+ else
+ kfree(skb->head);
+}
+
static void skb_release_data(struct sk_buff *skb)
{
if (!skb->cloned ||
@@ -402,7 +412,7 @@ static void skb_release_data(struct sk_buff *skb)
if (skb_has_frag_list(skb))
skb_drop_fraglist(skb);
- kfree(skb->head);
+ skb_free_head(skb);
}
}
@@ -644,6 +654,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
C(tail);
C(end);
C(head);
+ C(head_frag);
C(data);
C(truesize);
atomic_set(&n->users, 1);
@@ -940,7 +951,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
fastpath = atomic_read(&skb_shinfo(skb)->dataref) == delta;
}
- if (fastpath &&
+ if (fastpath && !skb->head_frag &&
size + sizeof(struct skb_shared_info) <= ksize(skb->head)) {
memmove(skb->head + size, skb_shinfo(skb),
offsetof(struct skb_shared_info,
@@ -967,7 +978,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
if (fastpath) {
- kfree(skb->head);
+ skb_free_head(skb);
} else {
/* copy this zero copy skb frags */
if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
@@ -985,6 +996,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
off = (data + nhead) - skb->head;
skb->head = data;
+ skb->head_frag = 0;
adjust_others:
skb->data += off;
#ifdef NET_SKBUFF_DATA_USES_OFFSET
@@ -2889,6 +2901,26 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
NAPI_GRO_CB(skb)->free = 1;
goto done;
+ } else if (skb->head_frag) {
+ int nr_frags = pinfo->nr_frags;
+ skb_frag_t *frag = pinfo->frags + nr_frags;
+ struct page *page = virt_to_head_page(skb->head);
+
+
+ if (nr_frags >= MAX_SKB_FRAGS)
+ return -E2BIG;
+ offset -= headlen;
+ offset += skb->head - (unsigned char *)page_address(page);
+
+ pinfo->nr_frags = nr_frags + 1;
+
+ frag->page.p = page;
+ frag->page_offset = offset;
+ skb_frag_size_set(frag, len);
+
+ /* note : napi_skb_finish() will free skb, not skb->head */
+ NAPI_GRO_CB(skb)->free = 1;
+ goto done;
} else if (skb_gro_len(p) != pinfo->gso_size)
return -E2BIG;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c93b0cb..69d379a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4478,6 +4478,7 @@ merge:
TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
return true;
}
+ /* TODO : handle special case where from->frag_head is true */
if (skb_headlen(from) == 0 &&
!skb_has_frag_list(to) &&
!skb_has_frag_list(from) &&
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists