[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <4A019DC9.7000307@cosmosbay.com>
Date: Wed, 06 May 2009 16:25:13 +0200
From: Eric Dumazet <dada1@...mosbay.com>
To: Lennert Buytenhek <buytenh@...tstofly.org>
CC: netdev@...r.kernel.org, afleming@...escale.com, bkostya@...vell.com
Subject: Re: [PATCH,RFC] generic skb recycling
Lennert Buytenhek a écrit :
> This RFC patch removes the skb recycling that was added to mv643xx_eth
> a while ago and moves it into the stack, based on an earlier idea by
> Kostya Belezko <bkostya@...vell.com>. There's a couple of reasons for
> doing this:
>
> - Make skb recycling available to all drivers, without needing driver
> modifications.
>
> - Allow recycling skbuffs in more cases, by having the recycle check
> in __kfree_skb() instead of in the ethernet driver transmit
> completion routine. This also allows for example recycling locally
> destined skbuffs, instead of only recycling forwarded skbuffs as
> the transmit completion-time check does.
>
> - Allow more consumers of skbuffs in the system use recycled skbuffs,
> and not just the rx refill process in the driver.
>
> - Having a per-interface recycle list doesn't allow skb recycling when
> you're e.g. unidirectionally routing from eth0 to eth1, as eth1 will
> be producing a lot of recycled skbuffs but eth0 won't have any skbuffs
> to allocate from its recycle list.
>
>
> Generic skb recycling is slightly slower than doing it in the driver,
> e.g. in the case of mv643xx_eth about 70 cycles per packet. Given the
> benefits I think that's an OK price to pay.
>
>
> Open items:
>
> - I tried putting the recycle list in the per-CPU softnet state, but
> the skb allocator is initialised before the softnet state is, and I
> ended up with ugly tests in __{alloc,kfree}_skb() to check whether
> softnet init is done yet. (Maybe softnet state can be initialised
> earlier?)
>
> - I picked SKB_DATA_ALIGN(ETH_FRAME_LEN + SMP_CACHE_BYTES) as the skb
> size threshold for skb recycling, as with NET_SKB_PAD padding included
> that's what I suppose most non-frag RX drivers will end up allocating
> for their receive ring (which is the main source for recycled skbuffs).
> I haven't yet measured the effect on frag RX with LRO/GRO to see if
> there's a benefit in recycling there as well.
>
> - Determine a sensible value for the recycle queue length. For
> in-driver recycling, I chose the rx queue length, as we'll never
> allocate more than that in one go, but here it's a bit less clear
> what a good value would be.
>
>
> Thoughts?
Interesting idea but :
1) Wont it break copybreak feature existing in some drivers ?
After your patch, an __alloc_skb(small size) can give a full size recycled skb. Some UDP servers
might consume much more ram :(
2) It breaks NUMA properties of existing code, and may increase cache line ping-pongs on SMP
(On some setups, one CPU receives all incoming frames and TX completion,
while another CPU(s) send(s) frames)
3) Are you sure your uses of &__get_cpu_var(skbuff_recycle_list) are safe
vs preemption ? For example, __alloc_skb() can be called with preempt enabled :)
So... we definitly want some numbers and benchmarks :)
>
>
> diff --git a/drivers/net/mv643xx_eth.c b/drivers/net/mv643xx_eth.c
> index d583852..738b5c3 100644
> --- a/drivers/net/mv643xx_eth.c
> +++ b/drivers/net/mv643xx_eth.c
> @@ -403,7 +403,6 @@ struct mv643xx_eth_private {
> u8 work_rx_refill;
>
> int skb_size;
> - struct sk_buff_head rx_recycle;
>
> /*
> * RX state.
> @@ -656,10 +655,7 @@ static int rxq_refill(struct rx_queue *rxq, int budget)
> int rx;
> struct rx_desc *rx_desc;
>
> - skb = __skb_dequeue(&mp->rx_recycle);
> - if (skb == NULL)
> - skb = dev_alloc_skb(mp->skb_size);
> -
> + skb = dev_alloc_skb(mp->skb_size);
> if (skb == NULL) {
> mp->oom = 1;
> goto oom;
> @@ -962,14 +958,8 @@ static int txq_reclaim(struct tx_queue *txq, int budget, int force)
> desc->byte_cnt, DMA_TO_DEVICE);
> }
>
> - if (skb != NULL) {
> - if (skb_queue_len(&mp->rx_recycle) <
> - mp->rx_ring_size &&
> - skb_recycle_check(skb, mp->skb_size))
> - __skb_queue_head(&mp->rx_recycle, skb);
> - else
> - dev_kfree_skb(skb);
> - }
> + if (skb != NULL)
> + dev_kfree_skb(skb);
> }
>
> __netif_tx_unlock(nq);
> @@ -2368,8 +2358,6 @@ static int mv643xx_eth_open(struct net_device *dev)
>
> napi_enable(&mp->napi);
>
> - skb_queue_head_init(&mp->rx_recycle);
> -
> mp->int_mask = INT_EXT;
>
> for (i = 0; i < mp->rxq_count; i++) {
> @@ -2464,8 +2452,6 @@ static int mv643xx_eth_stop(struct net_device *dev)
> mib_counters_update(mp);
> del_timer_sync(&mp->mib_counters_timer);
>
> - skb_queue_purge(&mp->rx_recycle);
> -
> for (i = 0; i < mp->rxq_count; i++)
> rxq_deinit(mp->rxq + i);
> for (i = 0; i < mp->txq_count; i++)
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index d152394..c9c111f 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -56,6 +56,7 @@
> #include <linux/init.h>
> #include <linux/scatterlist.h>
> #include <linux/errqueue.h>
> +#include <linux/cpu.h>
>
> #include <net/protocol.h>
> #include <net/dst.h>
> @@ -71,6 +72,9 @@
>
> static struct kmem_cache *skbuff_head_cache __read_mostly;
> static struct kmem_cache *skbuff_fclone_cache __read_mostly;
> +static DEFINE_PER_CPU(struct sk_buff_head, skbuff_recycle_list);
> +
> +#define SKB_MIN_RECYCLE_SIZE SKB_DATA_ALIGN(ETH_FRAME_LEN + SMP_CACHE_BYTES)
>
> static void sock_pipe_buf_release(struct pipe_inode_info *pipe,
> struct pipe_buffer *buf)
> @@ -176,6 +180,14 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
> struct sk_buff *skb;
> u8 *data;
>
> + if (size <= SKB_MIN_RECYCLE_SIZE && !fclone) {
> + struct sk_buff_head *h = &__get_cpu_var(skbuff_recycle_list);
> +
> + skb = skb_dequeue(h);
> + if (skb != NULL)
> + return skb;
> + }
> +
> cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
>
> /* Get the HEAD */
> @@ -423,7 +435,37 @@ static void skb_release_all(struct sk_buff *skb)
>
> void __kfree_skb(struct sk_buff *skb)
> {
> - skb_release_all(skb);
> + struct sk_buff_head *h = &__get_cpu_var(skbuff_recycle_list);
> +
> + skb_release_head_state(skb);
> +
> + if (skb_queue_len(h) < 256 &&
> + !skb_cloned(skb) && !skb_is_nonlinear(skb) &&
> + skb->fclone == SKB_FCLONE_UNAVAILABLE &&
> + skb_end_pointer(skb) - skb->head >= SKB_MIN_RECYCLE_SIZE) {
> + struct skb_shared_info *shinfo;
> +
> + shinfo = skb_shinfo(skb);
> + atomic_set(&shinfo->dataref, 1);
> + shinfo->nr_frags = 0;
> + shinfo->gso_size = 0;
> + shinfo->gso_segs = 0;
> + shinfo->gso_type = 0;
> + shinfo->ip6_frag_id = 0;
> + shinfo->tx_flags.flags = 0;
> + shinfo->frag_list = NULL;
> + memset(&shinfo->hwtstamps, 0, sizeof(shinfo->hwtstamps));
> +
> + memset(skb, 0, offsetof(struct sk_buff, tail));
> + skb->data = skb->head;
> + skb_reset_tail_pointer(skb);
> +
> + skb_queue_head(h, skb);
> +
> + return;
> + }
> +
> + skb_release_data(skb);
> kfree_skbmem(skb);
> }
> EXPORT_SYMBOL(__kfree_skb);
> @@ -2756,8 +2798,24 @@ done:
> }
> EXPORT_SYMBOL_GPL(skb_gro_receive);
>
> +static int
> +skb_cpu_callback(struct notifier_block *nfb,
> + unsigned long action, void *ocpu)
> +{
> + unsigned long oldcpu = (unsigned long)ocpu;
> +
> + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
> + struct sk_buff_head *h = &per_cpu(skbuff_recycle_list, oldcpu);
> + skb_queue_purge(h);
> + }
> +
> + return NOTIFY_OK;
> +}
> +
> void __init skb_init(void)
> {
> + int i;
> +
> skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
> sizeof(struct sk_buff),
> 0,
> @@ -2769,6 +2827,13 @@ void __init skb_init(void)
> 0,
> SLAB_HWCACHE_ALIGN|SLAB_PANIC,
> NULL);
> +
> + for_each_possible_cpu(i) {
> + struct sk_buff_head *h = &per_cpu(skbuff_recycle_list, i);
> + skb_queue_head_init(h);
> + }
> +
> + hotcpu_notifier(skb_cpu_callback, 0);
> }
>
> /**
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists