[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <1363245955.14913.21.camel@localhost>
Date: Thu, 14 Mar 2013 08:25:55 +0100
From: Jesper Dangaard Brouer <brouer@...hat.com>
To: Eric Dumazet <eric.dumazet@...il.com>,
Hannes Frederic Sowa <hannes@...essinduktion.org>
Cc: netdev@...r.kernel.org, yoshfuji@...ux-ipv6.org
Subject: RFC crap-patch [PATCH] net: Per CPU separate frag mem accounting
His is NOT the patch I just mentioned in the other thread, of removing
the LRU list. This patch does real per cpu mem acct, and LRU per CPU.
I get really good performance number with this patch, but I still think
this might not be the correct solution.
My current best results, which got applied recently, compared to this
patch:
- Test-type: Test-20G64K Test-20G3F 20G64K+DoS 20G3F+DoS
- Patch-06: 18486.7 Mbit/s 10723.20 3657.85 4560.64 Mbit/s
- curr-best: 19041.0 Mbit/s 12105.20 10160.40 11179.30 Mbit/s
Thus, I have almost solved DoS effect Test-20G3F 12GBit/s -> 11Gbit/s
under DoS. The 64K+DoS case is not perfect yet, 19Gbit/s -> 11 Gbit/s.
--Jesper
On Fri, 2013-03-08 at 23:17 +0100, Jesper Dangaard Brouer wrote:
> ... testing if the percpu_counter does not scale in DoS situations
> ---
>
> include/net/inet_frag.h | 99 ++++++++++++++++++-------------
> net/ipv4/inet_fragment.c | 61 +++++++++++++++----
> net/ipv4/ip_fragment.c | 3 +
> net/ipv6/netfilter/nf_conntrack_reasm.c | 2 -
> net/ipv6/reassembly.c | 2 -
> 5 files changed, 110 insertions(+), 57 deletions(-)
>
> diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
> index f2b46a5..974434a 100644
> --- a/include/net/inet_frag.h
> +++ b/include/net/inet_frag.h
> @@ -1,22 +1,31 @@
> #ifndef __NET_FRAG_H__
> #define __NET_FRAG_H__
>
> -#include <linux/percpu_counter.h>
> +//#include <linux/percpu_counter.h>
> +#include <linux/spinlock.h>
> +#include <linux/atomic.h>
> +#include <linux/percpu.h>
>
> -struct netns_frags {
> - int nqueues;
> - struct list_head lru_list;
> - spinlock_t lru_lock;
> +/* Need to maintain these resource limits per CPU, else we will kill
> + * performance due to cache-line bouncing
> + */
> +struct frag_cpu_limit {
> + atomic_t mem;
> + struct list_head lru_list;
> + spinlock_t lru_lock;
> +} ____cacheline_aligned_in_smp;
>
> - /* The percpu_counter "mem" need to be cacheline aligned.
> - * mem.count must not share cacheline with other writers
> - */
> - struct percpu_counter mem ____cacheline_aligned_in_smp;
> +struct netns_frags {
>
> /* sysctls */
> int timeout;
> int high_thresh;
> int low_thresh;
> +
> + struct frag_cpu_limit __percpu *percpu;
> +
> + // TODO move "nqueues" elsewere...
> + int nqueues ____cacheline_aligned_in_smp;
> };
>
> struct inet_frag_queue {
> @@ -25,6 +34,7 @@ struct inet_frag_queue {
> struct list_head lru_list; /* lru list member */
> struct hlist_node list;
> atomic_t refcnt;
> + u32 cpu_alloc; /* used for mem limit accounting */
> struct sk_buff *fragments; /* list of received fragments */
> struct sk_buff *fragments_tail;
> ktime_t stamp;
> @@ -80,7 +90,8 @@ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
> void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f);
> void inet_frag_destroy(struct inet_frag_queue *q,
> struct inet_frags *f, int *work);
> -int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force);
> +int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f,
> + bool force, int on_cpu);
> struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
> struct inet_frags *f, void *key, unsigned int hash)
> __releases(&f->lock);
> @@ -93,59 +104,65 @@ static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f
>
> /* Memory Tracking Functions. */
>
> -/* The default percpu_counter batch size is not big enough to scale to
> - * fragmentation mem acct sizes.
> - * The mem size of a 64K fragment is approx:
> - * (44 fragments * 2944 truesize) + frag_queue struct(200) = 129736 bytes
> - */
> -static unsigned int frag_percpu_counter_batch = 130000;
> -
> -static inline int frag_mem_limit(struct netns_frags *nf)
> -{
> - return percpu_counter_read(&nf->mem);
> -}
> -
> static inline void sub_frag_mem_limit(struct inet_frag_queue *q, int i)
> {
> - __percpu_counter_add(&q->net->mem, -i, frag_percpu_counter_batch);
> + int cpu = q->cpu_alloc;
> + struct frag_cpu_limit *percpu = per_cpu_ptr(q->net->percpu, cpu);
> + atomic_sub(i, &percpu->mem);
> }
>
> static inline void add_frag_mem_limit(struct inet_frag_queue *q, int i)
> {
> - __percpu_counter_add(&q->net->mem, i, frag_percpu_counter_batch);
> -}
> -
> -static inline void init_frag_mem_limit(struct netns_frags *nf)
> -{
> - percpu_counter_init(&nf->mem, 0);
> + int cpu = q->cpu_alloc;
> + struct frag_cpu_limit *percpu = per_cpu_ptr(q->net->percpu, cpu);
> + atomic_add(i, &percpu->mem);
> }
>
> static inline int sum_frag_mem_limit(struct netns_frags *nf)
> {
> - return percpu_counter_sum_positive(&nf->mem);
> + unsigned int sum = 0;
> + int cpu;
> +
> + for_each_possible_cpu(cpu) {
> + struct frag_cpu_limit *percpu = per_cpu_ptr(nf->percpu, cpu);
> +
> + sum += atomic_read(&percpu->mem);
> + }
> + return sum;
> }
>
> +/* LRU (Least Recently Used) resource functions */
> +
> static inline void inet_frag_lru_move(struct inet_frag_queue *q)
> {
> - spin_lock(&q->net->lru_lock);
> - list_move_tail(&q->lru_list, &q->net->lru_list);
> - spin_unlock(&q->net->lru_lock);
> + int cpu = q->cpu_alloc;
> + struct frag_cpu_limit *percpu = per_cpu_ptr(q->net->percpu, cpu);
> +
> + spin_lock(&percpu->lru_lock);
> + list_move_tail(&q->lru_list, &percpu->lru_list);
> + spin_unlock(&percpu->lru_lock);
> }
>
> static inline void inet_frag_lru_del(struct inet_frag_queue *q)
> {
> - spin_lock(&q->net->lru_lock);
> - list_del(&q->lru_list);
> - q->net->nqueues--;
> - spin_unlock(&q->net->lru_lock);
> + int cpu = q->cpu_alloc;
> + struct frag_cpu_limit *percpu = per_cpu_ptr(q->net->percpu, cpu);
> +
> + spin_lock(&percpu->lru_lock);
> + list_del(&q->lru_list);
> + q->net->nqueues--; //FIXME
> + spin_unlock(&percpu->lru_lock);
> }
>
> static inline void inet_frag_lru_add(struct netns_frags *nf,
> struct inet_frag_queue *q)
> {
> - spin_lock(&nf->lru_lock);
> - list_add_tail(&q->lru_list, &nf->lru_list);
> - q->net->nqueues++;
> - spin_unlock(&nf->lru_lock);
> + int cpu = q->cpu_alloc;
> + struct frag_cpu_limit *percpu = per_cpu_ptr(nf->percpu, cpu);
> +
> + spin_lock(&percpu->lru_lock);
> + list_add_tail(&q->lru_list, &percpu->lru_list);
> + q->net->nqueues++; //FIXME
> + spin_unlock(&percpu->lru_lock);
> }
> #endif
> diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
> index e5c426f..f09fa7e 100644
> --- a/net/ipv4/inet_fragment.c
> +++ b/net/ipv4/inet_fragment.c
> @@ -23,6 +23,18 @@
>
> #include <net/inet_frag.h>
>
> +static inline int frag_mem_limit_on_cpu(struct netns_frags *nf, int on_cpu)
> +{
> + struct frag_cpu_limit *percpu = per_cpu_ptr(nf->percpu, on_cpu);
> + return atomic_read(&percpu->mem);
> +}
> +
> +static inline int frag_mem_limit(struct netns_frags *nf)
> +{
> + int cpu = smp_processor_id();
> + return frag_mem_limit_on_cpu(nf, cpu);
> +}
> +
> static void inet_frag_secret_rebuild(unsigned long dummy)
> {
> struct inet_frags *f = (struct inet_frags *)dummy;
> @@ -81,12 +93,28 @@ void inet_frags_init(struct inet_frags *f)
> }
> EXPORT_SYMBOL(inet_frags_init);
>
> +static int inet_frags_init_percpu_limit(struct netns_frags *nf)
> +{
> + int cpu;
> +
> + nf->percpu = alloc_percpu(struct frag_cpu_limit);
> + if (!nf->percpu)
> + return -ENOMEM;
> +
> + for_each_possible_cpu(cpu) {
> + struct frag_cpu_limit *percpu = per_cpu_ptr(nf->percpu, cpu);
> +
> + INIT_LIST_HEAD(&percpu->lru_list);
> + spin_lock_init(&percpu->lru_lock);
> + atomic_set(&percpu->mem, 0);
> + }
> + return 1;
> +}
> +
> void inet_frags_init_net(struct netns_frags *nf)
> {
> nf->nqueues = 0; //remove?
> - init_frag_mem_limit(nf);
> - INIT_LIST_HEAD(&nf->lru_list);
> - spin_lock_init(&nf->lru_lock);
> + inet_frags_init_percpu_limit(nf);
> }
> EXPORT_SYMBOL(inet_frags_init_net);
>
> @@ -98,13 +126,16 @@ EXPORT_SYMBOL(inet_frags_fini);
>
> void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
> {
> + int cpu;
> +
> nf->low_thresh = 0;
>
> local_bh_disable();
> - inet_frag_evictor(nf, f, true);
> + for_each_possible_cpu(cpu)
> + inet_frag_evictor(nf, f, true, cpu);
> local_bh_enable();
>
> - percpu_counter_destroy(&nf->mem);
> + free_percpu(nf->percpu);
> }
> EXPORT_SYMBOL(inet_frags_exit_net);
>
> @@ -179,33 +210,36 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
> }
> EXPORT_SYMBOL(inet_frag_destroy);
>
> -int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)
> +int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f,
> + bool force, int on_cpu)
> {
> struct inet_frag_queue *q;
> int work, evicted = 0;
> + int cpu = (likely(on_cpu < 0)) ? smp_processor_id() : on_cpu;
> + struct frag_cpu_limit *percpu = per_cpu_ptr(nf->percpu, cpu);
>
> if (!force) {
> - if (frag_mem_limit(nf) <= nf->high_thresh)
> + if (frag_mem_limit_on_cpu(nf, cpu) <= nf->high_thresh)
> return 0;
> }
>
> - work = frag_mem_limit(nf) - nf->low_thresh;
> + work = frag_mem_limit_on_cpu(nf, cpu) - nf->low_thresh;
> while (work > 0) {
> - spin_lock(&nf->lru_lock);
> + spin_lock(&percpu->lru_lock);
>
> - if (list_empty(&nf->lru_list)) {
> - spin_unlock(&nf->lru_lock);
> + if (list_empty(&percpu->lru_list)) {
> + spin_unlock(&percpu->lru_lock);
> break;
> }
>
> - q = list_first_entry(&nf->lru_list,
> + q = list_first_entry(&percpu->lru_list,
> struct inet_frag_queue, lru_list);
> atomic_inc(&q->refcnt);
>
> // TEST: remove q from list to avoid more CPUs grabbing it
> list_del_init(&q->lru_list);
>
> - spin_unlock(&nf->lru_lock);
> + spin_unlock(&percpu->lru_lock);
>
> spin_lock(&q->lock);
> if (!(q->last_in & INET_FRAG_COMPLETE))
> @@ -283,6 +317,7 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
> return NULL;
>
> q->net = nf;
> + q->cpu_alloc = (u32) smp_processor_id();
> f->constructor(q, arg);
> add_frag_mem_limit(q, f->qsize);
>
> diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
> index 1211613..4241417 100644
> --- a/net/ipv4/ip_fragment.c
> +++ b/net/ipv4/ip_fragment.c
> @@ -18,6 +18,7 @@
> * John McDonald : 0 length frag bug.
> * Alexey Kuznetsov: SMP races, threading, cleanup.
> * Patrick McHardy : LRU queue of frag heads for evictor.
> + * Jesper Brouer : SMP/NUMA scalability
> */
>
> #define pr_fmt(fmt) "IPv4: " fmt
> @@ -212,7 +213,7 @@ static void ip_evictor(struct net *net)
> {
> int evicted;
>
> - evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false);
> + evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false, -1);
> if (evicted)
> IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted);
> }
> diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
> index c674f15..37291f9 100644
> --- a/net/ipv6/netfilter/nf_conntrack_reasm.c
> +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
> @@ -569,7 +569,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user)
> fhdr = (struct frag_hdr *)skb_transport_header(clone);
>
> local_bh_disable();
> - inet_frag_evictor(&net->nf_frag.frags, &nf_frags, false);
> + inet_frag_evictor(&net->nf_frag.frags, &nf_frags, false, -1);
> local_bh_enable();
>
> fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr);
> diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
> index bab2c27..d1e70dd 100644
> --- a/net/ipv6/reassembly.c
> +++ b/net/ipv6/reassembly.c
> @@ -529,7 +529,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
> return 1;
> }
>
> - evicted = inet_frag_evictor(&net->ipv6.frags, &ip6_frags, false);
> + evicted = inet_frag_evictor(&net->ipv6.frags, &ip6_frags, false, -1);
> if (evicted)
> IP6_ADD_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
> IPSTATS_MIB_REASMFAILS, evicted);
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists