[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20100819152149.GA19744@Krystal>
Date: Thu, 19 Aug 2010 11:21:49 -0400
From: Mathieu Desnoyers <mathieu.desnoyers@...ymtl.ca>
To: Changli Gao <xiaosuo@...il.com>
Cc: Patrick McHardy <kaber@...sh.net>,
"David S. Miller" <davem@...emloft.net>,
Eric Dumazet <eric.dumazet@...il.com>,
netfilter-devel@...r.kernel.org, netdev@...r.kernel.org
Subject: Re: [PATCH v2] netfilter: save the hash of the tuple in the
original direction for latter use
* Changli Gao (xiaosuo@...il.com) wrote:
> Since we don't change the tuple in the original direction, we can save it
> in ct->tuplehash[IP_CT_DIR_REPLY].hnode.pprev for __nf_conntrack_confirm()
> use.
>
> __hash_conntrack() is split into two steps: ____hash_conntrack() is used
> to get the raw hash, and __hash_bucket() is used to get the bucket id.
>
> In SYN-flood case, early_drop() doesn't need to recompute the hash again.
>
> Signed-off-by: Changli Gao <xiaosuo@...il.com>
> ---
> v2: use cmpxchg() to save 2 variables.
> net/netfilter/nf_conntrack_core.c | 114 ++++++++++++++++++++++++++------------
> 1 file changed, 79 insertions(+), 35 deletions(-)
> diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> index df3eedb..0e02205 100644
> --- a/net/netfilter/nf_conntrack_core.c
> +++ b/net/netfilter/nf_conntrack_core.c
> @@ -65,14 +65,20 @@ EXPORT_SYMBOL_GPL(nf_conntrack_max);
> DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
> EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
>
> -static int nf_conntrack_hash_rnd_initted;
> -static unsigned int nf_conntrack_hash_rnd;
> -
> -static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
> - u16 zone, unsigned int size, unsigned int rnd)
> +static u32 ____hash_conntrack(const struct nf_conntrack_tuple *tuple, u16 zone)
> {
> unsigned int n;
> u_int32_t h;
> + static unsigned int rnd;
Why are you putting a hidden static variable here ? It should probably
be declared outside of the function scope.
> +
> + if (unlikely(!rnd)) {
> + unsigned int rand;
> +
> + get_random_bytes(&rand, sizeof(rand));
> + if (!rand)
> + rand = 1;
> + cmpxchg(&rnd, 0, rand);
This really belongs to a "init()" function, not in a dynamic check in
the __hash_conntrack hot path. If you do that a init time, then you
don't even need the cmpxchg.
Or maybe I completely misunderstand you goal here; maybe you are really
trying to re-calculate random bytes. But more explanation is called for,
because I really don't see where the value is brought back to 0.
Thanks,
Mathieu
> + }
>
> /* The direction must be ignored, so we hash everything up to the
> * destination ports (which is a multiple of 4) and treat the last
> @@ -83,14 +89,29 @@ static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
> zone ^ rnd ^ (((__force __u16)tuple->dst.u.all << 16) |
> tuple->dst.protonum));
>
> - return ((u64)h * size) >> 32;
> + return h;
> +}
> +
> +static u32 __hash_bucket(u32 __hash, unsigned int size)
> +{
> + return ((u64)__hash * size) >> 32;
> +}
> +
> +static u32 hash_bucket(u32 __hash, const struct net *net)
> +{
> + return __hash_bucket(__hash, net->ct.htable_size);
> +}
> +
> +static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
> + u16 zone, unsigned int size)
> +{
> + return __hash_bucket(____hash_conntrack(tuple, zone), size);
> }
>
> static inline u_int32_t hash_conntrack(const struct net *net, u16 zone,
> const struct nf_conntrack_tuple *tuple)
> {
> - return __hash_conntrack(tuple, zone, net->ct.htable_size,
> - nf_conntrack_hash_rnd);
> + return __hash_conntrack(tuple, zone, net->ct.htable_size);
> }
>
> bool
> @@ -292,13 +313,13 @@ static void death_by_timeout(unsigned long ul_conntrack)
> * OR
> * - Caller must lock nf_conntrack_lock before calling this function
> */
> -struct nf_conntrack_tuple_hash *
> -__nf_conntrack_find(struct net *net, u16 zone,
> - const struct nf_conntrack_tuple *tuple)
> +static struct nf_conntrack_tuple_hash *
> +____nf_conntrack_find(struct net *net, u16 zone,
> + const struct nf_conntrack_tuple *tuple, u32 __hash)
> {
> struct nf_conntrack_tuple_hash *h;
> struct hlist_nulls_node *n;
> - unsigned int hash = hash_conntrack(net, zone, tuple);
> + unsigned int hash = hash_bucket(__hash, net);
>
> /* Disable BHs the entire time since we normally need to disable them
> * at least once for the stats anyway.
> @@ -327,19 +348,27 @@ begin:
>
> return NULL;
> }
> +
> +struct nf_conntrack_tuple_hash *
> +__nf_conntrack_find(struct net *net, u16 zone,
> + const struct nf_conntrack_tuple *tuple)
> +{
> + return ____nf_conntrack_find(net, zone, tuple,
> + ____hash_conntrack(tuple, zone));
> +}
> EXPORT_SYMBOL_GPL(__nf_conntrack_find);
>
> /* Find a connection corresponding to a tuple. */
> -struct nf_conntrack_tuple_hash *
> -nf_conntrack_find_get(struct net *net, u16 zone,
> - const struct nf_conntrack_tuple *tuple)
> +static struct nf_conntrack_tuple_hash *
> +__nf_conntrack_find_get(struct net *net, u16 zone,
> + const struct nf_conntrack_tuple *tuple, u32 __hash)
> {
> struct nf_conntrack_tuple_hash *h;
> struct nf_conn *ct;
>
> rcu_read_lock();
> begin:
> - h = __nf_conntrack_find(net, zone, tuple);
> + h = ____nf_conntrack_find(net, zone, tuple, __hash);
> if (h) {
> ct = nf_ct_tuplehash_to_ctrack(h);
> if (unlikely(nf_ct_is_dying(ct) ||
> @@ -357,6 +386,14 @@ begin:
>
> return h;
> }
> +
> +struct nf_conntrack_tuple_hash *
> +nf_conntrack_find_get(struct net *net, u16 zone,
> + const struct nf_conntrack_tuple *tuple)
> +{
> + return __nf_conntrack_find_get(net, zone, tuple,
> + ____hash_conntrack(tuple, zone));
> +}
> EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
>
> static void __nf_conntrack_hash_insert(struct nf_conn *ct,
> @@ -409,7 +446,8 @@ __nf_conntrack_confirm(struct sk_buff *skb)
> return NF_ACCEPT;
>
> zone = nf_ct_zone(ct);
> - hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
> + /* reuse the __hash saved before */
> + hash = hash_bucket(*(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev, net);
> repl_hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
>
> /* We're not in hash table, and we refuse to set up related
> @@ -567,25 +605,20 @@ static noinline int early_drop(struct net *net, unsigned int hash)
> return dropped;
> }
>
> -struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
> - const struct nf_conntrack_tuple *orig,
> - const struct nf_conntrack_tuple *repl,
> - gfp_t gfp)
> +static struct nf_conn *
> +__nf_conntrack_alloc(struct net *net, u16 zone,
> + const struct nf_conntrack_tuple *orig,
> + const struct nf_conntrack_tuple *repl,
> + gfp_t gfp, u32 __hash)
> {
> struct nf_conn *ct;
>
> - if (unlikely(!nf_conntrack_hash_rnd_initted)) {
> - get_random_bytes(&nf_conntrack_hash_rnd,
> - sizeof(nf_conntrack_hash_rnd));
> - nf_conntrack_hash_rnd_initted = 1;
> - }
> -
> /* We don't want any race condition at early drop stage */
> atomic_inc(&net->ct.count);
>
> if (nf_conntrack_max &&
> unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
> - unsigned int hash = hash_conntrack(net, zone, orig);
> + unsigned int hash = hash_bucket(__hash, net);
> if (!early_drop(net, hash)) {
> atomic_dec(&net->ct.count);
> if (net_ratelimit())
> @@ -616,7 +649,8 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
> ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
> ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
> ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
> - ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev = NULL;
> + /* save __hash for reusing when confirming */
> + *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = __hash;
> /* Don't set timer yet: wait for confirmation */
> setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct);
> write_pnet(&ct->ct_net, net);
> @@ -643,6 +677,14 @@ out_free:
> return ERR_PTR(-ENOMEM);
> #endif
> }
> +
> +struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
> + const struct nf_conntrack_tuple *orig,
> + const struct nf_conntrack_tuple *repl,
> + gfp_t gfp)
> +{
> + return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0);
> +}
> EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
>
> void nf_conntrack_free(struct nf_conn *ct)
> @@ -664,7 +706,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
> struct nf_conntrack_l3proto *l3proto,
> struct nf_conntrack_l4proto *l4proto,
> struct sk_buff *skb,
> - unsigned int dataoff)
> + unsigned int dataoff, u32 __hash)
> {
> struct nf_conn *ct;
> struct nf_conn_help *help;
> @@ -678,7 +720,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
> return NULL;
> }
>
> - ct = nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC);
> + ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
> + __hash);
> if (IS_ERR(ct)) {
> pr_debug("Can't allocate conntrack.\n");
> return (struct nf_conntrack_tuple_hash *)ct;
> @@ -755,6 +798,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
> struct nf_conntrack_tuple_hash *h;
> struct nf_conn *ct;
> u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
> + u32 __hash;
>
> if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
> dataoff, l3num, protonum, &tuple, l3proto,
> @@ -764,10 +808,11 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
> }
>
> /* look for tuple match */
> - h = nf_conntrack_find_get(net, zone, &tuple);
> + __hash = ____hash_conntrack(&tuple, zone);
> + h = __nf_conntrack_find_get(net, zone, &tuple, __hash);
> if (!h) {
> h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
> - skb, dataoff);
> + skb, dataoff, __hash);
> if (!h)
> return NULL;
> if (IS_ERR(h))
> @@ -1307,8 +1352,7 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
> ct = nf_ct_tuplehash_to_ctrack(h);
> hlist_nulls_del_rcu(&h->hnnode);
> bucket = __hash_conntrack(&h->tuple, nf_ct_zone(ct),
> - hashsize,
> - nf_conntrack_hash_rnd);
> + hashsize);
> hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
> }
> }
--
Mathieu Desnoyers
Operating System Efficiency R&D Consultant
EfficiOS Inc.
http://www.efficios.com
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists