[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <b803a854-f050-a12e-680f-c53da35b3151@virtuozzo.com>
Date: Fri, 30 Mar 2018 14:44:29 +0300
From: Kirill Tkhai <ktkhai@...tuozzo.com>
To: Eric Dumazet <edumazet@...gle.com>,
"David S . Miller" <davem@...emloft.net>
Cc: netdev <netdev@...r.kernel.org>, Florian Westphal <fw@...len.de>,
Herbert Xu <herbert@...dor.apana.org.au>,
Thomas Graf <tgraf@...g.ch>,
Jesper Dangaard Brouer <brouer@...hat.com>,
Alexander Aring <alex.aring@...il.com>,
Stefan Schmidt <stefan@....samsung.com>,
Eric Dumazet <eric.dumazet@...il.com>,
Nikolay Aleksandrov <nikolay@...hat.com>
Subject: Re: [PATCH net-next 4/6] inet: frags: use rhashtables for reassembly
units
Hi, Eric,
On 30.03.2018 08:22, Eric Dumazet wrote:
> Some applications still rely on IP fragmentation, and to be fair linux
> reassembly unit is not working under any serious load.
>
> It uses static hash tables of 1024 buckets, and up to 128 items per bucket (!!!)
>
> A work queue is supposed to garbage collect items when host is under memory
> pressure, and doing a hash rebuild, changing seed used in hash computations.
>
> This work queue blocks softirqs for up to 25 ms when doing a hash rebuild,
> occurring every 5 seconds if host is under fire.
>
> Then there is the problem of sharing this hash table for all netns.
>
> It is time to switch to rhashtables, and allocate one of them per netns
> to speedup netns dismantle, since this is a critical metric these days.
>
> Lookup is now using RCU. A followup patch will even remove
> the refcount hold/release left from prior implementation and save
> a couple of atomic operations.
>
> Before this patch, 16 cpus (16 RX queue NIC) could not handle more
> than 1 Mpps frags DDOS.
>
> After the patch, I reach 7 Mpps without any tuning, and can use up to 2GB
> of storage for the fragments.
Great results!
Please, see some comments below.
> $ grep FRAG /proc/net/sockstat
> FRAG: inuse 1966916 memory 2140004608
>
> A followup patch will change the limits for 64bit arches.
>
> Signed-off-by: Eric Dumazet <edumazet@...gle.com>
> Cc: Florian Westphal <fw@...len.de>
> Cc: Nikolay Aleksandrov <nikolay@...hat.com>
> Cc: Jesper Dangaard Brouer <brouer@...hat.com>
> Cc: Alexander Aring <alex.aring@...il.com>
> Cc: Stefan Schmidt <stefan@....samsung.com>
> ---
> Documentation/networking/ip-sysctl.txt | 7 +-
> include/net/inet_frag.h | 99 +++---
> include/net/ipv6.h | 20 +-
> net/ieee802154/6lowpan/6lowpan_i.h | 26 +-
> net/ieee802154/6lowpan/reassembly.c | 108 +++----
> net/ipv4/inet_fragment.c | 399 +++++-------------------
> net/ipv4/ip_fragment.c | 165 +++++-----
> net/ipv6/netfilter/nf_conntrack_reasm.c | 62 ++--
> net/ipv6/reassembly.c | 152 +++++----
> 9 files changed, 344 insertions(+), 694 deletions(-)
>
> diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
> index 1d1120753ae82d0aee3e934a3d9c074b70dcbca6..c3b65f24e58aa72b720861d816fb76f9956800f0 100644
> --- a/Documentation/networking/ip-sysctl.txt
> +++ b/Documentation/networking/ip-sysctl.txt
> @@ -134,13 +134,10 @@ min_adv_mss - INTEGER
> IP Fragmentation:
>
> ipfrag_high_thresh - INTEGER
> - Maximum memory used to reassemble IP fragments. When
> - ipfrag_high_thresh bytes of memory is allocated for this purpose,
> - the fragment handler will toss packets until ipfrag_low_thresh
> - is reached. This also serves as a maximum limit to namespaces
> - different from the initial one.
> + Maximum memory used to reassemble IP fragments.
>
> ipfrag_low_thresh - INTEGER
> + (Obsolete since linux-4.17)
> Maximum memory used to reassemble IP fragments before the kernel
> begins to remove incomplete fragment queues to free up resources.
> The kernel still accepts new fragments for defragmentation.
> diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
> index 69e531ed81894393e07cac9e953825fcb55ef42a..05099f9f980e2384c0c8cd7e74659656b585cd22 100644
> --- a/include/net/inet_frag.h
> +++ b/include/net/inet_frag.h
> @@ -2,15 +2,20 @@
> #ifndef __NET_FRAG_H__
> #define __NET_FRAG_H__
>
> +#include <linux/rhashtable.h>
> +
> struct netns_frags {
> - /* Keep atomic mem on separate cachelines in structs that include it */
> - atomic_t mem ____cacheline_aligned_in_smp;
> /* sysctls */
> int timeout;
> int high_thresh;
> int low_thresh;
> int max_dist;
> struct inet_frags *f;
> +
> + /* Keep atomic mem on separate cachelines in structs that include it */
> + atomic_t mem ____cacheline_aligned_in_smp;
The patch is big, and it seems it's possible to extract refactorings like this
in separate patch/patches. Here just two lines moved down.
> +
> + struct rhashtable rhashtable ____cacheline_aligned_in_smp;
> };
>
> /**
> @@ -26,12 +31,31 @@ enum {
> INET_FRAG_COMPLETE = BIT(2),
> };
>
> +struct frag_v4_compare_key {
> + __be32 saddr;
> + __be32 daddr;
> + u32 user;
> + u32 vif;
> + __be16 id;
> + u16 protocol;
> +};
> +
> +struct frag_v6_compare_key {
> + struct in6_addr saddr;
> + struct in6_addr daddr;
> + u32 user;
> + __be32 id;
> + u32 iif;
> +};
> +
> /**
> * struct inet_frag_queue - fragment queue
> *
> - * @lock: spinlock protecting the queue
> + * @node: rhash node
> + * @key: keys identifying this frag.
> * @timer: queue expiration timer
> - * @list: hash bucket list
> + * @net: namespace that this frag belongs to
> + * @lock: spinlock protecting this frag
> * @refcnt: reference count of the queue
> * @fragments: received fragments head
> * @fragments_tail: received fragments tail
> @@ -40,66 +64,38 @@ enum {
> * @meat: length of received fragments so far
> * @flags: fragment queue flags
> * @max_size: maximum received fragment size
> - * @net: namespace that this frag belongs to
> - * @list_evictor: list of queues to forcefully evict (e.g. due to low memory)
> + * @rcu: rcu head for freeing deferall
> */
> struct inet_frag_queue {
> - spinlock_t lock;
> + struct rhash_head node;
> + union {
> + struct frag_v4_compare_key v4;
> + struct frag_v6_compare_key v6;
> + } key;
> struct timer_list timer;
> - struct hlist_node list;
> + struct netns_frags *net;
> + spinlock_t lock;
Here lock and net just change their position in struct { }.
> refcount_t refcnt;
> struct sk_buff *fragments;
> struct sk_buff *fragments_tail;
> ktime_t stamp;
> int len;
> int meat;
> - __u8 flags;
> + u8 flags;
Here just type is changed.
> u16 max_size;
> - struct netns_frags *net;
> - struct hlist_node list_evictor;
> -};
> -
> -#define INETFRAGS_HASHSZ 1024
> -
> -/* averaged:
> - * max_depth = default ipfrag_high_thresh / INETFRAGS_HASHSZ /
> - * rounded up (SKB_TRUELEN(0) + sizeof(struct ipq or
> - * struct frag_queue))
> - */
> -#define INETFRAGS_MAXDEPTH 128
> -
> -struct inet_frag_bucket {
> - struct hlist_head chain;
> - spinlock_t chain_lock;
> + struct rcu_head rcu;
> };
>
> struct inet_frags {
> - struct inet_frag_bucket hash[INETFRAGS_HASHSZ];
> -
> - struct work_struct frags_work;
> - unsigned int next_bucket;
> - unsigned long last_rebuild_jiffies;
> - bool rebuild;
> -
> - /* The first call to hashfn is responsible to initialize
> - * rnd. This is best done with net_get_random_once.
> - *
> - * rnd_seqlock is used to let hash insertion detect
> - * when it needs to re-lookup the hash chain to use.
> - */
> - u32 rnd;
> - seqlock_t rnd_seqlock;
> unsigned int qsize;
>
> - unsigned int (*hashfn)(const struct inet_frag_queue *);
> - bool (*match)(const struct inet_frag_queue *q,
> - const void *arg);
> - void (*constructor)(struct inet_frag_queue *q,
> + void (*constructor)(struct inet_frag_queue *fq,
Here just parameter name is changed
> const void *arg);
> - void (*destructor)(struct inet_frag_queue *);
> + void (*destructor)(struct inet_frag_queue *fq);
The same as above.
> void (*frag_expire)(struct timer_list *t);
> struct kmem_cache *frags_cachep;
> const char *frags_cache_name;
> + struct rhashtable_params rhash_params;
> };
>
> int inet_frags_init(struct inet_frags *);
> @@ -108,17 +104,13 @@ void inet_frags_fini(struct inet_frags *);
> static inline int inet_frags_init_net(struct netns_frags *nf)
> {
> atomic_set(&nf->mem, 0);
> - return 0;
> + return rhashtable_init(&nf->rhashtable, &nf->f->rhash_params);
> }
> void inet_frags_exit_net(struct netns_frags *nf);
>
> void inet_frag_kill(struct inet_frag_queue *q);
> void inet_frag_destroy(struct inet_frag_queue *q);
> -struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
> - struct inet_frags *f, void *key, unsigned int hash);
> -
> -void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
> - const char *prefix);
> +struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key);
>
> static inline void inet_frag_put(struct inet_frag_queue *q)
> {
> @@ -126,11 +118,6 @@ static inline void inet_frag_put(struct inet_frag_queue *q)
> inet_frag_destroy(q);
> }
>
> -static inline bool inet_frag_evicting(struct inet_frag_queue *q)
> -{
> - return !hlist_unhashed(&q->list_evictor);
> -}
> -
> /* Memory Tracking Functions. */
>
> static inline int frag_mem_limit(struct netns_frags *nf)
> diff --git a/include/net/ipv6.h b/include/net/ipv6.h
> index 5c18836672e9d1c560cdce15f5b34928c337abfd..76f84d4be91b92761fb9a26e7f52e2101ee34c0a 100644
> --- a/include/net/ipv6.h
> +++ b/include/net/ipv6.h
> @@ -579,36 +579,20 @@ enum ip6_defrag_users {
> __IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX,
> };
>
> -struct ip6_create_arg {
> - __be32 id;
> - u32 user;
> - const struct in6_addr *src;
> - const struct in6_addr *dst;
> - int iif;
> - u8 ecn;
> -};
> -
> void ip6_frag_init(struct inet_frag_queue *q, const void *a);
> -bool ip6_frag_match(const struct inet_frag_queue *q, const void *a);
>
> /*
> - * Equivalent of ipv4 struct ip
> + * Equivalent of ipv4 struct ipq
> */
> struct frag_queue {
> struct inet_frag_queue q;
>
> - __be32 id; /* fragment id */
> - u32 user;
> - struct in6_addr saddr;
> - struct in6_addr daddr;
> -
> int iif;
> __u16 nhoffset;
> u8 ecn;
> };
>
> -void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
> - struct inet_frags *frags);
> +void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq);
>
> static inline bool ipv6_addr_any(const struct in6_addr *a)
> {
> diff --git a/net/ieee802154/6lowpan/6lowpan_i.h b/net/ieee802154/6lowpan/6lowpan_i.h
> index d8de3bcfb1032a1133402cb2a4c50a2448133846..b8d95cb71c25dd69c8a88b2c886a3f0d2ce1174f 100644
> --- a/net/ieee802154/6lowpan/6lowpan_i.h
> +++ b/net/ieee802154/6lowpan/6lowpan_i.h
> @@ -17,37 +17,19 @@ typedef unsigned __bitwise lowpan_rx_result;
> #define LOWPAN_DISPATCH_FRAG1 0xc0
> #define LOWPAN_DISPATCH_FRAGN 0xe0
>
> -struct lowpan_create_arg {
> +struct frag_lowpan_compare_key {
> u16 tag;
> u16 d_size;
> - const struct ieee802154_addr *src;
> - const struct ieee802154_addr *dst;
> + const struct ieee802154_addr src;
> + const struct ieee802154_addr dst;
> };
>
> -/* Equivalent of ipv4 struct ip
> +/* Equivalent of ipv4 struct ipq
> */
> struct lowpan_frag_queue {
> struct inet_frag_queue q;
> -
> - u16 tag;
> - u16 d_size;
> - struct ieee802154_addr saddr;
> - struct ieee802154_addr daddr;
> };
>
> -static inline u32 ieee802154_addr_hash(const struct ieee802154_addr *a)
> -{
> - switch (a->mode) {
> - case IEEE802154_ADDR_LONG:
> - return (((__force u64)a->extended_addr) >> 32) ^
> - (((__force u64)a->extended_addr) & 0xffffffff);
> - case IEEE802154_ADDR_SHORT:
> - return (__force u32)(a->short_addr + (a->pan_id << 16));
> - default:
> - return 0;
> - }
> -}
> -
> int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type);
> void lowpan_net_frag_exit(void);
> int lowpan_net_frag_init(void);
> diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
> index 6badc055555b7baedac2051a1aaea15f9e9b180c..9ee4d22666c26d6d9796d0f484bb4beb265dea42 100644
> --- a/net/ieee802154/6lowpan/reassembly.c
> +++ b/net/ieee802154/6lowpan/reassembly.c
> @@ -37,47 +37,15 @@ static struct inet_frags lowpan_frags;
> static int lowpan_frag_reasm(struct lowpan_frag_queue *fq,
> struct sk_buff *prev, struct net_device *ldev);
>
> -static unsigned int lowpan_hash_frag(u16 tag, u16 d_size,
> - const struct ieee802154_addr *saddr,
> - const struct ieee802154_addr *daddr)
> -{
> - net_get_random_once(&lowpan_frags.rnd, sizeof(lowpan_frags.rnd));
> - return jhash_3words(ieee802154_addr_hash(saddr),
> - ieee802154_addr_hash(daddr),
> - (__force u32)(tag + (d_size << 16)),
> - lowpan_frags.rnd);
> -}
> -
> -static unsigned int lowpan_hashfn(const struct inet_frag_queue *q)
> -{
> - const struct lowpan_frag_queue *fq;
> -
> - fq = container_of(q, struct lowpan_frag_queue, q);
> - return lowpan_hash_frag(fq->tag, fq->d_size, &fq->saddr, &fq->daddr);
> -}
> -
> -static bool lowpan_frag_match(const struct inet_frag_queue *q, const void *a)
> -{
> - const struct lowpan_frag_queue *fq;
> - const struct lowpan_create_arg *arg = a;
> -
> - fq = container_of(q, struct lowpan_frag_queue, q);
> - return fq->tag == arg->tag && fq->d_size == arg->d_size &&
> - ieee802154_addr_equal(&fq->saddr, arg->src) &&
> - ieee802154_addr_equal(&fq->daddr, arg->dst);
> -}
> -
> static void lowpan_frag_init(struct inet_frag_queue *q, const void *a)
> {
> - const struct lowpan_create_arg *arg = a;
> + const struct frag_lowpan_compare_key *key = a;
> struct lowpan_frag_queue *fq;
>
> fq = container_of(q, struct lowpan_frag_queue, q);
>
> - fq->tag = arg->tag;
> - fq->d_size = arg->d_size;
> - fq->saddr = *arg->src;
> - fq->daddr = *arg->dst;
> + BUILD_BUG_ON(sizeof(*key) > sizeof(q->key));
> + memcpy(&q->key, key, sizeof(*key));
> }
>
> static void lowpan_frag_expire(struct timer_list *t)
> @@ -105,25 +73,20 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb,
> const struct ieee802154_addr *src,
> const struct ieee802154_addr *dst)
> {
> - struct inet_frag_queue *q;
> - struct lowpan_create_arg arg;
> - unsigned int hash;
> struct netns_ieee802154_lowpan *ieee802154_lowpan =
> net_ieee802154_lowpan(net);
> + struct frag_lowpan_compare_key key = {
> + .tag = cb->d_tag,
> + .d_size = cb->d_size,
> + .src = *src,
> + .dst = *dst,
> + };
> + struct inet_frag_queue *q;
>
> - arg.tag = cb->d_tag;
> - arg.d_size = cb->d_size;
> - arg.src = src;
> - arg.dst = dst;
> -
> - hash = lowpan_hash_frag(cb->d_tag, cb->d_size, src, dst);
> -
> - q = inet_frag_find(&ieee802154_lowpan->frags,
> - &lowpan_frags, &arg, hash);
> - if (IS_ERR_OR_NULL(q)) {
> - inet_frag_maybe_warn_overflow(q, pr_fmt());
> + q = inet_frag_find(&ieee802154_lowpan->frags, &key);
> + if (IS_ERR_OR_NULL(q))
> return NULL;
> - }
> +
> return container_of(q, struct lowpan_frag_queue, q);
> }
>
> @@ -588,6 +551,7 @@ static int __net_init lowpan_frags_init_net(struct net *net)
> ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
> ieee802154_lowpan->frags.f = &lowpan_frags;
>
> + ieee802154_lowpan->frags.f = &lowpan_frags;
> res = inet_frags_init_net(&ieee802154_lowpan->frags);
> if (res < 0)
> return res;
> @@ -611,6 +575,36 @@ static struct pernet_operations lowpan_frags_ops = {
> .exit = lowpan_frags_exit_net,
> };
>
> +static u32 lowpan_key_hashfn(const void *data, u32 len, u32 seed)
> +{
> + return jhash2(data,
> + sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed);
> +}
> +
> +static u32 lowpan_obj_hashfn(const void *data, u32 len, u32 seed)
> +{
> + const struct inet_frag_queue *fq = data;
> +
> + return jhash2((const u32 *)&fq->key,
> + sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed);
> +}
> +
> +static int lowpan_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
> +{
> + const struct frag_lowpan_compare_key *key = arg->key;
> + const struct inet_frag_queue *fq = ptr;
> +
> + return !!memcmp(&fq->key, key, sizeof(*key));
> +}
> +
> +const struct rhashtable_params lowpan_rhash_params = {
> + .head_offset = offsetof(struct inet_frag_queue, node),
> + .hashfn = lowpan_key_hashfn,
> + .obj_hashfn = lowpan_obj_hashfn,
> + .obj_cmpfn = lowpan_obj_cmpfn,
> + .automatic_shrinking = true,
> +};
> +
> int __init lowpan_net_frag_init(void)
> {
> int ret;
> @@ -619,22 +613,24 @@ int __init lowpan_net_frag_init(void)
> if (ret)
> return ret;
>
> - ret = register_pernet_subsys(&lowpan_frags_ops);
> - if (ret)
> - goto err_pernet;
> -
> - lowpan_frags.hashfn = lowpan_hashfn;
> lowpan_frags.constructor = lowpan_frag_init;
> lowpan_frags.destructor = NULL;
> lowpan_frags.qsize = sizeof(struct frag_queue);
> - lowpan_frags.match = lowpan_frag_match;
> lowpan_frags.frag_expire = lowpan_frag_expire;
> lowpan_frags.frags_cache_name = lowpan_frags_cache_name;
> + lowpan_frags.rhash_params = lowpan_rhash_params;
> ret = inet_frags_init(&lowpan_frags);
> if (ret)
> goto err_pernet;
>
> + ret = register_pernet_subsys(&lowpan_frags_ops);
> + if (ret)
> + goto err_pernet_frags;
> +
Can't we move this register_pernet_subsys() in separate patch?
> return ret;
> +
> +err_pernet_frags:
> + inet_frags_fini(&lowpan_frags);
> err_pernet:
> lowpan_frags_sysctl_unregister();
> return ret;
> diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
> index 1ac69f65d0dee600d0ab4db20ff5942952932c40..8ccaf605630f14270996ee1b5a37376299d78661 100644
> --- a/net/ipv4/inet_fragment.c
> +++ b/net/ipv4/inet_fragment.c
> @@ -25,12 +25,6 @@
> #include <net/inet_frag.h>
> #include <net/inet_ecn.h>
>
> -#define INETFRAGS_EVICT_BUCKETS 128
> -#define INETFRAGS_EVICT_MAX 512
> -
> -/* don't rebuild inetfrag table with new secret more often than this */
> -#define INETFRAGS_MIN_REBUILD_INTERVAL (5 * HZ)
> -
> /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
> * Value : 0xff if frame should be dropped.
> * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field
> @@ -52,157 +46,8 @@ const u8 ip_frag_ecn_table[16] = {
> };
> EXPORT_SYMBOL(ip_frag_ecn_table);
>
> -static unsigned int
> -inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q)
> -{
> - return f->hashfn(q) & (INETFRAGS_HASHSZ - 1);
> -}
> -
> -static bool inet_frag_may_rebuild(struct inet_frags *f)
> -{
> - return time_after(jiffies,
> - f->last_rebuild_jiffies + INETFRAGS_MIN_REBUILD_INTERVAL);
> -}
> -
> -static void inet_frag_secret_rebuild(struct inet_frags *f)
> -{
> - int i;
> -
> - write_seqlock_bh(&f->rnd_seqlock);
> -
> - if (!inet_frag_may_rebuild(f))
> - goto out;
> -
> - get_random_bytes(&f->rnd, sizeof(u32));
> -
> - for (i = 0; i < INETFRAGS_HASHSZ; i++) {
> - struct inet_frag_bucket *hb;
> - struct inet_frag_queue *q;
> - struct hlist_node *n;
> -
> - hb = &f->hash[i];
> - spin_lock(&hb->chain_lock);
> -
> - hlist_for_each_entry_safe(q, n, &hb->chain, list) {
> - unsigned int hval = inet_frag_hashfn(f, q);
> -
> - if (hval != i) {
> - struct inet_frag_bucket *hb_dest;
> -
> - hlist_del(&q->list);
> -
> - /* Relink to new hash chain. */
> - hb_dest = &f->hash[hval];
> -
> - /* This is the only place where we take
> - * another chain_lock while already holding
> - * one. As this will not run concurrently,
> - * we cannot deadlock on hb_dest lock below, if its
> - * already locked it will be released soon since
> - * other caller cannot be waiting for hb lock
> - * that we've taken above.
> - */
> - spin_lock_nested(&hb_dest->chain_lock,
> - SINGLE_DEPTH_NESTING);
> - hlist_add_head(&q->list, &hb_dest->chain);
> - spin_unlock(&hb_dest->chain_lock);
> - }
> - }
> - spin_unlock(&hb->chain_lock);
> - }
> -
> - f->rebuild = false;
> - f->last_rebuild_jiffies = jiffies;
> -out:
> - write_sequnlock_bh(&f->rnd_seqlock);
> -}
> -
> -static bool inet_fragq_should_evict(const struct inet_frag_queue *q)
> -{
> - if (!hlist_unhashed(&q->list_evictor))
> - return false;
> -
> - return q->net->low_thresh == 0 ||
> - frag_mem_limit(q->net) >= q->net->low_thresh;
> -}
> -
> -static unsigned int
> -inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
> -{
> - struct inet_frag_queue *fq;
> - struct hlist_node *n;
> - unsigned int evicted = 0;
> - HLIST_HEAD(expired);
> -
> - spin_lock(&hb->chain_lock);
> -
> - hlist_for_each_entry_safe(fq, n, &hb->chain, list) {
> - if (!inet_fragq_should_evict(fq))
> - continue;
> -
> - if (!del_timer(&fq->timer))
> - continue;
> -
> - hlist_add_head(&fq->list_evictor, &expired);
> - ++evicted;
> - }
> -
> - spin_unlock(&hb->chain_lock);
> -
> - hlist_for_each_entry_safe(fq, n, &expired, list_evictor)
> - f->frag_expire(&fq->timer);
> -
> - return evicted;
> -}
> -
> -static void inet_frag_worker(struct work_struct *work)
> -{
> - unsigned int budget = INETFRAGS_EVICT_BUCKETS;
> - unsigned int i, evicted = 0;
> - struct inet_frags *f;
> -
> - f = container_of(work, struct inet_frags, frags_work);
> -
> - BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ);
> -
> - local_bh_disable();
> -
> - for (i = READ_ONCE(f->next_bucket); budget; --budget) {
> - evicted += inet_evict_bucket(f, &f->hash[i]);
> - i = (i + 1) & (INETFRAGS_HASHSZ - 1);
> - if (evicted > INETFRAGS_EVICT_MAX)
> - break;
> - }
> -
> - f->next_bucket = i;
> -
> - local_bh_enable();
> -
> - if (f->rebuild && inet_frag_may_rebuild(f))
> - inet_frag_secret_rebuild(f);
> -}
> -
> -static void inet_frag_schedule_worker(struct inet_frags *f)
> -{
> - if (unlikely(!work_pending(&f->frags_work)))
> - schedule_work(&f->frags_work);
> -}
> -
> int inet_frags_init(struct inet_frags *f)
> {
> - int i;
> -
> - INIT_WORK(&f->frags_work, inet_frag_worker);
> -
> - for (i = 0; i < INETFRAGS_HASHSZ; i++) {
> - struct inet_frag_bucket *hb = &f->hash[i];
> -
> - spin_lock_init(&hb->chain_lock);
> - INIT_HLIST_HEAD(&hb->chain);
> - }
> -
> - seqlock_init(&f->rnd_seqlock);
> - f->last_rebuild_jiffies = 0;
> f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0,
> NULL);
> if (!f->frags_cachep)
> @@ -214,93 +59,80 @@ EXPORT_SYMBOL(inet_frags_init);
>
> void inet_frags_fini(struct inet_frags *f)
> {
> - cancel_work_sync(&f->frags_work);
> + rcu_barrier();
What does this barrier waits? This should have a comment.
> kmem_cache_destroy(f->frags_cachep);
> + f->frags_cachep = NULL;
> }
> EXPORT_SYMBOL(inet_frags_fini);
>
> void inet_frags_exit_net(struct netns_frags *nf)
> {
> - struct inet_frags *f =nf->f;
> - unsigned int seq;
> - int i;
> -
> - nf->low_thresh = 0;
> + struct rhashtable_iter hti;
> + struct inet_frag_queue *fq;
>
> -evict_again:
> - local_bh_disable();
> - seq = read_seqbegin(&f->rnd_seqlock);
> + nf->low_thresh = 0; /* prevent creation of new frags */
>
> - for (i = 0; i < INETFRAGS_HASHSZ ; i++)
> - inet_evict_bucket(f, &f->hash[i]);
> + rhashtable_walk_enter(&nf->rhashtable, &hti);
> + do {
> + rhashtable_walk_start(&hti);
>
> - local_bh_enable();
> - cond_resched();
> + while ((fq = rhashtable_walk_next(&hti)) && !IS_ERR(fq)) {
> + if (refcount_inc_not_zero(&fq->refcnt)) {
> + spin_lock_bh(&fq->lock);
> + inet_frag_kill(fq);
> + spin_unlock_bh(&fq->lock);
> + inet_frag_put(fq);
> + }
> + }
>
> - if (read_seqretry(&f->rnd_seqlock, seq) ||
> - sum_frag_mem_limit(nf))
> - goto evict_again;
> + rhashtable_walk_stop(&hti);
> + } while (cond_resched(), fq == ERR_PTR(-EAGAIN));
> + rhashtable_walk_exit(&hti);
> + rhashtable_destroy(&nf->rhashtable);
> }
> EXPORT_SYMBOL(inet_frags_exit_net);
>
> -static struct inet_frag_bucket *
> -get_frag_bucket_locked(struct inet_frag_queue *fq, struct inet_frags *f)
> -__acquires(hb->chain_lock)
> -{
> - struct inet_frag_bucket *hb;
> - unsigned int seq, hash;
> -
> - restart:
> - seq = read_seqbegin(&f->rnd_seqlock);
> -
> - hash = inet_frag_hashfn(f, fq);
> - hb = &f->hash[hash];
> -
> - spin_lock(&hb->chain_lock);
> - if (read_seqretry(&f->rnd_seqlock, seq)) {
> - spin_unlock(&hb->chain_lock);
> - goto restart;
> - }
> -
> - return hb;
> -}
> -
> -static inline void fq_unlink(struct inet_frag_queue *fq)
> -{
> - struct inet_frag_bucket *hb;
> -
> - hb = get_frag_bucket_locked(fq, fq->net->f);
> - hlist_del(&fq->list);
> - fq->flags |= INET_FRAG_COMPLETE;
> - spin_unlock(&hb->chain_lock);
> -}
> -
> void inet_frag_kill(struct inet_frag_queue *fq)
> {
> if (del_timer(&fq->timer))
> refcount_dec(&fq->refcnt);
>
> if (!(fq->flags & INET_FRAG_COMPLETE)) {
> - fq_unlink(fq);
> + struct netns_frags *nf = fq->net;
> +
> + fq->flags |= INET_FRAG_COMPLETE;
> + rhashtable_remove_fast(&nf->rhashtable, &fq->node, nf->f->rhash_params);
> refcount_dec(&fq->refcnt);
> }
> }
> EXPORT_SYMBOL(inet_frag_kill);
>
> -void inet_frag_destroy(struct inet_frag_queue *q)
> +static void inet_frag_destroy_rcu(struct rcu_head *head)
> {
> - struct sk_buff *fp;
> + struct inet_frag_queue *fq = container_of(head, struct inet_frag_queue,
> + rcu);
> + struct inet_frags *f = fq->net->f;
> +
> + if (f->destructor)
> + f->destructor(fq);
> + kmem_cache_free(f->frags_cachep, fq);
> +}
> +
> +void inet_frag_destroy(struct inet_frag_queue *fq)
> +{
> + unsigned int sum_truesize;
> struct netns_frags *nf;
> - unsigned int sum, sum_truesize = 0;
> struct inet_frags *f;
> + struct sk_buff *fp;
>
> - WARN_ON(!(q->flags & INET_FRAG_COMPLETE));
> - WARN_ON(del_timer(&q->timer) != 0);
> + WARN_ON(!(fq->flags & INET_FRAG_COMPLETE));
> + WARN_ON(del_timer(&fq->timer) != 0);
This is actually a result of renaming the variable.
The type of variable remains the same, while the name
has changed. So, this should go in separate patch.
>
> /* Release all fragment data. */
> - fp = q->fragments;
> - nf = q->net;
> + fp = fq->fragments;
> + nf = fq->net;
The same here
> f = nf->f;
> + sum_truesize = f->qsize;
> while (fp) {
> struct sk_buff *xp = fp->next;
>
> @@ -308,136 +140,63 @@ void inet_frag_destroy(struct inet_frag_queue *q)
> kfree_skb(fp);
> fp = xp;
> }
> - sum = sum_truesize + f->qsize;
>
> - if (f->destructor)
> - f->destructor(q);
> - kmem_cache_free(f->frags_cachep, q);
> + call_rcu(&fq->rcu, inet_frag_destroy_rcu);
>
> - sub_frag_mem_limit(nf, sum);
> + sub_frag_mem_limit(nf, sum_truesize);
> }
> EXPORT_SYMBOL(inet_frag_destroy);
>
> -static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
> - struct inet_frag_queue *qp_in,
> - struct inet_frags *f,
> +static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
> void *arg)
> {
> - struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f);
> - struct inet_frag_queue *qp;
> -
> -#ifdef CONFIG_SMP
> - /* With SMP race we have to recheck hash table, because
> - * such entry could have been created on other cpu before
> - * we acquired hash bucket lock.
> - */
> - hlist_for_each_entry(qp, &hb->chain, list) {
> - if (qp->net == nf && f->match(qp, arg)) {
> - refcount_inc(&qp->refcnt);
> - spin_unlock(&hb->chain_lock);
> - qp_in->flags |= INET_FRAG_COMPLETE;
> - inet_frag_put(qp_in);
> - return qp;
> - }
> - }
> -#endif
> - qp = qp_in;
> - if (!mod_timer(&qp->timer, jiffies + nf->timeout))
> - refcount_inc(&qp->refcnt);
> -
> - refcount_inc(&qp->refcnt);
> - hlist_add_head(&qp->list, &hb->chain);
> -
> - spin_unlock(&hb->chain_lock);
> -
> - return qp;
> -}
> -
> -static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
> - struct inet_frags *f,
> - void *arg)
> -{
> - struct inet_frag_queue *q;
> + struct inet_frags *f = nf->f;
> + struct inet_frag_queue *fq;
> + int err;
>
> - if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) {
> - inet_frag_schedule_worker(f);
> + if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh)
> return NULL;
> - }
>
> - q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
> - if (!q)
> + fq = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
> + if (!fq)
> return NULL;
Here we also renamed a variable and merged two functions: inet_frag_alloc()
into inet_frag_create(). Can't we do that in separate patch?
Also, note, git (at least my) generates better diff for this hunk
with the following options in .gitconfig:
[diff "default"]
algorithm = patience
[diff]
algorithm = patience
>
> - q->net = nf;
> - f->constructor(q, arg);
> - add_frag_mem_limit(nf, f->qsize);
> -
> - timer_setup(&q->timer, f->frag_expire, 0);
> - spin_lock_init(&q->lock);
> - refcount_set(&q->refcnt, 1);
> -
> - return q;
> -}
> + fq->net = nf;
> + f->constructor(fq, arg);
Also renaming.
>
> -static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
> - struct inet_frags *f,
> - void *arg)
> -{
> - struct inet_frag_queue *q;
> + timer_setup(&fq->timer, f->frag_expire, 0);
> + spin_lock_init(&fq->lock);
> + refcount_set(&fq->refcnt, 3);
> + mod_timer(&fq->timer, jiffies + nf->timeout);
>
> - q = inet_frag_alloc(nf, f, arg);
> - if (!q)
> + err = rhashtable_insert_fast(&nf->rhashtable, &fq->node,
> + f->rhash_params);
> + add_frag_mem_limit(nf, f->qsize);
> + if (err < 0) {
> + fq->flags |= INET_FRAG_COMPLETE;
> + inet_frag_kill(fq);
> + inet_frag_destroy(fq);
> return NULL;
> -
> - return inet_frag_intern(nf, q, f, arg);
> + }
> + return fq;
> }
>
> -struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
> - struct inet_frags *f, void *key,
> - unsigned int hash)
> +/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */
> +struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key)
> {
> - struct inet_frag_bucket *hb;
> - struct inet_frag_queue *q;
> - int depth = 0;
> -
> - if (frag_mem_limit(nf) > nf->low_thresh)
> - inet_frag_schedule_worker(f);
> -
> - hash &= (INETFRAGS_HASHSZ - 1);
> - hb = &f->hash[hash];
> -
> - spin_lock(&hb->chain_lock);
> - hlist_for_each_entry(q, &hb->chain, list) {
> - if (q->net == nf && f->match(q, key)) {
> - refcount_inc(&q->refcnt);
> - spin_unlock(&hb->chain_lock);
> - return q;
> - }
> - depth++;
> - }
> - spin_unlock(&hb->chain_lock);
> + struct inet_frag_queue *fq;
>
> - if (depth <= INETFRAGS_MAXDEPTH)
> - return inet_frag_create(nf, f, key);
> + rcu_read_lock();
>
> - if (inet_frag_may_rebuild(f)) {
> - if (!f->rebuild)
> - f->rebuild = true;
> - inet_frag_schedule_worker(f);
> + fq = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params);
> + if (fq) {
> + if (!refcount_inc_not_zero(&fq->refcnt))
> + fq = NULL;
> + rcu_read_unlock();
> + return fq;
> }
> + rcu_read_unlock();
>
> - return ERR_PTR(-ENOBUFS);
> + return inet_frag_create(nf, key);
> }
> EXPORT_SYMBOL(inet_frag_find);
> -
> -void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
> - const char *prefix)
> -{
> - static const char msg[] = "inet_frag_find: Fragment hash bucket"
> - " list length grew over limit " __stringify(INETFRAGS_MAXDEPTH)
> - ". Dropping fragment.\n";
> -
> - if (PTR_ERR(q) == -ENOBUFS)
> - net_dbg_ratelimited("%s%s", prefix, msg);
> -}
> -EXPORT_SYMBOL(inet_frag_maybe_warn_overflow);
> diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
> index cd2b4c9419fc1552d367b572926e314b11cb6c00..1a7423e8ec0a8f88782ad8c945dc0cd6046f79f0 100644
> --- a/net/ipv4/ip_fragment.c
> +++ b/net/ipv4/ip_fragment.c
> @@ -69,15 +69,9 @@ struct ipfrag_skb_cb
> struct ipq {
> struct inet_frag_queue q;
>
> - u32 user;
> - __be32 saddr;
> - __be32 daddr;
> - __be16 id;
> - u8 protocol;
> u8 ecn; /* RFC3168 support */
> u16 max_df_size; /* largest frag with DF set seen */
> int iif;
> - int vif; /* L3 master device index */
> unsigned int rid;
> struct inet_peer *peer;
> };
> @@ -97,41 +91,6 @@ int ip_frag_mem(struct net *net)
> static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
> struct net_device *dev);
>
> -struct ip4_create_arg {
> - struct iphdr *iph;
> - u32 user;
> - int vif;
> -};
> -
> -static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
> -{
> - net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd));
> - return jhash_3words((__force u32)id << 16 | prot,
> - (__force u32)saddr, (__force u32)daddr,
> - ip4_frags.rnd);
> -}
> -
> -static unsigned int ip4_hashfn(const struct inet_frag_queue *q)
> -{
> - const struct ipq *ipq;
> -
> - ipq = container_of(q, struct ipq, q);
> - return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol);
> -}
> -
> -static bool ip4_frag_match(const struct inet_frag_queue *q, const void *a)
> -{
> - const struct ipq *qp;
> - const struct ip4_create_arg *arg = a;
> -
> - qp = container_of(q, struct ipq, q);
> - return qp->id == arg->iph->id &&
> - qp->saddr == arg->iph->saddr &&
> - qp->daddr == arg->iph->daddr &&
> - qp->protocol == arg->iph->protocol &&
> - qp->user == arg->user &&
> - qp->vif == arg->vif;
> -}
>
> static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
> {
> @@ -140,37 +99,23 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
> frags);
> struct net *net = container_of(ipv4, struct net, ipv4);
>
> - const struct ip4_create_arg *arg = a;
> + const struct frag_v4_compare_key *key = a;
>
> - qp->protocol = arg->iph->protocol;
> - qp->id = arg->iph->id;
> - qp->ecn = ip4_frag_ecn(arg->iph->tos);
> - qp->saddr = arg->iph->saddr;
> - qp->daddr = arg->iph->daddr;
> - qp->vif = arg->vif;
> - qp->user = arg->user;
> + q->key.v4 = *key;
> + qp->ecn = 0;
> qp->peer = q->net->max_dist ?
> - inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) :
> + inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) :
> NULL;
> }
>
> -static void ip4_frag_free(struct inet_frag_queue *q)
> +static void ip4_frag_destructor(struct inet_frag_queue *q)
This just renames the function
> {
> - struct ipq *qp;
> + struct ipq *qp = container_of(q, struct ipq, q);
>
> - qp = container_of(q, struct ipq, q);
This is also just a refactoring
> if (qp->peer)
> inet_putpeer(qp->peer);
> }
>
> -
> -/* Destruction primitives. */
> -
> -static void ipq_put(struct ipq *ipq)
> -{
> - inet_frag_put(&ipq->q);
> -}
> -
> /* Kill ipq entry. It is not destroyed immediately,
> * because caller (and someone more) holds reference count.
> */
> @@ -198,25 +143,25 @@ static void ip_expire(struct timer_list *t)
> struct net *net;
>
> qp = container_of(frag, struct ipq, q);
> - net = container_of(qp->q.net, struct net, ipv4.frags);
> + net = container_of(frag->net, struct net, ipv4.frags);
>
> rcu_read_lock();
> - spin_lock(&qp->q.lock);
> + spin_lock(&frag->lock);
>
> - if (qp->q.flags & INET_FRAG_COMPLETE)
> + if (frag->flags & INET_FRAG_COMPLETE)
> goto out;
>
> ipq_kill(qp);
> __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
>
> - if (!inet_frag_evicting(&qp->q)) {
> - struct sk_buff *clone, *head = qp->q.fragments;
> + if (true) {
This does not look good for me. Better we could try to move this
to separate function..
> + struct sk_buff *clone, *head = frag->fragments;
> const struct iphdr *iph;
> int err;
>
> __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
>
> - if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments)
> + if (!(frag->flags & INET_FRAG_FIRST_IN) || !frag->fragments)
> goto out;
>
> head->dev = dev_get_by_index_rcu(net, qp->iif);
> @@ -234,7 +179,7 @@ static void ip_expire(struct timer_list *t)
> /* Only an end host needs to send an ICMP
> * "Fragment Reassembly Timeout" message, per RFC792.
> */
> - if (frag_expire_skip_icmp(qp->user) &&
> + if (frag_expire_skip_icmp(frag->key.v4.user) &&
> (skb_rtable(head)->rt_type != RTN_LOCAL))
> goto out;
>
> @@ -242,7 +187,7 @@ static void ip_expire(struct timer_list *t)
>
> /* Send an ICMP "Fragment Reassembly Timeout" message. */
> if (clone) {
> - spin_unlock(&qp->q.lock);
> + spin_unlock(&frag->lock);
> icmp_send(clone, ICMP_TIME_EXCEEDED,
> ICMP_EXC_FRAGTIME, 0);
> consume_skb(clone);
> @@ -250,33 +195,32 @@ static void ip_expire(struct timer_list *t)
> }
> }
> out:
> - spin_unlock(&qp->q.lock);
> + spin_unlock(&frag->lock);
> out_rcu_unlock:
> rcu_read_unlock();
> - ipq_put(qp);
> + inet_frag_put(frag);
> }
>
> /* Find the correct entry in the "incomplete datagrams" queue for
> * this IP datagram, and create new one, if nothing is found.
> */
> -static struct ipq *ip_find(struct net *net, struct iphdr *iph,
> +static struct ipq *ip_find(struct net *net, const struct iphdr *iph,
> u32 user, int vif)
> {
> + struct frag_v4_compare_key key = {
> + .saddr = iph->saddr,
> + .daddr = iph->daddr,
> + .user = user,
> + .vif = vif,
> + .id = iph->id,
> + .protocol = iph->protocol,
> + };
> struct inet_frag_queue *q;
> - struct ip4_create_arg arg;
> - unsigned int hash;
>
> - arg.iph = iph;
> - arg.user = user;
> - arg.vif = vif;
> -
> - hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
> -
> - q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
> - if (IS_ERR_OR_NULL(q)) {
> - inet_frag_maybe_warn_overflow(q, pr_fmt());
> + q = inet_frag_find(&net->ipv4.frags, &key);
> + if (!q)
> return NULL;
> - }
> +
> return container_of(q, struct ipq, q);
> }
>
> @@ -310,8 +254,8 @@ static int ip_frag_too_far(struct ipq *qp)
>
> static int ip_frag_reinit(struct ipq *qp)
> {
> - struct sk_buff *fp;
> unsigned int sum_truesize = 0;
> + struct sk_buff *fp;
This just moves the line...
>
> if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
> refcount_inc(&qp->q.refcnt);
> @@ -652,11 +596,11 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
> return 0;
>
> out_nomem:
> - net_dbg_ratelimited("queue_glue: no memory for gluing queue %p\n", qp);
> + net_dbg_ratelimited("queue_glue: no memory for gluing queue\n");
Since there is no parameter type change, but complete removing of it,
we may do that in refatoring patch (together with the above).
> err = -ENOMEM;
> goto out_fail;
> out_oversize:
> - net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr);
> + net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->q.key.v4.saddr);
> out_fail:
> __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
> return err;
> @@ -682,7 +626,7 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
> ret = ip_frag_queue(qp, skb);
>
> spin_unlock(&qp->q.lock);
> - ipq_put(qp);
> + inet_frag_put(&qp->q);
> return ret;
> }
>
> @@ -894,17 +838,52 @@ static struct pernet_operations ip4_frags_ops = {
> .exit = ipv4_frags_exit_net,
> };
>
> +
> +static u32 ip4_key_hashfn(const void *data, u32 len, u32 seed)
> +{
> + return jhash2(data,
> + sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
> +}
> +
> +static u32 ip4_obj_hashfn(const void *data, u32 len, u32 seed)
> +{
> + const struct inet_frag_queue *fq = data;
> +
> + return jhash2((const u32 *)&fq->key.v4,
> + sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
> +}
> +
> +static int ip4_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
> +{
> + const struct frag_v4_compare_key *key = arg->key;
> + const struct inet_frag_queue *fq = ptr;
> +
> + return !!memcmp(&fq->key, key, sizeof(*key));
> +}
> +
> +static const struct rhashtable_params ip4_rhash_params = {
> + .head_offset = offsetof(struct inet_frag_queue, node),
> + .key_offset = offsetof(struct inet_frag_queue, key),
> + .key_len = sizeof(struct frag_v4_compare_key),
> + .hashfn = ip4_key_hashfn,
> + .obj_hashfn = ip4_obj_hashfn,
> + .obj_cmpfn = ip4_obj_cmpfn,
> + .automatic_shrinking = true,
> +};
> +
> void __init ipfrag_init(void)
> {
> - ip4_frags_ctl_register();
> - register_pernet_subsys(&ip4_frags_ops);
> - ip4_frags.hashfn = ip4_hashfn;
> ip4_frags.constructor = ip4_frag_init;
> - ip4_frags.destructor = ip4_frag_free;
> + ip4_frags.destructor = ip4_frag_destructor;
Here just reflects the fact we removed the function
> ip4_frags.qsize = sizeof(struct ipq);
> - ip4_frags.match = ip4_frag_match;
> ip4_frags.frag_expire = ip_expire;
> ip4_frags.frags_cache_name = ip_frag_cache_name;
> + ip4_frags.rhash_params = ip4_rhash_params;
> +
> if (inet_frags_init(&ip4_frags))
> panic("IP: failed to allocate ip4_frags cache\n");
> +
> + ip4_frags_ctl_register();
> + register_pernet_subsys(&ip4_frags_ops);
We may consider to do this moving in refatoring patch..
> +
> }
> diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
> index f69b7ca52727c814eb2887c9deb9f356c56e5442..53859311dea96c03fa5ae8456de32de25009efbe 100644
> --- a/net/ipv6/netfilter/nf_conntrack_reasm.c
> +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
> @@ -152,23 +152,6 @@ static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h)
> return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK);
> }
>
> -static unsigned int nf_hash_frag(__be32 id, const struct in6_addr *saddr,
> - const struct in6_addr *daddr)
> -{
> - net_get_random_once(&nf_frags.rnd, sizeof(nf_frags.rnd));
> - return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr),
> - (__force u32)id, nf_frags.rnd);
> -}
> -
> -
> -static unsigned int nf_hashfn(const struct inet_frag_queue *q)
> -{
> - const struct frag_queue *nq;
> -
> - nq = container_of(q, struct frag_queue, q);
> - return nf_hash_frag(nq->id, &nq->saddr, &nq->daddr);
> -}
> -
> static void nf_ct_frag6_expire(struct timer_list *t)
> {
> struct inet_frag_queue *frag = from_timer(frag, t, timer);
> @@ -178,34 +161,26 @@ static void nf_ct_frag6_expire(struct timer_list *t)
> fq = container_of(frag, struct frag_queue, q);
> net = container_of(fq->q.net, struct net, nf_frag.frags);
>
> - ip6_expire_frag_queue(net, fq, &nf_frags);
> + ip6_expire_frag_queue(net, fq);
> }
>
> /* Creation primitives. */
> -static inline struct frag_queue *fq_find(struct net *net, __be32 id,
> - u32 user, struct in6_addr *src,
> - struct in6_addr *dst, int iif, u8 ecn)
> +static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user,
> + const struct ipv6hdr *hdr, int iif)
> {
> + struct frag_v6_compare_key key = {
> + .id = id,
> + .saddr = hdr->saddr,
> + .daddr = hdr->daddr,
> + .user = user,
> + .iif = iif,
> + };
> struct inet_frag_queue *q;
> - struct ip6_create_arg arg;
> - unsigned int hash;
> -
> - arg.id = id;
> - arg.user = user;
> - arg.src = src;
> - arg.dst = dst;
> - arg.iif = iif;
> - arg.ecn = ecn;
> -
> - local_bh_disable();
> - hash = nf_hash_frag(id, src, dst);
> -
> - q = inet_frag_find(&net->nf_frag.frags, &nf_frags, &arg, hash);
> - local_bh_enable();
> - if (IS_ERR_OR_NULL(q)) {
> - inet_frag_maybe_warn_overflow(q, pr_fmt());
> +
> + q = inet_frag_find(&net->nf_frag.frags, &key);
> + if (!q)
> return NULL;
> - }
> +
> return container_of(q, struct frag_queue, q);
> }
>
> @@ -593,8 +568,8 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
> fhdr = (struct frag_hdr *)skb_transport_header(skb);
>
> skb_orphan(skb);
> - fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr,
> - skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr));
> + fq = fq_find(net, fhdr->identification, user, hdr,
> + skb->dev ? skb->dev->ifindex : 0);
> if (fq == NULL) {
> pr_debug("Can't find and can't create new queue\n");
> return -ENOMEM;
> @@ -656,17 +631,18 @@ static struct pernet_operations nf_ct_net_ops = {
> .exit = nf_ct_net_exit,
> };
>
> +extern const struct rhashtable_params ip6_rhash_params;
Can't we do this declaration in H-file?
> +
> int nf_ct_frag6_init(void)
> {
> int ret = 0;
>
> - nf_frags.hashfn = nf_hashfn;
> nf_frags.constructor = ip6_frag_init;
> nf_frags.destructor = NULL;
> nf_frags.qsize = sizeof(struct frag_queue);
> - nf_frags.match = ip6_frag_match;
> nf_frags.frag_expire = nf_ct_frag6_expire;
> nf_frags.frags_cache_name = nf_frags_cache_name;
> + nf_frags.rhash_params = ip6_rhash_params;
> ret = inet_frags_init(&nf_frags);
> if (ret)
> goto out;
> diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
> index 8cfea13a179c6f048177ac91fe26c8a5565e5820..737b0921ab0c9af198fefdf06d8f4ede91c7f3f6 100644
> --- a/net/ipv6/reassembly.c
> +++ b/net/ipv6/reassembly.c
> @@ -79,59 +79,19 @@ static struct inet_frags ip6_frags;
> static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
> struct net_device *dev);
>
> -/*
> - * callers should be careful not to use the hash value outside the ipfrag_lock
> - * as doing so could race with ipfrag_hash_rnd being recalculated.
> - */
> -static unsigned int inet6_hash_frag(__be32 id, const struct in6_addr *saddr,
> - const struct in6_addr *daddr)
> -{
> - net_get_random_once(&ip6_frags.rnd, sizeof(ip6_frags.rnd));
> - return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr),
> - (__force u32)id, ip6_frags.rnd);
> -}
> -
> -static unsigned int ip6_hashfn(const struct inet_frag_queue *q)
> -{
> - const struct frag_queue *fq;
> -
> - fq = container_of(q, struct frag_queue, q);
> - return inet6_hash_frag(fq->id, &fq->saddr, &fq->daddr);
> -}
> -
> -bool ip6_frag_match(const struct inet_frag_queue *q, const void *a)
> -{
> - const struct frag_queue *fq;
> - const struct ip6_create_arg *arg = a;
> -
> - fq = container_of(q, struct frag_queue, q);
> - return fq->id == arg->id &&
> - fq->user == arg->user &&
> - ipv6_addr_equal(&fq->saddr, arg->src) &&
> - ipv6_addr_equal(&fq->daddr, arg->dst) &&
> - (arg->iif == fq->iif ||
> - !(ipv6_addr_type(arg->dst) & (IPV6_ADDR_MULTICAST |
> - IPV6_ADDR_LINKLOCAL)));
> -}
> -EXPORT_SYMBOL(ip6_frag_match);
> -
> void ip6_frag_init(struct inet_frag_queue *q, const void *a)
> {
> struct frag_queue *fq = container_of(q, struct frag_queue, q);
> - const struct ip6_create_arg *arg = a;
> + const struct frag_v6_compare_key *key = a;
>
> - fq->id = arg->id;
> - fq->user = arg->user;
> - fq->saddr = *arg->src;
> - fq->daddr = *arg->dst;
> - fq->ecn = arg->ecn;
> + q->key.v6 = *key;
> + fq->ecn = 0;
> }
> EXPORT_SYMBOL(ip6_frag_init);
>
> -void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
> - struct inet_frags *frags)
> +void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq)
> {
> - struct net_device *dev = NULL;
> + struct net_device *dev;
>
> spin_lock(&fq->q.lock);
>
> @@ -146,10 +106,6 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
> goto out_rcu_unlock;
>
> __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
> -
> - if (inet_frag_evicting(&fq->q))
> - goto out_rcu_unlock;
> -
> __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);
>
> /* Don't send error if the first segment did not arrive. */
> @@ -179,31 +135,29 @@ static void ip6_frag_expire(struct timer_list *t)
> fq = container_of(frag, struct frag_queue, q);
> net = container_of(fq->q.net, struct net, ipv6.frags);
>
> - ip6_expire_frag_queue(net, fq, &ip6_frags);
> + ip6_expire_frag_queue(net, fq);
> }
>
> static struct frag_queue *
> -fq_find(struct net *net, __be32 id, const struct in6_addr *src,
> - const struct in6_addr *dst, int iif, u8 ecn)
> +fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif)
> {
> + struct frag_v6_compare_key key = {
> + .id = id,
> + .saddr = hdr->saddr,
> + .daddr = hdr->daddr,
> + .user = IP6_DEFRAG_LOCAL_DELIVER,
> + .iif = iif,
> + };
> struct inet_frag_queue *q;
> - struct ip6_create_arg arg;
> - unsigned int hash;
>
> - arg.id = id;
> - arg.user = IP6_DEFRAG_LOCAL_DELIVER;
> - arg.src = src;
> - arg.dst = dst;
> - arg.iif = iif;
> - arg.ecn = ecn;
> + if (!(ipv6_addr_type(&hdr->daddr) & (IPV6_ADDR_MULTICAST |
> + IPV6_ADDR_LINKLOCAL)))
> + key.iif = 0;
>
> - hash = inet6_hash_frag(id, src, dst);
> -
> - q = inet_frag_find(&net->ipv6.frags, &ip6_frags, &arg, hash);
> - if (IS_ERR_OR_NULL(q)) {
> - inet_frag_maybe_warn_overflow(q, pr_fmt());
> + q = inet_frag_find(&net->ipv6.frags, &key);
> + if (!q)
> return NULL;
> - }
> +
> return container_of(q, struct frag_queue, q);
> }
>
> @@ -527,10 +481,11 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
>
> static int ipv6_frag_rcv(struct sk_buff *skb)
> {
> + struct net *net = dev_net(skb_dst(skb)->dev);
> + const struct ipv6hdr *hdr = ipv6_hdr(skb);
> struct frag_hdr *fhdr;
> struct frag_queue *fq;
> - const struct ipv6hdr *hdr = ipv6_hdr(skb);
> - struct net *net = dev_net(skb_dst(skb)->dev);
> + int iif;
>
> if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED)
> goto fail_hdr;
> @@ -559,13 +514,14 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
> return 1;
> }
>
> - fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr,
> - skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr));
> + iif = skb->dev ? skb->dev->ifindex : 0;
> + fq = fq_find(net, fhdr->identification, hdr, iif);
> if (fq) {
> int ret;
>
> spin_lock(&fq->q.lock);
>
> + fq->iif = iif;
> ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff);
>
> spin_unlock(&fq->q.lock);
> @@ -718,6 +674,7 @@ static int __net_init ipv6_frags_init_net(struct net *net)
> net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
> net->ipv6.frags.f = &ip6_frags;
>
> + net->ipv6.frags.f = &ip6_frags;
This '=' is already made above, we shouldn't do that twise...
> res = inet_frags_init_net(&net->ipv6.frags);
> if (res < 0)
> return res;
> @@ -739,14 +696,55 @@ static struct pernet_operations ip6_frags_ops = {
> .exit = ipv6_frags_exit_net,
> };
>
> +static u32 ip6_key_hashfn(const void *data, u32 len, u32 seed)
> +{
> + return jhash2(data,
> + sizeof(struct frag_v6_compare_key) / sizeof(u32), seed);
> +}
> +
> +static u32 ip6_obj_hashfn(const void *data, u32 len, u32 seed)
> +{
> + const struct inet_frag_queue *fq = data;
> +
> + return jhash2((const u32 *)&fq->key.v6,
> + sizeof(struct frag_v6_compare_key) / sizeof(u32), seed);
> +}
> +
> +static int ip6_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
> +{
> + const struct frag_v6_compare_key *key = arg->key;
> + const struct inet_frag_queue *fq = ptr;
> +
> + return !!memcmp(&fq->key, key, sizeof(*key));
> +}
> +
> +const struct rhashtable_params ip6_rhash_params = {
> + .head_offset = offsetof(struct inet_frag_queue, node),
> + .hashfn = ip6_key_hashfn,
> + .obj_hashfn = ip6_obj_hashfn,
> + .obj_cmpfn = ip6_obj_cmpfn,
> + .automatic_shrinking = true,
> +};
> +EXPORT_SYMBOL(ip6_rhash_params);
> +
> int __init ipv6_frag_init(void)
> {
> int ret;
>
> - ret = inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT);
> + ip6_frags.constructor = ip6_frag_init;
> + ip6_frags.destructor = NULL;
> + ip6_frags.qsize = sizeof(struct frag_queue);
> + ip6_frags.frag_expire = ip6_frag_expire;
> + ip6_frags.frags_cache_name = ip6_frag_cache_name;
> + ip6_frags.rhash_params = ip6_rhash_params;
> + ret = inet_frags_init(&ip6_frags);
> if (ret)
> goto out;
>
> + ret = inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT);
> + if (ret)
> + goto err_protocol;
> +
> ret = ip6_frags_sysctl_register();
> if (ret)
> goto err_sysctl;
> @@ -755,16 +753,6 @@ int __init ipv6_frag_init(void)
> if (ret)
> goto err_pernet;
>
> - ip6_frags.hashfn = ip6_hashfn;
> - ip6_frags.constructor = ip6_frag_init;
> - ip6_frags.destructor = NULL;
> - ip6_frags.qsize = sizeof(struct frag_queue);
> - ip6_frags.match = ip6_frag_match;
> - ip6_frags.frag_expire = ip6_frag_expire;
> - ip6_frags.frags_cache_name = ip6_frag_cache_name;
> - ret = inet_frags_init(&ip6_frags);
> - if (ret)
> - goto err_pernet;
> out:
> return ret;
>
> @@ -772,6 +760,8 @@ int __init ipv6_frag_init(void)
> ip6_frags_sysctl_unregister();
> err_sysctl:
> inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT);
> +err_protocol:
> + inet_frags_fini(&ip6_frags);
> goto out;
> }
Thanks,
Kirill
Powered by blists - more mailing lists