lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Sat, 31 Mar 2018 01:44:58 +0300
From:   Kirill Tkhai <ktkhai@...tuozzo.com>
To:     Eric Dumazet <edumazet@...gle.com>,
        "David S . Miller" <davem@...emloft.net>
Cc:     netdev <netdev@...r.kernel.org>, Florian Westphal <fw@...len.de>,
        Herbert Xu <herbert@...dor.apana.org.au>,
        Thomas Graf <tgraf@...g.ch>,
        Jesper Dangaard Brouer <brouer@...hat.com>,
        Alexander Aring <alex.aring@...il.com>,
        Stefan Schmidt <stefan@....samsung.com>,
        Eric Dumazet <eric.dumazet@...il.com>,
        Nikolay Aleksandrov <nikolay@...hat.com>
Subject: Re: [PATCH v2 net-next 08/12] inet: frags: use rhashtables for
 reassembly units

Hi, Eric,

thanks for more small patches in v2. One comment below.

On 30.03.2018 23:42, Eric Dumazet wrote:
> Some applications still rely on IP fragmentation, and to be fair linux
> reassembly unit is not working under any serious load.
> 
> It uses static hash tables of 1024 buckets, and up to 128 items per bucket (!!!)
> 
> A work queue is supposed to garbage collect items when host is under memory
> pressure, and doing a hash rebuild, changing seed used in hash computations.
> 
> This work queue blocks softirqs for up to 25 ms when doing a hash rebuild,
> occurring every 5 seconds if host is under fire.
> 
> Then there is the problem of sharing this hash table for all netns.
> 
> It is time to switch to rhashtables, and allocate one of them per netns
> to speedup netns dismantle, since this is a critical metric these days.
> 
> Lookup is now using RCU. A followup patch will even remove
> the refcount hold/release left from prior implementation and save
> a couple of atomic operations.
> 
> Before this patch, 16 cpus (16 RX queue NIC) could not handle more
> than 1 Mpps frags DDOS.
> 
> After the patch, I reach 7 Mpps without any tuning, and can use up to 2GB
> of storage for the fragments.
> 
> $ grep FRAG /proc/net/sockstat
> FRAG: inuse 1966916 memory 2140004608
> 
> A followup patch will change the limits for 64bit arches.
> 
> Signed-off-by: Eric Dumazet <edumazet@...gle.com>
> Cc: Florian Westphal <fw@...len.de>
> Cc: Nikolay Aleksandrov <nikolay@...hat.com>
> Cc: Jesper Dangaard Brouer <brouer@...hat.com>
> Cc: Alexander Aring <alex.aring@...il.com>
> Cc: Stefan Schmidt <stefan@....samsung.com>
> ---
>  Documentation/networking/ip-sysctl.txt  |   7 +-
>  include/net/inet_frag.h                 |  81 +++---
>  include/net/ipv6.h                      |  16 +-
>  net/ieee802154/6lowpan/6lowpan_i.h      |  26 +-
>  net/ieee802154/6lowpan/reassembly.c     |  93 +++----
>  net/ipv4/inet_fragment.c                | 352 +++++-------------------
>  net/ipv4/ip_fragment.c                  | 112 ++++----
>  net/ipv6/netfilter/nf_conntrack_reasm.c |  51 +---
>  net/ipv6/reassembly.c                   | 110 ++++----
>  9 files changed, 269 insertions(+), 579 deletions(-)
> 
> diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
> index 33f35f049ad57ad6c06ed6e089966e346d72d108..6f2a3670e44b6662ce53c16cb7ca1e4f61274c15 100644
> --- a/Documentation/networking/ip-sysctl.txt
> +++ b/Documentation/networking/ip-sysctl.txt
> @@ -134,13 +134,10 @@ min_adv_mss - INTEGER
>  IP Fragmentation:
>  
>  ipfrag_high_thresh - INTEGER
> -	Maximum memory used to reassemble IP fragments. When
> -	ipfrag_high_thresh bytes of memory is allocated for this purpose,
> -	the fragment handler will toss packets until ipfrag_low_thresh
> -	is reached. This also serves as a maximum limit to namespaces
> -	different from the initial one.
> +	Maximum memory used to reassemble IP fragments.
>  
>  ipfrag_low_thresh - INTEGER
> +	(Obsolete since linux-4.17)
>  	Maximum memory used to reassemble IP fragments before the kernel
>  	begins to remove incomplete fragment queues to free up resources.
>  	The kernel still accepts new fragments for defragmentation.
> diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
> index 69e531ed81894393e07cac9e953825fcb55ef42a..3fec0d3a0d0186e98afb951784e1fe7329ba6d77 100644
> --- a/include/net/inet_frag.h
> +++ b/include/net/inet_frag.h
> @@ -2,7 +2,11 @@
>  #ifndef __NET_FRAG_H__
>  #define __NET_FRAG_H__
>  
> +#include <linux/rhashtable.h>
> +
>  struct netns_frags {
> +	struct rhashtable       rhashtable ____cacheline_aligned_in_smp;
> +
>  	/* Keep atomic mem on separate cachelines in structs that include it */
>  	atomic_t		mem ____cacheline_aligned_in_smp;
>  	/* sysctls */
> @@ -26,12 +30,30 @@ enum {
>  	INET_FRAG_COMPLETE	= BIT(2),
>  };
>  
> +struct frag_v4_compare_key {
> +	__be32		saddr;
> +	__be32		daddr;
> +	u32		user;
> +	u32		vif;
> +	__be16		id;
> +	u16		protocol;
> +};
> +
> +struct frag_v6_compare_key {
> +	struct in6_addr	saddr;
> +	struct in6_addr	daddr;
> +	u32		user;
> +	__be32		id;
> +	u32		iif;
> +};
> +
>  /**
>   * struct inet_frag_queue - fragment queue
>   *
> - * @lock: spinlock protecting the queue
> + * @node: rhash node
> + * @key: keys identifying this frag.
>   * @timer: queue expiration timer
> - * @list: hash bucket list
> + * @lock: spinlock protecting this frag
>   * @refcnt: reference count of the queue
>   * @fragments: received fragments head
>   * @fragments_tail: received fragments tail
> @@ -41,12 +63,16 @@ enum {
>   * @flags: fragment queue flags
>   * @max_size: maximum received fragment size
>   * @net: namespace that this frag belongs to
> - * @list_evictor: list of queues to forcefully evict (e.g. due to low memory)
> + * @rcu: rcu head for freeing deferall
>   */
>  struct inet_frag_queue {
> -	spinlock_t		lock;
> +	struct rhash_head	node;
> +	union {
> +		struct frag_v4_compare_key v4;
> +		struct frag_v6_compare_key v6;
> +	} key;
>  	struct timer_list	timer;
> -	struct hlist_node	list;
> +	spinlock_t		lock;
>  	refcount_t		refcnt;
>  	struct sk_buff		*fragments;
>  	struct sk_buff		*fragments_tail;
> @@ -55,51 +81,20 @@ struct inet_frag_queue {
>  	int			meat;
>  	__u8			flags;
>  	u16			max_size;
> -	struct netns_frags	*net;
> -	struct hlist_node	list_evictor;
> -};
> -
> -#define INETFRAGS_HASHSZ	1024
> -
> -/* averaged:
> - * max_depth = default ipfrag_high_thresh / INETFRAGS_HASHSZ /
> - *	       rounded up (SKB_TRUELEN(0) + sizeof(struct ipq or
> - *	       struct frag_queue))
> - */
> -#define INETFRAGS_MAXDEPTH	128
> -
> -struct inet_frag_bucket {
> -	struct hlist_head	chain;
> -	spinlock_t		chain_lock;
> +	struct netns_frags      *net;
> +	struct rcu_head		rcu;

inet_frag_destroy() calls call_rcu() after frags are destroyed.
It looks like we may place this rcu in union with fragments and
fragments_tail and to sa

>  };
>  
>  struct inet_frags {
> -	struct inet_frag_bucket	hash[INETFRAGS_HASHSZ];
> -
> -	struct work_struct	frags_work;
> -	unsigned int next_bucket;
> -	unsigned long last_rebuild_jiffies;
> -	bool rebuild;
> -
> -	/* The first call to hashfn is responsible to initialize
> -	 * rnd. This is best done with net_get_random_once.
> -	 *
> -	 * rnd_seqlock is used to let hash insertion detect
> -	 * when it needs to re-lookup the hash chain to use.
> -	 */
> -	u32			rnd;
> -	seqlock_t		rnd_seqlock;
>  	unsigned int		qsize;
>  
> -	unsigned int		(*hashfn)(const struct inet_frag_queue *);
> -	bool			(*match)(const struct inet_frag_queue *q,
> -					 const void *arg);
>  	void			(*constructor)(struct inet_frag_queue *q,
>  					       const void *arg);
>  	void			(*destructor)(struct inet_frag_queue *);
>  	void			(*frag_expire)(struct timer_list *t);
>  	struct kmem_cache	*frags_cachep;
>  	const char		*frags_cache_name;
> +	struct rhashtable_params rhash_params;
>  };
>  
>  int inet_frags_init(struct inet_frags *);
> @@ -108,15 +103,13 @@ void inet_frags_fini(struct inet_frags *);
>  static inline int inet_frags_init_net(struct netns_frags *nf)
>  {
>  	atomic_set(&nf->mem, 0);
> -	return 0;
> +	return rhashtable_init(&nf->rhashtable, &nf->f->rhash_params);
>  }
>  void inet_frags_exit_net(struct netns_frags *nf);
>  
>  void inet_frag_kill(struct inet_frag_queue *q);
>  void inet_frag_destroy(struct inet_frag_queue *q);
> -struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
> -		struct inet_frags *f, void *key, unsigned int hash);
> -
> +struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key);
>  void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
>  				   const char *prefix);
>  
> @@ -128,7 +121,7 @@ static inline void inet_frag_put(struct inet_frag_queue *q)
>  
>  static inline bool inet_frag_evicting(struct inet_frag_queue *q)
>  {
> -	return !hlist_unhashed(&q->list_evictor);
> +	return false;
>  }
>  
>  /* Memory Tracking Functions. */
> diff --git a/include/net/ipv6.h b/include/net/ipv6.h
> index 57b7fe43d2ab8e0ef3d663b7a5ee201affd5ca1f..6fa9a2bc589665dfa9ce84813f33e5e86e12fd74 100644
> --- a/include/net/ipv6.h
> +++ b/include/net/ipv6.h
> @@ -579,17 +579,8 @@ enum ip6_defrag_users {
>  	__IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX,
>  };
>  
> -struct ip6_create_arg {
> -	__be32 id;
> -	u32 user;
> -	const struct in6_addr *src;
> -	const struct in6_addr *dst;
> -	int iif;
> -	u8 ecn;
> -};
> -
>  void ip6_frag_init(struct inet_frag_queue *q, const void *a);
> -bool ip6_frag_match(const struct inet_frag_queue *q, const void *a);
> +extern const struct rhashtable_params ip6_rhash_params;
>  
>  /*
>   *	Equivalent of ipv4 struct ip
> @@ -597,11 +588,6 @@ bool ip6_frag_match(const struct inet_frag_queue *q, const void *a);
>  struct frag_queue {
>  	struct inet_frag_queue	q;
>  
> -	__be32			id;		/* fragment id		*/
> -	u32			user;
> -	struct in6_addr		saddr;
> -	struct in6_addr		daddr;
> -
>  	int			iif;
>  	__u16			nhoffset;
>  	u8			ecn;
> diff --git a/net/ieee802154/6lowpan/6lowpan_i.h b/net/ieee802154/6lowpan/6lowpan_i.h
> index d8de3bcfb1032a1133402cb2a4c50a2448133846..b8d95cb71c25dd69c8a88b2c886a3f0d2ce1174f 100644
> --- a/net/ieee802154/6lowpan/6lowpan_i.h
> +++ b/net/ieee802154/6lowpan/6lowpan_i.h
> @@ -17,37 +17,19 @@ typedef unsigned __bitwise lowpan_rx_result;
>  #define LOWPAN_DISPATCH_FRAG1           0xc0
>  #define LOWPAN_DISPATCH_FRAGN           0xe0
>  
> -struct lowpan_create_arg {
> +struct frag_lowpan_compare_key {
>  	u16 tag;
>  	u16 d_size;
> -	const struct ieee802154_addr *src;
> -	const struct ieee802154_addr *dst;
> +	const struct ieee802154_addr src;
> +	const struct ieee802154_addr dst;
>  };
>  
> -/* Equivalent of ipv4 struct ip
> +/* Equivalent of ipv4 struct ipq
>   */
>  struct lowpan_frag_queue {
>  	struct inet_frag_queue	q;
> -
> -	u16			tag;
> -	u16			d_size;
> -	struct ieee802154_addr	saddr;
> -	struct ieee802154_addr	daddr;
>  };
>  
> -static inline u32 ieee802154_addr_hash(const struct ieee802154_addr *a)
> -{
> -	switch (a->mode) {
> -	case IEEE802154_ADDR_LONG:
> -		return (((__force u64)a->extended_addr) >> 32) ^
> -			(((__force u64)a->extended_addr) & 0xffffffff);
> -	case IEEE802154_ADDR_SHORT:
> -		return (__force u32)(a->short_addr + (a->pan_id << 16));
> -	default:
> -		return 0;
> -	}
> -}
> -
>  int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type);
>  void lowpan_net_frag_exit(void);
>  int lowpan_net_frag_init(void);
> diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
> index ddada12a044de293f904a1dc7a5ff398d089d101..dd743c287bc229b1ba354e834af7bec34dcb8643 100644
> --- a/net/ieee802154/6lowpan/reassembly.c
> +++ b/net/ieee802154/6lowpan/reassembly.c
> @@ -37,47 +37,15 @@ static struct inet_frags lowpan_frags;
>  static int lowpan_frag_reasm(struct lowpan_frag_queue *fq,
>  			     struct sk_buff *prev, struct net_device *ldev);
>  
> -static unsigned int lowpan_hash_frag(u16 tag, u16 d_size,
> -				     const struct ieee802154_addr *saddr,
> -				     const struct ieee802154_addr *daddr)
> -{
> -	net_get_random_once(&lowpan_frags.rnd, sizeof(lowpan_frags.rnd));
> -	return jhash_3words(ieee802154_addr_hash(saddr),
> -			    ieee802154_addr_hash(daddr),
> -			    (__force u32)(tag + (d_size << 16)),
> -			    lowpan_frags.rnd);
> -}
> -
> -static unsigned int lowpan_hashfn(const struct inet_frag_queue *q)
> -{
> -	const struct lowpan_frag_queue *fq;
> -
> -	fq = container_of(q, struct lowpan_frag_queue, q);
> -	return lowpan_hash_frag(fq->tag, fq->d_size, &fq->saddr, &fq->daddr);
> -}
> -
> -static bool lowpan_frag_match(const struct inet_frag_queue *q, const void *a)
> -{
> -	const struct lowpan_frag_queue *fq;
> -	const struct lowpan_create_arg *arg = a;
> -
> -	fq = container_of(q, struct lowpan_frag_queue, q);
> -	return	fq->tag == arg->tag && fq->d_size == arg->d_size &&
> -		ieee802154_addr_equal(&fq->saddr, arg->src) &&
> -		ieee802154_addr_equal(&fq->daddr, arg->dst);
> -}
> -
>  static void lowpan_frag_init(struct inet_frag_queue *q, const void *a)
>  {
> -	const struct lowpan_create_arg *arg = a;
> +	const struct frag_lowpan_compare_key *key = a;
>  	struct lowpan_frag_queue *fq;
>  
>  	fq = container_of(q, struct lowpan_frag_queue, q);
>  
> -	fq->tag = arg->tag;
> -	fq->d_size = arg->d_size;
> -	fq->saddr = *arg->src;
> -	fq->daddr = *arg->dst;
> +	BUILD_BUG_ON(sizeof(*key) > sizeof(q->key));
> +	memcpy(&q->key, key, sizeof(*key));
>  }
>  
>  static void lowpan_frag_expire(struct timer_list *t)
> @@ -105,21 +73,17 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb,
>  	const struct ieee802154_addr *src,
>  	const struct ieee802154_addr *dst)
>  {
> +	struct netns_ieee802154_lowpan *ieee802154_lowpan =
> +		net_ieee802154_lowpan(net);
> +	struct frag_lowpan_compare_key key = {
> +		.tag = cb->d_tag,
> +		.d_size = cb->d_size,
> +		.src = *src,
> +		.dst = *dst,
> +	};
>  	struct inet_frag_queue *q;
> -	struct lowpan_create_arg arg;
> -	unsigned int hash;
> -	struct netns_ieee802154_lowpan *ieee802154_lowpan =
> -		net_ieee802154_lowpan(net);
>  
> -	arg.tag = cb->d_tag;
> -	arg.d_size = cb->d_size;
> -	arg.src = src;
> -	arg.dst = dst;
> -
> -	hash = lowpan_hash_frag(cb->d_tag, cb->d_size, src, dst);
> -
> -	q = inet_frag_find(&ieee802154_lowpan->frags,
> -			   &lowpan_frags, &arg, hash);
> +	q = inet_frag_find(&ieee802154_lowpan->frags, &key);
>  	if (IS_ERR_OR_NULL(q)) {
>  		inet_frag_maybe_warn_overflow(q, pr_fmt());
>  		return NULL;
> @@ -611,17 +575,46 @@ static struct pernet_operations lowpan_frags_ops = {
>  	.exit = lowpan_frags_exit_net,
>  };
>  
> +static u32 lowpan_key_hashfn(const void *data, u32 len, u32 seed)
> +{
> +	return jhash2(data,
> +		      sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed);
> +}
> +
> +static u32 lowpan_obj_hashfn(const void *data, u32 len, u32 seed)
> +{
> +	const struct inet_frag_queue *fq = data;
> +
> +	return jhash2((const u32 *)&fq->key,
> +		      sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed);
> +}
> +
> +static int lowpan_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
> +{
> +	const struct frag_lowpan_compare_key *key = arg->key;
> +	const struct inet_frag_queue *fq = ptr;
> +
> +	return !!memcmp(&fq->key, key, sizeof(*key));
> +}
> +
> +const struct rhashtable_params lowpan_rhash_params = {
> +	.head_offset		= offsetof(struct inet_frag_queue, node),
> +	.hashfn			= lowpan_key_hashfn,
> +	.obj_hashfn		= lowpan_obj_hashfn,
> +	.obj_cmpfn		= lowpan_obj_cmpfn,
> +	.automatic_shrinking	= true,
> +};
> +
>  int __init lowpan_net_frag_init(void)
>  {
>  	int ret;
>  
> -	lowpan_frags.hashfn = lowpan_hashfn;
>  	lowpan_frags.constructor = lowpan_frag_init;
>  	lowpan_frags.destructor = NULL;
>  	lowpan_frags.qsize = sizeof(struct frag_queue);
> -	lowpan_frags.match = lowpan_frag_match;
>  	lowpan_frags.frag_expire = lowpan_frag_expire;
>  	lowpan_frags.frags_cache_name = lowpan_frags_cache_name;
> +	lowpan_frags.rhash_params = lowpan_rhash_params;
>  	ret = inet_frags_init(&lowpan_frags);
>  	if (ret)
>  		goto out;
> diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
> index 1ac69f65d0dee600d0ab4db20ff5942952932c40..bbb0ff2c262e2d73630b441a088a036397df6f28 100644
> --- a/net/ipv4/inet_fragment.c
> +++ b/net/ipv4/inet_fragment.c
> @@ -25,12 +25,6 @@
>  #include <net/inet_frag.h>
>  #include <net/inet_ecn.h>
>  
> -#define INETFRAGS_EVICT_BUCKETS   128
> -#define INETFRAGS_EVICT_MAX	  512
> -
> -/* don't rebuild inetfrag table with new secret more often than this */
> -#define INETFRAGS_MIN_REBUILD_INTERVAL (5 * HZ)
> -
>  /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
>   * Value : 0xff if frame should be dropped.
>   *         0 or INET_ECN_CE value, to be ORed in to final iph->tos field
> @@ -52,157 +46,8 @@ const u8 ip_frag_ecn_table[16] = {
>  };
>  EXPORT_SYMBOL(ip_frag_ecn_table);
>  
> -static unsigned int
> -inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q)
> -{
> -	return f->hashfn(q) & (INETFRAGS_HASHSZ - 1);
> -}
> -
> -static bool inet_frag_may_rebuild(struct inet_frags *f)
> -{
> -	return time_after(jiffies,
> -	       f->last_rebuild_jiffies + INETFRAGS_MIN_REBUILD_INTERVAL);
> -}
> -
> -static void inet_frag_secret_rebuild(struct inet_frags *f)
> -{
> -	int i;
> -
> -	write_seqlock_bh(&f->rnd_seqlock);
> -
> -	if (!inet_frag_may_rebuild(f))
> -		goto out;
> -
> -	get_random_bytes(&f->rnd, sizeof(u32));
> -
> -	for (i = 0; i < INETFRAGS_HASHSZ; i++) {
> -		struct inet_frag_bucket *hb;
> -		struct inet_frag_queue *q;
> -		struct hlist_node *n;
> -
> -		hb = &f->hash[i];
> -		spin_lock(&hb->chain_lock);
> -
> -		hlist_for_each_entry_safe(q, n, &hb->chain, list) {
> -			unsigned int hval = inet_frag_hashfn(f, q);
> -
> -			if (hval != i) {
> -				struct inet_frag_bucket *hb_dest;
> -
> -				hlist_del(&q->list);
> -
> -				/* Relink to new hash chain. */
> -				hb_dest = &f->hash[hval];
> -
> -				/* This is the only place where we take
> -				 * another chain_lock while already holding
> -				 * one.  As this will not run concurrently,
> -				 * we cannot deadlock on hb_dest lock below, if its
> -				 * already locked it will be released soon since
> -				 * other caller cannot be waiting for hb lock
> -				 * that we've taken above.
> -				 */
> -				spin_lock_nested(&hb_dest->chain_lock,
> -						 SINGLE_DEPTH_NESTING);
> -				hlist_add_head(&q->list, &hb_dest->chain);
> -				spin_unlock(&hb_dest->chain_lock);
> -			}
> -		}
> -		spin_unlock(&hb->chain_lock);
> -	}
> -
> -	f->rebuild = false;
> -	f->last_rebuild_jiffies = jiffies;
> -out:
> -	write_sequnlock_bh(&f->rnd_seqlock);
> -}
> -
> -static bool inet_fragq_should_evict(const struct inet_frag_queue *q)
> -{
> -	if (!hlist_unhashed(&q->list_evictor))
> -		return false;
> -
> -	return q->net->low_thresh == 0 ||
> -	       frag_mem_limit(q->net) >= q->net->low_thresh;
> -}
> -
> -static unsigned int
> -inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
> -{
> -	struct inet_frag_queue *fq;
> -	struct hlist_node *n;
> -	unsigned int evicted = 0;
> -	HLIST_HEAD(expired);
> -
> -	spin_lock(&hb->chain_lock);
> -
> -	hlist_for_each_entry_safe(fq, n, &hb->chain, list) {
> -		if (!inet_fragq_should_evict(fq))
> -			continue;
> -
> -		if (!del_timer(&fq->timer))
> -			continue;
> -
> -		hlist_add_head(&fq->list_evictor, &expired);
> -		++evicted;
> -	}
> -
> -	spin_unlock(&hb->chain_lock);
> -
> -	hlist_for_each_entry_safe(fq, n, &expired, list_evictor)
> -		f->frag_expire(&fq->timer);
> -
> -	return evicted;
> -}
> -
> -static void inet_frag_worker(struct work_struct *work)
> -{
> -	unsigned int budget = INETFRAGS_EVICT_BUCKETS;
> -	unsigned int i, evicted = 0;
> -	struct inet_frags *f;
> -
> -	f = container_of(work, struct inet_frags, frags_work);
> -
> -	BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ);
> -
> -	local_bh_disable();
> -
> -	for (i = READ_ONCE(f->next_bucket); budget; --budget) {
> -		evicted += inet_evict_bucket(f, &f->hash[i]);
> -		i = (i + 1) & (INETFRAGS_HASHSZ - 1);
> -		if (evicted > INETFRAGS_EVICT_MAX)
> -			break;
> -	}
> -
> -	f->next_bucket = i;
> -
> -	local_bh_enable();
> -
> -	if (f->rebuild && inet_frag_may_rebuild(f))
> -		inet_frag_secret_rebuild(f);
> -}
> -
> -static void inet_frag_schedule_worker(struct inet_frags *f)
> -{
> -	if (unlikely(!work_pending(&f->frags_work)))
> -		schedule_work(&f->frags_work);
> -}
> -
>  int inet_frags_init(struct inet_frags *f)
>  {
> -	int i;
> -
> -	INIT_WORK(&f->frags_work, inet_frag_worker);
> -
> -	for (i = 0; i < INETFRAGS_HASHSZ; i++) {
> -		struct inet_frag_bucket *hb = &f->hash[i];
> -
> -		spin_lock_init(&hb->chain_lock);
> -		INIT_HLIST_HEAD(&hb->chain);
> -	}
> -
> -	seqlock_init(&f->rnd_seqlock);
> -	f->last_rebuild_jiffies = 0;
>  	f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0,
>  					    NULL);
>  	if (!f->frags_cachep)
> @@ -214,79 +59,66 @@ EXPORT_SYMBOL(inet_frags_init);
>  
>  void inet_frags_fini(struct inet_frags *f)
>  {
> -	cancel_work_sync(&f->frags_work);
> +	/* We must wait that all inet_frag_destroy_rcu() have completed. */
> +	rcu_barrier();
> +
>  	kmem_cache_destroy(f->frags_cachep);
> +	f->frags_cachep = NULL;
>  }
>  EXPORT_SYMBOL(inet_frags_fini);
>  
> +static void inet_frags_free_cb(void *ptr, void *arg)
> +{
> +	struct inet_frag_queue *fq = ptr;
> +
> +	if (refcount_inc_not_zero(&fq->refcnt)) {
> +		spin_lock_bh(&fq->lock);
> +		if (del_timer(&fq->timer))
> +			refcount_dec(&fq->refcnt);
> +
> +		if (!(fq->flags & INET_FRAG_COMPLETE)) {
> +			fq->flags |= INET_FRAG_COMPLETE;
> +			refcount_dec(&fq->refcnt);
> +		}
> +		spin_unlock_bh(&fq->lock);
> +		inet_frag_put(fq);
> +	}
> +}
> +
>  void inet_frags_exit_net(struct netns_frags *nf)
>  {
> -	struct inet_frags *f =nf->f;
> -	unsigned int seq;
> -	int i;
> +	nf->low_thresh = 0; /* prevent creation of new frags */
>  
> -	nf->low_thresh = 0;
> -
> -evict_again:
> -	local_bh_disable();
> -	seq = read_seqbegin(&f->rnd_seqlock);
> -
> -	for (i = 0; i < INETFRAGS_HASHSZ ; i++)
> -		inet_evict_bucket(f, &f->hash[i]);
> -
> -	local_bh_enable();
> -	cond_resched();
> -
> -	if (read_seqretry(&f->rnd_seqlock, seq) ||
> -	    sum_frag_mem_limit(nf))
> -		goto evict_again;
> +	rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL);
>  }
>  EXPORT_SYMBOL(inet_frags_exit_net);
>  
> -static struct inet_frag_bucket *
> -get_frag_bucket_locked(struct inet_frag_queue *fq, struct inet_frags *f)
> -__acquires(hb->chain_lock)
> -{
> -	struct inet_frag_bucket *hb;
> -	unsigned int seq, hash;
> -
> - restart:
> -	seq = read_seqbegin(&f->rnd_seqlock);
> -
> -	hash = inet_frag_hashfn(f, fq);
> -	hb = &f->hash[hash];
> -
> -	spin_lock(&hb->chain_lock);
> -	if (read_seqretry(&f->rnd_seqlock, seq)) {
> -		spin_unlock(&hb->chain_lock);
> -		goto restart;
> -	}
> -
> -	return hb;
> -}
> -
> -static inline void fq_unlink(struct inet_frag_queue *fq)
> -{
> -	struct inet_frag_bucket *hb;
> -
> -	hb = get_frag_bucket_locked(fq, fq->net->f);
> -	hlist_del(&fq->list);
> -	fq->flags |= INET_FRAG_COMPLETE;
> -	spin_unlock(&hb->chain_lock);
> -}
> -
>  void inet_frag_kill(struct inet_frag_queue *fq)
>  {
>  	if (del_timer(&fq->timer))
>  		refcount_dec(&fq->refcnt);
>  
>  	if (!(fq->flags & INET_FRAG_COMPLETE)) {
> -		fq_unlink(fq);
> +		struct netns_frags *nf = fq->net;
> +
> +		fq->flags |= INET_FRAG_COMPLETE;
> +		rhashtable_remove_fast(&nf->rhashtable, &fq->node, nf->f->rhash_params);
>  		refcount_dec(&fq->refcnt);
>  	}
>  }
>  EXPORT_SYMBOL(inet_frag_kill);
>  
> +static void inet_frag_destroy_rcu(struct rcu_head *head)
> +{
> +	struct inet_frag_queue *q = container_of(head, struct inet_frag_queue,
> +						 rcu);
> +	struct inet_frags *f = q->net->f;
> +
> +	if (f->destructor)
> +		f->destructor(q);
> +	kmem_cache_free(f->frags_cachep, q);
> +}
> +
>  void inet_frag_destroy(struct inet_frag_queue *q)
>  {
>  	struct sk_buff *fp;
> @@ -310,59 +142,20 @@ void inet_frag_destroy(struct inet_frag_queue *q)
>  	}
>  	sum = sum_truesize + f->qsize;
>  
> -	if (f->destructor)
> -		f->destructor(q);
> -	kmem_cache_free(f->frags_cachep, q);
> +	call_rcu(&q->rcu, inet_frag_destroy_rcu);
>  
>  	sub_frag_mem_limit(nf, sum);
>  }
>  EXPORT_SYMBOL(inet_frag_destroy);
>  
> -static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
> -						struct inet_frag_queue *qp_in,
> -						struct inet_frags *f,
> -						void *arg)
> -{
> -	struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f);
> -	struct inet_frag_queue *qp;
> -
> -#ifdef CONFIG_SMP
> -	/* With SMP race we have to recheck hash table, because
> -	 * such entry could have been created on other cpu before
> -	 * we acquired hash bucket lock.
> -	 */
> -	hlist_for_each_entry(qp, &hb->chain, list) {
> -		if (qp->net == nf && f->match(qp, arg)) {
> -			refcount_inc(&qp->refcnt);
> -			spin_unlock(&hb->chain_lock);
> -			qp_in->flags |= INET_FRAG_COMPLETE;
> -			inet_frag_put(qp_in);
> -			return qp;
> -		}
> -	}
> -#endif
> -	qp = qp_in;
> -	if (!mod_timer(&qp->timer, jiffies + nf->timeout))
> -		refcount_inc(&qp->refcnt);
> -
> -	refcount_inc(&qp->refcnt);
> -	hlist_add_head(&qp->list, &hb->chain);
> -
> -	spin_unlock(&hb->chain_lock);
> -
> -	return qp;
> -}
> -
>  static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
>  					       struct inet_frags *f,
>  					       void *arg)
>  {
>  	struct inet_frag_queue *q;
>  
> -	if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) {
> -		inet_frag_schedule_worker(f);
> +	if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh)
>  		return NULL;
> -	}
>  
>  	q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
>  	if (!q)
> @@ -374,59 +167,53 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
>  
>  	timer_setup(&q->timer, f->frag_expire, 0);
>  	spin_lock_init(&q->lock);
> -	refcount_set(&q->refcnt, 1);
> +	refcount_set(&q->refcnt, 3);
>  
>  	return q;
>  }
>  
>  static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
> -						struct inet_frags *f,
>  						void *arg)
>  {
> +	struct inet_frags *f = nf->f;
>  	struct inet_frag_queue *q;
> +	int err;
>  
>  	q = inet_frag_alloc(nf, f, arg);
>  	if (!q)
>  		return NULL;
>  
> -	return inet_frag_intern(nf, q, f, arg);
> +	mod_timer(&q->timer, jiffies + nf->timeout);
> +
> +	err = rhashtable_insert_fast(&nf->rhashtable, &q->node,
> +				     f->rhash_params);
> +	add_frag_mem_limit(nf, f->qsize);
> +	if (err < 0) {
> +		q->flags |= INET_FRAG_COMPLETE;
> +		inet_frag_kill(q);
> +		inet_frag_destroy(q);
> +		return NULL;
> +	}
> +	return q;
>  }
>  
> -struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
> -				       struct inet_frags *f, void *key,
> -				       unsigned int hash)
> +/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */
> +struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key)
>  {
> -	struct inet_frag_bucket *hb;
> -	struct inet_frag_queue *q;
> -	int depth = 0;
> +	struct inet_frag_queue *fq;
>  
> -	if (frag_mem_limit(nf) > nf->low_thresh)
> -		inet_frag_schedule_worker(f);
> +	rcu_read_lock();
>  
> -	hash &= (INETFRAGS_HASHSZ - 1);
> -	hb = &f->hash[hash];
> -
> -	spin_lock(&hb->chain_lock);
> -	hlist_for_each_entry(q, &hb->chain, list) {
> -		if (q->net == nf && f->match(q, key)) {
> -			refcount_inc(&q->refcnt);
> -			spin_unlock(&hb->chain_lock);
> -			return q;
> -		}
> -		depth++;
> -	}
> -	spin_unlock(&hb->chain_lock);
> -
> -	if (depth <= INETFRAGS_MAXDEPTH)
> -		return inet_frag_create(nf, f, key);
> -
> -	if (inet_frag_may_rebuild(f)) {
> -		if (!f->rebuild)
> -			f->rebuild = true;
> -		inet_frag_schedule_worker(f);
> +	fq = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params);
> +	if (fq) {
> +		if (!refcount_inc_not_zero(&fq->refcnt))
> +			fq = NULL;
> +		rcu_read_unlock();
> +		return fq;
>  	}
> +	rcu_read_unlock();
>  
> -	return ERR_PTR(-ENOBUFS);
> +	return inet_frag_create(nf, key);
>  }
>  EXPORT_SYMBOL(inet_frag_find);
>  
> @@ -434,8 +221,7 @@ void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
>  				   const char *prefix)
>  {
>  	static const char msg[] = "inet_frag_find: Fragment hash bucket"
> -		" list length grew over limit " __stringify(INETFRAGS_MAXDEPTH)
> -		". Dropping fragment.\n";
> +		" list length grew over limit. Dropping fragment.\n";
>  
>  	if (PTR_ERR(q) == -ENOBUFS)
>  		net_dbg_ratelimited("%s%s", prefix, msg);
> diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
> index 1a3bc85d6f5ea8f36b8f3d221cad632906b317a2..4021820db6f291b255cc53aeca91dd74aef29934 100644
> --- a/net/ipv4/ip_fragment.c
> +++ b/net/ipv4/ip_fragment.c
> @@ -69,15 +69,9 @@ struct ipfrag_skb_cb
>  struct ipq {
>  	struct inet_frag_queue q;
>  
> -	u32		user;
> -	__be32		saddr;
> -	__be32		daddr;
> -	__be16		id;
> -	u8		protocol;
>  	u8		ecn; /* RFC3168 support */
>  	u16		max_df_size; /* largest frag with DF set seen */
>  	int             iif;
> -	int             vif;   /* L3 master device index */
>  	unsigned int    rid;
>  	struct inet_peer *peer;
>  };
> @@ -97,41 +91,6 @@ int ip_frag_mem(struct net *net)
>  static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
>  			 struct net_device *dev);
>  
> -struct ip4_create_arg {
> -	struct iphdr *iph;
> -	u32 user;
> -	int vif;
> -};
> -
> -static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
> -{
> -	net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd));
> -	return jhash_3words((__force u32)id << 16 | prot,
> -			    (__force u32)saddr, (__force u32)daddr,
> -			    ip4_frags.rnd);
> -}
> -
> -static unsigned int ip4_hashfn(const struct inet_frag_queue *q)
> -{
> -	const struct ipq *ipq;
> -
> -	ipq = container_of(q, struct ipq, q);
> -	return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol);
> -}
> -
> -static bool ip4_frag_match(const struct inet_frag_queue *q, const void *a)
> -{
> -	const struct ipq *qp;
> -	const struct ip4_create_arg *arg = a;
> -
> -	qp = container_of(q, struct ipq, q);
> -	return	qp->id == arg->iph->id &&
> -		qp->saddr == arg->iph->saddr &&
> -		qp->daddr == arg->iph->daddr &&
> -		qp->protocol == arg->iph->protocol &&
> -		qp->user == arg->user &&
> -		qp->vif == arg->vif;
> -}
>  
>  static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
>  {
> @@ -140,17 +99,12 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
>  					       frags);
>  	struct net *net = container_of(ipv4, struct net, ipv4);
>  
> -	const struct ip4_create_arg *arg = a;
> +	const struct frag_v4_compare_key *key = a;
>  
> -	qp->protocol = arg->iph->protocol;
> -	qp->id = arg->iph->id;
> -	qp->ecn = ip4_frag_ecn(arg->iph->tos);
> -	qp->saddr = arg->iph->saddr;
> -	qp->daddr = arg->iph->daddr;
> -	qp->vif = arg->vif;
> -	qp->user = arg->user;
> +	q->key.v4 = *key;
> +	qp->ecn = 0;
>  	qp->peer = q->net->max_dist ?
> -		inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) :
> +		inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) :
>  		NULL;
>  }
>  
> @@ -234,7 +188,7 @@ static void ip_expire(struct timer_list *t)
>  		/* Only an end host needs to send an ICMP
>  		 * "Fragment Reassembly Timeout" message, per RFC792.
>  		 */
> -		if (frag_expire_skip_icmp(qp->user) &&
> +		if (frag_expire_skip_icmp(qp->q.key.v4.user) &&
>  		    (skb_rtable(head)->rt_type != RTN_LOCAL))
>  			goto out;
>  
> @@ -262,17 +216,17 @@ static void ip_expire(struct timer_list *t)
>  static struct ipq *ip_find(struct net *net, struct iphdr *iph,
>  			   u32 user, int vif)
>  {
> +	struct frag_v4_compare_key key = {
> +		.saddr = iph->saddr,
> +		.daddr = iph->daddr,
> +		.user = user,
> +		.vif = vif,
> +		.id = iph->id,
> +		.protocol = iph->protocol,
> +	};
>  	struct inet_frag_queue *q;
> -	struct ip4_create_arg arg;
> -	unsigned int hash;
>  
> -	arg.iph = iph;
> -	arg.user = user;
> -	arg.vif = vif;
> -
> -	hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
> -
> -	q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
> +	q = inet_frag_find(&net->ipv4.frags, &key);
>  	if (IS_ERR_OR_NULL(q)) {
>  		inet_frag_maybe_warn_overflow(q, pr_fmt());
>  		return NULL;
> @@ -656,7 +610,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
>  	err = -ENOMEM;
>  	goto out_fail;
>  out_oversize:
> -	net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr);
> +	net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->q.key.v4.saddr);
>  out_fail:
>  	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
>  	return err;
> @@ -894,15 +848,47 @@ static struct pernet_operations ip4_frags_ops = {
>  	.exit = ipv4_frags_exit_net,
>  };
>  
> +
> +static u32 ip4_key_hashfn(const void *data, u32 len, u32 seed)
> +{
> +	return jhash2(data,
> +		      sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
> +}
> +
> +static u32 ip4_obj_hashfn(const void *data, u32 len, u32 seed)
> +{
> +	const struct inet_frag_queue *fq = data;
> +
> +	return jhash2((const u32 *)&fq->key.v4,
> +		      sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
> +}
> +
> +static int ip4_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
> +{
> +	const struct frag_v4_compare_key *key = arg->key;
> +	const struct inet_frag_queue *fq = ptr;
> +
> +	return !!memcmp(&fq->key, key, sizeof(*key));
> +}
> +
> +static const struct rhashtable_params ip4_rhash_params = {
> +	.head_offset		= offsetof(struct inet_frag_queue, node),
> +	.key_offset		= offsetof(struct inet_frag_queue, key),
> +	.key_len		= sizeof(struct frag_v4_compare_key),
> +	.hashfn			= ip4_key_hashfn,
> +	.obj_hashfn		= ip4_obj_hashfn,
> +	.obj_cmpfn		= ip4_obj_cmpfn,
> +	.automatic_shrinking	= true,
> +};
> +
>  void __init ipfrag_init(void)
>  {
> -	ip4_frags.hashfn = ip4_hashfn;
>  	ip4_frags.constructor = ip4_frag_init;
>  	ip4_frags.destructor = ip4_frag_free;
>  	ip4_frags.qsize = sizeof(struct ipq);
> -	ip4_frags.match = ip4_frag_match;
>  	ip4_frags.frag_expire = ip_expire;
>  	ip4_frags.frags_cache_name = ip_frag_cache_name;
> +	ip4_frags.rhash_params = ip4_rhash_params;
>  	if (inet_frags_init(&ip4_frags))
>  		panic("IP: failed to allocate ip4_frags cache\n");
>  	ip4_frags_ctl_register();
> diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
> index c4b40fdee838300f3723cece1e6b5b9d03fa9249..0ad3df551d9884ba30f2d40658ee81a61720e947 100644
> --- a/net/ipv6/netfilter/nf_conntrack_reasm.c
> +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
> @@ -152,23 +152,6 @@ static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h)
>  	return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK);
>  }
>  
> -static unsigned int nf_hash_frag(__be32 id, const struct in6_addr *saddr,
> -				 const struct in6_addr *daddr)
> -{
> -	net_get_random_once(&nf_frags.rnd, sizeof(nf_frags.rnd));
> -	return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr),
> -			    (__force u32)id, nf_frags.rnd);
> -}
> -
> -
> -static unsigned int nf_hashfn(const struct inet_frag_queue *q)
> -{
> -	const struct frag_queue *nq;
> -
> -	nq = container_of(q, struct frag_queue, q);
> -	return nf_hash_frag(nq->id, &nq->saddr, &nq->daddr);
> -}
> -
>  static void nf_ct_frag6_expire(struct timer_list *t)
>  {
>  	struct inet_frag_queue *frag = from_timer(frag, t, timer);
> @@ -182,26 +165,19 @@ static void nf_ct_frag6_expire(struct timer_list *t)
>  }
>  
>  /* Creation primitives. */
> -static inline struct frag_queue *fq_find(struct net *net, __be32 id,
> -					 u32 user, struct in6_addr *src,
> -					 struct in6_addr *dst, int iif, u8 ecn)
> +static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user,
> +				  const struct ipv6hdr *hdr, int iif)
>  {
> +	struct frag_v6_compare_key key = {
> +		.id = id,
> +		.saddr = hdr->saddr,
> +		.daddr = hdr->daddr,
> +		.user = user,
> +		.iif = iif,
> +	};
>  	struct inet_frag_queue *q;
> -	struct ip6_create_arg arg;
> -	unsigned int hash;
>  
> -	arg.id = id;
> -	arg.user = user;
> -	arg.src = src;
> -	arg.dst = dst;
> -	arg.iif = iif;
> -	arg.ecn = ecn;
> -
> -	local_bh_disable();
> -	hash = nf_hash_frag(id, src, dst);
> -
> -	q = inet_frag_find(&net->nf_frag.frags, &nf_frags, &arg, hash);
> -	local_bh_enable();
> +	q = inet_frag_find(&net->nf_frag.frags, &key);
>  	if (IS_ERR_OR_NULL(q)) {
>  		inet_frag_maybe_warn_overflow(q, pr_fmt());
>  		return NULL;
> @@ -593,8 +569,8 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
>  	fhdr = (struct frag_hdr *)skb_transport_header(skb);
>  
>  	skb_orphan(skb);
> -	fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr,
> -		     skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr));
> +	fq = fq_find(net, fhdr->identification, user, hdr,
> +		     skb->dev ? skb->dev->ifindex : 0);
>  	if (fq == NULL) {
>  		pr_debug("Can't find and can't create new queue\n");
>  		return -ENOMEM;
> @@ -660,13 +636,12 @@ int nf_ct_frag6_init(void)
>  {
>  	int ret = 0;
>  
> -	nf_frags.hashfn = nf_hashfn;
>  	nf_frags.constructor = ip6_frag_init;
>  	nf_frags.destructor = NULL;
>  	nf_frags.qsize = sizeof(struct frag_queue);
> -	nf_frags.match = ip6_frag_match;
>  	nf_frags.frag_expire = nf_ct_frag6_expire;
>  	nf_frags.frags_cache_name = nf_frags_cache_name;
> +	nf_frags.rhash_params = ip6_rhash_params;
>  	ret = inet_frags_init(&nf_frags);
>  	if (ret)
>  		goto out;
> diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
> index f0071b113a92fcff15ac57610170c12b17cb59ba..3fc853e4492abb109062d662296c0b470763042a 100644
> --- a/net/ipv6/reassembly.c
> +++ b/net/ipv6/reassembly.c
> @@ -79,52 +79,13 @@ static struct inet_frags ip6_frags;
>  static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
>  			  struct net_device *dev);
>  
> -/*
> - * callers should be careful not to use the hash value outside the ipfrag_lock
> - * as doing so could race with ipfrag_hash_rnd being recalculated.
> - */
> -static unsigned int inet6_hash_frag(__be32 id, const struct in6_addr *saddr,
> -				    const struct in6_addr *daddr)
> -{
> -	net_get_random_once(&ip6_frags.rnd, sizeof(ip6_frags.rnd));
> -	return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr),
> -			    (__force u32)id, ip6_frags.rnd);
> -}
> -
> -static unsigned int ip6_hashfn(const struct inet_frag_queue *q)
> -{
> -	const struct frag_queue *fq;
> -
> -	fq = container_of(q, struct frag_queue, q);
> -	return inet6_hash_frag(fq->id, &fq->saddr, &fq->daddr);
> -}
> -
> -bool ip6_frag_match(const struct inet_frag_queue *q, const void *a)
> -{
> -	const struct frag_queue *fq;
> -	const struct ip6_create_arg *arg = a;
> -
> -	fq = container_of(q, struct frag_queue, q);
> -	return	fq->id == arg->id &&
> -		fq->user == arg->user &&
> -		ipv6_addr_equal(&fq->saddr, arg->src) &&
> -		ipv6_addr_equal(&fq->daddr, arg->dst) &&
> -		(arg->iif == fq->iif ||
> -		 !(ipv6_addr_type(arg->dst) & (IPV6_ADDR_MULTICAST |
> -					       IPV6_ADDR_LINKLOCAL)));
> -}
> -EXPORT_SYMBOL(ip6_frag_match);
> -
>  void ip6_frag_init(struct inet_frag_queue *q, const void *a)
>  {
>  	struct frag_queue *fq = container_of(q, struct frag_queue, q);
> -	const struct ip6_create_arg *arg = a;
> +	const struct frag_v6_compare_key *key = a;
>  
> -	fq->id = arg->id;
> -	fq->user = arg->user;
> -	fq->saddr = *arg->src;
> -	fq->daddr = *arg->dst;
> -	fq->ecn = arg->ecn;
> +	q->key.v6 = *key;
> +	fq->ecn = 0;
>  }
>  EXPORT_SYMBOL(ip6_frag_init);
>  
> @@ -182,23 +143,22 @@ static void ip6_frag_expire(struct timer_list *t)
>  }
>  
>  static struct frag_queue *
> -fq_find(struct net *net, __be32 id, const struct in6_addr *src,
> -	const struct in6_addr *dst, int iif, u8 ecn)
> +fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif)
>  {
> +	struct frag_v6_compare_key key = {
> +		.id = id,
> +		.saddr = hdr->saddr,
> +		.daddr = hdr->daddr,
> +		.user = IP6_DEFRAG_LOCAL_DELIVER,
> +		.iif = iif,
> +	};
>  	struct inet_frag_queue *q;
> -	struct ip6_create_arg arg;
> -	unsigned int hash;
>  
> -	arg.id = id;
> -	arg.user = IP6_DEFRAG_LOCAL_DELIVER;
> -	arg.src = src;
> -	arg.dst = dst;
> -	arg.iif = iif;
> -	arg.ecn = ecn;
> +	if (!(ipv6_addr_type(&hdr->daddr) & (IPV6_ADDR_MULTICAST |
> +					    IPV6_ADDR_LINKLOCAL)))
> +		key.iif = 0;
>  
> -	hash = inet6_hash_frag(id, src, dst);
> -
> -	q = inet_frag_find(&net->ipv6.frags, &ip6_frags, &arg, hash);
> +	q = inet_frag_find(&net->ipv6.frags, &key);
>  	if (IS_ERR_OR_NULL(q)) {
>  		inet_frag_maybe_warn_overflow(q, pr_fmt());
>  		return NULL;
> @@ -530,6 +490,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
>  	struct frag_queue *fq;
>  	const struct ipv6hdr *hdr = ipv6_hdr(skb);
>  	struct net *net = dev_net(skb_dst(skb)->dev);
> +	int iif;
>  
>  	if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED)
>  		goto fail_hdr;
> @@ -558,13 +519,14 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
>  		return 1;
>  	}
>  
> -	fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr,
> -		     skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr));
> +	iif = skb->dev ? skb->dev->ifindex : 0;
> +	fq = fq_find(net, fhdr->identification, hdr, iif);
>  	if (fq) {
>  		int ret;
>  
>  		spin_lock(&fq->q.lock);
>  
> +		fq->iif = iif;
>  		ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff);
>  
>  		spin_unlock(&fq->q.lock);
> @@ -738,17 +700,47 @@ static struct pernet_operations ip6_frags_ops = {
>  	.exit = ipv6_frags_exit_net,
>  };
>  
> +static u32 ip6_key_hashfn(const void *data, u32 len, u32 seed)
> +{
> +	return jhash2(data,
> +		      sizeof(struct frag_v6_compare_key) / sizeof(u32), seed);
> +}
> +
> +static u32 ip6_obj_hashfn(const void *data, u32 len, u32 seed)
> +{
> +	const struct inet_frag_queue *fq = data;
> +
> +	return jhash2((const u32 *)&fq->key.v6,
> +		      sizeof(struct frag_v6_compare_key) / sizeof(u32), seed);
> +}
> +
> +static int ip6_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
> +{
> +	const struct frag_v6_compare_key *key = arg->key;
> +	const struct inet_frag_queue *fq = ptr;
> +
> +	return !!memcmp(&fq->key, key, sizeof(*key));
> +}
> +
> +const struct rhashtable_params ip6_rhash_params = {
> +	.head_offset		= offsetof(struct inet_frag_queue, node),
> +	.hashfn			= ip6_key_hashfn,
> +	.obj_hashfn		= ip6_obj_hashfn,
> +	.obj_cmpfn		= ip6_obj_cmpfn,
> +	.automatic_shrinking	= true,
> +};
> +EXPORT_SYMBOL(ip6_rhash_params);
> +
>  int __init ipv6_frag_init(void)
>  {
>  	int ret;
>  
> -	ip6_frags.hashfn = ip6_hashfn;
>  	ip6_frags.constructor = ip6_frag_init;
>  	ip6_frags.destructor = NULL;
>  	ip6_frags.qsize = sizeof(struct frag_queue);
> -	ip6_frags.match = ip6_frag_match;
>  	ip6_frags.frag_expire = ip6_frag_expire;
>  	ip6_frags.frags_cache_name = ip6_frag_cache_name;
> +	ip6_frags.rhash_params = ip6_rhash_params;
>  	ret = inet_frags_init(&ip6_frags);
>  	if (ret)
>  		goto out;
> 

Kirill

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ