netdev - [RFC net-next PATCH 3/3] net: remove fragmentation LRU list system

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20130418213805.14296.21621.stgit@dragon>
Date:	Thu, 18 Apr 2013 23:39:01 +0200
From:	Jesper Dangaard Brouer <brouer@...hat.com>
To:	Eric Dumazet <eric.dumazet@...il.com>,
	"David S. Miller" <davem@...emloft.net>,
	Hannes Frederic Sowa <hannes@...essinduktion.org>
Cc:	Jesper Dangaard Brouer <brouer@...hat.com>, netdev@...r.kernel.org
Subject: [RFC net-next PATCH 3/3] net: remove fragmentation LRU list system

WORK-IN-PROGRESS

I have a bug in inet_frag_cleanup_netns(), which clean/deletes fragments
when a network namespace is taken down.

TODO:
- netns mem limits and hash cleaning have conflicts
- Howto can we re-add the IPSTATS_MIB_REASMFAILS stats?
- Remove nf->low_thresh
- Doc new max_hash_depth

NOT-signed-off
---

 include/net/inet_frag.h                 |   36 ++++++------
 include/net/ipv6.h                      |    2 -
 net/ipv4/inet_fragment.c                |   96 +++++++++++++++++--------------
 net/ipv4/ip_fragment.c                  |   19 +-----
 net/ipv6/netfilter/nf_conntrack_reasm.c |    5 --
 net/ipv6/reassembly.c                   |    8 ---
 6 files changed, 75 insertions(+), 91 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index b2cd72a..276900b 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -4,9 +4,7 @@
 #include <linux/percpu_counter.h>
 
 struct netns_frags {
-	int			nqueues;
-	struct list_head	lru_list;
-	spinlock_t		lru_lock;
+	struct percpu_counter   nqueues;
 
 	/* The percpu_counter "mem" need to be cacheline aligned.
 	 *  mem.count must not share cacheline with other writers
@@ -133,28 +131,30 @@ static inline int sum_frag_mem_limit(struct netns_frags *nf)
 	return res;
 }
 
-static inline void inet_frag_lru_move(struct inet_frag_queue *q)
+static inline void dec_frag_nqueues(struct inet_frag_queue *q)
 {
-	spin_lock(&q->net->lru_lock);
-	list_move_tail(&q->lru_list, &q->net->lru_list);
-	spin_unlock(&q->net->lru_lock);
+	percpu_counter_dec(&q->net->nqueues);
 }
 
-static inline void inet_frag_lru_del(struct inet_frag_queue *q)
+static inline void inc_frag_nqueues(struct inet_frag_queue *q)
 {
-	spin_lock(&q->net->lru_lock);
-	list_del(&q->lru_list);
-	q->net->nqueues--;
-	spin_unlock(&q->net->lru_lock);
+	percpu_counter_inc(&q->net->nqueues);
 }
 
-static inline void inet_frag_lru_add(struct netns_frags *nf,
-				     struct inet_frag_queue *q)
+static inline void init_frag_nqueues(struct netns_frags *nf)
 {
-	spin_lock(&nf->lru_lock);
-	list_add_tail(&q->lru_list, &nf->lru_list);
-	q->net->nqueues++;
-	spin_unlock(&nf->lru_lock);
+	percpu_counter_init(&nf->nqueues, 0);
+}
+
+static inline int sum_frag_nqueues(struct netns_frags *nf)
+{
+	int res;
+
+	local_bh_disable();
+	res = percpu_counter_sum_positive(&nf->nqueues);
+	local_bh_enable();
+
+	return res;
 }
 
 /* RFC 3168 support :
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 0810aa5..c35873c 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -286,7 +286,7 @@ static inline bool ipv6_accept_ra(struct inet6_dev *idev)
 #if IS_ENABLED(CONFIG_IPV6)
 static inline int ip6_frag_nqueues(struct net *net)
 {
-	return net->ipv6.frags.nqueues;
+	return sum_frag_nqueues(&net->ipv6.frags);
 }
 
 static inline int ip6_frag_mem(struct net *net)
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index fd9fe5c..8cb46fa 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -105,10 +105,8 @@ EXPORT_SYMBOL(inet_frags_init);
 
 void inet_frags_init_net(struct netns_frags *nf)
 {
-	nf->nqueues = 0;
+	init_frag_nqueues(nf);
 	init_frag_mem_limit(nf);
-	INIT_LIST_HEAD(&nf->lru_list);
-	spin_lock_init(&nf->lru_lock);
 }
 EXPORT_SYMBOL(inet_frags_init_net);
 
@@ -118,18 +116,6 @@ void inet_frags_fini(struct inet_frags *f)
 }
 EXPORT_SYMBOL(inet_frags_fini);
 
-void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
-{
-	nf->low_thresh = 0;
-
-	local_bh_disable();
-	inet_frag_evictor(nf, f, true);
-	local_bh_enable();
-
-	percpu_counter_destroy(&nf->mem);
-}
-EXPORT_SYMBOL(inet_frags_exit_net);
-
 static inline void fq_unlink_hash(struct inet_frag_queue *fq,
 				  struct inet_frags *f)
 {
@@ -160,7 +146,7 @@ void __inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f,
 	if (!(fq->last_in & INET_FRAG_COMPLETE)) {
 		if (unlink_hash)
 			fq_unlink_hash(fq, f);
-		inet_frag_lru_del(fq);
+		dec_frag_nqueues(fq);
 		atomic_dec(&fq->refcnt);
 		fq->last_in |= INET_FRAG_COMPLETE;
 	}
@@ -212,46 +198,60 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
 }
 EXPORT_SYMBOL(inet_frag_destroy);
 
-int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)
+int inet_frag_cleanup_netns(struct netns_frags *nf, struct inet_frags *f)
 {
+	struct hlist_head kill_list;
 	struct inet_frag_queue *q;
-	int work, evicted = 0;
+	struct hlist_node *n;
+	int i, evicted = 0;
 
-	if (!force) {
-		if (frag_mem_limit(nf) <= nf->high_thresh)
-			return 0;
-	}
+	/* Move inet_frag_queue's from hash table to the kill_list */
+	read_lock(&f->lock);
+	for (i = 0; i < INETFRAGS_HASHSZ; i++) {
+		struct inet_frag_bucket *hb;
 
-	work = frag_mem_limit(nf) - nf->low_thresh;
-	while (work > 0) {
-		spin_lock(&nf->lru_lock);
+		hb = &f->hash[i];
+		spin_lock(&hb->chain_lock);
+		hlist_for_each_entry_safe(q, n, &hb->chain, list) {
 
-		if (list_empty(&nf->lru_list)) {
-			spin_unlock(&nf->lru_lock);
-			break;
+			if (q->net == nf) { /* Only for specific netns */
+				atomic_inc(&q->refcnt);
+				hlist_del(&q->list);
+				hlist_add_head(&q->list, &kill_list);
+			}
 		}
+		spin_unlock(&hb->chain_lock);
+	}
+	read_unlock(&f->lock);
 
-		q = list_first_entry(&nf->lru_list,
-				struct inet_frag_queue, lru_list);
-		atomic_inc(&q->refcnt);
-		/* Remove q from list to avoid several CPUs grabbing it */
-		list_del_init(&q->lru_list);
-
-		spin_unlock(&nf->lru_lock);
-
+	/* Kill off the inet_frag_queue's */
+	hlist_for_each_entry_safe(q, n, &kill_list, list) {
 		spin_lock(&q->lock);
 		if (!(q->last_in & INET_FRAG_COMPLETE))
 			inet_frag_kill(q, f);
 		spin_unlock(&q->lock);
 
 		if (atomic_dec_and_test(&q->refcnt))
-			inet_frag_destroy(q, f, &work);
+			inet_frag_destroy(q, f, NULL);
+
 		evicted++;
 	}
 
 	return evicted;
 }
-EXPORT_SYMBOL(inet_frag_evictor);
+
+void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
+{
+	nf->low_thresh = 0;
+
+	local_bh_disable();
+	//BROKEN: inet_frag_cleanup_netns(nf, f);
+	local_bh_enable();
+
+	percpu_counter_destroy(&nf->nqueues);
+	percpu_counter_destroy(&nf->mem);
+}
+EXPORT_SYMBOL(inet_frags_exit_net);
 
 static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
 		struct inet_frag_queue *qp_in, struct inet_frags *f,
@@ -295,7 +295,7 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
 
 	atomic_inc(&qp->refcnt);
 	hlist_add_head(&qp->list, &hb->chain);
-	inet_frag_lru_add(nf, qp);
+	inc_frag_nqueues(qp);
 	spin_unlock(&hb->chain_lock);
 	read_unlock(&f->lock);
 	return qp;
@@ -356,9 +356,19 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
 		q_evict = q; /* candidate for eviction */
 	}
 	/* Not found situation */
-	if (depth > f->max_hash_depth) {
-		atomic_inc(&q_evict->refcnt);
-		hlist_del_init(&q_evict->list);
+	if ((depth > f->max_hash_depth)
+	    || (frag_mem_limit(nf) > nf->high_thresh)) {
+
+		/* This is in principal wrong, notice
+		 * frag_mem_limit(nf) is per netns, and we might
+		 * delete q_evict belonging to another netns, which is
+		 * not going to lower the frag_mem_limit(nf) for this
+		 * netns. What to do?
+		 */
+		if (q_evict) {
+			atomic_inc(&q_evict->refcnt);
+			hlist_del_init(&q_evict->list);
+		}
 	} else
 		q_evict = NULL;
 	spin_unlock(&hb->chain_lock);
@@ -374,7 +384,7 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
 			inet_frag_destroy(q_evict, f, NULL);
 
 		LIMIT_NETDEBUG(KERN_WARNING "%s(): Fragment hash bucket"
-			       " list length grew over limit (len %d),"
+			       " grew over lenght or mem limit (len %d),"
 			       " Dropping another fragment.\n",
 			       __func__, depth);
 	}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 4e8489b..193e360 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -18,6 +18,7 @@
  *		John McDonald	:	0 length frag bug.
  *		Alexey Kuznetsov:	SMP races, threading, cleanup.
  *		Patrick McHardy :	LRU queue of frag heads for evictor.
+ *		Jesper D Brouer :	Removed LRU queue for evictor.
  */
 
 #define pr_fmt(fmt) "IPv4: " fmt
@@ -88,7 +89,7 @@ static struct inet_frags ip4_frags;
 
 int ip_frag_nqueues(struct net *net)
 {
-	return net->ipv4.frags.nqueues;
+	return sum_frag_nqueues(&net->ipv4.frags);
 }
 
 int ip_frag_mem(struct net *net)
@@ -176,18 +177,6 @@ static void ipq_kill(struct ipq *ipq)
 	inet_frag_kill(&ipq->q, &ip4_frags);
 }
 
-/* Memory limiting on fragments.  Evictor trashes the oldest
- * fragment queue until we are back under the threshold.
- */
-static void ip_evictor(struct net *net)
-{
-	int evicted;
-
-	evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false);
-	if (evicted)
-		IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted);
-}
-
 /*
  * Oops, a fragment queue timed out.  Kill it and send an ICMP reply.
  */
@@ -495,7 +484,6 @@ found:
 	    qp->q.meat == qp->q.len)
 		return ip_frag_reasm(qp, prev, dev);
 
-	inet_frag_lru_move(&qp->q);
 	return -EINPROGRESS;
 
 err:
@@ -645,9 +633,6 @@ int ip_defrag(struct sk_buff *skb, u32 user)
 	net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev);
 	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
 
-	/* Start by cleaning up the memory. */
-	ip_evictor(net);
-
 	/* Lookup (or create) queue header */
 	if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) {
 		int ret;
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 3713764..c865d3c 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -338,7 +338,6 @@ found:
 		fq->q.last_in |= INET_FRAG_FIRST_IN;
 	}
 
-	inet_frag_lru_move(&fq->q);
 	return 0;
 
 discard_fq:
@@ -583,10 +582,6 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user)
 	hdr = ipv6_hdr(clone);
 	fhdr = (struct frag_hdr *)skb_transport_header(clone);
 
-	local_bh_disable();
-	inet_frag_evictor(&net->nf_frag.frags, &nf_frags, false);
-	local_bh_enable();
-
 	fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr,
 		     ip6_frag_ecn(hdr));
 	if (fq == NULL) {
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 45de5eb..ab1c843 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -25,6 +25,7 @@
  *	David Stevens and
  *	YOSHIFUJI,H. @USAGI	Always remove fragment header to
  *				calculate ICV correctly.
+ *	Jesper Dangaard Brouer	Removed LRU queue for evictor.
  */
 
 #define pr_fmt(fmt) "IPv6: " fmt
@@ -343,7 +344,6 @@ found:
 	    fq->q.meat == fq->q.len)
 		return ip6_frag_reasm(fq, prev, dev);
 
-	inet_frag_lru_move(&fq->q);
 	return -1;
 
 discard_fq:
@@ -512,7 +512,6 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 	struct frag_queue *fq;
 	const struct ipv6hdr *hdr = ipv6_hdr(skb);
 	struct net *net = dev_net(skb_dst(skb)->dev);
-	int evicted;
 
 	IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMREQDS);
 
@@ -537,11 +536,6 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 		return 1;
 	}
 
-	evicted = inet_frag_evictor(&net->ipv6.frags, &ip6_frags, false);
-	if (evicted)
-		IP6_ADD_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
-				 IPSTATS_MIB_REASMFAILS, evicted);
-
 	fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr,
 		     ip6_frag_ecn(hdr));
 	if (fq != NULL) {

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html