lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <6512d94713d40f1d572d2023168c48990f0d9cf0.1530798211.git.pabeni@redhat.com>
Date:   Fri,  6 Jul 2018 12:10:28 +0200
From:   Paolo Abeni <pabeni@...hat.com>
To:     netdev@...r.kernel.org
Cc:     "David S. Miller" <davem@...emloft.net>,
        Eric Dumazet <edumazet@...gle.com>,
        Florian Westphal <fw@...len.de>, NeilBrown <neilb@...e.com>
Subject: [RFC PATCH] ip: re-introduce fragments cache worker

Currently, the ip frag cache is fragile to overload. With
flow control disabled:

./super_netperf.sh 10  -H 192.168.101.2 -t UDP_STREAM -l 60
9618.08
./super_netperf.sh 200  -H 192.168.101.2 -t UDP_STREAM -l 60
28.66

Once that the overload condition is reached, the system does not
recover until it's almost completely idle:

./super_netperf.sh 200  -H 192.168.101.2 -t UDP_STREAM -l 60 &
sleep 4; I=0;
for P in `pidof netperf`; do kill -9 $P; I=$((I+1)); [ $I -gt 190 ] && break; done
13.72

This is due to the removal of the fragment cache worker, which
was responsible to free some IP fragment cache memory when the
high threshold was reached, allowing the system to cope successfully
with the next fragmented packets.

This commit re-introduces the worker, on a per netns basis. Thanks
to rhashtable walkers we can block the bh only for an entry removal.

After this commit (and before IP frag worker removal):

./super_netperf.sh 10  -H 192.168.101.2 -t UDP_STREAM -l 60
9618.08

./super_netperf.sh 200  -H 192.168.101.2 -t UDP_STREAM -l 60
8599.77

./super_netperf.sh 200  -H 192.168.101.2 -t UDP_STREAM -l 60 &
sleep 4; I=0;
for P in `pidof netperf`; do kill -9 $P; I=$((I+1)); [ $I -gt 190 ] && break; done
9623.12

Fixes: 648700f76b03 ("inet: frags: use rhashtables for reassembly units")
Signed-off-by: Paolo Abeni <pabeni@...hat.com>
---
Note: tweaking ipfrag sysfs does not solve completely the issue:
- raising ipfrag_high_thresh increases the number of parallels
  connections required to degrade the tput, but reached the IP
  fragment cache capacity the good-put still goes almost to 0,
  with the worker we get much more nice behaviour.
- setting ipfrag_time to 2 increases the change to recover from
  overload (the 2# test above), but with several experiments 
  in such test, I got an average of 50% the expected tput with a very
  large variance, with the worker we always see the expected/
  line rate tput.
---
 include/net/inet_frag.h  |  8 ++---
 net/ipv4/inet_fragment.c | 72 ++++++++++++++++++++++++++++++++++++++--
 2 files changed, 73 insertions(+), 7 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index ed07e3786d98..1f12692d7f7d 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -11,6 +11,8 @@ struct netns_frags {
 	int			timeout;
 	int			max_dist;
 	struct inet_frags	*f;
+	struct work_struct	frags_work;
+	struct rhashtable_iter	iter;
 
 	struct rhashtable       rhashtable ____cacheline_aligned_in_smp;
 
@@ -101,11 +103,7 @@ struct inet_frags {
 int inet_frags_init(struct inet_frags *);
 void inet_frags_fini(struct inet_frags *);
 
-static inline int inet_frags_init_net(struct netns_frags *nf)
-{
-	atomic_long_set(&nf->mem, 0);
-	return rhashtable_init(&nf->rhashtable, &nf->f->rhash_params);
-}
+int inet_frags_init_net(struct netns_frags *nf);
 void inet_frags_exit_net(struct netns_frags *nf);
 
 void inet_frag_kill(struct inet_frag_queue *q);
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index c9e35b81d093..0f5b29ce96de 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -88,10 +88,76 @@ static void inet_frags_free_cb(void *ptr, void *arg)
 	inet_frag_put(fq);
 }
 
+static void inet_frag_schedule_worker(struct netns_frags *nf)
+{
+	if (unlikely(!work_pending(&nf->frags_work)))
+		schedule_work(&nf->frags_work);
+}
+
+#define INETFRAGS_EVICT_MAX	64
+static void inet_frag_worker(struct work_struct *work)
+{
+	struct netns_frags *nf;
+	bool reschedule;
+	int evicted = 0;
+
+	nf = container_of(work, struct netns_frags, frags_work);
+
+	rhashtable_walk_start(&nf->iter);
+
+	while ((reschedule = (frag_mem_limit(nf) > nf->low_thresh))) {
+		struct inet_frag_queue *fq = rhashtable_walk_next(&nf->iter);
+
+		if (IS_ERR(fq) && PTR_ERR(fq) == -EAGAIN)
+			continue;
+		if (!fq) {
+			/* end of table, restart the walk */
+			rhashtable_walk_stop(&nf->iter);
+			rhashtable_walk_exit(&nf->iter);
+			rhashtable_walk_enter(&nf->rhashtable, &nf->iter);
+			rhashtable_walk_start(&nf->iter);
+			continue;
+		}
+		if (!refcount_inc_not_zero(&fq->refcnt))
+			continue;
+
+		spin_lock_bh(&fq->lock);
+		inet_frag_kill(fq);
+		spin_unlock_bh(&fq->lock);
+		inet_frag_put(fq);
+
+		/* limit the amount of work we can do before a reschedule,
+		 * to avoid starving others queued works
+		 */
+		if (++evicted > INETFRAGS_EVICT_MAX)
+			break;
+	}
+
+	rhashtable_walk_stop(&nf->iter);
+
+	if (reschedule)
+		inet_frag_schedule_worker(nf);
+}
+
+int inet_frags_init_net(struct netns_frags *nf)
+{
+	int ret;
+
+	atomic_long_set(&nf->mem, 0);
+	INIT_WORK(&nf->frags_work, inet_frag_worker);
+	ret = rhashtable_init(&nf->rhashtable, &nf->f->rhash_params);
+	if (ret)
+		return ret;
+	rhashtable_walk_enter(&nf->rhashtable, &nf->iter);
+	return ret;
+}
+EXPORT_SYMBOL(inet_frags_init_net);
+
 void inet_frags_exit_net(struct netns_frags *nf)
 {
 	nf->low_thresh = 0; /* prevent creation of new frags */
-
+	cancel_work_sync(&nf->frags_work);
+	rhashtable_walk_exit(&nf->iter);
 	rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL);
 }
 EXPORT_SYMBOL(inet_frags_exit_net);
@@ -157,8 +223,10 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
 {
 	struct inet_frag_queue *q;
 
-	if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh)
+	if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) {
+		inet_frag_schedule_worker(nf);
 		return NULL;
+	}
 
 	q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
 	if (!q)
-- 
2.17.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ