[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <6512d94713d40f1d572d2023168c48990f0d9cf0.1530798211.git.pabeni@redhat.com>
Date: Fri, 6 Jul 2018 12:10:28 +0200
From: Paolo Abeni <pabeni@...hat.com>
To: netdev@...r.kernel.org
Cc: "David S. Miller" <davem@...emloft.net>,
Eric Dumazet <edumazet@...gle.com>,
Florian Westphal <fw@...len.de>, NeilBrown <neilb@...e.com>
Subject: [RFC PATCH] ip: re-introduce fragments cache worker
Currently, the ip frag cache is fragile to overload. With
flow control disabled:
./super_netperf.sh 10 -H 192.168.101.2 -t UDP_STREAM -l 60
9618.08
./super_netperf.sh 200 -H 192.168.101.2 -t UDP_STREAM -l 60
28.66
Once that the overload condition is reached, the system does not
recover until it's almost completely idle:
./super_netperf.sh 200 -H 192.168.101.2 -t UDP_STREAM -l 60 &
sleep 4; I=0;
for P in `pidof netperf`; do kill -9 $P; I=$((I+1)); [ $I -gt 190 ] && break; done
13.72
This is due to the removal of the fragment cache worker, which
was responsible to free some IP fragment cache memory when the
high threshold was reached, allowing the system to cope successfully
with the next fragmented packets.
This commit re-introduces the worker, on a per netns basis. Thanks
to rhashtable walkers we can block the bh only for an entry removal.
After this commit (and before IP frag worker removal):
./super_netperf.sh 10 -H 192.168.101.2 -t UDP_STREAM -l 60
9618.08
./super_netperf.sh 200 -H 192.168.101.2 -t UDP_STREAM -l 60
8599.77
./super_netperf.sh 200 -H 192.168.101.2 -t UDP_STREAM -l 60 &
sleep 4; I=0;
for P in `pidof netperf`; do kill -9 $P; I=$((I+1)); [ $I -gt 190 ] && break; done
9623.12
Fixes: 648700f76b03 ("inet: frags: use rhashtables for reassembly units")
Signed-off-by: Paolo Abeni <pabeni@...hat.com>
---
Note: tweaking ipfrag sysfs does not solve completely the issue:
- raising ipfrag_high_thresh increases the number of parallels
connections required to degrade the tput, but reached the IP
fragment cache capacity the good-put still goes almost to 0,
with the worker we get much more nice behaviour.
- setting ipfrag_time to 2 increases the change to recover from
overload (the 2# test above), but with several experiments
in such test, I got an average of 50% the expected tput with a very
large variance, with the worker we always see the expected/
line rate tput.
---
include/net/inet_frag.h | 8 ++---
net/ipv4/inet_fragment.c | 72 ++++++++++++++++++++++++++++++++++++++--
2 files changed, 73 insertions(+), 7 deletions(-)
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index ed07e3786d98..1f12692d7f7d 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -11,6 +11,8 @@ struct netns_frags {
int timeout;
int max_dist;
struct inet_frags *f;
+ struct work_struct frags_work;
+ struct rhashtable_iter iter;
struct rhashtable rhashtable ____cacheline_aligned_in_smp;
@@ -101,11 +103,7 @@ struct inet_frags {
int inet_frags_init(struct inet_frags *);
void inet_frags_fini(struct inet_frags *);
-static inline int inet_frags_init_net(struct netns_frags *nf)
-{
- atomic_long_set(&nf->mem, 0);
- return rhashtable_init(&nf->rhashtable, &nf->f->rhash_params);
-}
+int inet_frags_init_net(struct netns_frags *nf);
void inet_frags_exit_net(struct netns_frags *nf);
void inet_frag_kill(struct inet_frag_queue *q);
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index c9e35b81d093..0f5b29ce96de 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -88,10 +88,76 @@ static void inet_frags_free_cb(void *ptr, void *arg)
inet_frag_put(fq);
}
+static void inet_frag_schedule_worker(struct netns_frags *nf)
+{
+ if (unlikely(!work_pending(&nf->frags_work)))
+ schedule_work(&nf->frags_work);
+}
+
+#define INETFRAGS_EVICT_MAX 64
+static void inet_frag_worker(struct work_struct *work)
+{
+ struct netns_frags *nf;
+ bool reschedule;
+ int evicted = 0;
+
+ nf = container_of(work, struct netns_frags, frags_work);
+
+ rhashtable_walk_start(&nf->iter);
+
+ while ((reschedule = (frag_mem_limit(nf) > nf->low_thresh))) {
+ struct inet_frag_queue *fq = rhashtable_walk_next(&nf->iter);
+
+ if (IS_ERR(fq) && PTR_ERR(fq) == -EAGAIN)
+ continue;
+ if (!fq) {
+ /* end of table, restart the walk */
+ rhashtable_walk_stop(&nf->iter);
+ rhashtable_walk_exit(&nf->iter);
+ rhashtable_walk_enter(&nf->rhashtable, &nf->iter);
+ rhashtable_walk_start(&nf->iter);
+ continue;
+ }
+ if (!refcount_inc_not_zero(&fq->refcnt))
+ continue;
+
+ spin_lock_bh(&fq->lock);
+ inet_frag_kill(fq);
+ spin_unlock_bh(&fq->lock);
+ inet_frag_put(fq);
+
+ /* limit the amount of work we can do before a reschedule,
+ * to avoid starving others queued works
+ */
+ if (++evicted > INETFRAGS_EVICT_MAX)
+ break;
+ }
+
+ rhashtable_walk_stop(&nf->iter);
+
+ if (reschedule)
+ inet_frag_schedule_worker(nf);
+}
+
+int inet_frags_init_net(struct netns_frags *nf)
+{
+ int ret;
+
+ atomic_long_set(&nf->mem, 0);
+ INIT_WORK(&nf->frags_work, inet_frag_worker);
+ ret = rhashtable_init(&nf->rhashtable, &nf->f->rhash_params);
+ if (ret)
+ return ret;
+ rhashtable_walk_enter(&nf->rhashtable, &nf->iter);
+ return ret;
+}
+EXPORT_SYMBOL(inet_frags_init_net);
+
void inet_frags_exit_net(struct netns_frags *nf)
{
nf->low_thresh = 0; /* prevent creation of new frags */
-
+ cancel_work_sync(&nf->frags_work);
+ rhashtable_walk_exit(&nf->iter);
rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL);
}
EXPORT_SYMBOL(inet_frags_exit_net);
@@ -157,8 +223,10 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
{
struct inet_frag_queue *q;
- if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh)
+ if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) {
+ inet_frag_schedule_worker(nf);
return NULL;
+ }
q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
if (!q)
--
2.17.1
Powered by blists - more mailing lists