[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <1269509574.3626.9.camel@edumazet-laptop>
Date: Thu, 25 Mar 2010 10:32:54 +0100
From: Eric Dumazet <eric.dumazet@...il.com>
To: Jorrit Kronjee <j.kronjee@...opact.nl>
Cc: netfilter-devel@...r.kernel.org, netdev <netdev@...r.kernel.org>,
Patrick McHardy <kaber@...sh.net>
Subject: Re: debugging kernel during packet drops
Le mercredi 24 mars 2010 à 17:22 +0100, Eric Dumazet a écrit :
> Le mercredi 24 mars 2010 à 16:20 +0100, Jorrit Kronjee a écrit :
> >
> > I hope this helps! Is there anything special I need to do to use RPS?
> >
>
> Sure this helps a lot !
>
> You might try RPS by doing :
>
> echo f >/sys/class/net/eth3/queues/rx-0/rps_cpus
>
> (But you'll also need a new xt_hashlimit module to make it more
> scalable, I can work on this this week if necessary)
>
Here is patch I cooked for xt_hashlimit (on top of net-next-2.6) to make
it use RCU and scale better in your case (allowing several concurrent
cpus once RPS is activated), but also on more general cases.
[PATCH] xt_hashlimit: RCU conversion
xt_hashlimit uses a central lock per hash table and suffers from
contention on some workloads.
After RCU conversion, central lock is only used when a writer wants to
add or delete an entry. For 'readers', updating an existing entry, they
use an individual lock per entry.
Signed-off-by: Eric Dumazet <eric.dumazet@...il.com>
---
net/netfilter/xt_hashlimit.c | 290 +++++++++++++++++----------------
1 file changed, 151 insertions(+), 139 deletions(-)
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 9e9c489..a4f1b69 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -41,6 +41,54 @@ MODULE_DESCRIPTION("Xtables: per hash-bucket rate-limit match");
MODULE_ALIAS("ipt_hashlimit");
MODULE_ALIAS("ip6t_hashlimit");
+/* The algorithm used is the Simple Token Bucket Filter (TBF)
+ * see net/sched/sch_tbf.c in the linux source tree
+ */
+
+/* Rusty: This is my (non-mathematically-inclined) understanding of
+ this algorithm. The `average rate' in jiffies becomes your initial
+ amount of credit `credit' and the most credit you can ever have
+ `credit_cap'. The `peak rate' becomes the cost of passing the
+ test, `cost'.
+
+ `prev' tracks the last packet hit: you gain one credit per jiffy.
+ If you get credit balance more than this, the extra credit is
+ discarded. Every time the match passes, you lose `cost' credits;
+ if you don't have that many, the test fails.
+
+ See Alexey's formal explanation in net/sched/sch_tbf.c.
+
+ To get the maximum range, we multiply by this factor (ie. you get N
+ credits per jiffy). We want to allow a rate as low as 1 per day
+ (slowest userspace tool allows), which means
+ CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32 ie.
+*/
+#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24))
+
+/* Repeated shift and or gives us all 1s, final shift and add 1 gives
+ * us the power of 2 below the theoretical max, so GCC simply does a
+ * shift. */
+#define _POW2_BELOW2(x) ((x)|((x)>>1))
+#define _POW2_BELOW4(x) (_POW2_BELOW2(x)|_POW2_BELOW2((x)>>2))
+#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4))
+#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8))
+#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16))
+#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1)
+
+#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ)
+
+/* Precision saver. */
+static inline u_int32_t
+user2credits(u_int32_t user)
+{
+ /* If multiplying would overflow... */
+ if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY))
+ /* Divide first. */
+ return (user / XT_HASHLIMIT_SCALE) * HZ * CREDITS_PER_JIFFY;
+
+ return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE;
+}
+
struct hashlimit_net {
struct hlist_head htables;
struct proc_dir_entry *ipt_hashlimit;
@@ -80,12 +128,14 @@ struct dsthash_ent {
struct dsthash_dst dst;
/* modified structure members in the end */
+ spinlock_t lock;
unsigned long expires; /* precalculated expiry time */
struct {
unsigned long prev; /* last modification */
u_int32_t credit;
- u_int32_t credit_cap, cost;
+ u_int32_t cost;
} rateinfo;
+ struct rcu_head rcu;
};
struct xt_hashlimit_htable {
@@ -95,10 +145,12 @@ struct xt_hashlimit_htable {
bool rnd_initialized;
struct hashlimit_cfg1 cfg; /* config */
+ u_int32_t credit_cap;
/* used internally */
- spinlock_t lock; /* lock for list_head */
u_int32_t rnd; /* random seed for hash */
+
+ spinlock_t hlock; /* lock for writes to hash[]/count/rnd */
unsigned int count; /* number entries in table */
struct timer_list timer; /* timer for gc */
@@ -142,55 +194,27 @@ dsthash_find(const struct xt_hashlimit_htable *ht,
u_int32_t hash = hash_dst(ht, dst);
if (!hlist_empty(&ht->hash[hash])) {
- hlist_for_each_entry(ent, pos, &ht->hash[hash], node)
- if (dst_cmp(ent, dst))
+ hlist_for_each_entry_rcu(ent, pos, &ht->hash[hash], node)
+ if (dst_cmp(ent, dst)) {
+ spin_lock(&ent->lock);
return ent;
+ }
}
return NULL;
}
-/* allocate dsthash_ent, initialize dst, put in htable and lock it */
-static struct dsthash_ent *
-dsthash_alloc_init(struct xt_hashlimit_htable *ht,
- const struct dsthash_dst *dst)
-{
- struct dsthash_ent *ent;
-
- /* initialize hash with random val at the time we allocate
- * the first hashtable entry */
- if (!ht->rnd_initialized) {
- get_random_bytes(&ht->rnd, sizeof(ht->rnd));
- ht->rnd_initialized = true;
- }
-
- if (ht->cfg.max && ht->count >= ht->cfg.max) {
- /* FIXME: do something. question is what.. */
- if (net_ratelimit())
- printk(KERN_WARNING
- "xt_hashlimit: max count of %u reached\n",
- ht->cfg.max);
- return NULL;
- }
- ent = kmem_cache_alloc(hashlimit_cachep, GFP_ATOMIC);
- if (!ent) {
- if (net_ratelimit())
- printk(KERN_ERR
- "xt_hashlimit: can't allocate dsthash_ent\n");
- return NULL;
- }
- memcpy(&ent->dst, dst, sizeof(ent->dst));
+static void dsthash_free_rcu(struct rcu_head *head)
+{
+ struct dsthash_ent *ent = container_of(head, struct dsthash_ent, rcu);
- hlist_add_head(&ent->node, &ht->hash[hash_dst(ht, dst)]);
- ht->count++;
- return ent;
+ kmem_cache_free(hashlimit_cachep, ent);
}
-static inline void
-dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent)
+static void dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent)
{
- hlist_del(&ent->node);
- kmem_cache_free(hashlimit_cachep, ent);
+ hlist_del_rcu(&ent->node);
+ call_rcu_bh(&ent->rcu, dsthash_free_rcu);
ht->count--;
}
static void htable_gc(unsigned long htlong);
@@ -243,11 +267,12 @@ static int htable_create_v0(struct net *net, struct xt_hashlimit_info *minfo, u_
for (i = 0; i < hinfo->cfg.size; i++)
INIT_HLIST_HEAD(&hinfo->hash[i]);
+ hinfo->credit_cap = user2credits(hinfo->cfg.avg * hinfo->cfg.burst);
hinfo->use = 1;
hinfo->count = 0;
hinfo->family = family;
hinfo->rnd_initialized = false;
- spin_lock_init(&hinfo->lock);
+ spin_lock_init(&hinfo->hlock);
hinfo->pde = proc_create_data(minfo->name, 0,
(family == NFPROTO_IPV4) ?
hashlimit_net->ipt_hashlimit : hashlimit_net->ip6t_hashlimit,
@@ -305,11 +330,12 @@ static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
for (i = 0; i < hinfo->cfg.size; i++)
INIT_HLIST_HEAD(&hinfo->hash[i]);
+ hinfo->credit_cap = user2credits(hinfo->cfg.avg * hinfo->cfg.burst);
hinfo->use = 1;
hinfo->count = 0;
hinfo->family = family;
hinfo->rnd_initialized = false;
- spin_lock_init(&hinfo->lock);
+ spin_lock_init(&hinfo->hlock);
hinfo->pde = proc_create_data(minfo->name, 0,
(family == NFPROTO_IPV4) ?
@@ -349,7 +375,7 @@ static void htable_selective_cleanup(struct xt_hashlimit_htable *ht,
unsigned int i;
/* lock hash table and iterate over it */
- spin_lock_bh(&ht->lock);
+ spin_lock_bh(&ht->hlock);
for (i = 0; i < ht->cfg.size; i++) {
struct dsthash_ent *dh;
struct hlist_node *pos, *n;
@@ -358,7 +384,7 @@ static void htable_selective_cleanup(struct xt_hashlimit_htable *ht,
dsthash_free(ht, dh);
}
}
- spin_unlock_bh(&ht->lock);
+ spin_unlock_bh(&ht->hlock);
}
/* hash table garbage collector, run by timer */
@@ -417,59 +443,12 @@ static void htable_put(struct xt_hashlimit_htable *hinfo)
mutex_unlock(&hashlimit_mutex);
}
-/* The algorithm used is the Simple Token Bucket Filter (TBF)
- * see net/sched/sch_tbf.c in the linux source tree
- */
-
-/* Rusty: This is my (non-mathematically-inclined) understanding of
- this algorithm. The `average rate' in jiffies becomes your initial
- amount of credit `credit' and the most credit you can ever have
- `credit_cap'. The `peak rate' becomes the cost of passing the
- test, `cost'.
-
- `prev' tracks the last packet hit: you gain one credit per jiffy.
- If you get credit balance more than this, the extra credit is
- discarded. Every time the match passes, you lose `cost' credits;
- if you don't have that many, the test fails.
-
- See Alexey's formal explanation in net/sched/sch_tbf.c.
-
- To get the maximum range, we multiply by this factor (ie. you get N
- credits per jiffy). We want to allow a rate as low as 1 per day
- (slowest userspace tool allows), which means
- CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32 ie.
-*/
-#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24))
-
-/* Repeated shift and or gives us all 1s, final shift and add 1 gives
- * us the power of 2 below the theoretical max, so GCC simply does a
- * shift. */
-#define _POW2_BELOW2(x) ((x)|((x)>>1))
-#define _POW2_BELOW4(x) (_POW2_BELOW2(x)|_POW2_BELOW2((x)>>2))
-#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4))
-#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8))
-#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16))
-#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1)
-
-#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ)
-
-/* Precision saver. */
-static inline u_int32_t
-user2credits(u_int32_t user)
-{
- /* If multiplying would overflow... */
- if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY))
- /* Divide first. */
- return (user / XT_HASHLIMIT_SCALE) * HZ * CREDITS_PER_JIFFY;
-
- return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE;
-}
-
-static inline void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now)
+static inline void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now,
+ struct xt_hashlimit_htable *hinfo)
{
dh->rateinfo.credit += (now - dh->rateinfo.prev) * CREDITS_PER_JIFFY;
- if (dh->rateinfo.credit > dh->rateinfo.credit_cap)
- dh->rateinfo.credit = dh->rateinfo.credit_cap;
+ if (dh->rateinfo.credit > hinfo->credit_cap)
+ dh->rateinfo.credit = hinfo->credit_cap;
dh->rateinfo.prev = now;
}
@@ -563,9 +542,7 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,
&_ports);
break;
default:
- _ports[0] = _ports[1] = 0;
- ports = _ports;
- break;
+ return 0;
}
if (!ports)
return -1;
@@ -576,6 +553,48 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,
return 0;
}
+/* allocate dsthash_ent, initialize dst, put in htable and lock it */
+static struct dsthash_ent *
+dsthash_alloc_init(struct xt_hashlimit_htable *ht,
+ const struct dsthash_dst *dst)
+{
+ struct dsthash_ent *ent = NULL;
+
+ spin_lock(&ht->hlock);
+ /* initialize hash with random val at the time we allocate
+ * the first hashtable entry */
+ if (unlikely(!ht->rnd_initialized)) {
+ get_random_bytes(&ht->rnd, sizeof(ht->rnd));
+ ht->rnd_initialized = true;
+ }
+
+ if (ht->cfg.max && ht->count >= ht->cfg.max) {
+ /* FIXME: do something. question is what.. */
+ if (net_ratelimit())
+ printk(KERN_WARNING
+ "xt_hashlimit: max count of %u reached\n",
+ ht->cfg.max);
+ } else
+ ent = kmem_cache_alloc(hashlimit_cachep, GFP_ATOMIC);
+ if (!ent) {
+ if (net_ratelimit())
+ pr_err("xt_hashlimit: can't allocate dsthash_ent\n");
+ } else {
+ memcpy(&ent->dst, dst, sizeof(ent->dst));
+ spin_lock_init(&ent->lock);
+ ent->expires = jiffies + msecs_to_jiffies(ht->cfg.expire);
+ ent->rateinfo.prev = jiffies;
+ ent->rateinfo.credit = ht->credit_cap;
+ ent->rateinfo.cost = user2credits(ht->cfg.avg);
+ spin_lock(&ent->lock);
+
+ hlist_add_head_rcu(&ent->node, &ht->hash[hash_dst(ht, dst)]);
+ ht->count++;
+ }
+ spin_unlock(&ht->hlock);
+ return ent;
+}
+
static bool
hashlimit_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
{
@@ -588,38 +607,30 @@ hashlimit_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0)
goto hotdrop;
- spin_lock_bh(&hinfo->lock);
+ rcu_read_lock_bh();
dh = dsthash_find(hinfo, &dst);
if (!dh) {
dh = dsthash_alloc_init(hinfo, &dst);
if (!dh) {
- spin_unlock_bh(&hinfo->lock);
+ rcu_read_unlock_bh();
goto hotdrop;
}
-
- dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire);
- dh->rateinfo.prev = jiffies;
- dh->rateinfo.credit = user2credits(hinfo->cfg.avg *
- hinfo->cfg.burst);
- dh->rateinfo.credit_cap = user2credits(hinfo->cfg.avg *
- hinfo->cfg.burst);
- dh->rateinfo.cost = user2credits(hinfo->cfg.avg);
} else {
/* update expiration timeout */
dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire);
- rateinfo_recalc(dh, now);
+ rateinfo_recalc(dh, now, hinfo);
}
if (dh->rateinfo.credit >= dh->rateinfo.cost) {
/* We're underlimit. */
dh->rateinfo.credit -= dh->rateinfo.cost;
- spin_unlock_bh(&hinfo->lock);
+ spin_unlock(&dh->lock);
+ rcu_read_unlock_bh();
return true;
}
- spin_unlock_bh(&hinfo->lock);
-
- /* default case: we're overlimit, thus don't match */
+ spin_unlock(&dh->lock);
+ rcu_read_unlock_bh();
return false;
hotdrop:
@@ -639,36 +650,31 @@ hashlimit_mt(const struct sk_buff *skb, const struct xt_match_param *par)
if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0)
goto hotdrop;
- spin_lock_bh(&hinfo->lock);
+ rcu_read_lock_bh();
dh = dsthash_find(hinfo, &dst);
if (dh == NULL) {
dh = dsthash_alloc_init(hinfo, &dst);
if (dh == NULL) {
- spin_unlock_bh(&hinfo->lock);
+ rcu_read_unlock_bh();
goto hotdrop;
}
- dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire);
- dh->rateinfo.prev = jiffies;
- dh->rateinfo.credit = user2credits(hinfo->cfg.avg *
- hinfo->cfg.burst);
- dh->rateinfo.credit_cap = user2credits(hinfo->cfg.avg *
- hinfo->cfg.burst);
- dh->rateinfo.cost = user2credits(hinfo->cfg.avg);
} else {
/* update expiration timeout */
dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire);
- rateinfo_recalc(dh, now);
+ rateinfo_recalc(dh, now, hinfo);
}
if (dh->rateinfo.credit >= dh->rateinfo.cost) {
/* below the limit */
dh->rateinfo.credit -= dh->rateinfo.cost;
- spin_unlock_bh(&hinfo->lock);
+ spin_unlock(&dh->lock);
+ rcu_read_unlock_bh();
return !(info->cfg.mode & XT_HASHLIMIT_INVERT);
}
- spin_unlock_bh(&hinfo->lock);
+ spin_unlock(&dh->lock);
+ rcu_read_unlock_bh();
/* default match is underlimit - so over the limit, we need to invert */
return info->cfg.mode & XT_HASHLIMIT_INVERT;
@@ -843,12 +849,12 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
/* PROC stuff */
static void *dl_seq_start(struct seq_file *s, loff_t *pos)
- __acquires(htable->lock)
+ __acquires(htable->hlock)
{
struct xt_hashlimit_htable *htable = s->private;
unsigned int *bucket;
- spin_lock_bh(&htable->lock);
+ spin_lock_bh(&htable->hlock);
if (*pos >= htable->cfg.size)
return NULL;
@@ -874,46 +880,52 @@ static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos)
}
static void dl_seq_stop(struct seq_file *s, void *v)
- __releases(htable->lock)
+ __releases(htable->hlock)
{
struct xt_hashlimit_htable *htable = s->private;
unsigned int *bucket = (unsigned int *)v;
kfree(bucket);
- spin_unlock_bh(&htable->lock);
+ spin_unlock_bh(&htable->hlock);
}
static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
- struct seq_file *s)
+ struct seq_file *s, struct xt_hashlimit_htable *htable)
{
+ int res = 0;
+
+ spin_lock(&ent->lock);
/* recalculate to show accurate numbers */
- rateinfo_recalc(ent, jiffies);
+ rateinfo_recalc(ent, jiffies, htable);
switch (family) {
case NFPROTO_IPV4:
- return seq_printf(s, "%ld %pI4:%u->%pI4:%u %u %u %u\n",
+ res = seq_printf(s, "%ld %pI4:%u->%pI4:%u %u %u %u\n",
(long)(ent->expires - jiffies)/HZ,
&ent->dst.ip.src,
ntohs(ent->dst.src_port),
&ent->dst.ip.dst,
ntohs(ent->dst.dst_port),
- ent->rateinfo.credit, ent->rateinfo.credit_cap,
+ ent->rateinfo.credit, htable->credit_cap,
ent->rateinfo.cost);
+ break;
#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
case NFPROTO_IPV6:
- return seq_printf(s, "%ld %pI6:%u->%pI6:%u %u %u %u\n",
+ res = seq_printf(s, "%ld %pI6:%u->%pI6:%u %u %u %u\n",
(long)(ent->expires - jiffies)/HZ,
&ent->dst.ip6.src,
ntohs(ent->dst.src_port),
&ent->dst.ip6.dst,
ntohs(ent->dst.dst_port),
- ent->rateinfo.credit, ent->rateinfo.credit_cap,
+ ent->rateinfo.credit, htable->credit_cap,
ent->rateinfo.cost);
+ break;
#endif
default:
BUG();
- return 0;
}
+ spin_unlock(&ent->lock);
+ return res;
}
static int dl_seq_show(struct seq_file *s, void *v)
@@ -925,7 +937,7 @@ static int dl_seq_show(struct seq_file *s, void *v)
if (!hlist_empty(&htable->hash[*bucket])) {
hlist_for_each_entry(ent, pos, &htable->hash[*bucket], node)
- if (dl_seq_real_show(ent, htable->family, s))
+ if (dl_seq_real_show(ent, htable->family, s, htable))
return -1;
}
return 0;
@@ -1037,9 +1049,9 @@ err1:
static void __exit hashlimit_mt_exit(void)
{
- kmem_cache_destroy(hashlimit_cachep);
xt_unregister_matches(hashlimit_mt_reg, ARRAY_SIZE(hashlimit_mt_reg));
unregister_pernet_subsys(&hashlimit_net_ops);
+ kmem_cache_destroy(hashlimit_cachep);
}
module_init(hashlimit_mt_init);
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists