[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <49E49C65.8060808@cosmosbay.com>
Date: Tue, 14 Apr 2009 16:23:33 +0200
From: Eric Dumazet <dada1@...mosbay.com>
To: Patrick McHardy <kaber@...sh.net>
CC: Stephen Hemminger <shemminger@...tta.com>,
paulmck@...ux.vnet.ibm.com, David Miller <davem@...emloft.net>,
paulus@...ba.org, mingo@...e.hu, torvalds@...ux-foundation.org,
laijs@...fujitsu.com, jeff.chua.linux@...il.com,
jengelh@...ozas.de, r000n@...0n.net, linux-kernel@...r.kernel.org,
netfilter-devel@...r.kernel.org, netdev@...r.kernel.org,
benh@...nel.crashing.org
Subject: Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU
Patrick McHardy a écrit :
> Stephen Hemminger wrote:
>> This is an alternative version of ip/ip6/arp tables locking using
>> per-cpu locks. This avoids the overhead of synchronize_net() during
>> update but still removes the expensive rwlock in earlier versions.
>>
>> The idea for this came from an earlier version done by Eric Duzamet.
>> Locking is done per-cpu, the fast path locks on the current cpu
>> and updates counters. The slow case involves acquiring the locks on
>> all cpu's.
>>
>> The mutex that was added for 2.6.30 in xt_table is unnecessary since
>> there already is a mutex for xt[af].mutex that is held.
>>
>> Tested basic functionality (add/remove/list), but don't have test cases
>> for stress, ip6tables or arptables.
>
> Thanks Stephen, I'll do some testing with ip6tables.
Here is the patch I cooked on top of Stephen one to get proper locking.
In the "iptables -L" case, we freeze updates on all cpus to get previous
RCU behavior (not sure it is mandatory, but anyway...)
And xt_replace_table() uses same logic to make sure a cpu wont try to parse rules
and update counters while a writer is replacing tables (and thus calling
vfree() and unmapping in-use pages)
Feel free to merge this patch to Stephen one before upstream submission
Thank you
Signed-off-by: Eric Dumazet <dada1@...mosbay.com>
---
include/linux/netfilter/x_tables.h | 5 ++
net/ipv4/netfilter/arp_tables.c | 20 +++------
net/ipv4/netfilter/ip_tables.c | 24 ++++-------
net/ipv6/netfilter/ip6_tables.c | 24 ++++-------
net/netfilter/x_tables.c | 55 ++++++++++++++++++++++++++-
5 files changed, 84 insertions(+), 44 deletions(-)
diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 1ff1a76..a5840a4 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -426,6 +426,11 @@ extern struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
const char *name);
extern void xt_table_unlock(struct xt_table *t);
+extern void xt_tlock_lockall(void);
+extern void xt_tlock_unlockall(void);
+extern void xt_tlock_lock(void);
+extern void xt_tlock_unlock(void);
+
extern int xt_proto_init(struct net *net, u_int8_t af);
extern void xt_proto_fini(struct net *net, u_int8_t af);
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index c60cc11..b561e1e 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -231,8 +231,6 @@ static inline struct arpt_entry *get_entry(void *base, unsigned int offset)
return (struct arpt_entry *)(base + offset);
}
-static DEFINE_PER_CPU(spinlock_t, arp_tables_lock);
-
unsigned int arpt_do_table(struct sk_buff *skb,
unsigned int hook,
const struct net_device *in,
@@ -256,7 +254,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
outdev = out ? out->name : nulldevname;
local_bh_disable();
- spin_lock(&__get_cpu_var(arp_tables_lock));
+ xt_tlock_lock();
private = table->private;
table_base = private->entries[smp_processor_id()];
@@ -331,7 +329,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
e = (void *)e + e->next_offset;
}
} while (!hotdrop);
- spin_unlock(&__get_cpu_var(arp_tables_lock));
+ xt_tlock_unlock();
local_bh_enable();
if (hotdrop)
@@ -709,33 +707,31 @@ static void get_counters(const struct xt_table_info *t,
{
unsigned int cpu;
unsigned int i = 0;
- unsigned int curcpu = raw_smp_processor_id();
+ unsigned int curcpu;
+ xt_tlock_lockall();
/* Instead of clearing (by a previous call to memset())
* the counters and using adds, we set the counters
* with data used by 'current' CPU
- * We dont care about preemption here.
*/
- spin_lock_bh(&per_cpu(arp_tables_lock, curcpu));
+ curcpu = smp_processor_id();
ARPT_ENTRY_ITERATE(t->entries[curcpu],
t->size,
set_entry_to_counter,
counters,
&i);
- spin_unlock_bh(&per_cpu(arp_tables_lock, curcpu));
for_each_possible_cpu(cpu) {
if (cpu == curcpu)
continue;
i = 0;
- spin_lock_bh(&per_cpu(arp_tables_lock, cpu));
ARPT_ENTRY_ITERATE(t->entries[cpu],
t->size,
add_entry_to_counter,
counters,
&i);
- spin_unlock_bh(&per_cpu(arp_tables_lock, cpu));
}
+ xt_tlock_unlockall();
}
static struct xt_counters *alloc_counters(struct xt_table *table)
@@ -1181,14 +1177,14 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
/* Choose the copy that is on our node */
local_bh_disable();
curcpu = smp_processor_id();
- spin_lock(&__get_cpu_var(arp_tables_lock));
+ xt_tlock_lock();
loc_cpu_entry = private->entries[curcpu];
ARPT_ENTRY_ITERATE(loc_cpu_entry,
private->size,
add_counter_to_entry,
paddc,
&i);
- spin_unlock(&__get_cpu_var(arp_tables_lock));
+ xt_tlock_unlock();
local_bh_enable();
unlock_up_free:
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index cb3b779..81d173e 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -297,7 +297,6 @@ static void trace_packet(struct sk_buff *skb,
}
#endif
-static DEFINE_PER_CPU(spinlock_t, ip_tables_lock);
/* Returns one of the generic firewall policies, like NF_ACCEPT. */
unsigned int
@@ -342,7 +341,7 @@ ipt_do_table(struct sk_buff *skb,
IP_NF_ASSERT(table->valid_hooks & (1 << hook));
local_bh_disable();
- spin_lock(&__get_cpu_var(ip_tables_lock));
+ xt_tlock_lock();
private = table->private;
table_base = private->entries[smp_processor_id()];
@@ -439,7 +438,7 @@ ipt_do_table(struct sk_buff *skb,
e = (void *)e + e->next_offset;
}
} while (!hotdrop);
- spin_unlock(&__get_cpu_var(ip_tables_lock));
+ xt_tlock_unlock();
local_bh_enable();
#ifdef DEBUG_ALLOW_ALL
@@ -895,34 +894,32 @@ get_counters(const struct xt_table_info *t,
{
unsigned int cpu;
unsigned int i = 0;
- unsigned int curcpu = raw_smp_processor_id();
+ unsigned int curcpu;
+ xt_tlock_lockall();
/* Instead of clearing (by a previous call to memset())
* the counters and using adds, we set the counters
* with data used by 'current' CPU
- * We dont care about preemption here.
*/
- spin_lock_bh(&per_cpu(ip_tables_lock, curcpu));
+ curcpu = smp_processor_id();
IPT_ENTRY_ITERATE(t->entries[curcpu],
t->size,
set_entry_to_counter,
counters,
&i);
- spin_unlock_bh(&per_cpu(ip_tables_lock, curcpu));
for_each_possible_cpu(cpu) {
if (cpu == curcpu)
continue;
i = 0;
- spin_lock_bh(&per_cpu(ip_tables_lock, cpu));
IPT_ENTRY_ITERATE(t->entries[cpu],
t->size,
add_entry_to_counter,
counters,
&i);
- spin_unlock_bh(&per_cpu(ip_tables_lock, cpu));
}
+ xt_tlock_unlockall();
}
static struct xt_counters * alloc_counters(struct xt_table *table)
@@ -1393,14 +1390,14 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
local_bh_disable();
/* Choose the copy that is on our node */
curcpu = smp_processor_id();
- spin_lock(&__get_cpu_var(ip_tables_lock));
+ xt_tlock_lock();
loc_cpu_entry = private->entries[curcpu];
IPT_ENTRY_ITERATE(loc_cpu_entry,
private->size,
add_counter_to_entry,
paddc,
&i);
- spin_unlock(&__get_cpu_var(ip_tables_lock));
+ xt_tlock_unlock();
local_bh_enable();
unlock_up_free:
@@ -2220,10 +2217,7 @@ static struct pernet_operations ip_tables_net_ops = {
static int __init ip_tables_init(void)
{
- int cpu, ret;
-
- for_each_possible_cpu(cpu)
- spin_lock_init(&per_cpu(ip_tables_lock, cpu));
+ int ret;
ret = register_pernet_subsys(&ip_tables_net_ops);
if (ret < 0)
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index ac46ca4..d6ba69e 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -329,7 +329,6 @@ static void trace_packet(struct sk_buff *skb,
}
#endif
-static DEFINE_PER_CPU(spinlock_t, ip6_tables_lock);
/* Returns one of the generic firewall policies, like NF_ACCEPT. */
unsigned int
@@ -368,7 +367,7 @@ ip6t_do_table(struct sk_buff *skb,
IP_NF_ASSERT(table->valid_hooks & (1 << hook));
local_bh_disable();
- spin_lock(&__get_cpu_var(ip6_tables_lock));
+ xt_tlock_lock();
private = table->private;
table_base = private->entries[smp_processor_id()];
@@ -469,7 +468,7 @@ ip6t_do_table(struct sk_buff *skb,
#ifdef CONFIG_NETFILTER_DEBUG
((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
#endif
- spin_unlock(&__get_cpu_var(ip6_tables_lock));
+ xt_tlock_unlock();
local_bh_enable();
#ifdef DEBUG_ALLOW_ALL
@@ -925,33 +924,31 @@ get_counters(const struct xt_table_info *t,
{
unsigned int cpu;
unsigned int i = 0;
- unsigned int curcpu = raw_smp_processor_id();
+ unsigned int curcpu;
+ xt_tlock_lockall();
/* Instead of clearing (by a previous call to memset())
* the counters and using adds, we set the counters
* with data used by 'current' CPU
- * We dont care about preemption here.
*/
- spin_lock_bh(&per_cpu(ip6_tables_lock, curcpu));
+ curcpu = smp_processor_id();
IP6T_ENTRY_ITERATE(t->entries[curcpu],
t->size,
set_entry_to_counter,
counters,
&i);
- spin_unlock_bh(&per_cpu(ip6_tables_lock, curcpu));
for_each_possible_cpu(cpu) {
if (cpu == curcpu)
continue;
i = 0;
- spin_lock_bh(&per_cpu(ip6_tables_lock, cpu));
IP6T_ENTRY_ITERATE(t->entries[cpu],
t->size,
add_entry_to_counter,
counters,
&i);
- spin_unlock_bh(&per_cpu(ip6_tables_lock, cpu));
}
+ xt_tlock_unlockall();
}
static struct xt_counters *alloc_counters(struct xt_table *table)
@@ -1423,14 +1420,14 @@ do_add_counters(struct net *net, void __user *user, unsigned int len,
local_bh_disable();
/* Choose the copy that is on our node */
curcpu = smp_processor_id();
- spin_lock(&__get_cpu_var(ip6_tables_lock));
+ xt_tlock_lock();
loc_cpu_entry = private->entries[curcpu];
IP6T_ENTRY_ITERATE(loc_cpu_entry,
private->size,
add_counter_to_entry,
paddc,
&i);
- spin_unlock(&__get_cpu_var(ip6_tables_lock));
+ xt_tlock_unlock();
local_bh_enable();
unlock_up_free:
@@ -2248,10 +2245,7 @@ static struct pernet_operations ip6_tables_net_ops = {
static int __init ip6_tables_init(void)
{
- int cpu, ret;
-
- for_each_possible_cpu(cpu)
- spin_lock_init(&per_cpu(ip6_tables_lock, cpu));
+ int ret;
ret = register_pernet_subsys(&ip6_tables_net_ops);
if (ret < 0)
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 0d94020..f2ad79f 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -670,19 +670,21 @@ xt_replace_table(struct xt_table *table,
{
struct xt_table_info *oldinfo, *private;
+ xt_tlock_lockall();
/* Do the substitution. */
private = table->private;
/* Check inside lock: is the old number correct? */
if (num_counters != private->number) {
duprintf("num_counters != table->private->number (%u/%u)\n",
num_counters, private->number);
+ xt_tlock_unlockall();
*error = -EAGAIN;
return NULL;
}
oldinfo = private;
table->private = newinfo;
newinfo->initial_entries = oldinfo->initial_entries;
-
+ xt_tlock_unlockall();
return oldinfo;
}
EXPORT_SYMBOL_GPL(xt_replace_table);
@@ -1126,9 +1128,58 @@ static struct pernet_operations xt_net_ops = {
.init = xt_net_init,
};
+static DEFINE_PER_CPU(spinlock_t, xt_tables_lock);
+
+void xt_tlock_lockall(void)
+{
+ int cpu;
+
+ local_bh_disable();
+ preempt_disable();
+ for_each_possible_cpu(cpu) {
+ spin_lock(&per_cpu(xt_tables_lock, cpu));
+ /*
+ * avoid preempt counter overflow
+ */
+ preempt_enable_no_resched();
+ }
+}
+EXPORT_SYMBOL(xt_tlock_lockall);
+
+void xt_tlock_unlockall(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ preempt_disable();
+ spin_unlock(&per_cpu(xt_tables_lock, cpu));
+ }
+ preempt_enable();
+ local_bh_enable();
+}
+EXPORT_SYMBOL(xt_tlock_unlockall);
+
+/*
+ * preemption should be disabled by caller
+ */
+void xt_tlock_lock(void)
+{
+ spin_lock(&__get_cpu_var(xt_tables_lock));
+}
+EXPORT_SYMBOL(xt_tlock_lock);
+
+void xt_tlock_unlock(void)
+{
+ spin_unlock(&__get_cpu_var(xt_tables_lock));
+}
+EXPORT_SYMBOL(xt_tlock_unlock);
+
static int __init xt_init(void)
{
- int i, rv;
+ int i, rv, cpu;
+
+ for_each_possible_cpu(cpu)
+ spin_lock_init(&per_cpu(xt_tables_lock, cpu));
xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL);
if (!xt)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists