Make netfilter tables that use x_tables (iptables, ip6tables, arptables) operatate without locking on the receive path. Replace existing reader/writer lock with Read-Copy-Update to elminate the overhead of a read lock on each incoming packet. This should reduce the overhead of iptables especially on SMP systems. The previous code used a reader-writer lock for two purposes. The first was to ensure that the xt_table_info reference was not in process of being changed. Since xt_table_info is only freed via one routine, it was a direct conversion to RCU. The other use of the reader-writer lock was to to block changes to counters while they were being read. This is handled by swapping in a new set of counter values and then summing the old ones. The sum is then restored back on a single cpu. Signed-off-by: Stephen Hemminger --- include/linux/netfilter/x_tables.h | 13 ++++ net/ipv4/netfilter/arp_tables.c | 92 ++++++++++++++++++++++++----------- net/ipv4/netfilter/ip_tables.c | 97 ++++++++++++++++++++++++------------- net/ipv6/netfilter/ip6_tables.c | 97 +++++++++++++++++++++++-------------- net/netfilter/x_tables.c | 67 +++++++++++++++++++++---- 5 files changed, 258 insertions(+), 108 deletions(-) --- a/include/linux/netfilter/x_tables.h 2009-02-02 15:06:39.893751845 -0800 +++ b/include/linux/netfilter/x_tables.h 2009-02-03 15:44:21.743663216 -0800 @@ -353,7 +353,7 @@ struct xt_table unsigned int valid_hooks; /* Lock for the curtain */ - rwlock_t lock; + struct mutex lock; /* Man behind the curtain... */ struct xt_table_info *private; @@ -383,9 +383,15 @@ struct xt_table_info unsigned int hook_entry[NF_INET_NUMHOOKS]; unsigned int underflow[NF_INET_NUMHOOKS]; + /* For the dustman... */ + union { + struct rcu_head rcu; + struct work_struct work; + }; + /* ipt_entry tables: one per CPU */ /* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */ - char *entries[1]; + void *entries[1]; }; #define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \ @@ -432,6 +438,9 @@ extern void xt_proto_fini(struct net *ne extern struct xt_table_info *xt_alloc_table_info(unsigned int size); extern void xt_free_table_info(struct xt_table_info *info); +extern void xt_zero_table_entries(struct xt_table_info *info); +extern void xt_swap_table_entries(struct xt_table_info *old, + struct xt_table_info *new); #ifdef CONFIG_COMPAT #include --- a/net/ipv4/netfilter/ip_tables.c 2009-02-02 15:06:29.684249364 -0800 +++ b/net/ipv4/netfilter/ip_tables.c 2009-02-03 15:52:32.047583686 -0800 @@ -347,10 +347,12 @@ ipt_do_table(struct sk_buff *skb, mtpar.family = tgpar.family = NFPROTO_IPV4; tgpar.hooknum = hook; - read_lock_bh(&table->lock); IP_NF_ASSERT(table->valid_hooks & (1 << hook)); - private = table->private; - table_base = (void *)private->entries[smp_processor_id()]; + + rcu_read_lock_bh(); + private = rcu_dereference(table->private); + table_base = rcu_dereference(private->entries[smp_processor_id()]); + e = get_entry(table_base, private->hook_entry[hook]); /* For return from builtin chain */ @@ -445,7 +447,7 @@ ipt_do_table(struct sk_buff *skb, } } while (!hotdrop); - read_unlock_bh(&table->lock); + rcu_read_unlock_bh(); #ifdef DEBUG_ALLOW_ALL return NF_ACCEPT; @@ -924,13 +926,45 @@ get_counters(const struct xt_table_info counters, &i); } + +} + +/* We're lazy, and add to the first CPU; overflow works its fey magic + * and everything is OK. */ +static int +add_counter_to_entry(struct ipt_entry *e, + const struct xt_counters addme[], + unsigned int *i) +{ + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); + + (*i)++; + return 0; +} + +/* Take values from counters and add them back onto the current cpu */ +static void put_counters(struct xt_table_info *t, + const struct xt_counters counters[]) +{ + unsigned int i, cpu; + + local_bh_disable(); + cpu = smp_processor_id(); + i = 0; + IPT_ENTRY_ITERATE(t->entries[cpu], + t->size, + add_counter_to_entry, + counters, + &i); + local_bh_enable(); } static struct xt_counters * alloc_counters(struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; - const struct xt_table_info *private = table->private; + struct xt_table_info *private = table->private; + struct xt_table_info *tmp; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care @@ -939,14 +973,30 @@ static struct xt_counters * alloc_counte counters = vmalloc_node(countersize, numa_node_id()); if (counters == NULL) - return ERR_PTR(-ENOMEM); + goto nomem; + + tmp = xt_alloc_table_info(private->size); + if (!tmp) + goto free_counters; + + xt_zero_table_entries(tmp); + + mutex_lock(&table->lock); + xt_swap_table_entries(private, tmp); + synchronize_net(); /* Wait until smoke has cleared */ - /* First, sum counters... */ - write_lock_bh(&table->lock); - get_counters(private, counters); - write_unlock_bh(&table->lock); + get_counters(tmp, counters); + put_counters(private, counters); + mutex_unlock(&table->lock); + + xt_free_table_info(tmp); return counters; + + free_counters: + vfree(counters); + nomem: + return ERR_PTR(-ENOMEM); } static int @@ -1312,27 +1362,6 @@ do_replace(struct net *net, void __user return ret; } -/* We're lazy, and add to the first CPU; overflow works its fey magic - * and everything is OK. */ -static int -add_counter_to_entry(struct ipt_entry *e, - const struct xt_counters addme[], - unsigned int *i) -{ -#if 0 - duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n", - *i, - (long unsigned int)e->counters.pcnt, - (long unsigned int)e->counters.bcnt, - (long unsigned int)addme[*i].pcnt, - (long unsigned int)addme[*i].bcnt); -#endif - - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); - - (*i)++; - return 0; -} static int do_add_counters(struct net *net, void __user *user, unsigned int len, int compat) @@ -1393,13 +1422,14 @@ do_add_counters(struct net *net, void __ goto free; } - write_lock_bh(&t->lock); + mutex_lock(&t->lock); private = t->private; if (private->number != num_counters) { ret = -EINVAL; goto unlock_up_free; } + preempt_disable(); i = 0; /* Choose the copy that is on our node */ loc_cpu_entry = private->entries[raw_smp_processor_id()]; @@ -1408,8 +1438,9 @@ do_add_counters(struct net *net, void __ add_counter_to_entry, paddc, &i); + preempt_enable(); unlock_up_free: - write_unlock_bh(&t->lock); + mutex_unlock(&t->lock); xt_table_unlock(t); module_put(t->me); free: --- a/net/netfilter/x_tables.c 2009-02-02 15:06:29.708249745 -0800 +++ b/net/netfilter/x_tables.c 2009-02-03 15:44:21.743663216 -0800 @@ -611,18 +611,61 @@ struct xt_table_info *xt_alloc_table_inf } EXPORT_SYMBOL(xt_alloc_table_info); -void xt_free_table_info(struct xt_table_info *info) +/* Zero out entries */ +void xt_zero_table_entries(struct xt_table_info *info) { - int cpu; + unsigned int cpu; + + for_each_possible_cpu(cpu) + memset(info->entries[cpu], 0, info->size); +} +EXPORT_SYMBOL_GPL(xt_zero_table_entries); + +/* Swap existing entries with new zeroed ones */ +void xt_swap_table_entries(struct xt_table_info *oldinfo, + struct xt_table_info *newinfo) +{ + unsigned int cpu; for_each_possible_cpu(cpu) { - if (info->size <= PAGE_SIZE) - kfree(info->entries[cpu]); - else - vfree(info->entries[cpu]); + void *p = oldinfo->entries[cpu]; + + rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]); + newinfo->entries[cpu] = p; } +} +EXPORT_SYMBOL_GPL(xt_swap_table_entries); + +/* callback to do free for vmalloc'd case */ +static void xt_free_table_info_work(struct work_struct *arg) +{ + int cpu; + struct xt_table_info *info = container_of(arg, struct xt_table_info, work); + + for_each_possible_cpu(cpu) + vfree(info->entries[cpu]); kfree(info); } + +static void xt_free_table_info_rcu(struct rcu_head *arg) +{ + struct xt_table_info *info = container_of(arg, struct xt_table_info, rcu); + if (info->size <= PAGE_SIZE) { + unsigned int cpu; + for_each_possible_cpu(cpu) + kfree(info->entries[cpu]); + kfree(info); + } else { + /* can't safely call vfree in current context */ + INIT_WORK(&info->work, xt_free_table_info_work); + schedule_work(&info->work); + } +} + +void xt_free_table_info(struct xt_table_info *info) +{ + call_rcu(&info->rcu, xt_free_table_info_rcu); +} EXPORT_SYMBOL(xt_free_table_info); /* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */ @@ -671,21 +714,22 @@ xt_replace_table(struct xt_table *table, struct xt_table_info *oldinfo, *private; /* Do the substitution. */ - write_lock_bh(&table->lock); + mutex_lock(&table->lock); private = table->private; /* Check inside lock: is the old number correct? */ if (num_counters != private->number) { duprintf("num_counters != table->private->number (%u/%u)\n", num_counters, private->number); - write_unlock_bh(&table->lock); + mutex_unlock(&table->lock); *error = -EAGAIN; return NULL; } oldinfo = private; - table->private = newinfo; + rcu_assign_pointer(table->private, newinfo); newinfo->initial_entries = oldinfo->initial_entries; - write_unlock_bh(&table->lock); + mutex_unlock(&table->lock); + synchronize_net(); return oldinfo; } EXPORT_SYMBOL_GPL(xt_replace_table); @@ -719,7 +763,8 @@ struct xt_table *xt_register_table(struc /* Simplifies replace_table code. */ table->private = bootstrap; - rwlock_init(&table->lock); + mutex_init(&table->lock); + if (!xt_replace_table(table, 0, newinfo, &ret)) goto unlock; --- a/net/ipv4/netfilter/arp_tables.c 2009-02-02 15:06:29.696250564 -0800 +++ b/net/ipv4/netfilter/arp_tables.c 2009-02-03 15:50:16.592791631 -0800 @@ -237,9 +237,10 @@ unsigned int arpt_do_table(struct sk_buf indev = in ? in->name : nulldevname; outdev = out ? out->name : nulldevname; - read_lock_bh(&table->lock); - private = table->private; - table_base = (void *)private->entries[smp_processor_id()]; + rcu_read_lock_bh(); + private = rcu_dereference(table->private); + table_base = rcu_dereference(private->entries[smp_processor_id()]); + e = get_entry(table_base, private->hook_entry[hook]); back = get_entry(table_base, private->underflow[hook]); @@ -311,7 +312,8 @@ unsigned int arpt_do_table(struct sk_buf e = (void *)e + e->next_offset; } } while (!hotdrop); - read_unlock_bh(&table->lock); + + rcu_read_unlock_bh(); if (hotdrop) return NF_DROP; @@ -714,11 +716,43 @@ static void get_counters(const struct xt } } -static inline struct xt_counters *alloc_counters(struct xt_table *table) + +/* We're lazy, and add to the first CPU; overflow works its fey magic + * and everything is OK. */ +static int +add_counter_to_entry(struct arpt_entry *e, + const struct xt_counters addme[], + unsigned int *i) +{ + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); + + (*i)++; + return 0; +} + +/* Take values from counters and add them back onto the current cpu */ +static void put_counters(struct xt_table_info *t, + const struct xt_counters counters[]) +{ + unsigned int i, cpu; + + local_bh_disable(); + cpu = smp_processor_id(); + i = 0; + ARPT_ENTRY_ITERATE(t->entries[cpu], + t->size, + add_counter_to_entry, + counters, + &i); + local_bh_enable(); +} + +static struct xt_counters *alloc_counters(struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; - const struct xt_table_info *private = table->private; + struct xt_table_info *private = table->private; + struct xt_table_info *tmp; /* We need atomic snapshot of counters: rest doesn't change * (other than comefrom, which userspace doesn't care @@ -728,14 +762,30 @@ static inline struct xt_counters *alloc_ counters = vmalloc_node(countersize, numa_node_id()); if (counters == NULL) - return ERR_PTR(-ENOMEM); + goto nomem; - /* First, sum counters... */ - write_lock_bh(&table->lock); - get_counters(private, counters); - write_unlock_bh(&table->lock); + tmp = xt_alloc_table_info(private->size); + if (!tmp) + goto free_counters; + + xt_zero_table_entries(tmp); + + mutex_lock(&table->lock); + xt_swap_table_entries(private, tmp); + synchronize_net(); /* Wait until smoke has cleared */ + + get_counters(tmp, counters); + put_counters(private, counters); + mutex_unlock(&table->lock); + + xt_free_table_info(tmp); return counters; + + free_counters: + vfree(counters); + nomem: + return ERR_PTR(-ENOMEM); } static int copy_entries_to_user(unsigned int total_size, @@ -1075,20 +1125,6 @@ static int do_replace(struct net *net, v return ret; } -/* We're lazy, and add to the first CPU; overflow works its fey magic - * and everything is OK. - */ -static inline int add_counter_to_entry(struct arpt_entry *e, - const struct xt_counters addme[], - unsigned int *i) -{ - - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); - - (*i)++; - return 0; -} - static int do_add_counters(struct net *net, void __user *user, unsigned int len, int compat) { @@ -1148,13 +1184,14 @@ static int do_add_counters(struct net *n goto free; } - write_lock_bh(&t->lock); + mutex_lock(&t->lock); private = t->private; if (private->number != num_counters) { ret = -EINVAL; goto unlock_up_free; } + preempt_disable(); i = 0; /* Choose the copy that is on our node */ loc_cpu_entry = private->entries[smp_processor_id()]; @@ -1164,7 +1201,8 @@ static int do_add_counters(struct net *n paddc, &i); unlock_up_free: - write_unlock_bh(&t->lock); + mutex_unlock(&t->lock); + xt_table_unlock(t); module_put(t->me); free: --- a/net/ipv6/netfilter/ip6_tables.c 2009-02-02 15:06:29.724250485 -0800 +++ b/net/ipv6/netfilter/ip6_tables.c 2009-02-03 15:54:10.084493787 -0800 @@ -373,10 +373,12 @@ ip6t_do_table(struct sk_buff *skb, mtpar.family = tgpar.family = NFPROTO_IPV6; tgpar.hooknum = hook; - read_lock_bh(&table->lock); IP_NF_ASSERT(table->valid_hooks & (1 << hook)); - private = table->private; - table_base = (void *)private->entries[smp_processor_id()]; + + rcu_read_lock_bh(); + private = rcu_dereference(table->private); + table_base = rcu_dereference(private->entries[smp_processor_id()]); + e = get_entry(table_base, private->hook_entry[hook]); /* For return from builtin chain */ @@ -474,7 +476,7 @@ ip6t_do_table(struct sk_buff *skb, #ifdef CONFIG_NETFILTER_DEBUG ((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON; #endif - read_unlock_bh(&table->lock); + rcu_read_unlock_bh(); #ifdef DEBUG_ALLOW_ALL return NF_ACCEPT; @@ -955,11 +957,42 @@ get_counters(const struct xt_table_info } } +/* We're lazy, and add to the first CPU; overflow works its fey magic + * and everything is OK. */ +static int +add_counter_to_entry(struct ip6t_entry *e, + const struct xt_counters addme[], + unsigned int *i) +{ + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); + + (*i)++; + return 0; +} + +/* Take values from counters and add them back onto the current cpu */ +static void put_counters(struct xt_table_info *t, + const struct xt_counters counters[]) +{ + unsigned int i, cpu; + + local_bh_disable(); + cpu = smp_processor_id(); + i = 0; + IP6T_ENTRY_ITERATE(t->entries[cpu], + t->size, + add_counter_to_entry, + counters, + &i); + local_bh_enable(); +} + static struct xt_counters *alloc_counters(struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; - const struct xt_table_info *private = table->private; + struct xt_table_info *private = table->private; + struct xt_table_info *tmp; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care @@ -968,14 +1001,28 @@ static struct xt_counters *alloc_counter counters = vmalloc_node(countersize, numa_node_id()); if (counters == NULL) - return ERR_PTR(-ENOMEM); + goto nomem; + + tmp = xt_alloc_table_info(private->size); + if (!tmp) + goto free_counters; + + xt_zero_table_entries(tmp); + + mutex_lock(&table->lock); + xt_swap_table_entries(private, tmp); + synchronize_net(); /* Wait until smoke has cleared */ + + get_counters(tmp, counters); + put_counters(private, counters); + mutex_unlock(&table->lock); - /* First, sum counters... */ - write_lock_bh(&table->lock); - get_counters(private, counters); - write_unlock_bh(&table->lock); + xt_free_table_info(tmp); - return counters; + free_counters: + vfree(counters); + nomem: + return ERR_PTR(-ENOMEM); } static int @@ -1342,28 +1389,6 @@ do_replace(struct net *net, void __user return ret; } -/* We're lazy, and add to the first CPU; overflow works its fey magic - * and everything is OK. */ -static inline int -add_counter_to_entry(struct ip6t_entry *e, - const struct xt_counters addme[], - unsigned int *i) -{ -#if 0 - duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n", - *i, - (long unsigned int)e->counters.pcnt, - (long unsigned int)e->counters.bcnt, - (long unsigned int)addme[*i].pcnt, - (long unsigned int)addme[*i].bcnt); -#endif - - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); - - (*i)++; - return 0; -} - static int do_add_counters(struct net *net, void __user *user, unsigned int len, int compat) @@ -1424,13 +1449,14 @@ do_add_counters(struct net *net, void __ goto free; } - write_lock_bh(&t->lock); + mutex_lock(&t->lock); private = t->private; if (private->number != num_counters) { ret = -EINVAL; goto unlock_up_free; } + preempt_disable(); i = 0; /* Choose the copy that is on our node */ loc_cpu_entry = private->entries[raw_smp_processor_id()]; @@ -1439,8 +1465,9 @@ do_add_counters(struct net *net, void __ add_counter_to_entry, paddc, &i); + preempt_enable(); unlock_up_free: - write_unlock_bh(&t->lock); + mutex_unlock(&t->lock); xt_table_unlock(t); module_put(t->me); free: -- -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html