[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <4989071E.5000803@cosmosbay.com>
Date: Wed, 04 Feb 2009 04:10:22 +0100
From: Eric Dumazet <dada1@...mosbay.com>
To: Stephen Hemminger <shemminger@...tta.com>
CC: David Miller <davem@...emloft.net>,
Patrick McHardy <kaber@...sh.net>,
Rick Jones <rick.jones2@...com>,
"Paul E. McKenney" <paulmck@...ux.vnet.ibm.com>,
netdev@...r.kernel.org, netfilter-devel@...r.kernel.org
Subject: Re: [RFT 3/3] iptables: lock free counters
Stephen Hemminger a écrit :
> Make netfilter tables that use x_tables (iptables, ip6tables, arptables)
> operatate without locking on the receive path.
> Replace existing reader/writer lock with Read-Copy-Update to
> elminate the overhead of a read lock on each incoming packet.
> This should reduce the overhead of iptables especially on SMP
> systems.
>
> The previous code used a reader-writer lock for two purposes.
> The first was to ensure that the xt_table_info reference was not in
> process of being changed. Since xt_table_info is only freed via one
> routine, it was a direct conversion to RCU.
>
> The other use of the reader-writer lock was to to block changes
> to counters while they were being read. This is handled by swapping in
> a new set of counter values and then summing the old ones. The sum
> is then restored back on a single cpu.
>
> Signed-off-by: Stephen Hemminger <shemminger@...tta.com>
>
> ---
> include/linux/netfilter/x_tables.h | 13 ++++
> net/ipv4/netfilter/arp_tables.c | 92 ++++++++++++++++++++++++-----------
> net/ipv4/netfilter/ip_tables.c | 97 ++++++++++++++++++++++++-------------
> net/ipv6/netfilter/ip6_tables.c | 97 +++++++++++++++++++++++--------------
> net/netfilter/x_tables.c | 67 +++++++++++++++++++++----
> 5 files changed, 258 insertions(+), 108 deletions(-)
>
> --- a/include/linux/netfilter/x_tables.h 2009-02-02 15:06:39.893751845 -0800
> +++ b/include/linux/netfilter/x_tables.h 2009-02-03 15:44:21.743663216 -0800
> @@ -353,7 +353,7 @@ struct xt_table
> unsigned int valid_hooks;
>
> /* Lock for the curtain */
> - rwlock_t lock;
> + struct mutex lock;
>
> /* Man behind the curtain... */
> struct xt_table_info *private;
> @@ -383,9 +383,15 @@ struct xt_table_info
> unsigned int hook_entry[NF_INET_NUMHOOKS];
> unsigned int underflow[NF_INET_NUMHOOKS];
>
> + /* For the dustman... */
> + union {
> + struct rcu_head rcu;
> + struct work_struct work;
> + };
> +
> /* ipt_entry tables: one per CPU */
> /* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
> - char *entries[1];
> + void *entries[1];
> };
>
> #define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \
> @@ -432,6 +438,9 @@ extern void xt_proto_fini(struct net *ne
>
> extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
> extern void xt_free_table_info(struct xt_table_info *info);
> +extern void xt_zero_table_entries(struct xt_table_info *info);
> +extern void xt_swap_table_entries(struct xt_table_info *old,
> + struct xt_table_info *new);
>
> #ifdef CONFIG_COMPAT
> #include <net/compat.h>
> --- a/net/ipv4/netfilter/ip_tables.c 2009-02-02 15:06:29.684249364 -0800
> +++ b/net/ipv4/netfilter/ip_tables.c 2009-02-03 15:52:32.047583686 -0800
> @@ -347,10 +347,12 @@ ipt_do_table(struct sk_buff *skb,
> mtpar.family = tgpar.family = NFPROTO_IPV4;
> tgpar.hooknum = hook;
>
> - read_lock_bh(&table->lock);
> IP_NF_ASSERT(table->valid_hooks & (1 << hook));
> - private = table->private;
> - table_base = (void *)private->entries[smp_processor_id()];
> +
> + rcu_read_lock_bh();
> + private = rcu_dereference(table->private);
> + table_base = rcu_dereference(private->entries[smp_processor_id()]);
> +
> e = get_entry(table_base, private->hook_entry[hook]);
>
> /* For return from builtin chain */
> @@ -445,7 +447,7 @@ ipt_do_table(struct sk_buff *skb,
> }
> } while (!hotdrop);
>
> - read_unlock_bh(&table->lock);
> + rcu_read_unlock_bh();
>
> #ifdef DEBUG_ALLOW_ALL
> return NF_ACCEPT;
> @@ -924,13 +926,45 @@ get_counters(const struct xt_table_info
> counters,
> &i);
> }
> +
> +}
> +
> +/* We're lazy, and add to the first CPU; overflow works its fey magic
> + * and everything is OK. */
> +static int
> +add_counter_to_entry(struct ipt_entry *e,
> + const struct xt_counters addme[],
> + unsigned int *i)
> +{
> + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> +
> + (*i)++;
> + return 0;
> +}
> +
> +/* Take values from counters and add them back onto the current cpu */
> +static void put_counters(struct xt_table_info *t,
> + const struct xt_counters counters[])
> +{
> + unsigned int i, cpu;
> +
> + local_bh_disable();
> + cpu = smp_processor_id();
> + i = 0;
> + IPT_ENTRY_ITERATE(t->entries[cpu],
> + t->size,
> + add_counter_to_entry,
> + counters,
> + &i);
> + local_bh_enable();
> }
>
> static struct xt_counters * alloc_counters(struct xt_table *table)
> {
> unsigned int countersize;
> struct xt_counters *counters;
> - const struct xt_table_info *private = table->private;
> + struct xt_table_info *private = table->private;
> + struct xt_table_info *tmp;
>
> /* We need atomic snapshot of counters: rest doesn't change
> (other than comefrom, which userspace doesn't care
> @@ -939,14 +973,30 @@ static struct xt_counters * alloc_counte
> counters = vmalloc_node(countersize, numa_node_id());
>
> if (counters == NULL)
> - return ERR_PTR(-ENOMEM);
> + goto nomem;
> +
> + tmp = xt_alloc_table_info(private->size);
> + if (!tmp)
> + goto free_counters;
> +
> + xt_zero_table_entries(tmp);
This is not correct. We must copy rules and zero counters on the copied stuff.
> +
> + mutex_lock(&table->lock);
> + xt_swap_table_entries(private, tmp);
> + synchronize_net(); /* Wait until smoke has cleared */
>
> - /* First, sum counters... */
> - write_lock_bh(&table->lock);
> - get_counters(private, counters);
> - write_unlock_bh(&table->lock);
> + get_counters(tmp, counters);
Yes, tmp now hold previous pointers
> + put_counters(private, counters);
> + mutex_unlock(&table->lock);
> +
> + xt_free_table_info(tmp);
>
> return counters;
> +
> + free_counters:
> + vfree(counters);
> + nomem:
> + return ERR_PTR(-ENOMEM);
> }
>
> static int
> @@ -1312,27 +1362,6 @@ do_replace(struct net *net, void __user
> return ret;
> }
>
> -/* We're lazy, and add to the first CPU; overflow works its fey magic
> - * and everything is OK. */
> -static int
> -add_counter_to_entry(struct ipt_entry *e,
> - const struct xt_counters addme[],
> - unsigned int *i)
> -{
> -#if 0
> - duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n",
> - *i,
> - (long unsigned int)e->counters.pcnt,
> - (long unsigned int)e->counters.bcnt,
> - (long unsigned int)addme[*i].pcnt,
> - (long unsigned int)addme[*i].bcnt);
> -#endif
> -
> - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> -
> - (*i)++;
> - return 0;
> -}
>
> static int
> do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
> @@ -1393,13 +1422,14 @@ do_add_counters(struct net *net, void __
> goto free;
> }
>
> - write_lock_bh(&t->lock);
> + mutex_lock(&t->lock);
> private = t->private;
> if (private->number != num_counters) {
> ret = -EINVAL;
> goto unlock_up_free;
> }
>
> + preempt_disable();
> i = 0;
> /* Choose the copy that is on our node */
> loc_cpu_entry = private->entries[raw_smp_processor_id()];
> @@ -1408,8 +1438,9 @@ do_add_counters(struct net *net, void __
> add_counter_to_entry,
> paddc,
> &i);
> + preempt_enable();
> unlock_up_free:
> - write_unlock_bh(&t->lock);
> + mutex_unlock(&t->lock);
> xt_table_unlock(t);
> module_put(t->me);
> free:
> --- a/net/netfilter/x_tables.c 2009-02-02 15:06:29.708249745 -0800
> +++ b/net/netfilter/x_tables.c 2009-02-03 15:44:21.743663216 -0800
> @@ -611,18 +611,61 @@ struct xt_table_info *xt_alloc_table_inf
> }
> EXPORT_SYMBOL(xt_alloc_table_info);
>
> -void xt_free_table_info(struct xt_table_info *info)
> +/* Zero out entries */
> +void xt_zero_table_entries(struct xt_table_info *info)
> {
> - int cpu;
> + unsigned int cpu;
> +
> + for_each_possible_cpu(cpu)
> + memset(info->entries[cpu], 0, info->size);
> +}
> +EXPORT_SYMBOL_GPL(xt_zero_table_entries);
Hum, you forgot entries[cpu] points to quite complex data set,
with iptables rules, countaining counters...
Only counters must be zeroed, one by one.
You thus need an ITERATE invocation...
I wont be able to make the incremental patch (too busy @work at this moment),
I am sorry :(
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists