lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <49889440.60702@cosmosbay.com>
Date:	Tue, 03 Feb 2009 20:00:16 +0100
From:	Eric Dumazet <dada1@...mosbay.com>
To:	Stephen Hemminger <shemminger@...tta.com>
CC:	David Miller <davem@...emloft.net>, netdev@...r.kernel.org
Subject: Re: [PATCH 3/3] iptables: lock free counters (alternate version)

Stephen Hemminger a écrit :
> This is an alternative to earlier RCU/seqcount_t version of counters.
> The counters operate as usual without locking, but when counters are rotated
> around the CPU's entries.  RCU is used in two ways, first to handle the
> counter rotation, second for replace.
> 

Is it a working patch or just a prototype ?

> Signed-off-by: Stephen Hemminger <shemminger@...tta.com>
> 
> ---
>  include/linux/netfilter/x_tables.h |   10 +++-
>  net/ipv4/netfilter/arp_tables.c    |   73 +++++++++++++++++++++++++++---------
>  net/ipv4/netfilter/ip_tables.c     |   68 ++++++++++++++++++++++++---------
>  net/ipv6/netfilter/ip6_tables.c    |   75 ++++++++++++++++++++++++++-----------
>  net/netfilter/x_tables.c           |   43 +++++++++++++++------
>  5 files changed, 197 insertions(+), 72 deletions(-)
> 
> --- a/include/linux/netfilter/x_tables.h	2009-02-02 15:06:39.893751845 -0800
> +++ b/include/linux/netfilter/x_tables.h	2009-02-02 15:28:10.022574005 -0800
> @@ -353,7 +353,7 @@ struct xt_table
>  	unsigned int valid_hooks;
>  
>  	/* Lock for the curtain */
> -	rwlock_t lock;
> +	struct mutex lock;
>  
>  	/* Man behind the curtain... */
>  	struct xt_table_info *private;
> @@ -383,9 +383,15 @@ struct xt_table_info
>  	unsigned int hook_entry[NF_INET_NUMHOOKS];
>  	unsigned int underflow[NF_INET_NUMHOOKS];
>  
> +	/* For the dustman... */
> +	union {
> +		struct rcu_head rcu;
> +		struct work_struct work;
> +	};
> +
>  	/* ipt_entry tables: one per CPU */
>  	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
> -	char *entries[1];
> +	void *entries[1];
>  };
>  
>  #define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \
> --- a/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:06:29.684249364 -0800
> +++ b/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:14:13.256499021 -0800
> @@ -347,10 +347,12 @@ ipt_do_table(struct sk_buff *skb,
>  	mtpar.family  = tgpar.family = NFPROTO_IPV4;
>  	tgpar.hooknum = hook;
>  
> -	read_lock_bh(&table->lock);
>  	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
> -	private = table->private;
> -	table_base = (void *)private->entries[smp_processor_id()];
> +
> +	rcu_read_lock_bh();
> +	private = rcu_dereference(table->private);
> +	table_base = rcu_dereference(private->entries[smp_processor_id()]);
> +
>  	e = get_entry(table_base, private->hook_entry[hook]);
>  
>  	/* For return from builtin chain */
> @@ -445,7 +447,7 @@ ipt_do_table(struct sk_buff *skb,
>  		}
>  	} while (!hotdrop);
>  
> -	read_unlock_bh(&table->lock);
> +	rcu_read_unlock_bh();
>  
>  #ifdef DEBUG_ALLOW_ALL
>  	return NF_ACCEPT;
> @@ -892,45 +894,73 @@ set_entry_to_counter(const struct ipt_en
>  	return 0;
>  }
>  
> +static inline int
> +set_counter_to_entry(struct ipt_entry *e,
> +		     const struct ipt_counters total[],
> +		     unsigned int *i)
> +{
> +	SET_COUNTER(e->counters, total[*i].bcnt, total[*i].pcnt);
> +
> +	(*i)++;
> +	return 0;
> +}
> +
> +
>  static void
> -get_counters(const struct xt_table_info *t,
> +get_counters(struct xt_table_info *t,
>  	     struct xt_counters counters[])
>  {
>  	unsigned int cpu;
>  	unsigned int i;
>  	unsigned int curcpu;
> +	struct ipt_entry *e;
>  
> -	/* Instead of clearing (by a previous call to memset())
> -	 * the counters and using adds, we set the counters
> -	 * with data used by 'current' CPU
> -	 * We dont care about preemption here.
> -	 */
> +	preempt_disable();
>  	curcpu = raw_smp_processor_id();
> -
> +	e = t->entries[curcpu];
>  	i = 0;
> -	IPT_ENTRY_ITERATE(t->entries[curcpu],
> +	IPT_ENTRY_ITERATE(e,
>  			  t->size,
>  			  set_entry_to_counter,

Hum, current cpu might be interrupted by NIC, since you only disabled preemption.
set_entry_to_counter() might get garbage.
I suppose I already mentioned it :)

>  			  counters,
>  			  &i);
>  
>  	for_each_possible_cpu(cpu) {
> +		void *p;
> +
>  		if (cpu == curcpu)
>  			continue;
> +
> +		/* Swizzle the values and wait */
> +		e->counters = ((struct xt_counters) { 0, 0 });

I dont see what you want to do here...

e->counters is the counter associated with rule #0

> +		p = t->entries[cpu];
> +		rcu_assign_pointer(t->entries[cpu], e);
> +		synchronize_net();


Oh well, not this synchronize_net() :)

This wont provide atomic sampling of counters for whole CPUS, and introduce large delays
on big machines (NR_CPUS >= 64)

What problem do we want to solve here ?


Current iptables is able to do an atomic snapshot because of the rwlock.

If we really want to keep this feature (but get rid of rwlock), we might do the reverse
with two seqlocks + RCU

One seqlock (seqlock_counters) to protect counter updates
One seqlock (seqlock_rules) to protect rules changes

Ie :

ipt_do_table() doing :
{
rcu_read_lock 
read_seqbegin(&table->seqlock_rules);
rcu fetch priv table pointer and work on it
 do {
 for all counters updates, use 
 do {
   seq = read_seqbegin(table->seqlock_counters);
   update counters
   }
 } while (!hotdrop);
rcu_read_unlock()
}

for get_counter() (iptables -L)
writer doing a write_seqlock(&table->seqlock_counters), waiting one RCU grace period, 
{
get / sum all counters (no updates of counters are allowed)
}
write_sequnlock();


for iptables_update/replace (iptables -A|I|D|R|Z|F...)
writer doing a write_seqlock(&table->seqlock_rules), waiting one RCU grace period, 
{
change rules/counters
}
write_sequnlock();


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ