lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <1267192638.9082.9.camel@edumazet-laptop>
Date:	Fri, 26 Feb 2010 14:57:18 +0100
From:	Eric Dumazet <eric.dumazet@...il.com>
To:	Simon Horman <horms@...ge.net.au>
Cc:	netdev@...r.kernel.org, lvs-devel@...r.kernel.org,
	Wensong Zhang <wensong@...ux-vs.org>,
	Julian Anastasov <ja@....bg>,
	Patrick McHardy <kaber@...sh.net>,
	"David S. Miller" <davem@...emloft.net>
Subject: Re: [RFC] IPVS: Convert connection table lock over to RCU

Le vendredi 26 février 2010 à 14:00 +1100, Simon Horman a écrit :
> Signed-off-by: Simon Horman <horms@...ge.net.au>
> 
> --- 
> 
> This seems to be a fairly clean conversion to me. But its my journey
> into the world of RCU, so I would appreciate a careful review.
> 
> I have deliberately introduced some noise into this patch
> in the form of changing the name of some global variables and functions.
> This is in order to clearly highlight changes at the call-sites.
> 
> The table of 16 locks (4 bits) used for the connection table seems
> to be somewhat arbitrary to me, this patch intentionally leaves
> that as is.
> 
> Index: net-next-2.6/net/netfilter/ipvs/ip_vs_conn.c
> ===================================================================
> --- net-next-2.6.orig/net/netfilter/ipvs/ip_vs_conn.c	2010-02-26 10:42:16.000000000 +1100
> +++ net-next-2.6/net/netfilter/ipvs/ip_vs_conn.c	2010-02-26 10:52:32.000000000 +1100
> @@ -35,6 +35,8 @@
>  #include <linux/seq_file.h>
>  #include <linux/jhash.h>
>  #include <linux/random.h>
> +#include <linux/spinlock.h>
> +#include <linux/rculist.h>
>  
>  #include <net/net_namespace.h>
>  #include <net/ip_vs.h>
> @@ -75,57 +77,37 @@ static unsigned int ip_vs_conn_rnd;
>  /*
>   *  Fine locking granularity for big connection hash table
>   */
> -#define CT_LOCKARRAY_BITS  4
> -#define CT_LOCKARRAY_SIZE  (1<<CT_LOCKARRAY_BITS)
> -#define CT_LOCKARRAY_MASK  (CT_LOCKARRAY_SIZE-1)
> +#define CT_MUTEX_BITS  4
> +#define CT_MUTEX_SIZE  (1<<CT_MUTEX_BITS)
> +#define CT_MUTEX_MASK  (CT_MUTEX_SIZE-1)
>  
> -struct ip_vs_aligned_lock
> +struct ip_vs_aligned_spinlock
>  {
> -	rwlock_t	l;
> +	spinlock_t	l;
>  } __attribute__((__aligned__(SMP_CACHE_BYTES)));
>  
> -/* lock array for conn table */
> -static struct ip_vs_aligned_lock
> -__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
> +/* mutex array for connection table */
> +static struct ip_vs_aligned_spinlock
> +__ip_vs_conntbl_mutex[CT_MUTEX_SIZE] __cacheline_aligned;
>  
> -static inline void ct_read_lock(unsigned key)
> +static inline void ct_mutex_lock(unsigned key)
>  {
> -	read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
> +	spin_lock(&__ip_vs_conntbl_mutex[key&CT_MUTEX_MASK].l);
>  }
>  
> -static inline void ct_read_unlock(unsigned key)
> +static inline void ct_mutex_unlock(unsigned key)
>  {
> -	read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
> +	spin_unlock(&__ip_vs_conntbl_mutex[key&CT_MUTEX_MASK].l);
>  }
>  
> -static inline void ct_write_lock(unsigned key)
> +static inline void ct_mutex_lock_bh(unsigned key)
>  {
> -	write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
> +	spin_lock_bh(&__ip_vs_conntbl_mutex[key&CT_MUTEX_MASK].l);
>  }
>  
> -static inline void ct_write_unlock(unsigned key)
> +static inline void ct_mutex_unlock_bh(unsigned key)
>  {
> -	write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
> -}
> -
> -static inline void ct_read_lock_bh(unsigned key)
> -{
> -	read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
> -}
> -
> -static inline void ct_read_unlock_bh(unsigned key)
> -{
> -	read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
> -}
> -
> -static inline void ct_write_lock_bh(unsigned key)
> -{
> -	write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
> -}
> -
> -static inline void ct_write_unlock_bh(unsigned key)
> -{
> -	write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
> +	spin_unlock_bh(&__ip_vs_conntbl_mutex[key&CT_MUTEX_MASK].l);
>  }
>  
> 
> @@ -155,27 +137,27 @@ static unsigned int ip_vs_conn_hashkey(i
>  static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
>  {
>  	unsigned hash;
> -	int ret;
>  
>  	/* Hash by protocol, client address and port */
>  	hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
>  
> -	ct_write_lock(hash);
> +	ct_mutex_lock(hash);
>  
>  	if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
> -		list_add(&cp->c_list, &ip_vs_conn_tab[hash]);
> +		list_add_rcu(&cp->c_list, &ip_vs_conn_tab[hash]);
>  		cp->flags |= IP_VS_CONN_F_HASHED;
>  		atomic_inc(&cp->refcnt);
> -		ret = 1;
> -	} else {
> -		pr_err("%s(): request for already hashed, called from %pF\n",
> -		       __func__, __builtin_return_address(0));
> -		ret = 0;
> +		ct_mutex_unlock(hash);
> +		synchronize_rcu();

Why is synchronize_rcu() necessary here ?

When adding a new item in a list, you dont need any rcu grace period.

> +		return 1;
>  	}
>  
> -	ct_write_unlock(hash);
> +	ct_mutex_unlock(hash);
>  
> -	return ret;
> +	pr_err("%s(): request for already hashed, called from %pF\n",
> +	       __func__, __builtin_return_address(0));
> +
> +	return 0;
>  }
>  
> 
> @@ -186,24 +168,24 @@ static inline int ip_vs_conn_hash(struct
>  static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
>  {
>  	unsigned hash;
> -	int ret;
>  
>  	/* unhash it and decrease its reference counter */
>  	hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
>  
> -	ct_write_lock(hash);
> +	ct_mutex_lock(hash);
>  
>  	if (cp->flags & IP_VS_CONN_F_HASHED) {
> -		list_del(&cp->c_list);
> +		list_del_rcu(&cp->c_list);
>  		cp->flags &= ~IP_VS_CONN_F_HASHED;
>  		atomic_dec(&cp->refcnt);
> -		ret = 1;
> -	} else
> -		ret = 0;
> +		ct_mutex_unlock(hash);
> +		synchronize_rcu();

Are you sure we can afford a synchronize_rcu() call here ?

This is a very long primitive, and I bet this is not acceptable for IPVS
use case.

> +		return 1;
> +	}
>  
> -	ct_write_unlock(hash);
> +	ct_mutex_unlock(hash);
>  
> -	return ret;
> +	return 0;
>  }
>  
> 
> @@ -222,9 +204,9 @@ static inline struct ip_vs_conn *__ip_vs
>  
>  	hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
>  
> -	ct_read_lock(hash);
> +	rcu_read_lock();
>  
> -	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
> +	list_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
>  		if (cp->af == af &&
>  		    ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
>  		    ip_vs_addr_equal(af, d_addr, &cp->vaddr) &&
> @@ -233,12 +215,12 @@ static inline struct ip_vs_conn *__ip_vs
>  		    protocol == cp->protocol) {
>  			/* HIT */
>  			atomic_inc(&cp->refcnt);
> -			ct_read_unlock(hash);
> +			rcu_read_unlock();
>  			return cp;
>  		}
>  	}
>  
> -	ct_read_unlock(hash);
> +	rcu_read_unlock();
>  
>  	return NULL;
>  }
> @@ -273,9 +255,9 @@ struct ip_vs_conn *ip_vs_ct_in_get
>  
>  	hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
>  
> -	ct_read_lock(hash);
> +	rcu_read_lock();
>  
> -	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
> +	list_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
>  		if (cp->af == af &&
>  		    ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
>  		    /* protocol should only be IPPROTO_IP if
> @@ -293,7 +275,7 @@ struct ip_vs_conn *ip_vs_ct_in_get
>  	cp = NULL;
>  
>    out:
> -	ct_read_unlock(hash);
> +	rcu_read_unlock();
>  
>  	IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
>  		      ip_vs_proto_name(protocol),
> @@ -322,9 +304,9 @@ struct ip_vs_conn *ip_vs_conn_out_get
>  	 */
>  	hash = ip_vs_conn_hashkey(af, protocol, d_addr, d_port);
>  
> -	ct_read_lock(hash);
> +	rcu_read_lock();
>  
> -	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
> +	list_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
>  		if (cp->af == af &&
>  		    ip_vs_addr_equal(af, d_addr, &cp->caddr) &&
>  		    ip_vs_addr_equal(af, s_addr, &cp->daddr) &&
> @@ -337,7 +319,7 @@ struct ip_vs_conn *ip_vs_conn_out_get
>  		}
>  	}
>  
> -	ct_read_unlock(hash);
> +	rcu_read_unlock();
>  
>  	IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
>  		      ip_vs_proto_name(protocol),
> @@ -776,14 +758,16 @@ static void *ip_vs_conn_array(struct seq
>  	struct ip_vs_conn *cp;
>  
>  	for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
> -		ct_read_lock_bh(idx);
> -		list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
> +		rcu_read_lock_bh();
> +		list_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
>  			if (pos-- == 0) {
>  				seq->private = &ip_vs_conn_tab[idx];
> +				/* N.B: no rcu_read_unlock_bh() here
> +				 *      Seems really horrible :-( */
>  				return cp;
>  			}
>  		}
> -		ct_read_unlock_bh(idx);
> +		rcu_read_unlock_bh();
>  	}
>  
>  	return NULL;
> @@ -807,19 +791,22 @@ static void *ip_vs_conn_seq_next(struct
>  
>  	/* more on same hash chain? */
>  	if ((e = cp->c_list.next) != l)
> -		return list_entry(e, struct ip_vs_conn, c_list);
> +		return list_entry_rcu(e, struct ip_vs_conn, c_list);
>  
>  	idx = l - ip_vs_conn_tab;
> -	ct_read_unlock_bh(idx);
> +	rcu_read_unlock_bh();
>  
>  	while (++idx < ip_vs_conn_tab_size) {
> -		ct_read_lock_bh(idx);
> -		list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
> +		rcu_read_lock_bh();
> +		list_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
>  			seq->private = &ip_vs_conn_tab[idx];
> +			/* N.B: no rcu_read_unlock_bh() here
> +			 *      Seems really horrible :-( */

... if you add a comment, please write why you need to keep rcu locked
... or dont add a comment, since this construct is quite common.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ