[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <1267192638.9082.9.camel@edumazet-laptop>
Date: Fri, 26 Feb 2010 14:57:18 +0100
From: Eric Dumazet <eric.dumazet@...il.com>
To: Simon Horman <horms@...ge.net.au>
Cc: netdev@...r.kernel.org, lvs-devel@...r.kernel.org,
Wensong Zhang <wensong@...ux-vs.org>,
Julian Anastasov <ja@....bg>,
Patrick McHardy <kaber@...sh.net>,
"David S. Miller" <davem@...emloft.net>
Subject: Re: [RFC] IPVS: Convert connection table lock over to RCU
Le vendredi 26 février 2010 à 14:00 +1100, Simon Horman a écrit :
> Signed-off-by: Simon Horman <horms@...ge.net.au>
>
> ---
>
> This seems to be a fairly clean conversion to me. But its my journey
> into the world of RCU, so I would appreciate a careful review.
>
> I have deliberately introduced some noise into this patch
> in the form of changing the name of some global variables and functions.
> This is in order to clearly highlight changes at the call-sites.
>
> The table of 16 locks (4 bits) used for the connection table seems
> to be somewhat arbitrary to me, this patch intentionally leaves
> that as is.
>
> Index: net-next-2.6/net/netfilter/ipvs/ip_vs_conn.c
> ===================================================================
> --- net-next-2.6.orig/net/netfilter/ipvs/ip_vs_conn.c 2010-02-26 10:42:16.000000000 +1100
> +++ net-next-2.6/net/netfilter/ipvs/ip_vs_conn.c 2010-02-26 10:52:32.000000000 +1100
> @@ -35,6 +35,8 @@
> #include <linux/seq_file.h>
> #include <linux/jhash.h>
> #include <linux/random.h>
> +#include <linux/spinlock.h>
> +#include <linux/rculist.h>
>
> #include <net/net_namespace.h>
> #include <net/ip_vs.h>
> @@ -75,57 +77,37 @@ static unsigned int ip_vs_conn_rnd;
> /*
> * Fine locking granularity for big connection hash table
> */
> -#define CT_LOCKARRAY_BITS 4
> -#define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS)
> -#define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1)
> +#define CT_MUTEX_BITS 4
> +#define CT_MUTEX_SIZE (1<<CT_MUTEX_BITS)
> +#define CT_MUTEX_MASK (CT_MUTEX_SIZE-1)
>
> -struct ip_vs_aligned_lock
> +struct ip_vs_aligned_spinlock
> {
> - rwlock_t l;
> + spinlock_t l;
> } __attribute__((__aligned__(SMP_CACHE_BYTES)));
>
> -/* lock array for conn table */
> -static struct ip_vs_aligned_lock
> -__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
> +/* mutex array for connection table */
> +static struct ip_vs_aligned_spinlock
> +__ip_vs_conntbl_mutex[CT_MUTEX_SIZE] __cacheline_aligned;
>
> -static inline void ct_read_lock(unsigned key)
> +static inline void ct_mutex_lock(unsigned key)
> {
> - read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
> + spin_lock(&__ip_vs_conntbl_mutex[key&CT_MUTEX_MASK].l);
> }
>
> -static inline void ct_read_unlock(unsigned key)
> +static inline void ct_mutex_unlock(unsigned key)
> {
> - read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
> + spin_unlock(&__ip_vs_conntbl_mutex[key&CT_MUTEX_MASK].l);
> }
>
> -static inline void ct_write_lock(unsigned key)
> +static inline void ct_mutex_lock_bh(unsigned key)
> {
> - write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
> + spin_lock_bh(&__ip_vs_conntbl_mutex[key&CT_MUTEX_MASK].l);
> }
>
> -static inline void ct_write_unlock(unsigned key)
> +static inline void ct_mutex_unlock_bh(unsigned key)
> {
> - write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
> -}
> -
> -static inline void ct_read_lock_bh(unsigned key)
> -{
> - read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
> -}
> -
> -static inline void ct_read_unlock_bh(unsigned key)
> -{
> - read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
> -}
> -
> -static inline void ct_write_lock_bh(unsigned key)
> -{
> - write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
> -}
> -
> -static inline void ct_write_unlock_bh(unsigned key)
> -{
> - write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
> + spin_unlock_bh(&__ip_vs_conntbl_mutex[key&CT_MUTEX_MASK].l);
> }
>
>
> @@ -155,27 +137,27 @@ static unsigned int ip_vs_conn_hashkey(i
> static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
> {
> unsigned hash;
> - int ret;
>
> /* Hash by protocol, client address and port */
> hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
>
> - ct_write_lock(hash);
> + ct_mutex_lock(hash);
>
> if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
> - list_add(&cp->c_list, &ip_vs_conn_tab[hash]);
> + list_add_rcu(&cp->c_list, &ip_vs_conn_tab[hash]);
> cp->flags |= IP_VS_CONN_F_HASHED;
> atomic_inc(&cp->refcnt);
> - ret = 1;
> - } else {
> - pr_err("%s(): request for already hashed, called from %pF\n",
> - __func__, __builtin_return_address(0));
> - ret = 0;
> + ct_mutex_unlock(hash);
> + synchronize_rcu();
Why is synchronize_rcu() necessary here ?
When adding a new item in a list, you dont need any rcu grace period.
> + return 1;
> }
>
> - ct_write_unlock(hash);
> + ct_mutex_unlock(hash);
>
> - return ret;
> + pr_err("%s(): request for already hashed, called from %pF\n",
> + __func__, __builtin_return_address(0));
> +
> + return 0;
> }
>
>
> @@ -186,24 +168,24 @@ static inline int ip_vs_conn_hash(struct
> static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
> {
> unsigned hash;
> - int ret;
>
> /* unhash it and decrease its reference counter */
> hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
>
> - ct_write_lock(hash);
> + ct_mutex_lock(hash);
>
> if (cp->flags & IP_VS_CONN_F_HASHED) {
> - list_del(&cp->c_list);
> + list_del_rcu(&cp->c_list);
> cp->flags &= ~IP_VS_CONN_F_HASHED;
> atomic_dec(&cp->refcnt);
> - ret = 1;
> - } else
> - ret = 0;
> + ct_mutex_unlock(hash);
> + synchronize_rcu();
Are you sure we can afford a synchronize_rcu() call here ?
This is a very long primitive, and I bet this is not acceptable for IPVS
use case.
> + return 1;
> + }
>
> - ct_write_unlock(hash);
> + ct_mutex_unlock(hash);
>
> - return ret;
> + return 0;
> }
>
>
> @@ -222,9 +204,9 @@ static inline struct ip_vs_conn *__ip_vs
>
> hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
>
> - ct_read_lock(hash);
> + rcu_read_lock();
>
> - list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
> + list_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
> if (cp->af == af &&
> ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
> ip_vs_addr_equal(af, d_addr, &cp->vaddr) &&
> @@ -233,12 +215,12 @@ static inline struct ip_vs_conn *__ip_vs
> protocol == cp->protocol) {
> /* HIT */
> atomic_inc(&cp->refcnt);
> - ct_read_unlock(hash);
> + rcu_read_unlock();
> return cp;
> }
> }
>
> - ct_read_unlock(hash);
> + rcu_read_unlock();
>
> return NULL;
> }
> @@ -273,9 +255,9 @@ struct ip_vs_conn *ip_vs_ct_in_get
>
> hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
>
> - ct_read_lock(hash);
> + rcu_read_lock();
>
> - list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
> + list_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
> if (cp->af == af &&
> ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
> /* protocol should only be IPPROTO_IP if
> @@ -293,7 +275,7 @@ struct ip_vs_conn *ip_vs_ct_in_get
> cp = NULL;
>
> out:
> - ct_read_unlock(hash);
> + rcu_read_unlock();
>
> IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
> ip_vs_proto_name(protocol),
> @@ -322,9 +304,9 @@ struct ip_vs_conn *ip_vs_conn_out_get
> */
> hash = ip_vs_conn_hashkey(af, protocol, d_addr, d_port);
>
> - ct_read_lock(hash);
> + rcu_read_lock();
>
> - list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
> + list_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
> if (cp->af == af &&
> ip_vs_addr_equal(af, d_addr, &cp->caddr) &&
> ip_vs_addr_equal(af, s_addr, &cp->daddr) &&
> @@ -337,7 +319,7 @@ struct ip_vs_conn *ip_vs_conn_out_get
> }
> }
>
> - ct_read_unlock(hash);
> + rcu_read_unlock();
>
> IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
> ip_vs_proto_name(protocol),
> @@ -776,14 +758,16 @@ static void *ip_vs_conn_array(struct seq
> struct ip_vs_conn *cp;
>
> for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
> - ct_read_lock_bh(idx);
> - list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
> + rcu_read_lock_bh();
> + list_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
> if (pos-- == 0) {
> seq->private = &ip_vs_conn_tab[idx];
> + /* N.B: no rcu_read_unlock_bh() here
> + * Seems really horrible :-( */
> return cp;
> }
> }
> - ct_read_unlock_bh(idx);
> + rcu_read_unlock_bh();
> }
>
> return NULL;
> @@ -807,19 +791,22 @@ static void *ip_vs_conn_seq_next(struct
>
> /* more on same hash chain? */
> if ((e = cp->c_list.next) != l)
> - return list_entry(e, struct ip_vs_conn, c_list);
> + return list_entry_rcu(e, struct ip_vs_conn, c_list);
>
> idx = l - ip_vs_conn_tab;
> - ct_read_unlock_bh(idx);
> + rcu_read_unlock_bh();
>
> while (++idx < ip_vs_conn_tab_size) {
> - ct_read_lock_bh(idx);
> - list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
> + rcu_read_lock_bh();
> + list_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
> seq->private = &ip_vs_conn_tab[idx];
> + /* N.B: no rcu_read_unlock_bh() here
> + * Seems really horrible :-( */
... if you add a comment, please write why you need to keep rcu locked
... or dont add a comment, since this construct is quite common.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists