[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20080911212459.GL6693@linux.vnet.ibm.com>
Date: Thu, 11 Sep 2008 14:24:59 -0700
From: "Paul E. McKenney" <paulmck@...ux.vnet.ibm.com>
To: Herbert Xu <herbert@...dor.apana.org.au>
Cc: David Miller <davem@...emloft.net>, timo.teras@....fi,
netdev@...r.kernel.org
Subject: Re: xfrm_state locking regression...
On Wed, Sep 10, 2008 at 12:33:12AM +1000, Herbert Xu wrote:
> On Mon, Sep 08, 2008 at 05:25:13PM -0700, David Miller wrote:
> >
> > The only comment I would make is that maybe it's a bit excessive
> > to trigger the GC worker every time we walk the states.
>
> Good point!
>
> I've avoided the memory barrier by simply extending the mutexed
> section in the GC to cover the list splicing. Here's the updated
> patch:
>
> ipsec: Use RCU-like construct for saved state within a walk
>
> Now that we save states within a walk we need synchronisation
> so that the list the saved state is on doesn't disappear from
> under us.
>
> As it stands this is done by keeping the state on the list which
> is bad because it gets in the way of the management of the state
> life-cycle.
>
> An alternative is to make our own pseudo-RCU system where we use
> counters to indicate which state can't be freed immediately as
> it may be referenced by an ongoing walk when that resumes.
There is only one reader at a time, right? Otherwise, I don't see how
the increments and reads of xfrm_state_walk_completed line up.
Thanx, Paul
> Signed-off-by: Herbert Xu <herbert@...dor.apana.org.au>
>
> diff --git a/include/net/xfrm.h b/include/net/xfrm.h
> index 2933d74..4bb9499 100644
> --- a/include/net/xfrm.h
> +++ b/include/net/xfrm.h
> @@ -120,9 +120,11 @@ extern struct mutex xfrm_cfg_mutex;
> /* Full description of state of transformer. */
> struct xfrm_state
> {
> - /* Note: bydst is re-used during gc */
> struct list_head all;
> - struct hlist_node bydst;
> + union {
> + struct list_head gclist;
> + struct hlist_node bydst;
> + };
> struct hlist_node bysrc;
> struct hlist_node byspi;
>
> @@ -1286,16 +1288,9 @@ static inline void xfrm_state_walk_init(struct xfrm_state_walk *walk, u8 proto)
> walk->count = 0;
> }
>
> -static inline void xfrm_state_walk_done(struct xfrm_state_walk *walk)
> -{
> - if (walk->state != NULL) {
> - xfrm_state_put(walk->state);
> - walk->state = NULL;
> - }
> -}
> -
> extern int xfrm_state_walk(struct xfrm_state_walk *walk,
> int (*func)(struct xfrm_state *, int, void*), void *);
> +extern void xfrm_state_walk_done(struct xfrm_state_walk *walk);
> extern struct xfrm_state *xfrm_state_alloc(void);
> extern struct xfrm_state *xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
> struct flowi *fl, struct xfrm_tmpl *tmpl,
> diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
> index 7bd62f6..d90f936 100644
> --- a/net/xfrm/xfrm_state.c
> +++ b/net/xfrm/xfrm_state.c
> @@ -59,6 +59,11 @@ static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024;
> static unsigned int xfrm_state_num;
> static unsigned int xfrm_state_genid;
>
> +/* Counter indicating ongoing walk, protected by xfrm_state_lock. */
> +static unsigned long xfrm_state_walk_ongoing;
> +/* Counter indicating walk completion, protected by xfrm_cfg_mutex. */
> +static unsigned long xfrm_state_walk_completed;
> +
> static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family);
> static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo);
>
> @@ -191,7 +196,8 @@ static DEFINE_RWLOCK(xfrm_state_afinfo_lock);
> static struct xfrm_state_afinfo *xfrm_state_afinfo[NPROTO];
>
> static struct work_struct xfrm_state_gc_work;
> -static HLIST_HEAD(xfrm_state_gc_list);
> +static LIST_HEAD(xfrm_state_gc_leftovers);
> +static LIST_HEAD(xfrm_state_gc_list);
> static DEFINE_SPINLOCK(xfrm_state_gc_lock);
>
> int __xfrm_state_delete(struct xfrm_state *x);
> @@ -403,17 +409,22 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
>
> static void xfrm_state_gc_task(struct work_struct *data)
> {
> - struct xfrm_state *x;
> - struct hlist_node *entry, *tmp;
> - struct hlist_head gc_list;
> + struct xfrm_state *x, *tmp;
> + unsigned long completed;
>
> + mutex_lock(&xfrm_cfg_mutex);
> spin_lock_bh(&xfrm_state_gc_lock);
> - gc_list.first = xfrm_state_gc_list.first;
> - INIT_HLIST_HEAD(&xfrm_state_gc_list);
> + list_splice_tail_init(&xfrm_state_gc_list, &xfrm_state_gc_leftovers);
> spin_unlock_bh(&xfrm_state_gc_lock);
>
> - hlist_for_each_entry_safe(x, entry, tmp, &gc_list, bydst)
> + completed = xfrm_state_walk_completed;
> + mutex_unlock(&xfrm_cfg_mutex);
> +
> + list_for_each_entry_safe(x, tmp, &xfrm_state_gc_leftovers, gclist) {
> + if ((long)(x->lastused - completed) > 0)
> + break;
> xfrm_state_gc_destroy(x);
> + }
>
> wake_up(&km_waitq);
> }
> @@ -540,12 +551,8 @@ void __xfrm_state_destroy(struct xfrm_state *x)
> {
> WARN_ON(x->km.state != XFRM_STATE_DEAD);
>
> - spin_lock_bh(&xfrm_state_lock);
> - list_del(&x->all);
> - spin_unlock_bh(&xfrm_state_lock);
> -
> spin_lock_bh(&xfrm_state_gc_lock);
> - hlist_add_head(&x->bydst, &xfrm_state_gc_list);
> + list_add_tail(&x->gclist, &xfrm_state_gc_list);
> spin_unlock_bh(&xfrm_state_gc_lock);
> schedule_work(&xfrm_state_gc_work);
> }
> @@ -558,6 +565,8 @@ int __xfrm_state_delete(struct xfrm_state *x)
> if (x->km.state != XFRM_STATE_DEAD) {
> x->km.state = XFRM_STATE_DEAD;
> spin_lock(&xfrm_state_lock);
> + x->lastused = xfrm_state_walk_ongoing;
> + list_del_rcu(&x->all);
> hlist_del(&x->bydst);
> hlist_del(&x->bysrc);
> if (x->id.spi)
> @@ -1572,6 +1581,7 @@ int xfrm_state_walk(struct xfrm_state_walk *walk,
> if (err) {
> xfrm_state_hold(last);
> walk->state = last;
> + xfrm_state_walk_ongoing++;
> goto out;
> }
> }
> @@ -1586,12 +1596,28 @@ int xfrm_state_walk(struct xfrm_state_walk *walk,
> err = func(last, 0, data);
> out:
> spin_unlock_bh(&xfrm_state_lock);
> - if (old != NULL)
> + if (old != NULL) {
> xfrm_state_put(old);
> + xfrm_state_walk_completed++;
> + if (!list_empty(&xfrm_state_gc_leftovers))
> + schedule_work(&xfrm_state_gc_work);
> + }
> return err;
> }
> EXPORT_SYMBOL(xfrm_state_walk);
>
> +void xfrm_state_walk_done(struct xfrm_state_walk *walk)
> +{
> + if (walk->state != NULL) {
> + xfrm_state_put(walk->state);
> + walk->state = NULL;
> + xfrm_state_walk_completed++;
> + if (!list_empty(&xfrm_state_gc_leftovers))
> + schedule_work(&xfrm_state_gc_work);
> + }
> +}
> +EXPORT_SYMBOL(xfrm_state_walk_done);
> +
>
> void xfrm_replay_notify(struct xfrm_state *x, int event)
> {
>
> Thanks,
> --
> Visit Openswan at http://www.openswan.org/
> Email: Herbert Xu ~{PmV>HI~} <herbert@...dor.apana.org.au>
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@...r.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists