[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1238487448.28248.1805.camel@twins>
Date:	Tue, 31 Mar 2009 10:17:28 +0200
From:	Peter Zijlstra <peterz@...radead.org>
To:	"Eric W. Biederman" <ebiederm@...ssion.com>
Cc:	Andrew Morton <akpm@...ux-foundation.org>,
	Ingo Molnar <mingo@...e.hu>, Jan Beulich <jbeulich@...ell.com>,
	tglx@...utronix.de, mingo@...hat.com, hpa@...or.com,
	linux-kernel@...r.kernel.org, Gautham R Shenoy <ego@...ibm.com>,
	Alexey Dobriyan <adobriyan@...il.com>, netdev@...r.kernel.org
Subject: Re: [PATCH 2/2] sysctl:  lockdep support for sysctl reference
 counting.
On Sat, 2009-03-21 at 00:42 -0700, Eric W. Biederman wrote:
> It is possible for get lock ordering deadlocks between locks
> and waiting for the sysctl used count to drop to zero.  We have
> recently observed one of these in the networking code.
> 
> So teach the sysctl code how to speak lockdep so the kernel
> can warn about these kinds of rare issues proactively.
It would be very good to extend this changelog with a more detailed
explanation of the deadlock in question.
Let me see if I got it right:
We're holding a lock, while waiting for the refcount to drop to 0.
Dropping that refcount is blocked on that lock.
Something like that?
> Signed-off-by: Eric Biederman <ebiederm@...stanetworks.com>
> ---
>  include/linux/sysctl.h |    4 ++
>  kernel/sysctl.c        |  108 ++++++++++++++++++++++++++++++++++++++---------
>  2 files changed, 91 insertions(+), 21 deletions(-)
> 
> diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
> index 39d471d..ec9b1dd 100644
> --- a/include/linux/sysctl.h
> +++ b/include/linux/sysctl.h
> @@ -28,6 +28,7 @@
>  #include <linux/kernel.h>
>  #include <linux/types.h>
>  #include <linux/compiler.h>
> +#include <linux/lockdep.h>
>  
>  struct file;
>  struct completion;
> @@ -1087,6 +1088,9 @@ struct ctl_table_header
>  	struct ctl_table *attached_by;
>  	struct ctl_table *attached_to;
>  	struct ctl_table_header *parent;
> +#ifdef CONFIG_PROVE_LOCKING
> +	struct lockdep_map dep_map;
> +#endif
>  };
>  
>  /* struct ctl_path describes where in the hierarchy a table is added */
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index c5ef44f..ea8cc39 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -1454,12 +1454,63 @@ static struct ctl_table dev_table[] = {
>  
>  static DEFINE_SPINLOCK(sysctl_lock);
>  
> +#ifndef CONFIG_PROVE_LOCKING
> +
> +# define lock_sysctl() spin_lock(&sysctl_lock)
> +# define unlock_sysctl() spin_unlock(&sysctl_lock)
> +
> +static inline void table_acquire_use(struct ctl_table_header *hdr) { }
> +static inline void table_release_use(struct ctl_table_header *hdr) { }
> +static inline void table_acquire(struct ctl_table_header *hdr) { }
> +static inline void table_contended(struct ctl_table_header *hdr) { }
> +static inline void table_acquired(struct ctl_table_header *hdr) { }
> +static inline void table_release(struct ctl_table_header *hdr) { }
> +
> +#else	/* CONFIG_PROVE_LOCKING */
> +
> +#  define lock_sysctl() __raw_spin_lock(&sysctl_lock.raw_lock)
> +#  define unlock_sysctl() __raw_spin_unlock(&sysctl_lock.raw_lock)
Uhmm, Please explain that -- without a proper explanation this is a NAK.
> +static inline void table_acquire_use(struct ctl_table_header *hdr)
> +{
> +	lock_acquire(&hdr->dep_map, 0, 0, 1, 2, NULL, _RET_IP_);
> +	lock_acquired(&hdr->dep_map, _RET_IP_);
> +}
> +
> +static inline void table_release_use(struct ctl_table_header *hdr)
> +{
> +	lock_release(&hdr->dep_map, 0, _RET_IP_);
> +}
> +
> +static inline void table_acquire(struct ctl_table_header *hdr)
> +{
> +	lock_acquire(&hdr->dep_map, 0, 0, 0, 2, NULL, _RET_IP_);
> +}
> +
> +static inline void table_contended(struct ctl_table_header *hdr)
> +{
> +	lock_contended(&hdr->dep_map, _RET_IP_);
> +}
> +
> +static inline void table_acquired(struct ctl_table_header *hdr)
> +{
> +	lock_acquired(&hdr->dep_map, _RET_IP_);
> +}
> +
> +static inline void table_release(struct ctl_table_header *hdr)
> +{
> +	lock_release(&hdr->dep_map, 0, _RET_IP_);
> +}
> +
> +#endif /* CONFIG_PROVE_LOCKING */
> +
>  /* called under sysctl_lock */
>  static int use_table(struct ctl_table_header *p)
>  {
>  	if (unlikely(p->unregistering))
>  		return 0;
>  	p->used++;
> +	table_acquire_use(p);
>  	return 1;
>  }
>  
> @@ -1469,6 +1520,8 @@ static void unuse_table(struct ctl_table_header *p)
>  	if (!--p->used)
>  		if (unlikely(p->unregistering))
>  			complete(p->unregistering);
> +
> +	table_release_use(p);
>  }
>  
>  /* called under sysctl_lock, will reacquire if has to wait */
> @@ -1478,47 +1531,54 @@ static void start_unregistering(struct ctl_table_header *p)
>  	 * if p->used is 0, nobody will ever touch that entry again;
>  	 * we'll eliminate all paths to it before dropping sysctl_lock
>  	 */
> +	table_acquire(p);
>  	if (unlikely(p->used)) {
>  		struct completion wait;
> +		table_contended(p);
> +
>  		init_completion(&wait);
>  		p->unregistering = &wait;
> -		spin_unlock(&sysctl_lock);
> +		unlock_sysctl();
>  		wait_for_completion(&wait);
> -		spin_lock(&sysctl_lock);
> +		lock_sysctl();
>  	} else {
>  		/* anything non-NULL; we'll never dereference it */
>  		p->unregistering = ERR_PTR(-EINVAL);
>  	}
> +	table_acquired(p);
> +
>  	/*
>  	 * do not remove from the list until nobody holds it; walking the
>  	 * list in do_sysctl() relies on that.
>  	 */
>  	list_del_init(&p->ctl_entry);
> +
> +	table_release(p);
>  }
>  
> @@ -1951,7 +2011,13 @@ struct ctl_table_header *__register_sysctl_paths(
>  		return NULL;
>  	}
>  #endif
> -	spin_lock(&sysctl_lock);
> +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> +	{
> +		static struct lock_class_key __key;
> +		lockdep_init_map(&header->dep_map, "sysctl_used", &__key, 0);
> +	}
> +#endif	
This means every sysctl thingy gets the same class, is that
intended/desired?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists
 
