lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <d358cb91-aa72-5fbe-47e0-b7b978ac65b5@yandex-team.ru>
Date:   Sat, 18 Feb 2017 21:55:28 +0300
From:   Konstantin Khlebnikov <khlebnikov@...dex-team.ru>
To:     linux-kernel@...r.kernel.org, Al Viro <viro@...iv.linux.org.uk>
Cc:     Andrew Morton <akpm@...ux-foundation.org>,
        "Eric W. Biederman" <ebiederm@...ssion.com>
Subject: Re: [PATCH] proc/sysctl: prune stale dentries during unregistering

This patch has locking problem. I've got lockdep splat under LTP.

[ 6633.115456] ======================================================
[ 6633.115502] [ INFO: possible circular locking dependency detected ]
[ 6633.115553] 4.9.10-debug+ #9 Tainted: G             L
[ 6633.115584] -------------------------------------------------------
[ 6633.115627] ksm02/284980 is trying to acquire lock:
[ 6633.115659]  (&sb->s_type->i_lock_key#4){+.+...}, at: [<ffffffff816bc1ce>] igrab+0x1e/0x80
[ 6633.115834] but task is already holding lock:
[ 6633.115882]  (sysctl_lock){+.+...}, at: [<ffffffff817e379b>] unregister_sysctl_table+0x6b/0x110
[ 6633.116026] which lock already depends on the new lock.
[ 6633.116026]
[ 6633.116080]
[ 6633.116080] the existing dependency chain (in reverse order) is:
[ 6633.116117]
-> #2 (sysctl_lock){+.+...}:
-> #1 (&(&dentry->d_lockref.lock)->rlock){+.+...}:
-> #0 (&sb->s_type->i_lock_key#4){+.+...}:

d_lock nests inside i_lock
sysctl_lock nests inside d_lock in d_compare

This patch adds i_lock nesting inside sysctl_lock.

On 10.02.2017 10:35, Konstantin Khlebnikov wrote:
> Currently unregistering sysctl table does not prune its dentries.
> Stale dentries could slowdown sysctl operations significantly.
>
> For example, command:
>
> # for i in {1..100000} ; do unshare -n -- sysctl -a &> /dev/null ; done
>
> creates a millions of stale denties around sysctls of loopback interface:
>
> # sysctl fs.dentry-state
> fs.dentry-state = 25812579  24724135        45      0       0       0
>
> All of them have matching names thus lookup have to scan though whole
> hash chain and call d_compare (proc_sys_compare) which checks them
> under system-wide spinlock (sysctl_lock).
>
> # time sysctl -a > /dev/null
> real    1m12.806s
> user    0m0.016s
> sys     1m12.400s
>
> Currently only memory reclaimer could remove this garbage.
> But without significant memory pressure this never happens.
>
> This patch collects sysctl inodes into list on sysctl table header and
> prunes all their dentries once that table unregisters.
>
> Signed-off-by: Konstantin Khlebnikov <khlebnikov@...dex-team.ru>
> Suggested-by: Al Viro <viro@...iv.linux.org.uk>
> ---
>  fs/proc/inode.c        |    3 ++
>  fs/proc/internal.h     |    7 ++++--
>  fs/proc/proc_sysctl.c  |   59 +++++++++++++++++++++++++++++++++++-------------
>  include/linux/sysctl.h |    1 +
>  4 files changed, 51 insertions(+), 19 deletions(-)
>
> diff --git a/fs/proc/inode.c b/fs/proc/inode.c
> index 842a5ff5b85c..7ad9ed7958af 100644
> --- a/fs/proc/inode.c
> +++ b/fs/proc/inode.c
> @@ -43,10 +43,11 @@ static void proc_evict_inode(struct inode *inode)
>  	de = PDE(inode);
>  	if (de)
>  		pde_put(de);
> +
>  	head = PROC_I(inode)->sysctl;
>  	if (head) {
>  		RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);
> -		sysctl_head_put(head);
> +		proc_sys_evict_inode(inode, head);
>  	}
>  }
>
> diff --git a/fs/proc/internal.h b/fs/proc/internal.h
> index 2de5194ba378..ed1d762160e6 100644
> --- a/fs/proc/internal.h
> +++ b/fs/proc/internal.h
> @@ -65,6 +65,7 @@ struct proc_inode {
>  	struct proc_dir_entry *pde;
>  	struct ctl_table_header *sysctl;
>  	struct ctl_table *sysctl_entry;
> +	struct list_head sysctl_inodes;
>  	const struct proc_ns_operations *ns_ops;
>  	struct inode vfs_inode;
>  };
> @@ -249,10 +250,12 @@ extern void proc_thread_self_init(void);
>   */
>  #ifdef CONFIG_PROC_SYSCTL
>  extern int proc_sys_init(void);
> -extern void sysctl_head_put(struct ctl_table_header *);
> +extern void proc_sys_evict_inode(struct inode *inode,
> +				 struct ctl_table_header *head);
>  #else
>  static inline void proc_sys_init(void) { }
> -static inline void sysctl_head_put(struct ctl_table_header *head) { }
> +static inline void proc_sys_evict_inode(struct  inode *inode,
> +					struct ctl_table_header *head) { }
>  #endif
>
>  /*
> diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
> index d4e37acd4821..8efb1e10b025 100644
> --- a/fs/proc/proc_sysctl.c
> +++ b/fs/proc/proc_sysctl.c
> @@ -190,6 +190,7 @@ static void init_header(struct ctl_table_header *head,
>  	head->set = set;
>  	head->parent = NULL;
>  	head->node = node;
> +	INIT_LIST_HEAD(&head->inodes);
>  	if (node) {
>  		struct ctl_table *entry;
>  		for (entry = table; entry->procname; entry++, node++)
> @@ -259,6 +260,29 @@ static void unuse_table(struct ctl_table_header *p)
>  			complete(p->unregistering);
>  }
>
> +/* called under sysctl_lock */
> +static void proc_sys_prune_dcache(struct ctl_table_header *head)
> +{
> +	struct inode *inode, *prev = NULL;
> +	struct proc_inode *ei;
> +
> +	list_for_each_entry(ei, &head->inodes, sysctl_inodes) {
> +		inode = igrab(&ei->vfs_inode);
> +		if (inode) {
> +			spin_unlock(&sysctl_lock);
> +			iput(prev);
> +			prev = inode;
> +			d_prune_aliases(inode);
> +			spin_lock(&sysctl_lock);
> +		}
> +	}
> +	if (prev) {
> +		spin_unlock(&sysctl_lock);
> +		iput(prev);
> +		spin_lock(&sysctl_lock);
> +	}
> +}
> +
>  /* called under sysctl_lock, will reacquire if has to wait */
>  static void start_unregistering(struct ctl_table_header *p)
>  {
> @@ -278,27 +302,17 @@ static void start_unregistering(struct ctl_table_header *p)
>  		p->unregistering = ERR_PTR(-EINVAL);
>  	}
>  	/*
> +	 * Prune dentries for unregistered sysctls: namespaced sysctls
> +	 * can have duplicate names and contaminate dcache very badly.
> +	 */
> +	proc_sys_prune_dcache(p);
> +	/*
>  	 * do not remove from the list until nobody holds it; walking the
>  	 * list in do_sysctl() relies on that.
>  	 */
>  	erase_header(p);
>  }
>
> -static void sysctl_head_get(struct ctl_table_header *head)
> -{
> -	spin_lock(&sysctl_lock);
> -	head->count++;
> -	spin_unlock(&sysctl_lock);
> -}
> -
> -void sysctl_head_put(struct ctl_table_header *head)
> -{
> -	spin_lock(&sysctl_lock);
> -	if (!--head->count)
> -		kfree_rcu(head, rcu);
> -	spin_unlock(&sysctl_lock);
> -}
> -
>  static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
>  {
>  	BUG_ON(!head);
> @@ -440,11 +454,15 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
>
>  	inode->i_ino = get_next_ino();
>
> -	sysctl_head_get(head);
>  	ei = PROC_I(inode);
>  	ei->sysctl = head;
>  	ei->sysctl_entry = table;
>
> +	spin_lock(&sysctl_lock);
> +	list_add(&ei->sysctl_inodes, &head->inodes);
> +	head->count++;
> +	spin_unlock(&sysctl_lock);
> +
>  	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
>  	inode->i_mode = table->mode;
>  	if (!S_ISDIR(table->mode)) {
> @@ -466,6 +484,15 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
>  	return inode;
>  }
>
> +void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head)
> +{
> +	spin_lock(&sysctl_lock);
> +	list_del(&PROC_I(inode)->sysctl_inodes);
> +	if (!--head->count)
> +		kfree_rcu(head, rcu);
> +	spin_unlock(&sysctl_lock);
> +}
> +
>  static struct ctl_table_header *grab_header(struct inode *inode)
>  {
>  	struct ctl_table_header *head = PROC_I(inode)->sysctl;
> diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
> index adf4e51cf597..b7e82049fec7 100644
> --- a/include/linux/sysctl.h
> +++ b/include/linux/sysctl.h
> @@ -143,6 +143,7 @@ struct ctl_table_header
>  	struct ctl_table_set *set;
>  	struct ctl_dir *parent;
>  	struct ctl_node *node;
> +	struct list_head inodes; /* head for proc_inode->sysctl_inodes */
>  };
>
>  struct ctl_dir {
>

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ