[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <d358cb91-aa72-5fbe-47e0-b7b978ac65b5@yandex-team.ru>
Date: Sat, 18 Feb 2017 21:55:28 +0300
From: Konstantin Khlebnikov <khlebnikov@...dex-team.ru>
To: linux-kernel@...r.kernel.org, Al Viro <viro@...iv.linux.org.uk>
Cc: Andrew Morton <akpm@...ux-foundation.org>,
"Eric W. Biederman" <ebiederm@...ssion.com>
Subject: Re: [PATCH] proc/sysctl: prune stale dentries during unregistering
This patch has locking problem. I've got lockdep splat under LTP.
[ 6633.115456] ======================================================
[ 6633.115502] [ INFO: possible circular locking dependency detected ]
[ 6633.115553] 4.9.10-debug+ #9 Tainted: G L
[ 6633.115584] -------------------------------------------------------
[ 6633.115627] ksm02/284980 is trying to acquire lock:
[ 6633.115659] (&sb->s_type->i_lock_key#4){+.+...}, at: [<ffffffff816bc1ce>] igrab+0x1e/0x80
[ 6633.115834] but task is already holding lock:
[ 6633.115882] (sysctl_lock){+.+...}, at: [<ffffffff817e379b>] unregister_sysctl_table+0x6b/0x110
[ 6633.116026] which lock already depends on the new lock.
[ 6633.116026]
[ 6633.116080]
[ 6633.116080] the existing dependency chain (in reverse order) is:
[ 6633.116117]
-> #2 (sysctl_lock){+.+...}:
-> #1 (&(&dentry->d_lockref.lock)->rlock){+.+...}:
-> #0 (&sb->s_type->i_lock_key#4){+.+...}:
d_lock nests inside i_lock
sysctl_lock nests inside d_lock in d_compare
This patch adds i_lock nesting inside sysctl_lock.
On 10.02.2017 10:35, Konstantin Khlebnikov wrote:
> Currently unregistering sysctl table does not prune its dentries.
> Stale dentries could slowdown sysctl operations significantly.
>
> For example, command:
>
> # for i in {1..100000} ; do unshare -n -- sysctl -a &> /dev/null ; done
>
> creates a millions of stale denties around sysctls of loopback interface:
>
> # sysctl fs.dentry-state
> fs.dentry-state = 25812579 24724135 45 0 0 0
>
> All of them have matching names thus lookup have to scan though whole
> hash chain and call d_compare (proc_sys_compare) which checks them
> under system-wide spinlock (sysctl_lock).
>
> # time sysctl -a > /dev/null
> real 1m12.806s
> user 0m0.016s
> sys 1m12.400s
>
> Currently only memory reclaimer could remove this garbage.
> But without significant memory pressure this never happens.
>
> This patch collects sysctl inodes into list on sysctl table header and
> prunes all their dentries once that table unregisters.
>
> Signed-off-by: Konstantin Khlebnikov <khlebnikov@...dex-team.ru>
> Suggested-by: Al Viro <viro@...iv.linux.org.uk>
> ---
> fs/proc/inode.c | 3 ++
> fs/proc/internal.h | 7 ++++--
> fs/proc/proc_sysctl.c | 59 +++++++++++++++++++++++++++++++++++-------------
> include/linux/sysctl.h | 1 +
> 4 files changed, 51 insertions(+), 19 deletions(-)
>
> diff --git a/fs/proc/inode.c b/fs/proc/inode.c
> index 842a5ff5b85c..7ad9ed7958af 100644
> --- a/fs/proc/inode.c
> +++ b/fs/proc/inode.c
> @@ -43,10 +43,11 @@ static void proc_evict_inode(struct inode *inode)
> de = PDE(inode);
> if (de)
> pde_put(de);
> +
> head = PROC_I(inode)->sysctl;
> if (head) {
> RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);
> - sysctl_head_put(head);
> + proc_sys_evict_inode(inode, head);
> }
> }
>
> diff --git a/fs/proc/internal.h b/fs/proc/internal.h
> index 2de5194ba378..ed1d762160e6 100644
> --- a/fs/proc/internal.h
> +++ b/fs/proc/internal.h
> @@ -65,6 +65,7 @@ struct proc_inode {
> struct proc_dir_entry *pde;
> struct ctl_table_header *sysctl;
> struct ctl_table *sysctl_entry;
> + struct list_head sysctl_inodes;
> const struct proc_ns_operations *ns_ops;
> struct inode vfs_inode;
> };
> @@ -249,10 +250,12 @@ extern void proc_thread_self_init(void);
> */
> #ifdef CONFIG_PROC_SYSCTL
> extern int proc_sys_init(void);
> -extern void sysctl_head_put(struct ctl_table_header *);
> +extern void proc_sys_evict_inode(struct inode *inode,
> + struct ctl_table_header *head);
> #else
> static inline void proc_sys_init(void) { }
> -static inline void sysctl_head_put(struct ctl_table_header *head) { }
> +static inline void proc_sys_evict_inode(struct inode *inode,
> + struct ctl_table_header *head) { }
> #endif
>
> /*
> diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
> index d4e37acd4821..8efb1e10b025 100644
> --- a/fs/proc/proc_sysctl.c
> +++ b/fs/proc/proc_sysctl.c
> @@ -190,6 +190,7 @@ static void init_header(struct ctl_table_header *head,
> head->set = set;
> head->parent = NULL;
> head->node = node;
> + INIT_LIST_HEAD(&head->inodes);
> if (node) {
> struct ctl_table *entry;
> for (entry = table; entry->procname; entry++, node++)
> @@ -259,6 +260,29 @@ static void unuse_table(struct ctl_table_header *p)
> complete(p->unregistering);
> }
>
> +/* called under sysctl_lock */
> +static void proc_sys_prune_dcache(struct ctl_table_header *head)
> +{
> + struct inode *inode, *prev = NULL;
> + struct proc_inode *ei;
> +
> + list_for_each_entry(ei, &head->inodes, sysctl_inodes) {
> + inode = igrab(&ei->vfs_inode);
> + if (inode) {
> + spin_unlock(&sysctl_lock);
> + iput(prev);
> + prev = inode;
> + d_prune_aliases(inode);
> + spin_lock(&sysctl_lock);
> + }
> + }
> + if (prev) {
> + spin_unlock(&sysctl_lock);
> + iput(prev);
> + spin_lock(&sysctl_lock);
> + }
> +}
> +
> /* called under sysctl_lock, will reacquire if has to wait */
> static void start_unregistering(struct ctl_table_header *p)
> {
> @@ -278,27 +302,17 @@ static void start_unregistering(struct ctl_table_header *p)
> p->unregistering = ERR_PTR(-EINVAL);
> }
> /*
> + * Prune dentries for unregistered sysctls: namespaced sysctls
> + * can have duplicate names and contaminate dcache very badly.
> + */
> + proc_sys_prune_dcache(p);
> + /*
> * do not remove from the list until nobody holds it; walking the
> * list in do_sysctl() relies on that.
> */
> erase_header(p);
> }
>
> -static void sysctl_head_get(struct ctl_table_header *head)
> -{
> - spin_lock(&sysctl_lock);
> - head->count++;
> - spin_unlock(&sysctl_lock);
> -}
> -
> -void sysctl_head_put(struct ctl_table_header *head)
> -{
> - spin_lock(&sysctl_lock);
> - if (!--head->count)
> - kfree_rcu(head, rcu);
> - spin_unlock(&sysctl_lock);
> -}
> -
> static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
> {
> BUG_ON(!head);
> @@ -440,11 +454,15 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
>
> inode->i_ino = get_next_ino();
>
> - sysctl_head_get(head);
> ei = PROC_I(inode);
> ei->sysctl = head;
> ei->sysctl_entry = table;
>
> + spin_lock(&sysctl_lock);
> + list_add(&ei->sysctl_inodes, &head->inodes);
> + head->count++;
> + spin_unlock(&sysctl_lock);
> +
> inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
> inode->i_mode = table->mode;
> if (!S_ISDIR(table->mode)) {
> @@ -466,6 +484,15 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
> return inode;
> }
>
> +void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head)
> +{
> + spin_lock(&sysctl_lock);
> + list_del(&PROC_I(inode)->sysctl_inodes);
> + if (!--head->count)
> + kfree_rcu(head, rcu);
> + spin_unlock(&sysctl_lock);
> +}
> +
> static struct ctl_table_header *grab_header(struct inode *inode)
> {
> struct ctl_table_header *head = PROC_I(inode)->sysctl;
> diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
> index adf4e51cf597..b7e82049fec7 100644
> --- a/include/linux/sysctl.h
> +++ b/include/linux/sysctl.h
> @@ -143,6 +143,7 @@ struct ctl_table_header
> struct ctl_table_set *set;
> struct ctl_dir *parent;
> struct ctl_node *node;
> + struct list_head inodes; /* head for proc_inode->sysctl_inodes */
> };
>
> struct ctl_dir {
>
Powered by blists - more mailing lists