linux-kernel - [patch] fs: scale vfsmount refcount (was Re: rcu-walk and dcache scaling tree update and status)

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20101213024217.GC6522@amd>
Date:	Mon, 13 Dec 2010 13:42:17 +1100
From:	Nick Piggin <npiggin@...nel.dk>
To:	Nick Piggin <npiggin@...nel.dk>
Cc:	Linus Torvalds <torvalds@...ux-foundation.org>,
	Andrew Morton <akpm@...ux-foundation.org>,
	Al Viro <viro@...IV.linux.org.uk>,
	Stephen Rothwell <sfr@...b.auug.org.au>,
	linux-fsdevel@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: [patch] fs: scale vfsmount refcount (was Re: rcu-walk and dcache
 scaling tree update and status)

On Mon, Dec 13, 2010 at 01:37:33PM +1100, Nick Piggin wrote:
> Final note:
> You won't be able to reproduce the parallel path walk scalability
> numbers that I've posted, because the vfsmount refcounting scalability
> patch is not included. I have a new idea for that now, so I'll be asking
> for comments with that soon.

Here is the patch I've been using, which works but has the problem
described in the changelog. But it works nicely for testing.

As I said, I have a promising approach to solving the problem.

fs: scale mntget/mntput

Improve scalability of mntget/mntput by using per-cpu counters protected by the
reader side of the brlock vfsmount_lock. If the mnt_hash field of the vfsmount
structure is attached to a list, then it is mounted which contributes to its
refcount, so the per-cpu counters need not be summed.

MNT_PSEUDO keeps track of whether the vfsmount is actually a pseudo filesystem
that will never be attached (such as sockfs).

No extra atomics in the common case because atomic mnt refcount is now replaced
with per-CPU spinlock. Code will be bigger and more complex however. With the
previous per-cpu locking patch, mount lookups and common case refcounting are
now per-cpu and should be ideally scalable. path lookups (and hence
path_get/path_put) within the same vfsmount should now be more scalable,
however this will often be hidden by dcache_lock on final dput, and d_lock on
common path elements (eg. cwd or root dentry).

Signed-off-by: Nick Piggin <npiggin@...nel.dk>

[Note: this is not for merging. Un-attached operation (lazy umount) may not be
 uncommon and will be slowed down and actually have worse scalablilty after
 this patch. I need to think about how to do fast refcounting with unattached
 mounts.]

---
 drivers/mtd/mtdchar.c |    1 
 fs/internal.h         |    1 
 fs/libfs.c            |    1 
 fs/namespace.c        |  167 +++++++++++++++++++++++++++++++++++++++++++-------
 fs/pnode.c            |    4 -
 include/linux/mount.h |   26 +------
 6 files changed, 154 insertions(+), 46 deletions(-)

Index: linux-2.6/fs/namespace.c
===================================================================
--- linux-2.6.orig/fs/namespace.c	2010-12-12 03:48:57.000000000 +1100
+++ linux-2.6/fs/namespace.c	2010-12-12 03:51:52.000000000 +1100
@@ -138,6 +138,64 @@ void mnt_release_group_id(struct vfsmoun
 	mnt->mnt_group_id = 0;
 }
 
+/*
+ * vfsmount lock must be held for read
+ */
+static inline void add_mnt_count(struct vfsmount *mnt, int n)
+{
+#ifdef CONFIG_SMP
+	(*per_cpu_ptr(mnt->mnt_count, smp_processor_id())) += n;
+#else
+	mnt->mnt_count += n;
+#endif
+}
+
+static inline void set_mnt_count(struct vfsmount *mnt, int n)
+{
+#ifdef CONFIG_SMP
+	preempt_disable();
+	(*per_cpu_ptr(mnt->mnt_count, smp_processor_id())) = n;
+	preempt_enable();
+#else
+	mnt->mnt_count = n;
+#endif
+}
+
+/*
+ * vfsmount lock must be held for read
+ */
+static inline void inc_mnt_count(struct vfsmount *mnt)
+{
+	add_mnt_count(mnt, 1);
+}
+
+/*
+ * vfsmount lock must be held for read
+ */
+static inline void dec_mnt_count(struct vfsmount *mnt)
+{
+	add_mnt_count(mnt, -1);
+}
+
+/*
+ * vfsmount lock must be held for write
+ */
+unsigned int count_mnt_count(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+	unsigned int count = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		count += *per_cpu_ptr(mnt->mnt_count, cpu);
+	}
+
+	return count;
+#else
+	return mnt->mnt_count;
+#endif
+}
+
 struct vfsmount *alloc_vfsmnt(const char *name)
 {
 	struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -154,7 +212,15 @@ struct vfsmount *alloc_vfsmnt(const char
 				goto out_free_id;
 		}
 
-		atomic_set(&mnt->mnt_count, 1);
+#ifdef CONFIG_SMP
+		mnt->mnt_count = alloc_percpu(int);
+		if (!mnt->mnt_count)
+			goto out_free_devname;
+#else
+		mnt->mnt_count = 0;
+#endif
+		set_mnt_count(mnt, 1);
+
 		INIT_LIST_HEAD(&mnt->mnt_hash);
 		INIT_LIST_HEAD(&mnt->mnt_child);
 		INIT_LIST_HEAD(&mnt->mnt_mounts);
@@ -169,7 +235,7 @@ struct vfsmount *alloc_vfsmnt(const char
 #ifdef CONFIG_SMP
 		mnt->mnt_writers = alloc_percpu(int);
 		if (!mnt->mnt_writers)
-			goto out_free_devname;
+			goto out_free_mntcount;
 #else
 		mnt->mnt_writers = 0;
 #endif
@@ -177,6 +243,8 @@ struct vfsmount *alloc_vfsmnt(const char
 	return mnt;
 
 #ifdef CONFIG_SMP
+out_free_mntcount:
+	free_percpu(mnt->mnt_count);
 out_free_devname:
 	kfree(mnt->mnt_devname);
 #endif
@@ -662,8 +730,8 @@ static inline void __mntput(struct vfsmo
 	 * to make r/w->r/o transitions.
 	 */
 	/*
-	 * atomic_dec_and_lock() used to deal with ->mnt_count decrements
-	 * provides barriers, so count_mnt_writers() below is safe.  AV
+	 * The locking used to deal with mnt_count decrement provides barriers,
+	 * so count_mnt_writers() below is safe.
 	 */
 	WARN_ON(count_mnt_writers(mnt));
 	fsnotify_vfsmount_delete(mnt);
@@ -675,45 +743,76 @@ static inline void __mntput(struct vfsmo
 void mntput_no_expire(struct vfsmount *mnt)
 {
 repeat:
-	if (atomic_add_unless(&mnt->mnt_count, -1, 1))
+	if (likely(!list_empty(&mnt->mnt_hash) ||
+				mnt->mnt_flags & MNT_PSEUDO)) {
+		br_read_lock(vfsmount_lock);
+		if (unlikely(list_empty(&mnt->mnt_hash) &&
+				(!(mnt->mnt_flags & MNT_PSEUDO)))) {
+			br_read_unlock(vfsmount_lock);
+			goto repeat;
+		}
+		dec_mnt_count(mnt);
+		br_read_unlock(vfsmount_lock);
 		return;
+	}
+
 	br_write_lock(vfsmount_lock);
-	if (!atomic_dec_and_test(&mnt->mnt_count)) {
+	dec_mnt_count(mnt);
+	if (count_mnt_count(mnt)) {
 		br_write_unlock(vfsmount_lock);
 		return;
 	}
-	if (likely(!mnt->mnt_pinned)) {
+	if (unlikely(mnt->mnt_pinned)) {
+		add_mnt_count(mnt, mnt->mnt_pinned + 1);
+		mnt->mnt_pinned = 0;
 		br_write_unlock(vfsmount_lock);
-		__mntput(mnt);
-		return;
+		acct_auto_close_mnt(mnt);
+		goto repeat;
 	}
-	atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count);
-	mnt->mnt_pinned = 0;
 	br_write_unlock(vfsmount_lock);
-	acct_auto_close_mnt(mnt);
-	goto repeat;
+	__mntput(mnt);
 }
 EXPORT_SYMBOL(mntput_no_expire);
 
+void mntput(struct vfsmount *mnt)
+{
+	if (mnt) {
+		/* avoid cacheline pingpong, hope gcc doesn't get "smart" */
+		if (unlikely(mnt->mnt_expiry_mark))
+			mnt->mnt_expiry_mark = 0;
+		mntput_no_expire(mnt);
+	}
+}
+EXPORT_SYMBOL(mntput);
+
+struct vfsmount *mntget(struct vfsmount *mnt)
+{
+	if (mnt) {
+		preempt_disable();
+		inc_mnt_count(mnt);
+		preempt_enable();
+	}
+	return mnt;
+}
+EXPORT_SYMBOL(mntget);
+
 void mnt_pin(struct vfsmount *mnt)
 {
 	br_write_lock(vfsmount_lock);
 	mnt->mnt_pinned++;
 	br_write_unlock(vfsmount_lock);
 }
-
 EXPORT_SYMBOL(mnt_pin);
 
 void mnt_unpin(struct vfsmount *mnt)
 {
 	br_write_lock(vfsmount_lock);
 	if (mnt->mnt_pinned) {
-		atomic_inc(&mnt->mnt_count);
+		inc_mnt_count(mnt);
 		mnt->mnt_pinned--;
 	}
 	br_write_unlock(vfsmount_lock);
 }
-
 EXPORT_SYMBOL(mnt_unpin);
 
 static inline void mangle(struct seq_file *m, const char *s)
@@ -1008,12 +1107,13 @@ int may_umount_tree(struct vfsmount *mnt
 	int minimum_refs = 0;
 	struct vfsmount *p;
 
-	br_read_lock(vfsmount_lock);
+	/* write lock needed for count_mnt_count */
+	br_write_lock(vfsmount_lock);
 	for (p = mnt; p; p = next_mnt(p, mnt)) {
-		actual_refs += atomic_read(&p->mnt_count);
+		actual_refs += count_mnt_count(p);
 		minimum_refs += 2;
 	}
-	br_read_unlock(vfsmount_lock);
+	br_write_unlock(vfsmount_lock);
 
 	if (actual_refs > minimum_refs)
 		return 0;
@@ -1040,10 +1140,10 @@ int may_umount(struct vfsmount *mnt)
 {
 	int ret = 1;
 	down_read(&namespace_sem);
-	br_read_lock(vfsmount_lock);
+	br_write_lock(vfsmount_lock);
 	if (propagate_mount_busy(mnt, 2))
 		ret = 0;
-	br_read_unlock(vfsmount_lock);
+	br_write_unlock(vfsmount_lock);
 	up_read(&namespace_sem);
 	return ret;
 }
@@ -1125,8 +1225,16 @@ static int do_umount(struct vfsmount *mn
 		    flags & (MNT_FORCE | MNT_DETACH))
 			return -EINVAL;
 
-		if (atomic_read(&mnt->mnt_count) != 2)
+		/*
+		 * probably don't strictly need the lock here if we examined
+		 * all race cases, but it's a slowpath.
+		 */
+		br_write_lock(vfsmount_lock);
+		if (count_mnt_count(mnt) != 2) {
+			br_write_lock(vfsmount_lock);
 			return -EBUSY;
+		}
+		br_write_unlock(vfsmount_lock);
 
 		if (!xchg(&mnt->mnt_expiry_mark, 1))
 			return -EAGAIN;
@@ -2350,6 +2458,12 @@ SYSCALL_DEFINE2(pivot_root, const char _
 	touch_mnt_namespace(current->nsproxy->mnt_ns);
 	br_write_unlock(vfsmount_lock);
 	chroot_fs_refs(&root, &new);
+
+	/* Drop MNT_PSEUDO from old, add it to new. See init_mount_tree */
+	BUG_ON(!(root.mnt->mnt_flags & MNT_PSEUDO));
+	root.mnt->mnt_flags &= ~MNT_PSEUDO;
+	new.mnt->mnt_flags |= MNT_PSEUDO;
+
 	error = 0;
 	path_put(&root_parent);
 	path_put(&parent_path);
@@ -2376,6 +2490,13 @@ static void __init init_mount_tree(void)
 	mnt = do_kern_mount("rootfs", 0, "rootfs", NULL);
 	if (IS_ERR(mnt))
 		panic("Can't create rootfs");
+	/*
+	 * MNT_PSEUDO tells mnt refcounting that we're pinned, so don't
+	 * bother checking for zero references. Give one of these to root
+	 * because it isn't "attached" to the tree. See mntput().
+	 */
+	mnt->mnt_flags |= MNT_PSEUDO;
+
 	ns = create_mnt_ns(mnt);
 	if (IS_ERR(ns))
 		panic("Can't allocate initial namespace");
Index: linux-2.6/include/linux/mount.h
===================================================================
--- linux-2.6.orig/include/linux/mount.h	2010-12-12 03:27:08.000000000 +1100
+++ linux-2.6/include/linux/mount.h	2010-12-12 03:51:52.000000000 +1100
@@ -30,6 +30,7 @@ struct mnt_namespace;
 
 #define MNT_SHRINKABLE	0x100
 #define MNT_WRITE_HOLD	0x200
+#define MNT_PSEUDO	0x400
 
 #define MNT_SHARED	0x1000	/* if the vfsmount is a shared mount */
 #define MNT_UNBINDABLE	0x2000	/* if the vfsmount is a unbindable mount */
@@ -70,19 +71,15 @@ struct vfsmount {
 	struct mnt_namespace *mnt_ns;	/* containing namespace */
 	int mnt_id;			/* mount identifier */
 	int mnt_group_id;		/* peer group identifier */
-	/*
-	 * We put mnt_count & mnt_expiry_mark at the end of struct vfsmount
-	 * to let these frequently modified fields in a separate cache line
-	 * (so that reads of mnt_flags wont ping-pong on SMP machines)
-	 */
-	atomic_t mnt_count;
 	int mnt_expiry_mark;		/* true if marked for expiry */
 	int mnt_pinned;
 	int mnt_ghosts;
 #ifdef CONFIG_SMP
 	int __percpu *mnt_writers;
+	int __percpu *mnt_count;
 #else
 	int mnt_writers;
+	int mnt_count;
 #endif
 };
 
@@ -95,13 +92,6 @@ static inline int *get_mnt_writers_ptr(s
 #endif
 }
 
-static inline struct vfsmount *mntget(struct vfsmount *mnt)
-{
-	if (mnt)
-		atomic_inc(&mnt->mnt_count);
-	return mnt;
-}
-
 struct file; /* forward dec */
 
 extern int mnt_want_write(struct vfsmount *mnt);
@@ -109,18 +99,12 @@ extern int mnt_want_write_file(struct fi
 extern int mnt_clone_write(struct vfsmount *mnt);
 extern void mnt_drop_write(struct vfsmount *mnt);
 extern void mntput_no_expire(struct vfsmount *mnt);
+extern void mntput(struct vfsmount *mnt);
+extern struct vfsmount *mntget(struct vfsmount *mnt);
 extern void mnt_pin(struct vfsmount *mnt);
 extern void mnt_unpin(struct vfsmount *mnt);
 extern int __mnt_is_readonly(struct vfsmount *mnt);
 
-static inline void mntput(struct vfsmount *mnt)
-{
-	if (mnt) {
-		mnt->mnt_expiry_mark = 0;
-		mntput_no_expire(mnt);
-	}
-}
-
 extern struct vfsmount *do_kern_mount(const char *fstype, int flags,
 				      const char *name, void *data);
 
Index: linux-2.6/fs/pnode.c
===================================================================
--- linux-2.6.orig/fs/pnode.c	2010-12-12 03:27:08.000000000 +1100
+++ linux-2.6/fs/pnode.c	2010-12-12 03:51:52.000000000 +1100
@@ -288,7 +288,7 @@ int propagate_mnt(struct vfsmount *dest_
  */
 static inline int do_refcount_check(struct vfsmount *mnt, int count)
 {
-	int mycount = atomic_read(&mnt->mnt_count) - mnt->mnt_ghosts;
+	int mycount = count_mnt_count(mnt) - mnt->mnt_ghosts;
 	return (mycount > count);
 }
 
@@ -300,7 +300,7 @@ static inline int do_refcount_check(stru
  * Check if any of these mounts that **do not have submounts**
  * have more references than 'refcnt'. If so return busy.
  *
- * vfsmount lock must be held for read or write
+ * vfsmount lock must be held for write
  */
 int propagate_mount_busy(struct vfsmount *mnt, int refcnt)
 {
Index: linux-2.6/fs/internal.h
===================================================================
--- linux-2.6.orig/fs/internal.h	2010-12-12 03:27:08.000000000 +1100
+++ linux-2.6/fs/internal.h	2010-12-12 03:51:52.000000000 +1100
@@ -63,6 +63,7 @@ extern int copy_mount_string(const void
 
 extern void free_vfsmnt(struct vfsmount *);
 extern struct vfsmount *alloc_vfsmnt(const char *);
+extern unsigned int count_mnt_count(struct vfsmount *mnt);
 extern struct vfsmount *__lookup_mnt(struct vfsmount *, struct dentry *, int);
 extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
 				struct vfsmount *);
Index: linux-2.6/drivers/mtd/mtdchar.c
===================================================================
--- linux-2.6.orig/drivers/mtd/mtdchar.c	2010-12-12 03:27:08.000000000 +1100
+++ linux-2.6/drivers/mtd/mtdchar.c	2010-12-12 03:51:52.000000000 +1100
@@ -1201,6 +1201,7 @@ static int __init init_mtdchar(void)
 static void __exit cleanup_mtdchar(void)
 {
 	unregister_mtd_user(&mtdchar_notifier);
+	mtd_inode_mnt->mnt_flags &= ~MNT_PSEUDO;
 	mntput(mtd_inode_mnt);
 	unregister_filesystem(&mtd_inodefs_type);
 	__unregister_chrdev(MTD_CHAR_MAJOR, 0, 1 << MINORBITS, "mtd");
Index: linux-2.6/arch/ia64/kernel/perfmon.c
===================================================================
--- linux-2.6.orig/arch/ia64/kernel/perfmon.c	2010-12-12 03:48:57.000000000 +1100
+++ linux-2.6/arch/ia64/kernel/perfmon.c	2010-12-12 03:51:52.000000000 +1100
@@ -1553,8 +1553,10 @@ init_pfm_fs(void)
 		err = PTR_ERR(pfmfs_mnt);
 		if (IS_ERR(pfmfs_mnt))
 			unregister_filesystem(&pfm_fs_type);
-		else
+		else {
 			err = 0;
+			pfmfs_mnt->mnt_flags |= MNT_PSEUDO;
+		}
 	}
 	return err;
 }
Index: linux-2.6/fs/anon_inodes.c
===================================================================
--- linux-2.6.orig/fs/anon_inodes.c	2010-12-12 03:51:50.000000000 +1100
+++ linux-2.6/fs/anon_inodes.c	2010-12-12 03:51:52.000000000 +1100
@@ -223,6 +223,7 @@ static int __init anon_inode_init(void)
 		error = PTR_ERR(anon_inode_mnt);
 		goto err_unregister_filesystem;
 	}
+	anon_inode_mnt->mnt_flags |= MNT_PSEUDO;
 	anon_inode_inode = anon_inode_mkinode();
 	if (IS_ERR(anon_inode_inode)) {
 		error = PTR_ERR(anon_inode_inode);
@@ -232,6 +233,7 @@ static int __init anon_inode_init(void)
 	return 0;
 
 err_mntput:
+	anon_inode_mnt->mnt_flags &= ~MNT_PSEUDO;
 	mntput(anon_inode_mnt);
 err_unregister_filesystem:
 	unregister_filesystem(&anon_inode_fs_type);
Index: linux-2.6/fs/block_dev.c
===================================================================
--- linux-2.6.orig/fs/block_dev.c	2010-12-12 03:27:08.000000000 +1100
+++ linux-2.6/fs/block_dev.c	2010-12-12 03:51:52.000000000 +1100
@@ -499,6 +499,7 @@ void __init bdev_cache_init(void)
 	bd_mnt = kern_mount(&bd_type);
 	if (IS_ERR(bd_mnt))
 		panic("Cannot create bdev pseudo-fs");
+	bd_mnt->mnt_flags |= MNT_PSEUDO;
 	/*
 	 * This vfsmount structure is only used to obtain the
 	 * blockdev_superblock, so tell kmemleak not to report it.
Index: linux-2.6/fs/pipe.c
===================================================================
--- linux-2.6.orig/fs/pipe.c	2010-12-12 03:51:50.000000000 +1100
+++ linux-2.6/fs/pipe.c	2010-12-12 03:51:52.000000000 +1100
@@ -1285,6 +1285,7 @@ static int __init init_pipe_fs(void)
 			err = PTR_ERR(pipe_mnt);
 			unregister_filesystem(&pipe_fs_type);
 		}
+		pipe_mnt->mnt_flags |= MNT_PSEUDO;
 	}
 	return err;
 }
@@ -1292,6 +1293,7 @@ static int __init init_pipe_fs(void)
 static void __exit exit_pipe_fs(void)
 {
 	unregister_filesystem(&pipe_fs_type);
+	pipe_mnt->mnt_flags &= ~MNT_PSEUDO;
 	mntput(pipe_mnt);
 }
 
Index: linux-2.6/net/socket.c
===================================================================
--- linux-2.6.orig/net/socket.c	2010-12-12 03:51:50.000000000 +1100
+++ linux-2.6/net/socket.c	2010-12-12 03:51:52.000000000 +1100
@@ -2375,6 +2375,8 @@ EXPORT_SYMBOL(sock_unregister);
 
 static int __init sock_init(void)
 {
+	int err;
+
 	/*
 	 *      Initialize sock SLAB cache.
 	 */
@@ -2391,8 +2393,16 @@ static int __init sock_init(void)
 	 */
 
 	init_inodecache();
-	register_filesystem(&sock_fs_type);
+
+	err = register_filesystem(&sock_fs_type);
+	if (err)
+		goto out_fs;
 	sock_mnt = kern_mount(&sock_fs_type);
+	if (IS_ERR(sock_mnt)) {
+		err = PTR_ERR(sock_mnt);
+		goto out_mount;
+	}
+	sock_mnt->mnt_flags |= MNT_PSEUDO;
 
 	/* The real protocol initialization is performed in later initcalls.
 	 */
@@ -2405,7 +2415,13 @@ static int __init sock_init(void)
 	skb_timestamping_init();
 #endif
 
-	return 0;
+out:
+	return err;
+
+out_mount:
+	unregister_filesystem(&sock_fs_type);
+out_fs:
+	goto out;
 }
 
 core_initcall(sock_init);	/* early initcall */
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/