lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Mon,  3 Jan 2022 19:45:44 +1100
From:   Imran Khan <imran.f.khan@...cle.com>
To:     gregkh@...uxfoundation.org, tj@...nel.org
Cc:     linux-kernel@...r.kernel.org
Subject: [RFC PATCH v2 2/2] kernfs: Reduce contention around global per-fs kernfs_rwsem.

Right now a global per file system based rwsem (kernfs_rwsem)
synchronizes multiple kernfs operations. On a large system with
few hundred CPUs and few hundred applications simultaenously trying
to access sysfs, this results in multiple sys_open(s) contending on
kernfs_rwsem via kernfs_iop_permission and kernfs_dop_revalidate.

-   21.42%    21.34%  showgids   [kernel.kallsyms]     [k] up_read
     21.34% __libc_start_main
        __GI___libc_open
        entry_SYSCALL_64_after_hwframe
        do_syscall_64
        sys_open
        do_sys_open
        do_filp_open
      - path_openat
         - 20.05% link_path_walk
            - 9.76% walk_component
                 lookup_fast
               - d_revalidate.part.24
                  - 9.75% kernfs_dop_revalidate
                       up_read
            - 9.46% inode_permission
               - __inode_permission
                  - 9.46% kernfs_iop_permission
                       up_read
            - 0.83% kernfs_iop_get_link
                 up_read
         - 0.80% lookup_fast
              d_revalidate.part.24
              kernfs_dop_revalidate
              up_read

-   21.31%    21.21%  showgids   [kernel.kallsyms]    [k] down_read
     21.21% __libc_start_main
        __GI___libc_open
        entry_SYSCALL_64_after_hwframe
        do_syscall_64
        sys_open
        do_sys_open
        do_filp_open
      - path_openat
         - 19.78% link_path_walk
            - 10.62% inode_permission
               - __inode_permission
                  - 10.62% kernfs_iop_permission
                       down_read
            - 8.45% walk_component
                 lookup_fast
               - d_revalidate.part.24
                  - 8.45% kernfs_dop_revalidate
                       down_read
            - 0.71% kernfs_iop_get_link
                 down_read
         - 0.72% lookup_fast
            - d_revalidate.part.24
               - 0.72% kernfs_dop_revalidate
                    down_read
         - 0.71% may_open
              inode_permission
              __inode_permission
              kernfs_iop_permission
              down_read

Since permission is specific to a kernfs_node we can use a per
kernfs_node based lock to access/modify permission. Also use kernfs
reference counting to ensure we are accessing/modifying permissions
for an existing kernfs_node object.

Using this change brings down the above mentioned down_read/up_read
numbers to ~8%, thus indicating that contention around kernfs_rwsem
has reduced to about 1/3rd of earlier value.

Signed-off-by: Imran Khan <imran.f.khan@...cle.com>
---
 fs/kernfs/dir.c        |  4 ++++
 fs/kernfs/inode.c      | 25 +++++++++++++++++--------
 include/linux/kernfs.h |  1 +
 3 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index cd68ac30f71b..a6846e5e2cab 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -750,11 +750,13 @@ int kernfs_add_one(struct kernfs_node *kn)
 		goto out_unlock;
 
 	/* Update timestamps on the parent */
+	down_write(&parent->kernfs_iop_rwsem);
 	ps_iattr = parent->iattr;
 	if (ps_iattr) {
 		ktime_get_real_ts64(&ps_iattr->ia_ctime);
 		ps_iattr->ia_mtime = ps_iattr->ia_ctime;
 	}
+	up_write(&parent->kernfs_iop_rwsem);
 
 	up_write(&root->kernfs_rwsem);
 
@@ -1380,8 +1382,10 @@ static void __kernfs_remove(struct kernfs_node *kn)
 
 			/* update timestamps on the parent */
 			if (ps_iattr) {
+				down_write(&pos->parent->kernfs_iop_rwsem);
 				ktime_get_real_ts64(&ps_iattr->ia_ctime);
 				ps_iattr->ia_mtime = ps_iattr->ia_ctime;
+				up_write(&pos->parent->kernfs_iop_rwsem);
 			}
 
 			kernfs_put(pos);
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 3d783d80f5da..6d375f3b5e17 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -101,9 +101,11 @@ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
 	int ret;
 	struct kernfs_root *root = kernfs_root(kn);
 
-	down_write(&root->kernfs_rwsem);
+	kernfs_get(kn);
+	down_write(&kn->kernfs_iop_rwsem);
 	ret = __kernfs_setattr(kn, iattr);
-	up_write(&root->kernfs_rwsem);
+	up_write(&kn->kernfs_iop_rwsem);
+	kernfs_put(kn);
 	return ret;
 }
 
@@ -119,7 +121,8 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 		return -EINVAL;
 
 	root = kernfs_root(kn);
-	down_write(&root->kernfs_rwsem);
+	kernfs_get(kn);
+	down_write(&kn->kernfs_iop_rwsem);
 	error = setattr_prepare(&init_user_ns, dentry, iattr);
 	if (error)
 		goto out;
@@ -132,7 +135,8 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 	setattr_copy(&init_user_ns, inode, iattr);
 
 out:
-	up_write(&root->kernfs_rwsem);
+	up_write(&kn->kernfs_iop_rwsem);
+	kernfs_put(kn);
 	return error;
 }
 
@@ -189,12 +193,14 @@ int kernfs_iop_getattr(struct user_namespace *mnt_userns,
 	struct kernfs_node *kn = inode->i_private;
 	struct kernfs_root *root = kernfs_root(kn);
 
-	down_read(&root->kernfs_rwsem);
+	kernfs_get(kn);
+	down_read(&kn->kernfs_iop_rwsem);
 	spin_lock(&inode->i_lock);
 	kernfs_refresh_inode(kn, inode);
 	generic_fillattr(&init_user_ns, inode, stat);
 	spin_unlock(&inode->i_lock);
-	up_read(&root->kernfs_rwsem);
+	up_read(&kn->kernfs_iop_rwsem);
+	kernfs_put(kn);
 
 	return 0;
 }
@@ -207,6 +213,7 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode)
 	inode->i_op = &kernfs_iops;
 	inode->i_generation = kernfs_gen(kn);
 
+	init_rwsem(&kn->kernfs_iop_rwsem);
 	set_default_inode_attr(inode, kn->mode);
 	kernfs_refresh_inode(kn, inode);
 
@@ -287,12 +294,14 @@ int kernfs_iop_permission(struct user_namespace *mnt_userns,
 	kn = inode->i_private;
 	root = kernfs_root(kn);
 
-	down_read(&root->kernfs_rwsem);
+	kernfs_get(kn);
+	down_read(&kn->kernfs_iop_rwsem);
 	spin_lock(&inode->i_lock);
 	kernfs_refresh_inode(kn, inode);
 	ret = generic_permission(&init_user_ns, inode, mask);
 	spin_unlock(&inode->i_lock);
-	up_read(&root->kernfs_rwsem);
+	up_read(&kn->kernfs_iop_rwsem);
+	kernfs_put(kn);
 
 	return ret;
 }
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 5ed4c9ed39af..c0fa319d266b 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -166,6 +166,7 @@ struct kernfs_node {
 	struct kernfs_iattrs	*iattr;
 	spinlock_t kernfs_open_node_lock;
 	struct mutex kernfs_open_file_mutex;
+	struct rw_semaphore	kernfs_iop_rwsem;
 };
 
 /*
-- 
2.30.2

Powered by blists - more mailing lists