lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20220113104259.1584491-3-imran.f.khan@oracle.com>
Date:   Thu, 13 Jan 2022 21:42:59 +1100
From:   Imran Khan <imran.f.khan@...cle.com>
To:     tj@...nel.org, gregkh@...uxfoundation.org
Cc:     linux-kernel@...r.kernel.org
Subject: [PATCH v3 2/2] kernfs: Reduce contention around global per-fs kernfs_rwsem.

Right now a global per file system based rwsem (kernfs_rwsem)
synchronizes multiple kernfs operations. On a large system with
few hundred CPUs and few hundred applications simultaenously trying
to access sysfs, this results in multiple sys_open(s) contending on
kernfs_rwsem via kernfs_iop_permission and kernfs_dop_revalidate.

-   21.42%    21.34%  showgids   [kernel.kallsyms]     [k] up_read
     21.34% __libc_start_main
        __GI___libc_open
        entry_SYSCALL_64_after_hwframe
        do_syscall_64
        sys_open
        do_sys_open
        do_filp_open
      - path_openat
         - 20.05% link_path_walk
            - 9.76% walk_component
                 lookup_fast
               - d_revalidate.part.24
                  - 9.75% kernfs_dop_revalidate
                       up_read
            - 9.46% inode_permission
               - __inode_permission
                  - 9.46% kernfs_iop_permission
                       up_read
            - 0.83% kernfs_iop_get_link
                 up_read
         - 0.80% lookup_fast
              d_revalidate.part.24
              kernfs_dop_revalidate
              up_read

-   21.31%    21.21%  showgids   [kernel.kallsyms]    [k] down_read
     21.21% __libc_start_main
        __GI___libc_open
        entry_SYSCALL_64_after_hwframe
        do_syscall_64
        sys_open
        do_sys_open
        do_filp_open
      - path_openat
         - 19.78% link_path_walk
            - 10.62% inode_permission
               - __inode_permission
                  - 10.62% kernfs_iop_permission
                       down_read
            - 8.45% walk_component
                 lookup_fast
               - d_revalidate.part.24
                  - 8.45% kernfs_dop_revalidate
                       down_read
            - 0.71% kernfs_iop_get_link
                 down_read
         - 0.72% lookup_fast
            - d_revalidate.part.24
               - 0.72% kernfs_dop_revalidate
                    down_read
         - 0.71% may_open
              inode_permission
              __inode_permission
              kernfs_iop_permission
              down_read

Since permission is specific to a kernfs_node we can use a hashed
lock to access/modify permission. Also use kernfs reference counting
to ensure we are accessing/modifying permissions for an existing
kernfs_node object.

Using this change brings down the above mentioned down_read/up_read
numbers to ~8%, thus indicating that contention around kernfs_rwsem
has reduced to about 1/3rd of earlier value.

Signed-off-by: Imran Khan <imran.f.khan@...cle.com>
---
 fs/kernfs/dir.c             |  8 ++++++++
 fs/kernfs/inode.c           | 35 +++++++++++++++++++++++++----------
 fs/kernfs/kernfs-internal.h | 18 ++++++++++++++++--
 3 files changed, 49 insertions(+), 12 deletions(-)

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index e6d9772ddb4c..37daaffda718 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -720,6 +720,7 @@ int kernfs_add_one(struct kernfs_node *kn)
 	struct kernfs_node *parent = kn->parent;
 	struct kernfs_root *root = kernfs_root(parent);
 	struct kernfs_iattrs *ps_iattr;
+	struct rw_semaphore *rwsem = NULL;
 	bool has_ns;
 	int ret;
 
@@ -748,11 +749,14 @@ int kernfs_add_one(struct kernfs_node *kn)
 		goto out_unlock;
 
 	/* Update timestamps on the parent */
+	rwsem = iattr_rwsem_ptr(parent);
+	down_write(rwsem);
 	ps_iattr = parent->iattr;
 	if (ps_iattr) {
 		ktime_get_real_ts64(&ps_iattr->ia_ctime);
 		ps_iattr->ia_mtime = ps_iattr->ia_ctime;
 	}
+	up_write(rwsem);
 
 	up_write(&root->kernfs_rwsem);
 
@@ -1326,6 +1330,7 @@ void kernfs_activate(struct kernfs_node *kn)
 static void __kernfs_remove(struct kernfs_node *kn)
 {
 	struct kernfs_node *pos;
+	struct rw_semaphore *rwsem;
 
 	lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem);
 
@@ -1378,8 +1383,11 @@ static void __kernfs_remove(struct kernfs_node *kn)
 
 			/* update timestamps on the parent */
 			if (ps_iattr) {
+				rwsem = iattr_rwsem_ptr(pos->parent);
+				down_write(rwsem);
 				ktime_get_real_ts64(&ps_iattr->ia_ctime);
 				ps_iattr->ia_mtime = ps_iattr->ia_ctime;
+				up_write(rwsem);
 			}
 
 			kernfs_put(pos);
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 3d783d80f5da..7a55ad440bf4 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -99,11 +99,15 @@ int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
 int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
 {
 	int ret;
-	struct kernfs_root *root = kernfs_root(kn);
+	struct rw_semaphore *rwsem = NULL;
 
-	down_write(&root->kernfs_rwsem);
+	kernfs_get(kn);
+	rwsem = iattr_rwsem_ptr(kn);
+	down_write(rwsem);
 	ret = __kernfs_setattr(kn, iattr);
-	up_write(&root->kernfs_rwsem);
+	up_write(rwsem);
+	kernfs_put(kn);
+
 	return ret;
 }
 
@@ -112,6 +116,7 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 {
 	struct inode *inode = d_inode(dentry);
 	struct kernfs_node *kn = inode->i_private;
+	struct rw_semaphore *rwsem = NULL;
 	struct kernfs_root *root;
 	int error;
 
@@ -119,7 +124,9 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 		return -EINVAL;
 
 	root = kernfs_root(kn);
-	down_write(&root->kernfs_rwsem);
+	kernfs_get(kn);
+	rwsem = iattr_rwsem_ptr(kn);
+	down_write(rwsem);
 	error = setattr_prepare(&init_user_ns, dentry, iattr);
 	if (error)
 		goto out;
@@ -132,7 +139,8 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 	setattr_copy(&init_user_ns, inode, iattr);
 
 out:
-	up_write(&root->kernfs_rwsem);
+	up_write(rwsem);
+	kernfs_put(kn);
 	return error;
 }
 
@@ -187,14 +195,17 @@ int kernfs_iop_getattr(struct user_namespace *mnt_userns,
 {
 	struct inode *inode = d_inode(path->dentry);
 	struct kernfs_node *kn = inode->i_private;
-	struct kernfs_root *root = kernfs_root(kn);
+	struct rw_semaphore *rwsem = NULL;
 
-	down_read(&root->kernfs_rwsem);
+	kernfs_get(kn);
+	rwsem = iattr_rwsem_ptr(kn);
+	down_read(rwsem);
 	spin_lock(&inode->i_lock);
 	kernfs_refresh_inode(kn, inode);
 	generic_fillattr(&init_user_ns, inode, stat);
 	spin_unlock(&inode->i_lock);
-	up_read(&root->kernfs_rwsem);
+	up_read(rwsem);
+	kernfs_put(kn);
 
 	return 0;
 }
@@ -279,6 +290,7 @@ int kernfs_iop_permission(struct user_namespace *mnt_userns,
 {
 	struct kernfs_node *kn;
 	struct kernfs_root *root;
+	struct rw_semaphore *rwsem = NULL;
 	int ret;
 
 	if (mask & MAY_NOT_BLOCK)
@@ -287,12 +299,15 @@ int kernfs_iop_permission(struct user_namespace *mnt_userns,
 	kn = inode->i_private;
 	root = kernfs_root(kn);
 
-	down_read(&root->kernfs_rwsem);
+	kernfs_get(kn);
+	rwsem = iattr_rwsem_ptr(kn);
+	down_read(rwsem);
 	spin_lock(&inode->i_lock);
 	kernfs_refresh_inode(kn, inode);
 	ret = generic_permission(&init_user_ns, inode, mask);
 	spin_unlock(&inode->i_lock);
-	up_read(&root->kernfs_rwsem);
+	up_read(rwsem);
+	kernfs_put(kn);
 
 	return ret;
 }
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 4bdcf7a71845..f1977d72369a 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -39,17 +39,23 @@ struct kernfs_open_file_mutex {
 	struct mutex lock;
 } ____cacheline_aligned_in_smp;
 
+struct kernfs_iattr_rwsem {
+	struct rw_semaphore rwsem;
+} ____cacheline_aligned_in_smp;
+
 /*
  * There's one kernfs_open_file for each open file and one kernfs_open_node
  * for each kernfs_node with one or more open files.
  *
  * kernfs_node->attr.open points to kernfs_open_node.  attr.open is
- * protected by open_node_locks[i].
+ * protected by open_node_locks[i].lock.
  *
  * filp->private_data points to seq_file whose ->private points to
  * kernfs_open_file.  kernfs_open_files are chained at
- * kernfs_open_node->files, which is protected by open_file_mutex[i].
+ * kernfs_open_node->files, which is protected by open_file_mutex[i].lock.
  *
+ * kernfs_node->iattr points to kernfs_node's attributes  and is
+ * protected by iattr_rwsem[i].rwsem
  * To reduce possible contention in sysfs access, arising due to single
  * locks, use an array of locks and use kernfs_node object address as
  * hash keys to get the index of these locks.
@@ -58,6 +64,7 @@ struct kernfs_open_file_mutex {
 struct kernfs_global_locks {
 	struct kernfs_open_node_lock open_node_locks[NR_KERNFS_LOCKS];
 	struct kernfs_open_file_mutex open_file_mutex[NR_KERNFS_LOCKS];
+	struct kernfs_iattr_rwsem iattr_rwsem[NR_KERNFS_LOCKS];
 };
 
 static struct kernfs_global_locks kernfs_global_locks;
@@ -76,6 +83,13 @@ static inline struct mutex *open_file_mutex_ptr(struct kernfs_node *kn)
 	return &kernfs_global_locks.open_file_mutex[index].lock;
 }
 
+static inline struct rw_semaphore *iattr_rwsem_ptr(struct kernfs_node *kn)
+{
+	int index = hash_ptr(kn, NR_KERNFS_LOCK_BITS);
+
+	return &kernfs_global_locks.iattr_rwsem[index].rwsem;
+}
+
 struct kernfs_iattrs {
 	kuid_t			ia_uid;
 	kgid_t			ia_gid;
-- 
2.30.2

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ