lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20230302043203.1695051-2-imran.f.khan@oracle.com>
Date:   Thu,  2 Mar 2023 15:32:01 +1100
From:   Imran Khan <imran.f.khan@...cle.com>
To:     tj@...nel.org, gregkh@...uxfoundation.org, viro@...iv.linux.org.uk
Cc:     linux-kernel@...r.kernel.org, linux-fsdevel@...r.kernel.org,
        joe.jin@...cle.com
Subject: [PATCH 1/3] kernfs: Introduce separate rwsem to protect inode attributes.

Right now a global per-fs rwsem (kernfs_rwsem) synchronizes multiple
kernfs operations. On a large system with few hundred CPUs and few
hundred applications simultaneoulsy trying to access sysfs, this
results in multiple sys_open(s) contending on kernfs_rwsem via
kernfs_iop_permission and kernfs_dop_revalidate.

For example on a system with 384 cores, if I run 200 instances of an
application which is mostly executing the following loop:

  for (int loop = 0; loop <100 ; loop++)
  {
    for (int port_num = 1; port_num < 2; port_num++)
    {
      for (int gid_index = 0; gid_index < 254; gid_index++ )
      {
        char ret_buf[64], ret_buf_lo[64];
        char gid_file_path[1024];

        int      ret_len;
        int      ret_fd;
        ssize_t  ret_rd;

        ub4  i, saved_errno;

        memset(ret_buf, 0, sizeof(ret_buf));
        memset(gid_file_path, 0, sizeof(gid_file_path));

        ret_len = snprintf(gid_file_path, sizeof(gid_file_path),
                           "/sys/class/infiniband/%s/ports/%d/gids/%d",
                           dev_name,
                           port_num,
                           gid_index);

        ret_fd = open(gid_file_path, O_RDONLY | O_CLOEXEC);
        if (ret_fd < 0)
        {
          printf("Failed to open %s\n", gid_file_path);
          continue;
        }

        /* Read the GID */
        ret_rd = read(ret_fd, ret_buf, 40);

        if (ret_rd == -1)
        {
          printf("Failed to read from file %s, errno: %u\n",
                 gid_file_path, saved_errno);

          continue;
        }

        close(ret_fd);
      }
    }

I see contention around kernfs_rwsem as follows:

path_openat
|
|----link_path_walk.part.0.constprop.0
|      |
|      |--49.92%--inode_permission
|      |          |
|      |           --48.69%--kernfs_iop_permission
|      |                     |
|      |                     |--18.16%--down_read
|      |                     |
|      |                     |--15.38%--up_read
|      |                     |
|      |                      --14.58%--_raw_spin_lock
|      |                                |
|      |                                 -----
|      |
|      |--29.08%--walk_component
|      |          |
|      |           --29.02%--lookup_fast
|      |                     |
|      |                     |--24.26%--kernfs_dop_revalidate
|      |                     |          |
|      |                     |          |--14.97%--down_read
|      |                     |          |
|      |                     |           --9.01%--up_read
|      |                     |
|      |                      --4.74%--__d_lookup
|      |                                |
|      |                                 --4.64%--_raw_spin_lock
|      |                                           |
|      |                                            ----

Having a separate per-fs rwsem to protect kernfs inode attributes,
will avoid the above mentioned contention and result in better
performance as can bee seen below:

path_openat
|
|----link_path_walk.part.0.constprop.0
|     |
|     |
|     |--27.06%--inode_permission
|     |          |
|     |           --25.84%--kernfs_iop_permission
|     |                     |
|     |                     |--9.29%--up_read
|     |                     |
|     |                     |--8.19%--down_read
|     |                     |
|     |                      --7.89%--_raw_spin_lock
|     |                                |
|     |                                 ----
|     |
|     |--22.42%--walk_component
|     |          |
|     |           --22.36%--lookup_fast
|     |                     |
|     |                     |--16.07%--__d_lookup
|     |                     |          |
|     |                     |           --16.01%--_raw_spin_lock
|     |                     |                     |
|     |                     |                      ----
|     |                     |
|     |                      --6.28%--kernfs_dop_revalidate
|     |                                |
|     |                                |--3.76%--down_read
|     |                                |
|     |                                 --2.26%--up_read

As can be seen from the above data the overhead due to both
kerfs_iop_permission and kernfs_dop_revalidate have gone down and
this also reduces overall run time of the earlier mentioned loop.

Signed-off-by: Imran Khan <imran.f.khan@...cle.com>
---
 fs/kernfs/dir.c             |  7 +++++++
 fs/kernfs/inode.c           | 16 ++++++++--------
 fs/kernfs/kernfs-internal.h |  1 +
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index ef00b5fe8ceea..953b2717c60e6 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -770,12 +770,15 @@ int kernfs_add_one(struct kernfs_node *kn)
 		goto out_unlock;
 
 	/* Update timestamps on the parent */
+	down_write(&root->kernfs_iattr_rwsem);
+
 	ps_iattr = parent->iattr;
 	if (ps_iattr) {
 		ktime_get_real_ts64(&ps_iattr->ia_ctime);
 		ps_iattr->ia_mtime = ps_iattr->ia_ctime;
 	}
 
+	up_write(&root->kernfs_iattr_rwsem);
 	up_write(&root->kernfs_rwsem);
 
 	/*
@@ -940,6 +943,7 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
 
 	idr_init(&root->ino_idr);
 	init_rwsem(&root->kernfs_rwsem);
+	init_rwsem(&root->kernfs_iattr_rwsem);
 	INIT_LIST_HEAD(&root->supers);
 
 	/*
@@ -1462,11 +1466,14 @@ static void __kernfs_remove(struct kernfs_node *kn)
 				pos->parent ? pos->parent->iattr : NULL;
 
 			/* update timestamps on the parent */
+			down_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
+
 			if (ps_iattr) {
 				ktime_get_real_ts64(&ps_iattr->ia_ctime);
 				ps_iattr->ia_mtime = ps_iattr->ia_ctime;
 			}
 
+			up_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
 			kernfs_put(pos);
 		}
 
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 30494dcb0df34..b22b74d1a1150 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -101,9 +101,9 @@ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
 	int ret;
 	struct kernfs_root *root = kernfs_root(kn);
 
-	down_write(&root->kernfs_rwsem);
+	down_write(&root->kernfs_iattr_rwsem);
 	ret = __kernfs_setattr(kn, iattr);
-	up_write(&root->kernfs_rwsem);
+	up_write(&root->kernfs_iattr_rwsem);
 	return ret;
 }
 
@@ -119,7 +119,7 @@ int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 		return -EINVAL;
 
 	root = kernfs_root(kn);
-	down_write(&root->kernfs_rwsem);
+	down_write(&root->kernfs_iattr_rwsem);
 	error = setattr_prepare(&nop_mnt_idmap, dentry, iattr);
 	if (error)
 		goto out;
@@ -132,7 +132,7 @@ int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 	setattr_copy(&nop_mnt_idmap, inode, iattr);
 
 out:
-	up_write(&root->kernfs_rwsem);
+	up_write(&root->kernfs_iattr_rwsem);
 	return error;
 }
 
@@ -189,10 +189,10 @@ int kernfs_iop_getattr(struct mnt_idmap *idmap,
 	struct kernfs_node *kn = inode->i_private;
 	struct kernfs_root *root = kernfs_root(kn);
 
-	down_read(&root->kernfs_rwsem);
+	down_read(&root->kernfs_iattr_rwsem);
 	kernfs_refresh_inode(kn, inode);
 	generic_fillattr(&nop_mnt_idmap, inode, stat);
-	up_read(&root->kernfs_rwsem);
+	up_read(&root->kernfs_iattr_rwsem);
 
 	return 0;
 }
@@ -285,10 +285,10 @@ int kernfs_iop_permission(struct mnt_idmap *idmap,
 	kn = inode->i_private;
 	root = kernfs_root(kn);
 
-	down_read(&root->kernfs_rwsem);
+	down_read(&root->kernfs_iattr_rwsem);
 	kernfs_refresh_inode(kn, inode);
 	ret = generic_permission(&nop_mnt_idmap, inode, mask);
-	up_read(&root->kernfs_rwsem);
+	up_read(&root->kernfs_iattr_rwsem);
 
 	return ret;
 }
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 236c3a6113f1e..3297093c920de 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -47,6 +47,7 @@ struct kernfs_root {
 
 	wait_queue_head_t	deactivate_waitq;
 	struct rw_semaphore	kernfs_rwsem;
+	struct rw_semaphore	kernfs_iattr_rwsem;
 };
 
 /* +1 to avoid triggering overflow warning when negating it */
-- 
2.34.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ