[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20220103084544.1109829-3-imran.f.khan@oracle.com>
Date: Mon, 3 Jan 2022 19:45:44 +1100
From: Imran Khan <imran.f.khan@...cle.com>
To: gregkh@...uxfoundation.org, tj@...nel.org
Cc: linux-kernel@...r.kernel.org
Subject: [RFC PATCH v2 2/2] kernfs: Reduce contention around global per-fs kernfs_rwsem.
Right now a global per file system based rwsem (kernfs_rwsem)
synchronizes multiple kernfs operations. On a large system with
few hundred CPUs and few hundred applications simultaenously trying
to access sysfs, this results in multiple sys_open(s) contending on
kernfs_rwsem via kernfs_iop_permission and kernfs_dop_revalidate.
- 21.42% 21.34% showgids [kernel.kallsyms] [k] up_read
21.34% __libc_start_main
__GI___libc_open
entry_SYSCALL_64_after_hwframe
do_syscall_64
sys_open
do_sys_open
do_filp_open
- path_openat
- 20.05% link_path_walk
- 9.76% walk_component
lookup_fast
- d_revalidate.part.24
- 9.75% kernfs_dop_revalidate
up_read
- 9.46% inode_permission
- __inode_permission
- 9.46% kernfs_iop_permission
up_read
- 0.83% kernfs_iop_get_link
up_read
- 0.80% lookup_fast
d_revalidate.part.24
kernfs_dop_revalidate
up_read
- 21.31% 21.21% showgids [kernel.kallsyms] [k] down_read
21.21% __libc_start_main
__GI___libc_open
entry_SYSCALL_64_after_hwframe
do_syscall_64
sys_open
do_sys_open
do_filp_open
- path_openat
- 19.78% link_path_walk
- 10.62% inode_permission
- __inode_permission
- 10.62% kernfs_iop_permission
down_read
- 8.45% walk_component
lookup_fast
- d_revalidate.part.24
- 8.45% kernfs_dop_revalidate
down_read
- 0.71% kernfs_iop_get_link
down_read
- 0.72% lookup_fast
- d_revalidate.part.24
- 0.72% kernfs_dop_revalidate
down_read
- 0.71% may_open
inode_permission
__inode_permission
kernfs_iop_permission
down_read
Since permission is specific to a kernfs_node we can use a per
kernfs_node based lock to access/modify permission. Also use kernfs
reference counting to ensure we are accessing/modifying permissions
for an existing kernfs_node object.
Using this change brings down the above mentioned down_read/up_read
numbers to ~8%, thus indicating that contention around kernfs_rwsem
has reduced to about 1/3rd of earlier value.
Signed-off-by: Imran Khan <imran.f.khan@...cle.com>
---
fs/kernfs/dir.c | 4 ++++
fs/kernfs/inode.c | 25 +++++++++++++++++--------
include/linux/kernfs.h | 1 +
3 files changed, 22 insertions(+), 8 deletions(-)
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index cd68ac30f71b..a6846e5e2cab 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -750,11 +750,13 @@ int kernfs_add_one(struct kernfs_node *kn)
goto out_unlock;
/* Update timestamps on the parent */
+ down_write(&parent->kernfs_iop_rwsem);
ps_iattr = parent->iattr;
if (ps_iattr) {
ktime_get_real_ts64(&ps_iattr->ia_ctime);
ps_iattr->ia_mtime = ps_iattr->ia_ctime;
}
+ up_write(&parent->kernfs_iop_rwsem);
up_write(&root->kernfs_rwsem);
@@ -1380,8 +1382,10 @@ static void __kernfs_remove(struct kernfs_node *kn)
/* update timestamps on the parent */
if (ps_iattr) {
+ down_write(&pos->parent->kernfs_iop_rwsem);
ktime_get_real_ts64(&ps_iattr->ia_ctime);
ps_iattr->ia_mtime = ps_iattr->ia_ctime;
+ up_write(&pos->parent->kernfs_iop_rwsem);
}
kernfs_put(pos);
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 3d783d80f5da..6d375f3b5e17 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -101,9 +101,11 @@ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
int ret;
struct kernfs_root *root = kernfs_root(kn);
- down_write(&root->kernfs_rwsem);
+ kernfs_get(kn);
+ down_write(&kn->kernfs_iop_rwsem);
ret = __kernfs_setattr(kn, iattr);
- up_write(&root->kernfs_rwsem);
+ up_write(&kn->kernfs_iop_rwsem);
+ kernfs_put(kn);
return ret;
}
@@ -119,7 +121,8 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
return -EINVAL;
root = kernfs_root(kn);
- down_write(&root->kernfs_rwsem);
+ kernfs_get(kn);
+ down_write(&kn->kernfs_iop_rwsem);
error = setattr_prepare(&init_user_ns, dentry, iattr);
if (error)
goto out;
@@ -132,7 +135,8 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
setattr_copy(&init_user_ns, inode, iattr);
out:
- up_write(&root->kernfs_rwsem);
+ up_write(&kn->kernfs_iop_rwsem);
+ kernfs_put(kn);
return error;
}
@@ -189,12 +193,14 @@ int kernfs_iop_getattr(struct user_namespace *mnt_userns,
struct kernfs_node *kn = inode->i_private;
struct kernfs_root *root = kernfs_root(kn);
- down_read(&root->kernfs_rwsem);
+ kernfs_get(kn);
+ down_read(&kn->kernfs_iop_rwsem);
spin_lock(&inode->i_lock);
kernfs_refresh_inode(kn, inode);
generic_fillattr(&init_user_ns, inode, stat);
spin_unlock(&inode->i_lock);
- up_read(&root->kernfs_rwsem);
+ up_read(&kn->kernfs_iop_rwsem);
+ kernfs_put(kn);
return 0;
}
@@ -207,6 +213,7 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode)
inode->i_op = &kernfs_iops;
inode->i_generation = kernfs_gen(kn);
+ init_rwsem(&kn->kernfs_iop_rwsem);
set_default_inode_attr(inode, kn->mode);
kernfs_refresh_inode(kn, inode);
@@ -287,12 +294,14 @@ int kernfs_iop_permission(struct user_namespace *mnt_userns,
kn = inode->i_private;
root = kernfs_root(kn);
- down_read(&root->kernfs_rwsem);
+ kernfs_get(kn);
+ down_read(&kn->kernfs_iop_rwsem);
spin_lock(&inode->i_lock);
kernfs_refresh_inode(kn, inode);
ret = generic_permission(&init_user_ns, inode, mask);
spin_unlock(&inode->i_lock);
- up_read(&root->kernfs_rwsem);
+ up_read(&kn->kernfs_iop_rwsem);
+ kernfs_put(kn);
return ret;
}
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 5ed4c9ed39af..c0fa319d266b 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -166,6 +166,7 @@ struct kernfs_node {
struct kernfs_iattrs *iattr;
spinlock_t kernfs_open_node_lock;
struct mutex kernfs_open_file_mutex;
+ struct rw_semaphore kernfs_iop_rwsem;
};
/*
--
2.30.2
Powered by blists - more mailing lists