[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20220214120322.2402628-7-imran.f.khan@oracle.com>
Date: Mon, 14 Feb 2022 23:03:21 +1100
From: Imran Khan <imran.f.khan@...cle.com>
To: tj@...nel.org, gregkh@...uxfoundation.org
Cc: linux-kernel@...r.kernel.org
Subject: [PATCH v6 6/7] kernfs: Introduce hashed rw-sem to replace per-fs kernfs_rwsem.
Having a single rwsem to synchronize all operations across a kernfs
based file system (cgroup, sysfs etc.) does not scale well. The contention
around this single rwsem becomes more apparent in large scale systems with
few hundred CPUs where most of the CPUs have running tasks that are
opening, accessing or closing sysfs files at any point of time.
Using hashed rwsems in place of a per-fs rwsem, can significantly reduce
contention around per-fs rwsem and hence provide better scalability.
Moreover as these hashed rwsems are not part of kernfs_node objects we will
not see any singnificant change in memory utilization of kernfs based file
systems like sysfs, cgroupfs etc.
This patch introduces hashed rwsems that can be used in place of per-fs
rwsem. It also provides interfaces to use hashed rwsems and interfaces for
lockdep annotation as well. The next patch makes use of these interfaces
and replaces per-fs rwsem with hashed ones.
Signed-off-by: Imran Khan <imran.f.khan@...cle.com>
---
fs/kernfs/dir.c | 1 -
fs/kernfs/kernfs-internal.h | 560 ++++++++++++++++++++++++++++++++++++
fs/kernfs/mount.c | 1 +
include/linux/kernfs.h | 2 +
4 files changed, 563 insertions(+), 1 deletion(-)
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index dc769301ac96b..0dac58f8091c9 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -25,7 +25,6 @@ static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */
static bool kernfs_active(struct kernfs_node *kn)
{
- lockdep_assert_held(&kernfs_root(kn)->kernfs_rwsem);
return atomic_read(&kn->active) >= 0;
}
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 593395f325a18..ba89de378f240 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -19,6 +19,17 @@
#include <linux/kernfs.h>
#include <linux/fs_context.h>
+/*
+ * kernfs_rwsem locking pattern:
+ *
+ * KERNFS_RWSEM_LOCK_SELF: lock kernfs_node only.
+ * KERNFS_RWSEM_LOCK_SELF_AND_PARENT: lock kernfs_node and its parent.
+ */
+enum kernfs_rwsem_lock_pattern {
+ KERNFS_RWSEM_LOCK_SELF,
+ KERNFS_RWSEM_LOCK_SELF_AND_PARENT
+};
+
struct kernfs_iattrs {
kuid_t ia_uid;
kgid_t ia_gid;
@@ -190,4 +201,553 @@ kernfs_open_node_spinlock(struct kernfs_node *kn)
return lock;
}
+static inline struct rw_semaphore *kernfs_rwsem_ptr(struct kernfs_node *kn)
+{
+ int idx = hash_ptr(kn, NR_KERNFS_LOCK_BITS);
+
+ return &kernfs_locks->kernfs_rwsem[idx];
+}
+
+static inline void kernfs_rwsem_assert_held(struct kernfs_node *kn)
+{
+ lockdep_assert_held(kernfs_rwsem_ptr(kn));
+}
+
+static inline void kernfs_rwsem_assert_held_write(struct kernfs_node *kn)
+{
+ lockdep_assert_held_write(kernfs_rwsem_ptr(kn));
+}
+
+static inline void kernfs_rwsem_assert_held_read(struct kernfs_node *kn)
+{
+ lockdep_assert_held_read(kernfs_rwsem_ptr(kn));
+}
+
+/**
+ * down_write_kernfs_rwsem_for_two_nodes() - Acquire hashed rwsem for 2 nodes
+ *
+ * @kn1: kernfs_node for which hashed rwsem needs to be taken
+ * @kn2: kernfs_node for which hashed rwsem needs to be taken
+ *
+ * In certain cases we need to acquire hashed rwsem for 2 nodes that don't have a
+ * parent child relationship. This is one of the cases of nested locking involving
+ * hashed rwsem and rwsem with lower address is acquired first.
+ */
+static inline void down_write_kernfs_rwsem_for_two_nodes(struct kernfs_node *kn1,
+ struct kernfs_node *kn2)
+{
+ struct rw_semaphore *rwsem1 = kernfs_rwsem_ptr(kn1);
+ struct rw_semaphore *rwsem2 = kernfs_rwsem_ptr(kn2);
+
+ if (rwsem1 == rwsem2)
+ down_write_nested(rwsem1, 0);
+ else {
+ if (rwsem1 < rwsem2) {
+ down_write_nested(rwsem1, 0);
+ down_write_nested(rwsem2, 1);
+ } else {
+ down_write_nested(rwsem2, 0);
+ down_write_nested(rwsem1, 1);
+ }
+ }
+ kernfs_get(kn1);
+ kernfs_get(kn2);
+}
+
+/**
+ * up_write_kernfs_rwsem_for_two_nodes() - Release hashed rwsem for 2 nodes
+ *
+ * @kn1: kernfs_node for which hashed rwsem needs to be released
+ * @kn2: kernfs_node for which hashed rwsem needs to be released
+ *
+ * In case of nested locking, rwsem with higher address is released first.
+ */
+static inline void up_write_kernfs_rwsem_for_two_nodes(struct kernfs_node *kn1,
+ struct kernfs_node *kn2)
+{
+ struct rw_semaphore *rwsem1 = kernfs_rwsem_ptr(kn1);
+ struct rw_semaphore *rwsem2 = kernfs_rwsem_ptr(kn2);
+
+ if (rwsem1 == rwsem2)
+ up_write(rwsem1);
+ else {
+ if (rwsem1 > rwsem2) {
+ up_write(rwsem1);
+ up_write(rwsem2);
+ } else {
+ up_write(rwsem2);
+ up_write(rwsem1);
+ }
+ }
+
+ kernfs_put(kn1);
+ kernfs_put(kn2);
+}
+
+/**
+ * down_read_kernfs_rwsem_for_two_nodes() - Acquire hashed rwsem for 2 nodes
+ *
+ * @kn1: kernfs_node for which hashed rwsem needs to be taken
+ * @kn2: kernfs_node for which hashed rwsem needs to be taken
+ *
+ * In certain cases we need to acquire hashed rwsem for 2 nodes that don't have a
+ * parent child relationship. This is one of the cases of nested locking involving
+ * hashed rwsem and rwsem with lower address is acquired first.
+ */
+static inline void down_read_kernfs_rwsem_for_two_nodes(struct kernfs_node *kn1,
+ struct kernfs_node *kn2)
+{
+ struct rw_semaphore *rwsem1 = kernfs_rwsem_ptr(kn1);
+ struct rw_semaphore *rwsem2 = kernfs_rwsem_ptr(kn2);
+
+ if (rwsem1 == rwsem2)
+ down_read_nested(rwsem1, 0);
+ else {
+ if (rwsem1 < rwsem2) {
+ down_read_nested(rwsem1, 0);
+ down_read_nested(rwsem2, 1);
+ } else {
+ down_read_nested(rwsem2, 0);
+ down_read_nested(rwsem1, 1);
+ }
+ }
+ kernfs_get(kn1);
+ kernfs_get(kn2);
+}
+
+/**
+ * up_read_kernfs_rwsem_for_two_nodes() - Release hashed rwsem for 2 nodes
+ *
+ * @kn1: kernfs_node for which hashed rwsem needs to be released
+ * @kn2: kernfs_node for which hashed rwsem needs to be released
+ *
+ * In case of nested locking, rwsem with higher address is released first.
+ */
+static inline void up_read_kernfs_rwsem_for_two_nodes(struct kernfs_node *kn1,
+ struct kernfs_node *kn2)
+{
+ struct rw_semaphore *rwsem1 = kernfs_rwsem_ptr(kn1);
+ struct rw_semaphore *rwsem2 = kernfs_rwsem_ptr(kn2);
+
+ if (rwsem1 == rwsem2)
+ up_read(rwsem1);
+ else {
+ if (rwsem1 > rwsem2) {
+ up_read(rwsem1);
+ up_read(rwsem2);
+ } else {
+ up_read(rwsem2);
+ up_read(rwsem1);
+ }
+ }
+
+ kernfs_put(kn1);
+ kernfs_put(kn2);
+}
+
+/**
+ * down_write_kernfs_rwsem() - Acquire hashed rwsem
+ *
+ * @kn: kernfs_node for which hashed rwsem needs to be taken
+ * @ptrn: locking pattern i.e whether to lock only given node or to lock
+ * node and its parent as well
+ *
+ * In case of nested locking, rwsem with lower address is acquired first.
+ *
+ * Return: void
+ */
+static inline void down_write_kernfs_rwsem(struct kernfs_node *kn,
+ enum kernfs_rwsem_lock_pattern ptrn)
+{
+ struct rw_semaphore *p_rwsem = NULL;
+ struct rw_semaphore *rwsem = kernfs_rwsem_ptr(kn);
+ int lock_parent = 0;
+
+ if (ptrn == KERNFS_RWSEM_LOCK_SELF_AND_PARENT && kn->parent)
+ lock_parent = 1;
+
+ if (lock_parent)
+ p_rwsem = kernfs_rwsem_ptr(kn->parent);
+
+ if (!lock_parent || rwsem == p_rwsem) {
+ down_write_nested(rwsem, 0);
+ kernfs_get(kn);
+ kn->unlock_parent = 0;
+ } else {
+ /**
+ * In case of nested locking, locks are taken in order of their
+ * addresses. lock with lower address is taken first, followed
+ * by lock with higher address.
+ */
+ if (rwsem < p_rwsem) {
+ down_write_nested(rwsem, 0);
+ down_write_nested(p_rwsem, 1);
+ } else {
+ down_write_nested(p_rwsem, 0);
+ down_write_nested(rwsem, 1);
+ }
+ kernfs_get(kn);
+ kernfs_get(kn->parent);
+ kn->unlock_parent = 1;
+ }
+}
+
+/**
+ * up_write_kernfs_rwsem() - Release hashed rwsem
+ *
+ * @kn: kernfs_node for which hashed rwsem was taken
+ *
+ * In case of nested locking, rwsem with higher address is released first.
+ *
+ * Return: void
+ */
+static inline void up_write_kernfs_rwsem(struct kernfs_node *kn)
+{
+ struct rw_semaphore *p_rwsem = NULL;
+ struct rw_semaphore *rwsem = kernfs_rwsem_ptr(kn);
+
+ if (kn->unlock_parent) {
+ kn->unlock_parent = 0;
+ p_rwsem = kernfs_rwsem_ptr(kn->parent);
+ if (rwsem > p_rwsem) {
+ up_write(rwsem);
+ up_write(p_rwsem);
+ } else {
+ up_write(p_rwsem);
+ up_write(rwsem);
+ }
+ kernfs_put(kn->parent);
+ } else
+ up_write(rwsem);
+
+ kernfs_put(kn);
+}
+
+/**
+ * down_read_kernfs_rwsem() - Acquire hashed rwsem
+ *
+ * @kn: kernfs_node for which hashed rwsem needs to be taken
+ * @ptrn: locking pattern i.e whether to lock only given node or to lock
+ * node and its parent as well
+ *
+ * In case of nested locking, rwsem with lower address is acquired first.
+ *
+ * Return: void
+ */
+static inline void down_read_kernfs_rwsem(struct kernfs_node *kn,
+ enum kernfs_rwsem_lock_pattern ptrn)
+{
+ struct rw_semaphore *p_rwsem = NULL;
+ struct rw_semaphore *rwsem = kernfs_rwsem_ptr(kn);
+ int lock_parent = 0;
+
+ if (ptrn == KERNFS_RWSEM_LOCK_SELF_AND_PARENT && kn->parent)
+ lock_parent = 1;
+
+ if (lock_parent)
+ p_rwsem = kernfs_rwsem_ptr(kn->parent);
+
+ if (!lock_parent || rwsem == p_rwsem) {
+ down_read_nested(rwsem, 0);
+ kernfs_get(kn);
+ kn->unlock_parent = 0;
+ } else {
+ /**
+ * In case of nested locking, locks are taken in order of their
+ * addresses. lock with lower address is taken first, followed
+ * by lock with higher address.
+ */
+ if (rwsem < p_rwsem) {
+ down_read_nested(rwsem, 0);
+ down_read_nested(p_rwsem, 1);
+ } else {
+ down_read_nested(p_rwsem, 0);
+ down_read_nested(rwsem, 1);
+ }
+ kernfs_get(kn);
+ kernfs_get(kn->parent);
+ kn->unlock_parent = 1;
+ }
+}
+
+/**
+ * up_read_kernfs_rwsem() - Release hashed rwsem
+ *
+ * @kn: kernfs_node for which hashed rwsem was taken
+ *
+ * In case of nested locking, rwsem with higher address is released first.
+ *
+ * Return: void
+ */
+static inline void up_read_kernfs_rwsem(struct kernfs_node *kn)
+{
+ struct rw_semaphore *p_rwsem = NULL;
+ struct rw_semaphore *rwsem = kernfs_rwsem_ptr(kn);
+
+ if (kn->unlock_parent) {
+ kn->unlock_parent = 0;
+ p_rwsem = kernfs_rwsem_ptr(kn->parent);
+ if (rwsem > p_rwsem) {
+ up_read(rwsem);
+ up_read(p_rwsem);
+ } else {
+ up_read(p_rwsem);
+ up_read(rwsem);
+ }
+ kernfs_put(kn->parent);
+ } else
+ up_read(rwsem);
+
+ kernfs_put(kn);
+}
+
+static inline void kernfs_swap_rwsems(struct rw_semaphore **array, int i, int j)
+{
+ struct rw_semaphore *tmp;
+
+ tmp = array[i];
+ array[i] = array[j];
+ array[j] = tmp;
+}
+
+static inline void kernfs_sort_rwsems(struct rw_semaphore **array)
+{
+ if (array[0] > array[1])
+ kernfs_swap_rwsems(array, 0, 1);
+
+ if (array[0] > array[2])
+ kernfs_swap_rwsems(array, 0, 2);
+
+ if (array[1] > array[2])
+ kernfs_swap_rwsems(array, 1, 2);
+}
+
+/**
+ * down_write_kernfs_rwsem_rename_ns() - take hashed rwsem during
+ * rename or similar operations.
+ *
+ * @kn: kernfs_node of interest
+ * @current_parent: current parent of kernfs_node of interest
+ * @new_parent: about to become new parent of kernfs_node
+ *
+ * During rename or similar operations the parent of a node changes,
+ * and this means we will see different parents of a kernfs_node at
+ * the time of taking and releasing its or its parent's hashed rwsem.
+ * This function separately takes locks corresponding to node, and
+ * corresponding to its current and future parents (if needed).
+ *
+ * Return: void
+ */
+static inline void down_write_kernfs_rwsem_rename_ns(struct kernfs_node *kn,
+ struct kernfs_node *current_parent,
+ struct kernfs_node *new_parent)
+{
+ struct rw_semaphore *array[3];
+
+ array[0] = kernfs_rwsem_ptr(kn);
+ array[1] = kernfs_rwsem_ptr(current_parent);
+ array[2] = kernfs_rwsem_ptr(new_parent);
+
+ if (array[0] == array[1] && array[0] == array[2]) {
+ /* All 3 nodes hash to same rwsem */
+ down_write_nested(array[0], 0);
+ } else {
+ /**
+ * All 3 nodes are not hashing to the same rwsem, so sort the
+ * array.
+ */
+ kernfs_sort_rwsems(array);
+
+ if (array[0] == array[1] || array[1] == array[2]) {
+ /**
+ * Two nodes hash to same rwsem, and these
+ * will occupy consecutive places in array after
+ * sorting.
+ */
+ down_write_nested(array[0], 0);
+ down_write_nested(array[2], 1);
+ } else {
+ /* All 3 nodes hashe to different rwsems */
+ down_write_nested(array[0], 0);
+ down_write_nested(array[1], 1);
+ down_write_nested(array[2], 2);
+ }
+ }
+
+ kernfs_get(kn);
+ kernfs_get(current_parent);
+ kernfs_get(new_parent);
+}
+
+/**
+ * up_write_kernfs_rwsem_rename_ns() - release hashed rwsem during
+ * rename or similar operations.
+ *
+ * @kn: kernfs_node of interest
+ * @current_parent: current parent of kernfs_node of interest
+ * @old_parent: old parent of kernfs_node
+ *
+ * During rename or similar operations the parent of a node changes,
+ * and this means we will see different parents of a kernfs_node at
+ * the time of taking and releasing its or its parent's hashed rwsem.
+ * This function separately releases locks corresponding to node, and
+ * corresponding to its current and old parents (if needed).
+ *
+ * Return: void
+ */
+static inline void up_write_kernfs_rwsem_rename_ns(struct kernfs_node *kn,
+ struct kernfs_node *current_parent,
+ struct kernfs_node *old_parent)
+{
+ struct rw_semaphore *array[3];
+
+ array[0] = kernfs_rwsem_ptr(kn);
+ array[1] = kernfs_rwsem_ptr(current_parent);
+ array[2] = kernfs_rwsem_ptr(old_parent);
+
+ if (array[0] == array[1] && array[0] == array[2]) {
+ /* All 3 nodes hash to same rwsem */
+ up_write(array[0]);
+ } else {
+ /**
+ * All 3 nodes are not hashing to the same rwsem, so sort the
+ * array.
+ */
+ kernfs_sort_rwsems(array);
+
+ if (array[0] == array[1] || array[1] == array[2]) {
+ /**
+ * Two nodes hash to same rwsem, and these
+ * will occupy consecutive places in array after
+ * sorting.
+ */
+ up_write(array[2]);
+ up_write(array[0]);
+ } else {
+ /* All 3 nodes hashe to different rwsems */
+ up_write(array[2]);
+ up_write(array[1]);
+ up_write(array[0]);
+ }
+ }
+
+ kernfs_put(old_parent);
+ kernfs_put(current_parent);
+ kernfs_put(kn);
+}
+
+/**
+ * down_read_kernfs_rwsem_rename_ns() - take hashed rwsem during
+ * rename or similar operations.
+ *
+ * @kn: kernfs_node of interest
+ * @current_parent: current parent of kernfs_node of interest
+ * @new_parent: about to become new parent of kernfs_node
+ *
+ * During rename or similar operations the parent of a node changes,
+ * and this means we will see different parents of a kernfs_node at
+ * the time of taking and releasing its or its parent's hashed rwsem.
+ * This function separately takes locks corresponding to node, and
+ * corresponding to its current and future parents (if needed).
+ *
+ * Return: void
+ */
+static inline void down_read_kernfs_rwsem_rename_ns(struct kernfs_node *kn,
+ struct kernfs_node *current_parent,
+ struct kernfs_node *new_parent)
+{
+ struct rw_semaphore *array[3];
+
+ array[0] = kernfs_rwsem_ptr(kn);
+ array[1] = kernfs_rwsem_ptr(current_parent);
+ array[2] = kernfs_rwsem_ptr(new_parent);
+
+ if (array[0] == array[1] && array[0] == array[2]) {
+ /* All 3 nodes hash to same rwsem */
+ down_read_nested(array[0], 0);
+ } else {
+ /**
+ * All 3 nodes are not hashing to the same rwsem, so sort the
+ * array.
+ */
+ kernfs_sort_rwsems(array);
+
+ if (array[0] == array[1] || array[1] == array[2]) {
+ /**
+ * Two nodes hash to same rwsem, and these
+ * will occupy consecutive places in array after
+ * sorting.
+ */
+ down_read_nested(array[0], 0);
+ down_read_nested(array[2], 1);
+ } else {
+ /* All 3 nodes hashe to different rwsems */
+ down_read_nested(array[0], 0);
+ down_read_nested(array[1], 1);
+ down_read_nested(array[2], 2);
+ }
+ }
+
+ kernfs_get(kn);
+ kernfs_get(current_parent);
+ kernfs_get(new_parent);
+}
+
+/**
+ * up_read_kernfs_rwsem_rename_ns() - release hashed rwsem during
+ * rename or similar operations.
+ *
+ * @kn: kernfs_node of interest
+ * @current_parent: current parent of kernfs_node of interest
+ * @old_parent: old parent of kernfs_node
+ *
+ * During rename or similar operations the parent of a node changes,
+ * and this means we will see different parents of a kernfs_node at
+ * the time of taking and releasing its or its parent's hashed rwsem.
+ * This function separately releases locks corresponding to node, and
+ * corresponding to its current and old parents (if needed).
+ *
+ * Return: void
+ */
+static inline void up_read_kernfs_rwsem_rename_ns(struct kernfs_node *kn,
+ struct kernfs_node *current_parent,
+ struct kernfs_node *old_parent)
+{
+ struct rw_semaphore *array[3];
+
+ array[0] = kernfs_rwsem_ptr(kn);
+ array[1] = kernfs_rwsem_ptr(current_parent);
+ array[2] = kernfs_rwsem_ptr(old_parent);
+
+ if (array[0] == array[1] && array[0] == array[2]) {
+ /* All 3 nodes hash to same rwsem */
+ up_read(array[0]);
+ } else {
+ /**
+ * All 3 nodes are not hashing to the same rwsem, so sort the
+ * array.
+ */
+ kernfs_sort_rwsems(array);
+
+ if (array[0] == array[1] || array[1] == array[2]) {
+ /**
+ * Two nodes hash to same rwsem, and these
+ * will occupy consecutive places in array after
+ * sorting.
+ */
+ up_read(array[2]);
+ up_read(array[0]);
+ } else {
+ /* All 3 nodes hashe to different rwsems */
+ up_read(array[2]);
+ up_read(array[1]);
+ up_read(array[0]);
+ }
+ }
+
+ kernfs_put(old_parent);
+ kernfs_put(current_parent);
+ kernfs_put(kn);
+}
+
#endif /* __KERNFS_INTERNAL_H */
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index d35142226c340..d28f8a3eeb215 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -398,6 +398,7 @@ void __init kernfs_lock_init(void)
for (count = 0; count < NR_KERNFS_LOCKS; count++) {
mutex_init(&kernfs_locks->open_file_mutex[count].lock);
spin_lock_init(&kernfs_locks->open_node_locks[count].lock);
+ init_rwsem(&kernfs_locks->kernfs_rwsem[count]);
}
}
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 84653c609a5c0..8451f153b2291 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -96,6 +96,7 @@ struct kernfs_open_node_lock {
struct kernfs_global_locks {
struct kernfs_open_file_mutex open_file_mutex[NR_KERNFS_LOCKS];
struct kernfs_open_node_lock open_node_locks[NR_KERNFS_LOCKS];
+ struct rw_semaphore kernfs_rwsem[NR_KERNFS_LOCKS];
};
enum kernfs_node_type {
@@ -206,6 +207,7 @@ struct kernfs_node {
*/
struct kernfs_node *parent;
const char *name;
+ u8 unlock_parent; /* release parent's rwsem */
struct rb_node rb;
--
2.30.2
Powered by blists - more mailing lists