The current percpu-rwsem read side is entirely free of serializing instructions at the cost of having a synchronize_sched() in the write path. The latency of the synchronize_sched() is too high for some users (cgroups), so provide a __percpu_init_rwsem(.bias) argument to forgot this synchronize_sched() at the cost of forcing all readers into the slow path, which has serializing instructions. Cc: Tejun Heo Cc: Oleg Nesterov Cc: Paul McKenney Reported-by: John Stultz Reported-by: Dmitry Shmidt Signed-off-by: Peter Zijlstra (Intel) --- fs/super.c | 3 ++- include/linux/percpu-rwsem.h | 15 +++++++++++++-- kernel/cgroup.c | 2 +- kernel/locking/percpu-rwsem.c | 10 +++++++++- 4 files changed, 25 insertions(+), 5 deletions(-) --- a/fs/super.c +++ b/fs/super.c @@ -195,7 +195,8 @@ static struct super_block *alloc_super(s for (i = 0; i < SB_FREEZE_LEVELS; i++) { if (__percpu_init_rwsem(&s->s_writers.rw_sem[i], sb_writers_name[i], - &type->s_writers_key[i])) + &type->s_writers_key[i], + PERCPU_RWSEM_READER)) goto fail; } init_waitqueue_head(&s->s_writers.wait_unfrozen); --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -90,15 +90,26 @@ static inline void percpu_up_read(struct extern void percpu_down_write(struct percpu_rw_semaphore *); extern void percpu_up_write(struct percpu_rw_semaphore *); +enum percpu_rwsem_bias { PERCPU_RWSEM_READER, PERCPU_RWSEM_WRITER }; + extern int __percpu_init_rwsem(struct percpu_rw_semaphore *, - const char *, struct lock_class_key *); + const char *, struct lock_class_key *, + enum percpu_rwsem_bias bias); extern void percpu_free_rwsem(struct percpu_rw_semaphore *); #define percpu_init_rwsem(sem) \ ({ \ static struct lock_class_key rwsem_key; \ - __percpu_init_rwsem(sem, #sem, &rwsem_key); \ + __percpu_init_rwsem(sem, #sem, &rwsem_key, \ + PERCPU_RWSEM_READER); \ +}) + +#define percpu_init_rwsem_writer(sem) \ +({ \ + static struct lock_class_key rwsem_key; \ + __percpu_init_rwsem(sem, #sem, &rwsem_key, \ + PERCPU_RWSEM_WRITER); \ }) #define percpu_rwsem_is_held(sem) lockdep_is_held(&(sem)->rw_sem) --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5605,7 +5605,7 @@ int __init cgroup_init(void) int ssid; BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); - BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); + BUG_ON(percpu_init_rwsem_writer(&cgroup_threadgroup_rwsem)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -11,7 +11,8 @@ enum { readers_slow, readers_block }; int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, - const char *name, struct lock_class_key *rwsem_key) + const char *name, struct lock_class_key *rwsem_key, + enum percpu_rwsem_bias bias) { sem->read_count = alloc_percpu(int); if (unlikely(!sem->read_count)) @@ -19,6 +20,13 @@ int __percpu_init_rwsem(struct percpu_rw /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ rcu_sync_init(&sem->rss, RCU_SCHED_SYNC); + if (bias == PERCPU_RWSEM_WRITER) { + /* + * Disable rcu_sync() and force slow path. + */ + sem->rss.gp_count++; + sem->rss.gp_state = !0; + } __init_rwsem(&sem->rw_sem, name, rwsem_key); init_waitqueue_head(&sem->writer); sem->state = readers_slow;