[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20150722211541.GA20017@redhat.com>
Date: Wed, 22 Jul 2015 23:15:41 +0200
From: Oleg Nesterov <oleg@...hat.com>
To: Al Viro <viro@...iv.linux.org.uk>,
Dave Chinner <david@...morbit.com>,
Dave Hansen <dave.hansen@...ux.intel.com>,
Jan Kara <jack@...e.cz>
Cc: "Paul E. McKenney" <paulmck@...ux.vnet.ibm.com>,
Peter Zijlstra <peterz@...radead.org>,
linux-fsdevel@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: [PATCH 4/4] change sb_writers to use percpu_rw_semaphore
We can remove everything from struct sb_writers except frozen
and add the array of percpu_rw_semaphore's instead.
This patch doesn't remove sb_writers->wait_unfrozen yet, we keep
it for get_super_thawed(). We will probably remove it later.
This change tries to address the following problems:
- Firstly, __sb_start_write() looks simply buggy. It does
__sb_end_write() if it sees ->frozen, but if it migrates
to another CPU before percpu_counter_dec(), sb_wait_write()
can wrongly succeed if there is another task which holds
the same "semaphore": sb_wait_write() can miss the result
of the previous percpu_counter_inc() but see the result
of this percpu_counter_dec().
- As Dave Hansen reports, it is suboptimal. The trivial
microbenchmark that writes to a tmpfs file in a loop runs
12% faster if we change this code to rely on RCU and kill
the memory barriers.
- This code doesn't look simple. It would be better to rely
on the generic locking code.
According to Dave, this change adds the same performance
improvement.
Perhaps we should also cleanup the usage of ->frozen. It would be
better to set/clear (say) SB_FREEZE_WRITE with the corresponding
write-lock held. Currently freeze_super() has to set SB_FREEZE_WRITE
before sb_wait_write(SB_FREEZE_WRITE) to avoid the race with itself,
we can add another state. The "From now on, no new normal writers
can start" removed by this patch was not really correct.
Note: with this change both freeze_super() and thaw_super() will do
synchronize_sched_expedited() 3 times. This is just ugly. But:
- This will be "fixed" by the rcu_sync changes we are going
to merge. After that freeze_super()->percpu_down_write()
will use synchronize_sched(), and thaw_super() won't use
synchronize() at all.
This doesn't need any changes in fs/super.c.
- Once we merge rcu_sync changes, we can also change super.c
so that all wb_write->rw_sem's will share the single ->rss
in struct sb_writes, then freeze_super() will need only one
synchronize_sched().
Signed-off-by: Oleg Nesterov <oleg@...hat.com>
---
fs/super.c | 113 ++++++++++++++--------------------------------------
include/linux/fs.h | 19 +++------
2 files changed, 36 insertions(+), 96 deletions(-)
diff --git a/fs/super.c b/fs/super.c
index 886fddf..29b811b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -142,7 +142,7 @@ static void destroy_super_work(struct work_struct *work)
int i;
for (i = 0; i < SB_FREEZE_LEVELS; i++)
- percpu_counter_destroy(&s->s_writers.counter[i]);
+ percpu_free_rwsem(&s->s_writers.rw_sem[i]);
kfree(s);
}
@@ -193,13 +193,11 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
goto fail;
for (i = 0; i < SB_FREEZE_LEVELS; i++) {
- if (percpu_counter_init(&s->s_writers.counter[i], 0,
- GFP_KERNEL) < 0)
+ if (__percpu_init_rwsem(&s->s_writers.rw_sem[i],
+ sb_writers_name[i],
+ &type->s_writers_key[i]))
goto fail;
- lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i],
- &type->s_writers_key[i], 0);
}
- init_waitqueue_head(&s->s_writers.wait);
init_waitqueue_head(&s->s_writers.wait_unfrozen);
s->s_bdi = &noop_backing_dev_info;
s->s_flags = flags;
@@ -1161,47 +1159,10 @@ out:
*/
void __sb_end_write(struct super_block *sb, int level)
{
- percpu_counter_dec(&sb->s_writers.counter[level-1]);
- /*
- * Make sure s_writers are updated before we wake up waiters in
- * freeze_super().
- */
- smp_mb();
- if (waitqueue_active(&sb->s_writers.wait))
- wake_up(&sb->s_writers.wait);
- rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_);
+ percpu_up_read(sb->s_writers.rw_sem + level-1);
}
EXPORT_SYMBOL(__sb_end_write);
-static int do_sb_start_write(struct super_block *sb, int level, bool wait,
- unsigned long ip)
-{
- if (wait)
- rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, 0, ip);
-retry:
- if (unlikely(sb->s_writers.frozen >= level)) {
- if (!wait)
- return 0;
- wait_event(sb->s_writers.wait_unfrozen,
- sb->s_writers.frozen < level);
- }
-
- percpu_counter_inc(&sb->s_writers.counter[level-1]);
- /*
- * Make sure counter is updated before we check for frozen.
- * freeze_super() first sets frozen and then checks the counter.
- */
- smp_mb();
- if (unlikely(sb->s_writers.frozen >= level)) {
- __sb_end_write(sb, level);
- goto retry;
- }
-
- if (!wait)
- rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, 1, ip);
- return 1;
-}
-
/*
* This is an internal function, please use sb_start_{write,pagefault,intwrite}
* instead.
@@ -1209,7 +1170,7 @@ retry:
int __sb_start_write(struct super_block *sb, int level, bool wait)
{
bool force_trylock = false;
- int ret;
+ int ret = 1;
#ifdef CONFIG_LOCKDEP
/*
@@ -1225,13 +1186,17 @@ int __sb_start_write(struct super_block *sb, int level, bool wait)
int i;
for (i = 0; i < level - 1; i++)
- if (lock_is_held(&sb->s_writers.lock_map[i])) {
+ if (percpu_rwsem_is_held(sb->s_writers.rw_sem + i)) {
force_trylock = true;
break;
}
}
#endif
- ret = do_sb_start_write(sb, level, wait && !force_trylock, _RET_IP_);
+ if (wait && !force_trylock)
+ percpu_down_read(sb->s_writers.rw_sem + level-1);
+ else
+ ret = percpu_down_read_trylock(sb->s_writers.rw_sem + level-1);
+
WARN_ON(force_trylock & !ret);
return ret;
}
@@ -1249,25 +1214,7 @@ EXPORT_SYMBOL(__sb_start_write);
*/
static void sb_wait_write(struct super_block *sb, int level)
{
- s64 writers;
-
- rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_);
-
- do {
- DEFINE_WAIT(wait);
- /*
- * We use a barrier in prepare_to_wait() to separate setting
- * of frozen and checking of the counter
- */
- prepare_to_wait(&sb->s_writers.wait, &wait,
- TASK_UNINTERRUPTIBLE);
-
- writers = percpu_counter_sum(&sb->s_writers.counter[level-1]);
- if (writers)
- schedule();
-
- finish_wait(&sb->s_writers.wait, &wait);
- } while (writers);
+ percpu_down_write(sb->s_writers.rw_sem + level-1);
}
static void sb_freeze_release(struct super_block *sb)
@@ -1275,7 +1222,7 @@ static void sb_freeze_release(struct super_block *sb)
int level;
/* Avoid the warning from lockdep_sys_exit() */
for (level = 0; level < SB_FREEZE_LEVELS; ++level)
- rwsem_release(sb->s_writers.lock_map + level, 1, _THIS_IP_);
+ percpu_rwsem_release(sb->s_writers.rw_sem + level, 0, _THIS_IP_);
}
static void sb_freeze_acquire(struct super_block *sb)
@@ -1283,7 +1230,15 @@ static void sb_freeze_acquire(struct super_block *sb)
int level;
for (level = 0; level < SB_FREEZE_LEVELS; ++level)
- rwsem_acquire(sb->s_writers.lock_map + level, 0, 1, _THIS_IP_);
+ percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_);
+}
+
+static void sb_freeze_unlock(struct super_block *sb)
+{
+ int level;
+
+ for (level = 0; level < SB_FREEZE_LEVELS; ++level)
+ percpu_up_write(sb->s_writers.rw_sem + level);
}
/**
@@ -1342,20 +1297,14 @@ int freeze_super(struct super_block *sb)
return 0;
}
- /* From now on, no new normal writers can start */
sb->s_writers.frozen = SB_FREEZE_WRITE;
- smp_wmb();
-
/* Release s_umount to preserve sb_start_write -> s_umount ordering */
up_write(&sb->s_umount);
-
sb_wait_write(sb, SB_FREEZE_WRITE);
+ down_write(&sb->s_umount);
/* Now we go and block page faults... */
- down_write(&sb->s_umount);
sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
- smp_wmb();
-
sb_wait_write(sb, SB_FREEZE_PAGEFAULT);
/* All writers are done so after syncing there won't be dirty data */
@@ -1363,7 +1312,6 @@ int freeze_super(struct super_block *sb)
/* Now wait for internal filesystem counter */
sb->s_writers.frozen = SB_FREEZE_FS;
- smp_wmb();
sb_wait_write(sb, SB_FREEZE_FS);
if (sb->s_op->freeze_fs) {
@@ -1372,9 +1320,8 @@ int freeze_super(struct super_block *sb)
printk(KERN_ERR
"VFS:Filesystem freeze failed\n");
sb->s_writers.frozen = SB_UNFROZEN;
- smp_wmb();
+ sb_freeze_unlock(sb);
wake_up(&sb->s_writers.wait_unfrozen);
- sb_freeze_release(sb);
deactivate_locked_super(sb);
return ret;
}
@@ -1406,8 +1353,10 @@ int thaw_super(struct super_block *sb)
return -EINVAL;
}
- if (sb->s_flags & MS_RDONLY)
+ if (sb->s_flags & MS_RDONLY) {
+ sb->s_writers.frozen = SB_UNFROZEN;
goto out;
+ }
sb_freeze_acquire(sb);
@@ -1422,13 +1371,11 @@ int thaw_super(struct super_block *sb)
}
}
- sb_freeze_release(sb);
-out:
sb->s_writers.frozen = SB_UNFROZEN;
- smp_wmb();
+ sb_freeze_unlock(sb);
+out:
wake_up(&sb->s_writers.wait_unfrozen);
deactivate_locked_super(sb);
-
return 0;
}
EXPORT_SYMBOL(thaw_super);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6addccc..fb2cb4a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1,7 +1,6 @@
#ifndef _LINUX_FS_H
#define _LINUX_FS_H
-
#include <linux/linkage.h>
#include <linux/wait.h>
#include <linux/kdev_t.h>
@@ -31,6 +30,7 @@
#include <linux/percpu-rwsem.h>
#include <linux/blk_types.h>
#include <linux/workqueue.h>
+#include <linux/percpu-rwsem.h>
#include <asm/byteorder.h>
#include <uapi/linux/fs.h>
@@ -1247,16 +1247,9 @@ enum {
#define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1)
struct sb_writers {
- /* Counters for counting writers at each level */
- struct percpu_counter counter[SB_FREEZE_LEVELS];
- wait_queue_head_t wait; /* queue for waiting for
- writers / faults to finish */
- int frozen; /* Is sb frozen? */
- wait_queue_head_t wait_unfrozen; /* queue for waiting for
- sb to be thawed */
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- struct lockdep_map lock_map[SB_FREEZE_LEVELS];
-#endif
+ int frozen; /* Is sb frozen? */
+ wait_queue_head_t wait_unfrozen; /* for get_super_thawed() */
+ struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS];
};
struct super_block {
@@ -1364,9 +1357,9 @@ void __sb_end_write(struct super_block *sb, int level);
int __sb_start_write(struct super_block *sb, int level, bool wait);
#define __sb_acquire_write(sb, lev) \
- rwsem_acquire_read(&(sb)->s_writers.lock_map[(lev)-1], 0, 1, _THIS_IP_)
+ percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
#define __sb_release_write(sb, lev) \
- rwsem_release(&(sb)->s_writers.lock_map[(lev)-1], 1, _THIS_IP_)
+ percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
/**
* sb_end_write - drop write access to a superblock
--
1.5.5.1
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists