[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <4F0F6566.4050202@sandeen.net>
Date: Thu, 12 Jan 2012 16:57:42 -0600
From: Eric Sandeen <sandeen@...deen.net>
To: Jan Kara <jack@...e.cz>
CC: linux-fsdevel@...r.kernel.org, LKML <linux-kernel@...r.kernel.org>,
linux-ext4@...r.kernel.org, xfs@....sgi.com,
Dave Chinner <dchinner@...hat.com>,
Surbhi Palande <csurbhi@...il.com>,
Kamal Mostafa <kamal@...onical.com>,
Christoph Hellwig <hch@...radead.org>
Subject: Re: [PATCH 1/4] fs: Improve filesystem freezing handling
On 1/11/12 7:20 PM, Jan Kara wrote:
> Currently, exclusion between ->page_mkwrite() and filesystem freezing has been
> handled by setting page dirty and then verifying s_frozen. This guaranteed that
> either the freezing code sees the faulted page, writes it, and writeprotects it
> again or we see s_frozen set and bail out of page fault. This works to protect
> from page being marked writeable while filesystem freezing is running but has
> an unpleasant artefact of leaving dirty (although unmodified and
> writeprotected) pages on frozen filesystem. This artefact then requires
> workarounds in writeback code and other places.
>
> Also generally vfs_check_frozen() tests are racy since the filesystem can be
> frozen just after the test is performed. Thus in other write paths we can
> end up marking some pages or inodes dirty even though filesystem is already
> frozen. Again this creates problems with flusher thread hanging on frozen
> filesystem.
>
> This patch aims at providing exclusion between write paths which dirty data (we
> don't have to worry about metadata since that is handled by filesystems in
> ->freeze_fs) and filesystem freezing. We implement a writer-freeze read-write
> semaphore in the superblock. Write paths which dirty data such as
> ->block_page_mkwrite() implementations, or ->aio_write() implementations hold
> reader side of the semaphore. Filesystem freezing code holds the writer side.
> Only that we don't really want to bounce cachelines of the semaphore between
> CPUs for each write happening. So we implement the reader side of the semaphore
> as a per-cpu counter and the writer side is implemented using s_frozen
> superblock field.
>
> Signed-off-by: Jan Kara <jack@...e.cz>
> ---
> fs/super.c | 121 +++++++++++++++++++++++++++++++++++++++++++++++++++-
> include/linux/fs.h | 14 ++++++
> 2 files changed, 134 insertions(+), 1 deletions(-)
>
> diff --git a/fs/super.c b/fs/super.c
> index afd0f1a..c85c64c 100644
> --- a/fs/super.c
> +++ b/fs/super.c
> @@ -32,12 +32,15 @@
> #include <linux/backing-dev.h>
> #include <linux/rculist_bl.h>
> #include <linux/cleancache.h>
> +#include <linux/lockdep.h>
> #include "internal.h"
>
>
> LIST_HEAD(super_blocks);
> DEFINE_SPINLOCK(sb_lock);
>
> +static struct lock_class_key sb_writers_key;
> +
> /*
> * One thing we have to be careful of with a per-sb shrinker is that we don't
> * drop the last active reference to the superblock from within the shrinker.
> @@ -183,6 +186,13 @@ static struct super_block *alloc_super(struct file_system_type *type)
> s->s_shrink.seeks = DEFAULT_SEEKS;
> s->s_shrink.shrink = prune_super;
> s->s_shrink.batch = 1024;
> +
> + init_waitqueue_head(&s->s_writers_wait);
> +#ifdef CONFIG_SMP
> + s->s_page_faults = alloc_percpu(int);
isn't this s->s_writers? s->s_page_faults isn't defined anywhere.
> +#endif
> + lockdep_init_map(&s->s_writers_lock_map, "sb_writers",
> + &sb_writers_key, 0);
> }
> out:
> return s;
> @@ -1126,6 +1136,84 @@ out:
> }
>
> /**
> + * sb_start_write - drop write access to a superblock
^^^^^^^^^^^^^^
s/b sb_end_write
> + * @sb: the super we wrote to
> + *
> + * Decrement number of writers to the filesystem and wake up possible
> + * waiters wanting to freeze the filesystem.
> + */
> +void sb_end_write(struct super_block *sb)
> +{
> +#ifdef CONFIG_SMP
> + this_cpu_dec(sb->s_writers);
> +#else
> + preempt_disable();
> + sb->s_writers--;
> + preempt_enable();
> +#endif
> + /*
> + * Make sure s_writers are updated before we wake up waiters in
> + * freeze_super().
> + */
> + smp_mb();
> + if (waitqueue_active(&sb->s_writers_wait))
> + wake_up(&sb->s_writers_wait);
> + rwsem_release(&sb->s_writers_lock_map, 1, _RET_IP_);
> +}
> +
> +/**
> + * sb_start_write - get write access to a superblock
> + * @sb: the super we write to
> + *
> + * When a process wants to write data to a filesystem (i.e. dirty a page),
> + * it should embed the operation in a sb_start_write() - sb_end_write() pair
> + * to get exclusion against filesystem freezing. This function increments
> + * number of writers to the filesystem and waits if filesystem is frozen until
> + * it is thawed.
> + */
> +void sb_start_write(struct super_block *sb)
> +{
> +retry:
> + rwsem_acquire_read(&sb->s_writers_lock_map, 0, 0, _RET_IP_);
> + vfs_check_frozen(sb, SB_FREEZE_WRITE);
> +#ifdef CONFIG_SMP
> + this_cpu_inc(sb->s_writers);
> +#else
> + preempt_disable();
> + sb->s_writers++;
> + preempt_enable();
> +#endif
> + /*
> + * Make sure s_writers are updated before we check s_frozen.
> + * freeze_super() first sets s_frozen and then checks s_writers.
> + */
> + smp_mb();
> + if (sb->s_frozen != SB_UNFROZEN) {
> + sb_end_write(sb);
> + goto retry;
> + }
> +}
> +
> +/*
> + * Get number of writers to the superblock
> + */
> +static int get_writers_count(struct super_block *sb)
> +{
> + int writers;
> +#ifdef CONFIG_SMP
> + int cpu;
> +
> + writers = 0;
> + for_each_possible_cpu(cpu) {
> + writers += *per_cpu_ptr(sb->s_writers, cpu);
> + }
> +#else
> + writers = sb->s_writers;
> +#endif
> + return writers;
> +}
> +
> +/**
> * freeze_super - lock the filesystem and force it into a consistent state
> * @sb: the super to lock
> *
> @@ -1136,6 +1224,7 @@ out:
> int freeze_super(struct super_block *sb)
> {
> int ret;
> + int writers;
>
> atomic_inc(&sb->s_active);
> down_write(&sb->s_umount);
> @@ -1151,8 +1240,36 @@ int freeze_super(struct super_block *sb)
> return 0;
> }
>
> + rwsem_acquire(&sb->s_writers_lock_map, 0, 0, _THIS_IP_);
> sb->s_frozen = SB_FREEZE_WRITE;
> - smp_wmb();
> + /*
> + * Now wait for all page faults to finish. ->page_mkwrite()
> + * implementations must call vfs_check_frozen() before starting
> + * a fault so that we cannot livelock here. Because of that we
> + * are guaranteed that from this moment on new ->page_mkwrite()
> + * calls will block and we just have to wait for s_page_faults
wait for s_writers, right?
> + * to drop to zero (in a sum).
> + */
> + do {
> + DEFINE_WAIT(wait);
> +
> + /*
> + * We use a barrier in prepare_to_wait() to separate setting
> + * of s_frozen and checking of s_writers
> + */
> + prepare_to_wait(&sb->s_writers_wait, &wait,
> + TASK_UNINTERRUPTIBLE);
> + /*
> + * We must iterate over all (even offline) CPUs because of CPU
> + * hotplug their entries could still be non-zero. This is slow
> + * when lots of CPUs are configured but hey, filesystem freezing
> + * isn't exactly cheap anyway.
> + */
> + writers = get_writers_count(sb);
> + if (writers)
> + schedule();
> + finish_wait(&sb->s_writers_wait, &wait);
> + } while (writers);
>
> sync_filesystem(sb);
>
> @@ -1165,6 +1282,7 @@ int freeze_super(struct super_block *sb)
> if (ret) {
> printk(KERN_ERR
> "VFS:Filesystem freeze failed\n");
> + rwsem_release(&sb->s_writers_lock_map, 1, _THIS_IP_);
> sb->s_frozen = SB_UNFROZEN;
> deactivate_locked_super(sb);
> return ret;
> @@ -1206,6 +1324,7 @@ int thaw_super(struct super_block *sb)
> }
>
> out:
> + rwsem_release(&sb->s_writers_lock_map, 1, _THIS_IP_);
> sb->s_frozen = SB_UNFROZEN;
> smp_wmb();
> wake_up(&sb->s_wait_unfrozen);
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index e313022..297b263 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -10,6 +10,7 @@
> #include <linux/ioctl.h>
> #include <linux/blk_types.h>
> #include <linux/types.h>
> +#include <linux/lockdep.h>
>
> /*
> * It's silly to have NR_OPEN bigger than NR_FILE, but you can change
> @@ -1445,6 +1446,16 @@ struct super_block {
>
> int s_frozen;
> wait_queue_head_t s_wait_unfrozen;
> +#ifdef CONFIG_SMP
> + int __percpu *s_writers; /* counter of running writes */
> +#else
> + int s_writers; /* counter of running writes */
> +#endif
> + wait_queue_head_t s_writers_wait; /* queue for waiting for
> + writers to finish */
> +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> + struct lockdep_map s_writers_lock_map;
> +#endif
>
> char s_id[32]; /* Informational name */
> u8 s_uuid[16]; /* UUID */
> @@ -1501,6 +1512,9 @@ enum {
> #define vfs_check_frozen(sb, level) \
> wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level)))
>
> +void sb_end_write(struct super_block *sb);
> +void sb_start_write(struct super_block *sb);
> +
> /*
> * until VFS tracks user namespaces for inodes, just make all files
> * belong to init_user_ns
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists