[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20090527061705.GQ11363@kernel.dk>
Date: Wed, 27 May 2009 08:17:05 +0200
From: Jens Axboe <jens.axboe@...cle.com>
To: "Zhang, Yanmin" <yanmin_zhang@...ux.intel.com>
Cc: linux-kernel@...r.kernel.org, linux-fsdevel@...r.kernel.org,
chris.mason@...cle.com, david@...morbit.com, hch@...radead.org,
akpm@...ux-foundation.org, jack@...e.cz, richard@....demon.co.uk
Subject: Re: [PATCH 0/12] Per-bdi writeback flusher threads v7
On Wed, May 27 2009, Zhang, Yanmin wrote:
> On Tue, 2009-05-26 at 11:33 +0200, Jens Axboe wrote:
> > Hi,
> >
> > Here's the 7th version of the writeback patches. Changes since
> > v5/v6:
> >
> > - Move the sync_supers() to the global bdi_forker_task() thread, so we
> > don't writeback the supers from all the bdi kupdated() tasks.
> > - Make bdi_start_writeback() and bdi_writeback_all() be sync when called
> > with WB_SYNC_ALL only.
> > - Shuffle some more things around to make a cleaner series. The sync vs
> > async nature of bdi_writeback_all() and bdi_start_writeback() isn't
> > consistent through the series, but otherwise things should be sane.
> >
> > I'd appreciate if Richard and Yanmin could re-run testing with this,
> > just to make sure that things are sane. For ease of patching, I've
> > put the full diff here:
> >
> > http://kernel.dk/writeback-v7.patch
> I ported it to 2.6.30-rc6 with some change in file mm/page-write.c, so I
> could compare with old data.
>
> See the attachment.
>
> The new testing hits the hang issue again. It seems there is still a race.
It's actually not a race, it's a deadlock on the bdi_lock. If you find
the bdi-default task, it should be stuck in the mutex slow path. I
posted this quickie [1] yesterday but didn't test it, I'll test it today and
post a v8.
[1] http://lkml.org/lkml/2009/5/26/401
> INFO: task sync:30013 blocked for more than 120 seconds.
> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> sync D ffffc20000011300 4736 30013 28019
> ffffffff8093e350 0000000000000086 0000000000000000 0000000000000000
> 0000000000021220 0000000000004000 0000000000011300 000000000000c868
> ffff880000016c48 ffffe20002934b30 ffff8800720b3780 ffff8800720b3b08
> Call Trace:
> [<ffffffff802c31d0>] ? bdi_sched_wait+0x0/0xd
> [<ffffffff8071e56f>] ? schedule+0x9/0x1e
> [<ffffffff802c31d9>] ? bdi_sched_wait+0x9/0xd
> [<ffffffff8071eb36>] ? __wait_on_bit+0x41/0x71
> [<ffffffff802c31d0>] ? bdi_sched_wait+0x0/0xd
> [<ffffffff8071ebd1>] ? out_of_line_wait_on_bit+0x6b/0x77
> [<ffffffff8024cc0c>] ? wake_bit_function+0x0/0x23
> [<ffffffff8022cfa1>] ? __wake_up+0x30/0x44
> [<ffffffff802c2e22>] ? bdi_writeback_all+0x20b/0x24c
> [<ffffffff802800ce>] ? pagevec_lookup_tag+0x1a/0x21
> [<ffffffff80279248>] ? wait_on_page_writeback_range+0xce/0x11b
> [<ffffffff802c2ff3>] ? generic_sync_sb_inodes+0x36/0xe1
> [<ffffffff802c3121>] ? sync_inodes_sb+0x83/0x88
> [<ffffffff802c316c>] ? __sync_inodes+0x46/0x8f
> [<ffffffff802c5d10>] ? do_sync+0x36/0x5a
> [<ffffffff802c5d56>] ? sys_sync+0xe/0x14
> [<ffffffff8020ba2b>] ? system_call_fastpath+0x16/0x1b
>
>
> diff -Nraup linux-2.6.30-rc6/block/blk-core.c linux-2.6.30-rc6_bdiflusherv7/block/blk-core.c
> --- linux-2.6.30-rc6/block/blk-core.c 2009-05-19 11:00:45.000000000 +0800
> +++ linux-2.6.30-rc6_bdiflusherv7/block/blk-core.c 2009-05-27 08:59:27.000000000 +0800
> @@ -517,6 +517,7 @@ struct request_queue *blk_alloc_queue_no
>
> q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
> q->backing_dev_info.unplug_io_data = q;
> + q->backing_dev_info.name = "block";
> err = bdi_init(&q->backing_dev_info);
> if (err) {
> kmem_cache_free(blk_requestq_cachep, q);
> diff -Nraup linux-2.6.30-rc6/drivers/block/aoe/aoeblk.c linux-2.6.30-rc6_bdiflusherv7/drivers/block/aoe/aoeblk.c
> --- linux-2.6.30-rc6/drivers/block/aoe/aoeblk.c 2009-05-19 11:00:27.000000000 +0800
> +++ linux-2.6.30-rc6_bdiflusherv7/drivers/block/aoe/aoeblk.c 2009-05-27 08:59:27.000000000 +0800
> @@ -265,6 +265,7 @@ aoeblk_gdalloc(void *vp)
> }
>
> blk_queue_make_request(&d->blkq, aoeblk_make_request);
> + d->blkq.backing_dev_info.name = "aoe";
> if (bdi_init(&d->blkq.backing_dev_info))
> goto err_mempool;
> spin_lock_irqsave(&d->lock, flags);
> diff -Nraup linux-2.6.30-rc6/drivers/char/mem.c linux-2.6.30-rc6_bdiflusherv7/drivers/char/mem.c
> --- linux-2.6.30-rc6/drivers/char/mem.c 2009-05-19 11:00:46.000000000 +0800
> +++ linux-2.6.30-rc6_bdiflusherv7/drivers/char/mem.c 2009-05-27 08:59:27.000000000 +0800
> @@ -820,6 +820,7 @@ static const struct file_operations zero
> * - permits private mappings, "copies" are taken of the source of zeros
> */
> static struct backing_dev_info zero_bdi = {
> + .name = "char/mem",
> .capabilities = BDI_CAP_MAP_COPY,
> };
>
> diff -Nraup linux-2.6.30-rc6/fs/btrfs/disk-io.c linux-2.6.30-rc6_bdiflusherv7/fs/btrfs/disk-io.c
> --- linux-2.6.30-rc6/fs/btrfs/disk-io.c 2009-05-19 11:00:56.000000000 +0800
> +++ linux-2.6.30-rc6_bdiflusherv7/fs/btrfs/disk-io.c 2009-05-27 08:59:27.000000000 +0800
> @@ -1345,12 +1345,25 @@ static void btrfs_unplug_io_fn(struct ba
> free_extent_map(em);
> }
>
> +/*
> + * If this fails, caller must call bdi_destroy() to get rid of the
> + * bdi again.
> + */
> static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
> {
> - bdi_init(bdi);
> + int err;
> +
> + bdi->name = "btrfs";
> + bdi->capabilities = BDI_CAP_MAP_COPY;
> + err = bdi_init(bdi);
> + if (err)
> + return err;
> +
> + err = bdi_register(bdi, NULL, "btrfs");
> + if (err)
> + return err;
> +
> bdi->ra_pages = default_backing_dev_info.ra_pages;
> - bdi->state = 0;
> - bdi->capabilities = default_backing_dev_info.capabilities;
> bdi->unplug_io_fn = btrfs_unplug_io_fn;
> bdi->unplug_io_data = info;
> bdi->congested_fn = btrfs_congested_fn;
> @@ -1574,7 +1587,8 @@ struct btrfs_root *open_ctree(struct sup
> fs_info->sb = sb;
> fs_info->max_extent = (u64)-1;
> fs_info->max_inline = 8192 * 1024;
> - setup_bdi(fs_info, &fs_info->bdi);
> + if (setup_bdi(fs_info, &fs_info->bdi))
> + goto fail_bdi;
> fs_info->btree_inode = new_inode(sb);
> fs_info->btree_inode->i_ino = 1;
> fs_info->btree_inode->i_nlink = 1;
> @@ -1931,8 +1945,8 @@ fail_iput:
>
> btrfs_close_devices(fs_info->fs_devices);
> btrfs_mapping_tree_free(&fs_info->mapping_tree);
> +fail_bdi:
> bdi_destroy(&fs_info->bdi);
> -
> fail:
> kfree(extent_root);
> kfree(tree_root);
> diff -Nraup linux-2.6.30-rc6/fs/buffer.c linux-2.6.30-rc6_bdiflusherv7/fs/buffer.c
> --- linux-2.6.30-rc6/fs/buffer.c 2009-05-19 11:00:56.000000000 +0800
> +++ linux-2.6.30-rc6_bdiflusherv7/fs/buffer.c 2009-05-27 08:59:27.000000000 +0800
> @@ -281,7 +281,7 @@ static void free_more_memory(void)
> struct zone *zone;
> int nid;
>
> - wakeup_pdflush(1024);
> + wakeup_flusher_threads(1024);
> yield();
>
> for_each_online_node(nid) {
> diff -Nraup linux-2.6.30-rc6/fs/char_dev.c linux-2.6.30-rc6_bdiflusherv7/fs/char_dev.c
> --- linux-2.6.30-rc6/fs/char_dev.c 2009-05-19 11:00:27.000000000 +0800
> +++ linux-2.6.30-rc6_bdiflusherv7/fs/char_dev.c 2009-05-27 08:59:27.000000000 +0800
> @@ -32,6 +32,7 @@
> * - no readahead or I/O queue unplugging required
> */
> struct backing_dev_info directly_mappable_cdev_bdi = {
> + .name = "char",
> .capabilities = (
> #ifdef CONFIG_MMU
> /* permit private copies of the data to be taken */
> diff -Nraup linux-2.6.30-rc6/fs/configfs/inode.c linux-2.6.30-rc6_bdiflusherv7/fs/configfs/inode.c
> --- linux-2.6.30-rc6/fs/configfs/inode.c 2009-05-19 11:00:27.000000000 +0800
> +++ linux-2.6.30-rc6_bdiflusherv7/fs/configfs/inode.c 2009-05-27 08:59:27.000000000 +0800
> @@ -46,6 +46,7 @@ static const struct address_space_operat
> };
>
> static struct backing_dev_info configfs_backing_dev_info = {
> + .name = "configfs",
> .ra_pages = 0, /* No readahead */
> .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
> };
> diff -Nraup linux-2.6.30-rc6/fs/fs-writeback.c linux-2.6.30-rc6_bdiflusherv7/fs/fs-writeback.c
> --- linux-2.6.30-rc6/fs/fs-writeback.c 2009-05-19 11:00:57.000000000 +0800
> +++ linux-2.6.30-rc6_bdiflusherv7/fs/fs-writeback.c 2009-05-27 08:59:27.000000000 +0800
> @@ -19,49 +19,563 @@
> #include <linux/sched.h>
> #include <linux/fs.h>
> #include <linux/mm.h>
> +#include <linux/kthread.h>
> +#include <linux/freezer.h>
> #include <linux/writeback.h>
> #include <linux/blkdev.h>
> #include <linux/backing-dev.h>
> #include <linux/buffer_head.h>
> #include "internal.h"
>
> +#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info)
>
> -/**
> - * writeback_acquire - attempt to get exclusive writeback access to a device
> - * @bdi: the device's backing_dev_info structure
> - *
> - * It is a waste of resources to have more than one pdflush thread blocked on
> - * a single request queue. Exclusion at the request_queue level is obtained
> - * via a flag in the request_queue's backing_dev_info.state.
> - *
> - * Non-request_queue-backed address_spaces will share default_backing_dev_info,
> - * unless they implement their own. Which is somewhat inefficient, as this
> - * may prevent concurrent writeback against multiple devices.
> +/*
> + * We don't actually have pdflush, but this one is exported though /proc...
> + */
> +int nr_pdflush_threads;
> +
> +static void generic_sync_wb_inodes(struct bdi_writeback *wb,
> + struct super_block *sb,
> + struct writeback_control *wbc);
> +
> +/*
> + * Work items for the bdi_writeback threads
> */
> -static int writeback_acquire(struct backing_dev_info *bdi)
> +struct bdi_work {
> + struct list_head list;
> + struct list_head wait_list;
> + struct rcu_head rcu_head;
> +
> + unsigned long seen;
> + atomic_t pending;
> +
> + unsigned long sb_data;
> + unsigned long nr_pages;
> + enum writeback_sync_modes sync_mode;
> +
> + unsigned long state;
> +};
> +
> +static struct super_block *bdi_work_sb(struct bdi_work *work)
> +{
> + return (struct super_block *) (work->sb_data & ~1UL);
> +}
> +
> +static inline bool bdi_work_on_stack(struct bdi_work *work)
> +{
> + return work->sb_data & 1UL;
> +}
> +
> +static inline void bdi_work_init(struct bdi_work *work, struct super_block *sb,
> + unsigned long nr_pages,
> + enum writeback_sync_modes sync_mode)
> +{
> + INIT_RCU_HEAD(&work->rcu_head);
> + work->sb_data = (unsigned long) sb;
> + work->nr_pages = nr_pages;
> + work->sync_mode = sync_mode;
> + work->state = 1;
> +
> + /*
> + * state must not be reordered around the insert
> + */
> + smp_mb();
> +}
> +
> +static inline void bdi_work_init_on_stack(struct bdi_work *work,
> + struct super_block *sb,
> + unsigned long nr_pages,
> + enum writeback_sync_modes sync_mode)
> {
> - return !test_and_set_bit(BDI_pdflush, &bdi->state);
> + bdi_work_init(work, sb, nr_pages, sync_mode);
> + work->sb_data |= 1UL;
> }
>
> /**
> * writeback_in_progress - determine whether there is writeback in progress
> * @bdi: the device's backing_dev_info structure.
> *
> - * Determine whether there is writeback in progress against a backing device.
> + * Determine whether there is writeback waiting to be handled against a
> + * backing device.
> */
> int writeback_in_progress(struct backing_dev_info *bdi)
> {
> - return test_bit(BDI_pdflush, &bdi->state);
> + return !list_empty(&bdi->work_list);
> }
>
> -/**
> - * writeback_release - relinquish exclusive writeback access against a device.
> - * @bdi: the device's backing_dev_info structure
> +static void bdi_work_clear(struct bdi_work *work)
> +{
> + clear_bit(0, &work->state);
> + smp_mb__after_clear_bit();
> + wake_up_bit(&work->state, 0);
> +}
> +
> +static void bdi_work_free(struct rcu_head *head)
> +{
> + struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
> +
> + if (!bdi_work_on_stack(work))
> + kfree(work);
> + else
> + bdi_work_clear(work);
> +}
> +
> +static void wb_work_complete(struct bdi_work *work)
> +{
> + if (!bdi_work_on_stack(work)) {
> + bdi_work_clear(work);
> +
> + if (work->sync_mode == WB_SYNC_NONE)
> + call_rcu(&work->rcu_head, bdi_work_free);
> + } else
> + call_rcu(&work->rcu_head, bdi_work_free);
> +}
> +
> +static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
> +{
> + /*
> + * The caller has retrieved the work arguments from this work,
> + * drop our reference. If this is the last ref, delete and free it
> + */
> + if (atomic_dec_and_test(&work->pending)) {
> + struct backing_dev_info *bdi = wb->bdi;
> +
> + spin_lock(&bdi->wb_lock);
> + list_del_rcu(&work->list);
> + spin_unlock(&bdi->wb_lock);
> +
> + wb_work_complete(work);
> + }
> +}
> +
> +static void wb_start_writeback(struct bdi_writeback *wb, struct bdi_work *work)
> +{
> + /*
> + * If we failed allocating the bdi work item, wake up the wb thread
> + * always. As a safety precaution, it'll flush out everything
> + */
> + if (!wb_has_dirty_io(wb) && work)
> + wb_clear_pending(wb, work);
> + else
> + wake_up(&wb->wait);
> +}
> +
> +static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
> +{
> + if (work) {
> + work->seen = bdi->wb_mask;
> + atomic_set(&work->pending, bdi->wb_cnt);
> +
> + /*
> + * Make sure stores are seen before it appears on the list
> + */
> + smp_mb();
> +
> + spin_lock(&bdi->wb_lock);
> + list_add_tail_rcu(&work->list, &bdi->work_list);
> + spin_unlock(&bdi->wb_lock);
> + }
> +}
> +
> +static void bdi_sched_work(struct backing_dev_info *bdi, struct bdi_work *work)
> +{
> + if (!bdi_wblist_needs_lock(bdi))
> + wb_start_writeback(&bdi->wb, work);
> + else {
> + struct bdi_writeback *wb;
> + int idx;
> +
> + idx = srcu_read_lock(&bdi->srcu);
> +
> + list_for_each_entry_rcu(wb, &bdi->wb_list, list)
> + wb_start_writeback(wb, work);
> +
> + srcu_read_unlock(&bdi->srcu, idx);
> + }
> +}
> +
> +static void __bdi_start_work(struct backing_dev_info *bdi,
> + struct bdi_work *work)
> +{
> + /*
> + * If the default thread isn't there, make sure we add it. When
> + * it gets created and wakes up, we'll run this work.
> + */
> + if (unlikely(list_empty_careful(&bdi->wb_list)))
> + bdi_add_default_flusher_task(bdi);
> + else
> + bdi_sched_work(bdi, work);
> +}
> +
> +static void bdi_start_work(struct backing_dev_info *bdi, struct bdi_work *work)
> +{
> + /*
> + * If the default thread isn't there, make sure we add it. When
> + * it gets created and wakes up, we'll run this work.
> + */
> + if (unlikely(list_empty_careful(&bdi->wb_list))) {
> + mutex_lock(&bdi_lock);
> + bdi_add_default_flusher_task(bdi);
> + mutex_unlock(&bdi_lock);
> + } else
> + bdi_sched_work(bdi, work);
> +}
> +
> +/*
> + * Used for on-stack allocated work items. The caller needs to wait until
> + * the wb threads have acked the work before it's safe to continue.
> + */
> +static void bdi_wait_on_work_clear(struct bdi_work *work)
> +{
> + wait_on_bit(&work->state, 0, bdi_sched_wait, TASK_UNINTERRUPTIBLE);
> +}
> +
> +static struct bdi_work *bdi_alloc_work(struct super_block *sb, long nr_pages,
> + enum writeback_sync_modes sync_mode)
> +{
> + struct bdi_work *work;
> +
> + work = kmalloc(sizeof(*work), GFP_ATOMIC);
> + if (work)
> + bdi_work_init(work, sb, nr_pages, sync_mode);
> +
> + return work;
> +}
> +
> +void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
> + long nr_pages, enum writeback_sync_modes sync_mode)
> +{
> + const bool must_wait = sync_mode == WB_SYNC_ALL;
> + struct bdi_work work_stack, *work = NULL;
> +
> + if (!must_wait)
> + work = bdi_alloc_work(sb, nr_pages, sync_mode);
> +
> + if (!work) {
> + work = &work_stack;
> + bdi_work_init_on_stack(work, sb, nr_pages, sync_mode);
> + }
> +
> + bdi_queue_work(bdi, work);
> + bdi_start_work(bdi, work);
> +
> + /*
> + * If the sync mode is WB_SYNC_ALL, block waiting for the work to
> + * complete. If not, we only need to wait for the work to be started,
> + * if we allocated it on-stack. We use the same mechanism, if the
> + * wait bit is set in the bdi_work struct, then threads will not
> + * clear pending until after they are done.
> + *
> + * Note that work == &work_stack if must_wait is true, but that
> + * is implementation detail and we make it explicit here for
> + * ease of reading.
> + */
> + if (work == &work_stack || must_wait) {
> + bdi_wait_on_work_clear(work);
> + if (must_wait)
> + call_rcu(&work->rcu_head, bdi_work_free);
> + }
> +}
> +
> +/*
> + * The maximum number of pages to writeout in a single bdi flush/kupdate
> + * operation. We do this so we don't hold I_SYNC against an inode for
> + * enormous amounts of time, which would block a userspace task which has
> + * been forced to throttle against that inode. Also, the code reevaluates
> + * the dirty each time it has written this many pages.
> + */
> +#define MAX_WRITEBACK_PAGES 1024
> +
> +/*
> + * Periodic writeback of "old" data.
> + *
> + * Define "old": the first time one of an inode's pages is dirtied, we mark the
> + * dirtying-time in the inode's address_space. So this periodic writeback code
> + * just walks the superblock inode list, writing back any inodes which are
> + * older than a specific point in time.
> + *
> + * Try to run once per dirty_writeback_interval. But if a writeback event
> + * takes longer than a dirty_writeback_interval interval, then leave a
> + * one-second gap.
> + *
> + * older_than_this takes precedence over nr_to_write. So we'll only write back
> + * all dirty pages if they are all attached to "old" mappings.
> + */
> +static long wb_kupdated(struct bdi_writeback *wb)
> +{
> + unsigned long oldest_jif;
> + long nr_to_write, wrote = 0;
> + struct writeback_control wbc = {
> + .bdi = wb->bdi,
> + .sync_mode = WB_SYNC_NONE,
> + .older_than_this = &oldest_jif,
> + .nr_to_write = 0,
> + .for_kupdate = 1,
> + .range_cyclic = 1,
> + };
> +
> + oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10);
> +
> + nr_to_write = global_page_state(NR_FILE_DIRTY) +
> + global_page_state(NR_UNSTABLE_NFS) +
> + (inodes_stat.nr_inodes - inodes_stat.nr_unused);
> +
> + while (nr_to_write > 0) {
> + wbc.more_io = 0;
> + wbc.encountered_congestion = 0;
> + wbc.nr_to_write = MAX_WRITEBACK_PAGES;
> + generic_sync_wb_inodes(wb, NULL, &wbc);
> + wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
> + if (wbc.nr_to_write > 0)
> + break; /* All the old data is written */
> + nr_to_write -= MAX_WRITEBACK_PAGES;
> + }
> +
> + return wrote;
> +}
> +
> +static inline bool over_bground_thresh(void)
> +{
> + unsigned long background_thresh, dirty_thresh;
> +
> + get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
> +
> + return (global_page_state(NR_FILE_DIRTY) +
> + global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
> +}
> +
> +static long __wb_writeback(struct bdi_writeback *wb, long nr_pages,
> + struct super_block *sb,
> + enum writeback_sync_modes sync_mode)
> +{
> + struct writeback_control wbc = {
> + .bdi = wb->bdi,
> + .sync_mode = sync_mode,
> + .older_than_this = NULL,
> + .range_cyclic = 1,
> + };
> + long wrote = 0;
> +
> + for (;;) {
> + if (sync_mode == WB_SYNC_NONE && nr_pages <= 0 &&
> + !over_bground_thresh())
> + break;
> +
> + wbc.more_io = 0;
> + wbc.encountered_congestion = 0;
> + wbc.nr_to_write = MAX_WRITEBACK_PAGES;
> + wbc.pages_skipped = 0;
> + generic_sync_wb_inodes(wb, sb, &wbc);
> + nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
> + wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
> + /*
> + * If we ran out of stuff to write, bail unless more_io got set
> + */
> + if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
> + if (wbc.more_io)
> + continue;
> + break;
> + }
> + }
> +
> + return wrote;
> +}
> +
> +/*
> + * Return the next bdi_work struct that hasn't been processed by this
> + * wb thread yet
> + */
> +static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi,
> + struct bdi_writeback *wb)
> +{
> + struct bdi_work *work, *ret = NULL;
> +
> + rcu_read_lock();
> +
> + list_for_each_entry_rcu(work, &bdi->work_list, list) {
> + if (!test_and_clear_bit(wb->nr, &work->seen))
> + continue;
> +
> + ret = work;
> + break;
> + }
> +
> + rcu_read_unlock();
> + return ret;
> +}
> +
> +/*
> + * Retrieve work items and do the writeback they describe
> + */
> +static long wb_writeback(struct bdi_writeback *wb)
> +{
> + struct backing_dev_info *bdi = wb->bdi;
> + struct bdi_work *work;
> + long wrote = 0;
> +
> + while ((work = get_next_work_item(bdi, wb)) != NULL) {
> + struct super_block *sb = bdi_work_sb(work);
> + long nr_pages = work->nr_pages;
> + enum writeback_sync_modes sync_mode = work->sync_mode;
> +
> + /*
> + * If this isn't a data integrity operation, just notify
> + * that we have seen this work and we are now starting it.
> + */
> + if (sync_mode == WB_SYNC_NONE)
> + wb_clear_pending(wb, work);
> +
> + wrote += __wb_writeback(wb, nr_pages, sb, sync_mode);
> +
> + /*
> + * This is a data integrity writeback, so only do the
> + * notification when we have completed the work.
> + */
> + if (sync_mode == WB_SYNC_ALL)
> + wb_clear_pending(wb, work);
> + }
> +
> + return wrote;
> +}
> +
> +/*
> + * This will be inlined in bdi_writeback_task() once we get rid of any
> + * dirty inodes on the default_backing_dev_info
> + */
> +long wb_do_writeback(struct bdi_writeback *wb)
> +{
> + long wrote;
> +
> + /*
> + * We get here in two cases:
> + *
> + * schedule_timeout() returned because the dirty writeback
> + * interval has elapsed. If that happens, the work item list
> + * will be empty and we will proceed to do kupdated style writeout.
> + *
> + * Someone called bdi_start_writeback(), which put one/more work
> + * items on the work_list. Process those.
> + */
> + if (list_empty(&wb->bdi->work_list))
> + wrote = wb_kupdated(wb);
> + else
> + wrote = wb_writeback(wb);
> +
> + return wrote;
> +}
> +
> +/*
> + * Handle writeback of dirty data for the device backed by this bdi. Also
> + * wakes up periodically and does kupdated style flushing.
> */
> -static void writeback_release(struct backing_dev_info *bdi)
> +int bdi_writeback_task(struct bdi_writeback *wb)
> {
> - BUG_ON(!writeback_in_progress(bdi));
> - clear_bit(BDI_pdflush, &bdi->state);
> + unsigned long last_active = jiffies;
> + unsigned long wait_jiffies = -1UL;
> + long pages_written;
> + DEFINE_WAIT(wait);
> +
> + while (!kthread_should_stop()) {
> +
> + pages_written = wb_do_writeback(wb);
> +
> + if (pages_written)
> + last_active = jiffies;
> + else if (wait_jiffies != -1UL) {
> + unsigned long max_idle;
> +
> + /*
> + * Longest period of inactivity that we tolerate. If we
> + * see dirty data again later, the task will get
> + * recreated automatically.
> + */
> + max_idle = max(5UL * 60 * HZ, wait_jiffies);
> + if (time_after(jiffies, max_idle + last_active) &&
> + wb_is_default_task(wb))
> + break;
> + }
> +
> + prepare_to_wait(&wb->wait, &wait, TASK_INTERRUPTIBLE);
> + wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
> + schedule_timeout(wait_jiffies);
> + try_to_freeze();
> + }
> +
> + finish_wait(&wb->wait, &wait);
> + return 0;
> +}
> +
> +/*
> + * Schedule writeback for all backing devices. Expensive! If this is a data
> + * integrity operation, writeback will be complete when this returns. If
> + * we are simply called for WB_SYNC_NONE, then writeback will merely be
> + * scheduled to run.
> + */
> +void bdi_writeback_all(struct super_block *sb, long nr_pages,
> + enum writeback_sync_modes sync_mode)
> +{
> + const bool must_wait = sync_mode == WB_SYNC_ALL;
> + struct backing_dev_info *bdi, *tmp;
> + struct bdi_work *work;
> + LIST_HEAD(list);
> +
> + mutex_lock(&bdi_lock);
> +
> + list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) {
> + struct bdi_work *work, work_stack;
> +
> + if (!bdi_has_dirty_io(bdi))
> + continue;
> +
> + work = bdi_alloc_work(sb, nr_pages, sync_mode);
> + if (!work) {
> + work = &work_stack;
> + bdi_work_init_on_stack(work, sb, nr_pages, sync_mode);
> + } else if (must_wait)
> + list_add_tail(&work->wait_list, &list);
> +
> + bdi_queue_work(bdi, work);
> + __bdi_start_work(bdi, work);
> +
> + /*
> + * Do the wait inline if this came from the stack. This
> + * only happens if we ran out of memory, so should very
> + * rarely trigger.
> + */
> + if (work == &work_stack) {
> + bdi_wait_on_work_clear(work);
> + if (must_wait)
> + call_rcu(&work->rcu_head, bdi_work_free);
> + }
> + }
> +
> + mutex_unlock(&bdi_lock);
> +
> + /*
> + * If this is for WB_SYNC_ALL, wait for pending work to complete
> + * before returning.
> + */
> + while (!list_empty(&list)) {
> + work = list_entry(list.next, struct bdi_work, wait_list);
> + list_del(&work->wait_list);
> + bdi_wait_on_work_clear(work);
> + call_rcu(&work->rcu_head, bdi_work_free);
> + }
> +}
> +
> +/*
> + * If the filesystem didn't provide a way to map an inode to a dedicated
> + * flusher thread, it doesn't support more than 1 thread. So we know it's
> + * the default thread, return that.
> + */
> +static inline struct bdi_writeback *inode_get_wb(struct inode *inode)
> +{
> + const struct super_operations *sop = inode->i_sb->s_op;
> +
> + if (!sop->inode_get_wb)
> + return &inode_to_bdi(inode)->wb;
> +
> + return sop->inode_get_wb(inode);
> }
>
> /**
> @@ -158,12 +672,21 @@ void __mark_inode_dirty(struct inode *in
> goto out;
>
> /*
> - * If the inode was already on s_dirty/s_io/s_more_io, don't
> - * reposition it (that would break s_dirty time-ordering).
> + * If the inode was already on b_dirty/b_io/b_more_io, don't
> + * reposition it (that would break b_dirty time-ordering).
> */
> if (!was_dirty) {
> + struct bdi_writeback *wb = inode_get_wb(inode);
> + struct backing_dev_info *bdi = wb->bdi;
> +
> + if (bdi_cap_writeback_dirty(bdi) &&
> + !test_bit(BDI_registered, &bdi->state)) {
> + WARN_ON(1);
> + printk("bdi-%s not registered\n", bdi->name);
> + }
> +
> inode->dirtied_when = jiffies;
> - list_move(&inode->i_list, &sb->s_dirty);
> + list_move(&inode->i_list, &wb->b_dirty);
> }
> }
> out:
> @@ -184,31 +707,32 @@ static int write_inode(struct inode *ino
> * furthest end of its superblock's dirty-inode list.
> *
> * Before stamping the inode's ->dirtied_when, we check to see whether it is
> - * already the most-recently-dirtied inode on the s_dirty list. If that is
> + * already the most-recently-dirtied inode on the b_dirty list. If that is
> * the case then the inode must have been redirtied while it was being written
> * out and we don't reset its dirtied_when.
> */
> static void redirty_tail(struct inode *inode)
> {
> - struct super_block *sb = inode->i_sb;
> + struct bdi_writeback *wb = inode_get_wb(inode);
>
> - if (!list_empty(&sb->s_dirty)) {
> - struct inode *tail_inode;
> + if (!list_empty(&wb->b_dirty)) {
> + struct inode *tail;
>
> - tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list);
> - if (time_before(inode->dirtied_when,
> - tail_inode->dirtied_when))
> + tail = list_entry(wb->b_dirty.next, struct inode, i_list);
> + if (time_before(inode->dirtied_when, tail->dirtied_when))
> inode->dirtied_when = jiffies;
> }
> - list_move(&inode->i_list, &sb->s_dirty);
> + list_move(&inode->i_list, &wb->b_dirty);
> }
>
> /*
> - * requeue inode for re-scanning after sb->s_io list is exhausted.
> + * requeue inode for re-scanning after bdi->b_io list is exhausted.
> */
> static void requeue_io(struct inode *inode)
> {
> - list_move(&inode->i_list, &inode->i_sb->s_more_io);
> + struct bdi_writeback *wb = inode_get_wb(inode);
> +
> + list_move(&inode->i_list, &wb->b_more_io);
> }
>
> static void inode_sync_complete(struct inode *inode)
> @@ -255,20 +779,11 @@ static void move_expired_inodes(struct l
> /*
> * Queue all expired dirty inodes for io, eldest first.
> */
> -static void queue_io(struct super_block *sb,
> - unsigned long *older_than_this)
> -{
> - list_splice_init(&sb->s_more_io, sb->s_io.prev);
> - move_expired_inodes(&sb->s_dirty, &sb->s_io, older_than_this);
> -}
> -
> -int sb_has_dirty_inodes(struct super_block *sb)
> +static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
> {
> - return !list_empty(&sb->s_dirty) ||
> - !list_empty(&sb->s_io) ||
> - !list_empty(&sb->s_more_io);
> + list_splice_init(&wb->b_more_io, wb->b_io.prev);
> + move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
> }
> -EXPORT_SYMBOL(sb_has_dirty_inodes);
>
> /*
> * Write a single inode's dirty pages and inode data out to disk.
> @@ -322,11 +837,11 @@ __sync_single_inode(struct inode *inode,
> /*
> * We didn't write back all the pages. nfs_writepages()
> * sometimes bales out without doing anything. Redirty
> - * the inode; Move it from s_io onto s_more_io/s_dirty.
> + * the inode; Move it from b_io onto b_more_io/b_dirty.
> */
> /*
> * akpm: if the caller was the kupdate function we put
> - * this inode at the head of s_dirty so it gets first
> + * this inode at the head of b_dirty so it gets first
> * consideration. Otherwise, move it to the tail, for
> * the reasons described there. I'm not really sure
> * how much sense this makes. Presumably I had a good
> @@ -336,7 +851,7 @@ __sync_single_inode(struct inode *inode,
> if (wbc->for_kupdate) {
> /*
> * For the kupdate function we move the inode
> - * to s_more_io so it will get more writeout as
> + * to b_more_io so it will get more writeout as
> * soon as the queue becomes uncongested.
> */
> inode->i_state |= I_DIRTY_PAGES;
> @@ -402,10 +917,10 @@ __writeback_single_inode(struct inode *i
> if ((wbc->sync_mode != WB_SYNC_ALL) && (inode->i_state & I_SYNC)) {
> /*
> * We're skipping this inode because it's locked, and we're not
> - * doing writeback-for-data-integrity. Move it to s_more_io so
> - * that writeback can proceed with the other inodes on s_io.
> + * doing writeback-for-data-integrity. Move it to b_more_io so
> + * that writeback can proceed with the other inodes on b_io.
> * We'll have another go at writing back this inode when we
> - * completed a full scan of s_io.
> + * completed a full scan of b_io.
> */
> requeue_io(inode);
> return 0;
> @@ -428,51 +943,34 @@ __writeback_single_inode(struct inode *i
> return __sync_single_inode(inode, wbc);
> }
>
> -/*
> - * Write out a superblock's list of dirty inodes. A wait will be performed
> - * upon no inodes, all inodes or the final one, depending upon sync_mode.
> - *
> - * If older_than_this is non-NULL, then only write out inodes which
> - * had their first dirtying at a time earlier than *older_than_this.
> - *
> - * If we're a pdflush thread, then implement pdflush collision avoidance
> - * against the entire list.
> - *
> - * If `bdi' is non-zero then we're being asked to writeback a specific queue.
> - * This function assumes that the blockdev superblock's inodes are backed by
> - * a variety of queues, so all inodes are searched. For other superblocks,
> - * assume that all inodes are backed by the same queue.
> - *
> - * FIXME: this linear search could get expensive with many fileystems. But
> - * how to fix? We need to go from an address_space to all inodes which share
> - * a queue with that address_space. (Easy: have a global "dirty superblocks"
> - * list).
> - *
> - * The inodes to be written are parked on sb->s_io. They are moved back onto
> - * sb->s_dirty as they are selected for writing. This way, none can be missed
> - * on the writer throttling path, and we get decent balancing between many
> - * throttled threads: we don't want them all piling up on inode_sync_wait.
> - */
> -void generic_sync_sb_inodes(struct super_block *sb,
> - struct writeback_control *wbc)
> +static void generic_sync_wb_inodes(struct bdi_writeback *wb,
> + struct super_block *sb,
> + struct writeback_control *wbc)
> {
> + const int is_blkdev_sb = sb_is_blkdev_sb(sb);
> const unsigned long start = jiffies; /* livelock avoidance */
> - int sync = wbc->sync_mode == WB_SYNC_ALL;
>
> spin_lock(&inode_lock);
> - if (!wbc->for_kupdate || list_empty(&sb->s_io))
> - queue_io(sb, wbc->older_than_this);
>
> - while (!list_empty(&sb->s_io)) {
> - struct inode *inode = list_entry(sb->s_io.prev,
> + if (!wbc->for_kupdate || list_empty(&wb->b_io))
> + queue_io(wb, wbc->older_than_this);
> +
> + while (!list_empty(&wb->b_io)) {
> + struct inode *inode = list_entry(wb->b_io.prev,
> struct inode, i_list);
> - struct address_space *mapping = inode->i_mapping;
> - struct backing_dev_info *bdi = mapping->backing_dev_info;
> long pages_skipped;
>
> - if (!bdi_cap_writeback_dirty(bdi)) {
> + /*
> + * super block given and doesn't match, skip this inode
> + */
> + if (sb && sb != inode->i_sb) {
> + redirty_tail(inode);
> + continue;
> + }
> +
> + if (!bdi_cap_writeback_dirty(wb->bdi)) {
> redirty_tail(inode);
> - if (sb_is_blkdev_sb(sb)) {
> + if (is_blkdev_sb) {
> /*
> * Dirty memory-backed blockdev: the ramdisk
> * driver does this. Skip just this inode
> @@ -492,21 +990,14 @@ void generic_sync_sb_inodes(struct super
> continue;
> }
>
> - if (wbc->nonblocking && bdi_write_congested(bdi)) {
> + if (wbc->nonblocking && bdi_write_congested(wb->bdi)) {
> wbc->encountered_congestion = 1;
> - if (!sb_is_blkdev_sb(sb))
> + if (!is_blkdev_sb)
> break; /* Skip a congested fs */
> requeue_io(inode);
> continue; /* Skip a congested blockdev */
> }
>
> - if (wbc->bdi && bdi != wbc->bdi) {
> - if (!sb_is_blkdev_sb(sb))
> - break; /* fs has the wrong queue */
> - requeue_io(inode);
> - continue; /* blockdev has wrong queue */
> - }
> -
> /*
> * Was this inode dirtied after sync_sb_inodes was called?
> * This keeps sync from extra jobs and livelock.
> @@ -514,16 +1005,10 @@ void generic_sync_sb_inodes(struct super
> if (inode_dirtied_after(inode, start))
> break;
>
> - /* Is another pdflush already flushing this queue? */
> - if (current_is_pdflush() && !writeback_acquire(bdi))
> - break;
> -
> BUG_ON(inode->i_state & I_FREEING);
> __iget(inode);
> pages_skipped = wbc->pages_skipped;
> __writeback_single_inode(inode, wbc);
> - if (current_is_pdflush())
> - writeback_release(bdi);
> if (wbc->pages_skipped != pages_skipped) {
> /*
> * writeback is not making progress due to locked
> @@ -539,13 +1024,71 @@ void generic_sync_sb_inodes(struct super
> wbc->more_io = 1;
> break;
> }
> - if (!list_empty(&sb->s_more_io))
> + if (!list_empty(&wb->b_more_io))
> wbc->more_io = 1;
> }
>
> - if (sync) {
> + spin_unlock(&inode_lock);
> + /* Leave any unwritten inodes on b_io */
> +}
> +
> +void generic_sync_bdi_inodes(struct super_block *sb,
> + struct writeback_control *wbc)
> +{
> + struct backing_dev_info *bdi = wbc->bdi;
> + struct bdi_writeback *wb;
> +
> + /*
> + * Common case is just a single wb thread and that is embedded in
> + * the bdi, so it doesn't need locking
> + */
> + if (!bdi_wblist_needs_lock(bdi))
> + generic_sync_wb_inodes(&bdi->wb, sb, wbc);
> + else {
> + int idx;
> +
> + idx = srcu_read_lock(&bdi->srcu);
> +
> + list_for_each_entry_rcu(wb, &bdi->wb_list, list)
> + generic_sync_wb_inodes(wb, sb, wbc);
> +
> + srcu_read_unlock(&bdi->srcu, idx);
> + }
> +}
> +
> +/*
> + * Write out a superblock's list of dirty inodes. A wait will be performed
> + * upon no inodes, all inodes or the final one, depending upon sync_mode.
> + *
> + * If older_than_this is non-NULL, then only write out inodes which
> + * had their first dirtying at a time earlier than *older_than_this.
> + *
> + * If we're a pdlfush thread, then implement pdflush collision avoidance
> + * against the entire list.
> + *
> + * If `bdi' is non-zero then we're being asked to writeback a specific queue.
> + * This function assumes that the blockdev superblock's inodes are backed by
> + * a variety of queues, so all inodes are searched. For other superblocks,
> + * assume that all inodes are backed by the same queue.
> + *
> + * The inodes to be written are parked on bdi->b_io. They are moved back onto
> + * bdi->b_dirty as they are selected for writing. This way, none can be missed
> + * on the writer throttling path, and we get decent balancing between many
> + * throttled threads: we don't want them all piling up on inode_sync_wait.
> + */
> +void generic_sync_sb_inodes(struct super_block *sb,
> + struct writeback_control *wbc)
> +{
> + if (wbc->bdi)
> + bdi_start_writeback(wbc->bdi, sb, wbc->nr_to_write, wbc->sync_mode);
> + else
> + bdi_writeback_all(sb, wbc->nr_to_write, wbc->sync_mode);
> +
> + if (wbc->sync_mode == WB_SYNC_ALL) {
> struct inode *inode, *old_inode = NULL;
>
> + spin_lock(&inode_lock);
> +
> /*
> * Data integrity sync. Must wait for all pages under writeback,
> * because there may have been pages dirtied before our sync
> @@ -583,10 +1126,8 @@ void generic_sync_sb_inodes(struct super
> }
> spin_unlock(&inode_lock);
> iput(old_inode);
> - } else
> - spin_unlock(&inode_lock);
> + }
>
> - return; /* Leave any unwritten inodes on s_io */
> }
> EXPORT_SYMBOL_GPL(generic_sync_sb_inodes);
>
> @@ -597,58 +1138,6 @@ static void sync_sb_inodes(struct super_
> }
>
> /*
> - * Start writeback of dirty pagecache data against all unlocked inodes.
> - *
> - * Note:
> - * We don't need to grab a reference to superblock here. If it has non-empty
> - * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed
> - * past sync_inodes_sb() until the ->s_dirty/s_io/s_more_io lists are all
> - * empty. Since __sync_single_inode() regains inode_lock before it finally moves
> - * inode from superblock lists we are OK.
> - *
> - * If `older_than_this' is non-zero then only flush inodes which have a
> - * flushtime older than *older_than_this.
> - *
> - * If `bdi' is non-zero then we will scan the first inode against each
> - * superblock until we find the matching ones. One group will be the dirty
> - * inodes against a filesystem. Then when we hit the dummy blockdev superblock,
> - * sync_sb_inodes will seekout the blockdev which matches `bdi'. Maybe not
> - * super-efficient but we're about to do a ton of I/O...
> - */
> -void
> -writeback_inodes(struct writeback_control *wbc)
> -{
> - struct super_block *sb;
> -
> - might_sleep();
> - spin_lock(&sb_lock);
> -restart:
> - list_for_each_entry_reverse(sb, &super_blocks, s_list) {
> - if (sb_has_dirty_inodes(sb)) {
> - /* we're making our own get_super here */
> - sb->s_count++;
> - spin_unlock(&sb_lock);
> - /*
> - * If we can't get the readlock, there's no sense in
> - * waiting around, most of the time the FS is going to
> - * be unmounted by the time it is released.
> - */
> - if (down_read_trylock(&sb->s_umount)) {
> - if (sb->s_root)
> - sync_sb_inodes(sb, wbc);
> - up_read(&sb->s_umount);
> - }
> - spin_lock(&sb_lock);
> - if (__put_super_and_need_restart(sb))
> - goto restart;
> - }
> - if (wbc->nr_to_write <= 0)
> - break;
> - }
> - spin_unlock(&sb_lock);
> -}
> -
> -/*
> * writeback and wait upon the filesystem's dirty inodes. The caller will
> * do this in two passes - one to write, and one to wait.
> *
> diff -Nraup linux-2.6.30-rc6/fs/fuse/inode.c linux-2.6.30-rc6_bdiflusherv7/fs/fuse/inode.c
> --- linux-2.6.30-rc6/fs/fuse/inode.c 2009-05-19 11:00:57.000000000 +0800
> +++ linux-2.6.30-rc6_bdiflusherv7/fs/fuse/inode.c 2009-05-27 08:59:27.000000000 +0800
> @@ -484,6 +484,7 @@ int fuse_conn_init(struct fuse_conn *fc,
> INIT_LIST_HEAD(&fc->bg_queue);
> INIT_LIST_HEAD(&fc->entry);
> atomic_set(&fc->num_waiting, 0);
> + fc->bdi.name = "fuse";
> fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
> fc->bdi.unplug_io_fn = default_unplug_io_fn;
> /* fuse does it's own writeback accounting */
> diff -Nraup linux-2.6.30-rc6/fs/hugetlbfs/inode.c linux-2.6.30-rc6_bdiflusherv7/fs/hugetlbfs/inode.c
> --- linux-2.6.30-rc6/fs/hugetlbfs/inode.c 2009-05-19 11:00:57.000000000 +0800
> +++ linux-2.6.30-rc6_bdiflusherv7/fs/hugetlbfs/inode.c 2009-05-27 08:59:27.000000000 +0800
> @@ -43,6 +43,7 @@ static const struct inode_operations hug
> static const struct inode_operations hugetlbfs_inode_operations;
>
> static struct backing_dev_info hugetlbfs_backing_dev_info = {
> + .name = "hugetlbfs",
> .ra_pages = 0, /* No readahead */
> .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
> };
> diff -Nraup linux-2.6.30-rc6/fs/nfs/client.c linux-2.6.30-rc6_bdiflusherv7/fs/nfs/client.c
> --- linux-2.6.30-rc6/fs/nfs/client.c 2009-05-19 11:00:57.000000000 +0800
> +++ linux-2.6.30-rc6_bdiflusherv7/fs/nfs/client.c 2009-05-27 08:59:27.000000000 +0800
> @@ -836,6 +836,7 @@ static void nfs_server_set_fsinfo(struct
> server->rsize = NFS_MAX_FILE_IO_SIZE;
> server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
>
> + server->backing_dev_info.name = "nfs";
> server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
>
> if (server->wsize > max_rpc_payload)
> diff -Nraup linux-2.6.30-rc6/fs/ntfs/super.c linux-2.6.30-rc6_bdiflusherv7/fs/ntfs/super.c
> --- linux-2.6.30-rc6/fs/ntfs/super.c 2009-05-19 11:00:57.000000000 +0800
> +++ linux-2.6.30-rc6_bdiflusherv7/fs/ntfs/super.c 2009-05-27 08:59:27.000000000 +0800
> @@ -2373,39 +2373,12 @@ static void ntfs_put_super(struct super_
> vol->mftmirr_ino = NULL;
> }
> /*
> - * If any dirty inodes are left, throw away all mft data page cache
> - * pages to allow a clean umount. This should never happen any more
> - * due to mft.c::ntfs_mft_writepage() cleaning all the dirty pages as
> - * the underlying mft records are written out and cleaned. If it does,
> - * happen anyway, we want to know...
> + * We should have no dirty inodes left, due to
> + * mft.c::ntfs_mft_writepage() cleaning all the dirty pages as
> + * the underlying mft records are written out and cleaned.
> */
> ntfs_commit_inode(vol->mft_ino);
> write_inode_now(vol->mft_ino, 1);
> - if (sb_has_dirty_inodes(sb)) {
> - const char *s1, *s2;
> -
> - mutex_lock(&vol->mft_ino->i_mutex);
> - truncate_inode_pages(vol->mft_ino->i_mapping, 0);
> - mutex_unlock(&vol->mft_ino->i_mutex);
> - write_inode_now(vol->mft_ino, 1);
> - if (sb_has_dirty_inodes(sb)) {
> - static const char *_s1 = "inodes";
> - static const char *_s2 = "";
> - s1 = _s1;
> - s2 = _s2;
> - } else {
> - static const char *_s1 = "mft pages";
> - static const char *_s2 = "They have been thrown "
> - "away. ";
> - s1 = _s1;
> - s2 = _s2;
> - }
> - ntfs_error(sb, "Dirty %s found at umount time. %sYou should "
> - "run chkdsk. Please email "
> - "linux-ntfs-dev@...ts.sourceforge.net and say "
> - "that you saw this message. Thank you.", s1,
> - s2);
> - }
> #endif /* NTFS_RW */
>
> iput(vol->mft_ino);
> diff -Nraup linux-2.6.30-rc6/fs/ocfs2/dlm/dlmfs.c linux-2.6.30-rc6_bdiflusherv7/fs/ocfs2/dlm/dlmfs.c
> --- linux-2.6.30-rc6/fs/ocfs2/dlm/dlmfs.c 2009-05-19 11:00:27.000000000 +0800
> +++ linux-2.6.30-rc6_bdiflusherv7/fs/ocfs2/dlm/dlmfs.c 2009-05-27 08:59:27.000000000 +0800
> @@ -325,6 +325,7 @@ clear_fields:
> }
>
> static struct backing_dev_info dlmfs_backing_dev_info = {
> + .name = "ocfs2-dlmfs",
> .ra_pages = 0, /* No readahead */
> .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
> };
> diff -Nraup linux-2.6.30-rc6/fs/ramfs/inode.c linux-2.6.30-rc6_bdiflusherv7/fs/ramfs/inode.c
> --- linux-2.6.30-rc6/fs/ramfs/inode.c 2009-05-19 11:00:57.000000000 +0800
> +++ linux-2.6.30-rc6_bdiflusherv7/fs/ramfs/inode.c 2009-05-27 08:59:27.000000000 +0800
> @@ -46,6 +46,7 @@ static const struct super_operations ram
> static const struct inode_operations ramfs_dir_inode_operations;
>
> static struct backing_dev_info ramfs_backing_dev_info = {
> + .name = "ramfs",
> .ra_pages = 0, /* No readahead */
> .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK |
> BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY |
> diff -Nraup linux-2.6.30-rc6/fs/super.c linux-2.6.30-rc6_bdiflusherv7/fs/super.c
> --- linux-2.6.30-rc6/fs/super.c 2009-05-19 11:00:57.000000000 +0800
> +++ linux-2.6.30-rc6_bdiflusherv7/fs/super.c 2009-05-27 08:59:27.000000000 +0800
> @@ -64,9 +64,6 @@ static struct super_block *alloc_super(s
> s = NULL;
> goto out;
> }
> - INIT_LIST_HEAD(&s->s_dirty);
> - INIT_LIST_HEAD(&s->s_io);
> - INIT_LIST_HEAD(&s->s_more_io);
> INIT_LIST_HEAD(&s->s_files);
> INIT_LIST_HEAD(&s->s_instances);
> INIT_HLIST_HEAD(&s->s_anon);
> diff -Nraup linux-2.6.30-rc6/fs/sync.c linux-2.6.30-rc6_bdiflusherv7/fs/sync.c
> --- linux-2.6.30-rc6/fs/sync.c 2009-05-19 11:00:57.000000000 +0800
> +++ linux-2.6.30-rc6_bdiflusherv7/fs/sync.c 2009-05-27 08:59:27.000000000 +0800
> @@ -23,7 +23,7 @@
> */
> static void do_sync(unsigned long wait)
> {
> - wakeup_pdflush(0);
> + wakeup_flusher_threads(0);
> sync_inodes(0); /* All mappings, inodes and their blockdevs */
> vfs_dq_sync(NULL);
> sync_supers(); /* Write the superblocks */
> diff -Nraup linux-2.6.30-rc6/fs/sysfs/inode.c linux-2.6.30-rc6_bdiflusherv7/fs/sysfs/inode.c
> --- linux-2.6.30-rc6/fs/sysfs/inode.c 2009-05-19 11:00:57.000000000 +0800
> +++ linux-2.6.30-rc6_bdiflusherv7/fs/sysfs/inode.c 2009-05-27 08:59:27.000000000 +0800
> @@ -29,6 +29,7 @@ static const struct address_space_operat
> };
>
> static struct backing_dev_info sysfs_backing_dev_info = {
> + .name = "sysfs",
> .ra_pages = 0, /* No readahead */
> .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
> };
> diff -Nraup linux-2.6.30-rc6/fs/ubifs/super.c linux-2.6.30-rc6_bdiflusherv7/fs/ubifs/super.c
> --- linux-2.6.30-rc6/fs/ubifs/super.c 2009-05-19 11:00:57.000000000 +0800
> +++ linux-2.6.30-rc6_bdiflusherv7/fs/ubifs/super.c 2009-05-27 08:59:27.000000000 +0800
> @@ -1923,6 +1923,7 @@ static int ubifs_fill_super(struct super
> *
> * Read-ahead will be disabled because @c->bdi.ra_pages is 0.
> */
> + c->bdi.name = "ubifs",
> c->bdi.capabilities = BDI_CAP_MAP_COPY;
> c->bdi.unplug_io_fn = default_unplug_io_fn;
> err = bdi_init(&c->bdi);
> diff -Nraup linux-2.6.30-rc6/include/linux/backing-dev.h linux-2.6.30-rc6_bdiflusherv7/include/linux/backing-dev.h
> --- linux-2.6.30-rc6/include/linux/backing-dev.h 2009-05-19 11:00:57.000000000 +0800
> +++ linux-2.6.30-rc6_bdiflusherv7/include/linux/backing-dev.h 2009-05-27 08:59:27.000000000 +0800
> @@ -13,6 +13,9 @@
> #include <linux/proportions.h>
> #include <linux/kernel.h>
> #include <linux/fs.h>
> +#include <linux/sched.h>
> +#include <linux/srcu.h>
> +#include <linux/writeback.h>
> #include <asm/atomic.h>
>
> struct page;
> @@ -23,9 +26,12 @@ struct dentry;
> * Bits in backing_dev_info.state
> */
> enum bdi_state {
> - BDI_pdflush, /* A pdflush thread is working this device */
> + BDI_pending, /* On its way to being activated */
> + BDI_wb_alloc, /* Default embedded wb allocated */
> + BDI_wblist_lock, /* bdi->wb_list now needs locking */
> BDI_async_congested, /* The async (write) queue is getting full */
> BDI_sync_congested, /* The sync queue is getting full */
> + BDI_registered, /* bdi_register() was done */
> BDI_unused, /* Available bits start here */
> };
>
> @@ -39,7 +45,24 @@ enum bdi_stat_item {
>
> #define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
>
> +struct bdi_writeback {
> + struct list_head list; /* hangs off the bdi */
> +
> + struct backing_dev_info *bdi; /* our parent bdi */
> + unsigned int nr;
> +
> + struct task_struct *task; /* writeback task */
> + wait_queue_head_t wait;
> + struct list_head b_dirty; /* dirty inodes */
> + struct list_head b_io; /* parked for writeback */
> + struct list_head b_more_io; /* parked for more writeback */
> +};
> +
> +#define BDI_MAX_FLUSHERS 32
> +
> struct backing_dev_info {
> + struct srcu_struct srcu; /* for wb_list read side protection */
> + struct list_head bdi_list;
> unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */
> unsigned long state; /* Always use atomic bitops on this */
> unsigned int capabilities; /* Device capabilities */
> @@ -48,6 +71,8 @@ struct backing_dev_info {
> void (*unplug_io_fn)(struct backing_dev_info *, struct page *);
> void *unplug_io_data;
>
> + char *name;
> +
> struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
>
> struct prop_local_percpu completions;
> @@ -56,6 +81,14 @@ struct backing_dev_info {
> unsigned int min_ratio;
> unsigned int max_ratio, max_prop_frac;
>
> + struct bdi_writeback wb; /* default writeback info for this bdi */
> + spinlock_t wb_lock; /* protects update side of wb_list */
> + struct list_head wb_list; /* the flusher threads hanging off this bdi */
> + unsigned long wb_mask; /* bitmask of registered tasks */
> + unsigned int wb_cnt; /* number of registered tasks */
> +
> + struct list_head work_list;
> +
> struct device *dev;
>
> #ifdef CONFIG_DEBUG_FS
> @@ -71,6 +104,34 @@ int bdi_register(struct backing_dev_info
> const char *fmt, ...);
> int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
> void bdi_unregister(struct backing_dev_info *bdi);
> +void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
> + long nr_pages, enum writeback_sync_modes sync_mode);
> +int bdi_writeback_task(struct bdi_writeback *wb);
> +void bdi_writeback_all(struct super_block *sb, long nr_pages,
> + enum writeback_sync_modes sync_mode);
> +void bdi_add_default_flusher_task(struct backing_dev_info *bdi);
> +void bdi_add_flusher_task(struct backing_dev_info *bdi);
> +int bdi_has_dirty_io(struct backing_dev_info *bdi);
> +
> +extern struct mutex bdi_lock;
> +extern struct list_head bdi_list;
> +
> +static inline int wb_is_default_task(struct bdi_writeback *wb)
> +{
> + return wb == &wb->bdi->wb;
> +}
> +
> +static inline int bdi_wblist_needs_lock(struct backing_dev_info *bdi)
> +{
> + return test_bit(BDI_wblist_lock, &bdi->state);
> +}
> +
> +static inline int wb_has_dirty_io(struct bdi_writeback *wb)
> +{
> + return !list_empty(&wb->b_dirty) ||
> + !list_empty(&wb->b_io) ||
> + !list_empty(&wb->b_more_io);
> +}
>
> static inline void __add_bdi_stat(struct backing_dev_info *bdi,
> enum bdi_stat_item item, s64 amount)
> @@ -187,6 +248,7 @@ int bdi_set_max_ratio(struct backing_dev
> #define BDI_CAP_EXEC_MAP 0x00000040
> #define BDI_CAP_NO_ACCT_WB 0x00000080
> #define BDI_CAP_SWAP_BACKED 0x00000100
> +#define BDI_CAP_FLUSH_FORKER 0x00000200
>
> #define BDI_CAP_VMFLAGS \
> (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP)
> @@ -256,6 +318,11 @@ static inline bool bdi_cap_swap_backed(s
> return bdi->capabilities & BDI_CAP_SWAP_BACKED;
> }
>
> +static inline bool bdi_cap_flush_forker(struct backing_dev_info *bdi)
> +{
> + return bdi->capabilities & BDI_CAP_FLUSH_FORKER;
> +}
> +
> static inline bool mapping_cap_writeback_dirty(struct address_space *mapping)
> {
> return bdi_cap_writeback_dirty(mapping->backing_dev_info);
> @@ -271,4 +338,10 @@ static inline bool mapping_cap_swap_back
> return bdi_cap_swap_backed(mapping->backing_dev_info);
> }
>
> +static inline int bdi_sched_wait(void *word)
> +{
> + schedule();
> + return 0;
> +}
> +
> #endif /* _LINUX_BACKING_DEV_H */
> diff -Nraup linux-2.6.30-rc6/include/linux/fs.h linux-2.6.30-rc6_bdiflusherv7/include/linux/fs.h
> --- linux-2.6.30-rc6/include/linux/fs.h 2009-05-19 11:00:57.000000000 +0800
> +++ linux-2.6.30-rc6_bdiflusherv7/include/linux/fs.h 2009-05-27 08:59:27.000000000 +0800
> @@ -712,7 +712,7 @@ static inline int mapping_writably_mappe
>
> struct inode {
> struct hlist_node i_hash;
> - struct list_head i_list;
> + struct list_head i_list; /* backing dev IO list */
> struct list_head i_sb_list;
> struct list_head i_dentry;
> unsigned long i_ino;
> @@ -1329,9 +1329,6 @@ struct super_block {
> struct xattr_handler **s_xattr;
>
> struct list_head s_inodes; /* all inodes */
> - struct list_head s_dirty; /* dirty inodes */
> - struct list_head s_io; /* parked for writeback */
> - struct list_head s_more_io; /* parked for more writeback */
> struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */
> struct list_head s_files;
> /* s_dentry_lru and s_nr_dentry_unused are protected by dcache_lock */
> @@ -1553,11 +1550,14 @@ extern ssize_t vfs_readv(struct file *,
> extern ssize_t vfs_writev(struct file *, const struct iovec __user *,
> unsigned long, loff_t *);
>
> +struct bdi_writeback;
> +
> struct super_operations {
> struct inode *(*alloc_inode)(struct super_block *sb);
> void (*destroy_inode)(struct inode *);
>
> void (*dirty_inode) (struct inode *);
> + struct bdi_writeback *(*inode_get_wb) (struct inode *);
> int (*write_inode) (struct inode *, int);
> void (*drop_inode) (struct inode *);
> void (*delete_inode) (struct inode *);
> @@ -2066,6 +2066,8 @@ extern int invalidate_inode_pages2_range
> pgoff_t start, pgoff_t end);
> extern void generic_sync_sb_inodes(struct super_block *sb,
> struct writeback_control *wbc);
> +extern void generic_sync_bdi_inodes(struct super_block *sb,
> + struct writeback_control *);
> extern int write_inode_now(struct inode *, int);
> extern int filemap_fdatawrite(struct address_space *);
> extern int filemap_flush(struct address_space *);
> @@ -2183,7 +2185,6 @@ extern int bdev_read_only(struct block_d
> extern int set_blocksize(struct block_device *, int);
> extern int sb_set_blocksize(struct super_block *, int);
> extern int sb_min_blocksize(struct super_block *, int);
> -extern int sb_has_dirty_inodes(struct super_block *);
>
> extern int generic_file_mmap(struct file *, struct vm_area_struct *);
> extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
> diff -Nraup linux-2.6.30-rc6/include/linux/writeback.h linux-2.6.30-rc6_bdiflusherv7/include/linux/writeback.h
> --- linux-2.6.30-rc6/include/linux/writeback.h 2009-05-19 11:00:58.000000000 +0800
> +++ linux-2.6.30-rc6_bdiflusherv7/include/linux/writeback.h 2009-05-27 08:59:27.000000000 +0800
> @@ -14,17 +14,6 @@ extern struct list_head inode_in_use;
> extern struct list_head inode_unused;
>
> /*
> - * Yes, writeback.h requires sched.h
> - * No, sched.h is not included from here.
> - */
> -static inline int task_is_pdflush(struct task_struct *task)
> -{
> - return task->flags & PF_FLUSHER;
> -}
> -
> -#define current_is_pdflush() task_is_pdflush(current)
> -
> -/*
> * fs/fs-writeback.c
> */
> enum writeback_sync_modes {
> @@ -80,6 +69,7 @@ void writeback_inodes(struct writeback_c
> int inode_wait(void *);
> void sync_inodes_sb(struct super_block *, int wait);
> void sync_inodes(int wait);
> +long wb_do_writeback(struct bdi_writeback *wb);
>
> /* writeback.h requires fs.h; it, too, is not included from here. */
> static inline void wait_on_inode(struct inode *inode)
> @@ -99,7 +89,7 @@ static inline void inode_sync_wait(struc
> /*
> * mm/page-writeback.c
> */
> -int wakeup_pdflush(long nr_pages);
> +void wakeup_flusher_threads(long nr_pages);
> void laptop_io_completion(void);
> void laptop_sync_completion(void);
> void throttle_vm_writeout(gfp_t gfp_mask);
> @@ -151,7 +141,6 @@ balance_dirty_pages_ratelimited(struct a
> typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
> void *data);
>
> -int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0);
> int generic_writepages(struct address_space *mapping,
> struct writeback_control *wbc);
> int write_cache_pages(struct address_space *mapping,
> diff -Nraup linux-2.6.30-rc6/kernel/cgroup.c linux-2.6.30-rc6_bdiflusherv7/kernel/cgroup.c
> --- linux-2.6.30-rc6/kernel/cgroup.c 2009-05-19 11:00:58.000000000 +0800
> +++ linux-2.6.30-rc6_bdiflusherv7/kernel/cgroup.c 2009-05-27 08:59:27.000000000 +0800
> @@ -598,6 +598,7 @@ static struct inode_operations cgroup_di
> static struct file_operations proc_cgroupstats_operations;
>
> static struct backing_dev_info cgroup_backing_dev_info = {
> + .name = "cgroup",
> .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
> };
>
> diff -Nraup linux-2.6.30-rc6/mm/backing-dev.c linux-2.6.30-rc6_bdiflusherv7/mm/backing-dev.c
> --- linux-2.6.30-rc6/mm/backing-dev.c 2009-05-19 11:00:58.000000000 +0800
> +++ linux-2.6.30-rc6_bdiflusherv7/mm/backing-dev.c 2009-05-27 08:59:27.000000000 +0800
> @@ -1,8 +1,11 @@
>
> #include <linux/wait.h>
> #include <linux/backing-dev.h>
> +#include <linux/kthread.h>
> +#include <linux/freezer.h>
> #include <linux/fs.h>
> #include <linux/pagemap.h>
> +#include <linux/mm.h>
> #include <linux/sched.h>
> #include <linux/module.h>
> #include <linux/writeback.h>
> @@ -14,14 +17,18 @@ void default_unplug_io_fn(struct backing
> EXPORT_SYMBOL(default_unplug_io_fn);
>
> struct backing_dev_info default_backing_dev_info = {
> + .name = "default",
> .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
> .state = 0,
> - .capabilities = BDI_CAP_MAP_COPY,
> + .capabilities = BDI_CAP_MAP_COPY | BDI_CAP_FLUSH_FORKER,
> .unplug_io_fn = default_unplug_io_fn,
> };
> EXPORT_SYMBOL_GPL(default_backing_dev_info);
>
> static struct class *bdi_class;
> +DEFINE_MUTEX(bdi_lock);
> +LIST_HEAD(bdi_list);
> +LIST_HEAD(bdi_pending_list);
>
> #ifdef CONFIG_DEBUG_FS
> #include <linux/debugfs.h>
> @@ -37,9 +44,29 @@ static void bdi_debug_init(void)
> static int bdi_debug_stats_show(struct seq_file *m, void *v)
> {
> struct backing_dev_info *bdi = m->private;
> + struct bdi_writeback *wb;
> unsigned long background_thresh;
> unsigned long dirty_thresh;
> unsigned long bdi_thresh;
> + unsigned long nr_dirty, nr_io, nr_more_io, nr_wb;
> + struct inode *inode;
> +
> + /*
> + * inode lock is enough here, the bdi->wb_list is protected by
> + * RCU on the reader side
> + */
> + nr_wb = nr_dirty = nr_io = nr_more_io = 0;
> + spin_lock(&inode_lock);
> + list_for_each_entry(wb, &bdi->wb_list, list) {
> + nr_wb++;
> + list_for_each_entry(inode, &wb->b_dirty, i_list)
> + nr_dirty++;
> + list_for_each_entry(inode, &wb->b_io, i_list)
> + nr_io++;
> + list_for_each_entry(inode, &wb->b_more_io, i_list)
> + nr_more_io++;
> + }
> + spin_unlock(&inode_lock);
>
> get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
>
> @@ -49,12 +76,22 @@ static int bdi_debug_stats_show(struct s
> "BdiReclaimable: %8lu kB\n"
> "BdiDirtyThresh: %8lu kB\n"
> "DirtyThresh: %8lu kB\n"
> - "BackgroundThresh: %8lu kB\n",
> + "BackgroundThresh: %8lu kB\n"
> + "WriteBack threads:%8lu\n"
> + "b_dirty: %8lu\n"
> + "b_io: %8lu\n"
> + "b_more_io: %8lu\n"
> + "bdi_list: %8u\n"
> + "state: %8lx\n"
> + "wb_mask: %8lx\n"
> + "wb_list: %8u\n"
> + "wb_cnt: %8u\n",
> (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
> (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
> - K(bdi_thresh),
> - K(dirty_thresh),
> - K(background_thresh));
> + K(bdi_thresh), K(dirty_thresh),
> + K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io,
> + !list_empty(&bdi->bdi_list), bdi->state, bdi->wb_mask,
> + !list_empty(&bdi->wb_list), bdi->wb_cnt);
> #undef K
>
> return 0;
> @@ -193,6 +230,343 @@ static int __init default_bdi_init(void)
> }
> subsys_initcall(default_bdi_init);
>
> +static int wb_assign_nr(struct backing_dev_info *bdi, struct bdi_writeback *wb)
> +{
> + unsigned long mask = BDI_MAX_FLUSHERS - 1;
> + unsigned int nr;
> +
> + do {
> + if ((bdi->wb_mask & mask) == mask)
> + return 1;
> +
> + nr = find_first_zero_bit(&bdi->wb_mask, BDI_MAX_FLUSHERS);
> + } while (test_and_set_bit(nr, &bdi->wb_mask));
> +
> + wb->nr = nr;
> +
> + spin_lock(&bdi->wb_lock);
> + bdi->wb_cnt++;
> + spin_unlock(&bdi->wb_lock);
> +
> + return 0;
> +}
> +
> +static void bdi_put_wb(struct backing_dev_info *bdi, struct bdi_writeback *wb)
> +{
> + clear_bit(wb->nr, &bdi->wb_mask);
> +
> + if (wb == &bdi->wb)
> + clear_bit(BDI_wb_alloc, &bdi->state);
> + else
> + kfree(wb);
> +
> + spin_lock(&bdi->wb_lock);
> + bdi->wb_cnt--;
> + spin_unlock(&bdi->wb_lock);
> +}
> +
> +static int bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
> +{
> + memset(wb, 0, sizeof(*wb));
> +
> + wb->bdi = bdi;
> + init_waitqueue_head(&wb->wait);
> + INIT_LIST_HEAD(&wb->b_dirty);
> + INIT_LIST_HEAD(&wb->b_io);
> + INIT_LIST_HEAD(&wb->b_more_io);
> +
> + return wb_assign_nr(bdi, wb);
> +}
> +
> +static struct bdi_writeback *bdi_new_wb(struct backing_dev_info *bdi)
> +{
> + struct bdi_writeback *wb;
> +
> + /*
> + * Default bdi->wb is already assigned, so just return it
> + */
> + if (!test_and_set_bit(BDI_wb_alloc, &bdi->state))
> + wb = &bdi->wb;
> + else {
> + wb = kmalloc(sizeof(struct bdi_writeback), GFP_KERNEL);
> + if (wb) {
> + if (bdi_wb_init(wb, bdi)) {
> + kfree(wb);
> + wb = NULL;
> + }
> + }
> + }
> +
> + return wb;
> +}
> +
> +static void bdi_task_init(struct backing_dev_info *bdi,
> + struct bdi_writeback *wb)
> +{
> + struct task_struct *tsk = current;
> + int was_empty;
> +
> + /*
> + * Add us to the active bdi_list. If we are adding threads beyond
> + * the default embedded bdi_writeback, then we need to start using
> + * proper locking. Check the list for empty first, then set the
> + * BDI_wblist_lock flag if there's > 1 entry on the list now
> + */
> + spin_lock(&bdi->wb_lock);
> +
> + was_empty = list_empty(&bdi->wb_list);
> + list_add_tail_rcu(&wb->list, &bdi->wb_list);
> + if (!was_empty)
> + set_bit(BDI_wblist_lock, &bdi->state);
> +
> + spin_unlock(&bdi->wb_lock);
> +
> + tsk->flags |= PF_FLUSHER | PF_SWAPWRITE;
> + set_freezable();
> +
> + /*
> + * Our parent may run at a different priority, just set us to normal
> + */
> + set_user_nice(tsk, 0);
> +}
> +
> +static int bdi_start_fn(void *ptr)
> +{
> + struct bdi_writeback *wb = ptr;
> + struct backing_dev_info *bdi = wb->bdi;
> + int ret;
> +
> + /*
> + * Add us to the active bdi_list
> + */
> + mutex_lock(&bdi_lock);
> + list_add(&bdi->bdi_list, &bdi_list);
> + mutex_unlock(&bdi_lock);
> +
> + bdi_task_init(bdi, wb);
> +
> + /*
> + * Clear pending bit and wakeup anybody waiting to tear us down
> + */
> + clear_bit(BDI_pending, &bdi->state);
> + smp_mb__after_clear_bit();
> + wake_up_bit(&bdi->state, BDI_pending);
> +
> + ret = bdi_writeback_task(wb);
> +
> + /*
> + * Remove us from the list
> + */
> + spin_lock(&bdi->wb_lock);
> + list_del_rcu(&wb->list);
> + spin_unlock(&bdi->wb_lock);
> +
> + /*
> + * wait for rcu grace period to end, so we can free wb
> + */
> + synchronize_srcu(&bdi->srcu);
> +
> + bdi_put_wb(bdi, wb);
> + return ret;
> +}
> +
> +int bdi_has_dirty_io(struct backing_dev_info *bdi)
> +{
> + struct bdi_writeback *wb;
> + int ret = 0;
> +
> + if (!bdi_wblist_needs_lock(bdi))
> + ret = wb_has_dirty_io(&bdi->wb);
> + else {
> + int idx;
> +
> + idx = srcu_read_lock(&bdi->srcu);
> +
> + list_for_each_entry_rcu(wb, &bdi->wb_list, list) {
> + ret = wb_has_dirty_io(wb);
> + if (ret)
> + break;
> + }
> +
> + srcu_read_unlock(&bdi->srcu, idx);
> + }
> +
> + return ret;
> +}
> +
> +static void bdi_flush_io(struct backing_dev_info *bdi)
> +{
> + struct writeback_control wbc = {
> + .bdi = bdi,
> + .sync_mode = WB_SYNC_NONE,
> + .older_than_this = NULL,
> + .range_cyclic = 1,
> + .nr_to_write = 1024,
> + };
> +
> + generic_sync_bdi_inodes(NULL, &wbc);
> +}
> +
> +static int bdi_forker_task(void *ptr)
> +{
> + struct bdi_writeback *me = ptr;
> + DEFINE_WAIT(wait);
> +
> + bdi_task_init(me->bdi, me);
> +
> + for (;;) {
> + struct backing_dev_info *bdi, *tmp;
> + struct bdi_writeback *wb;
> +
> + /*
> + * Do this periodically, like kupdated() did before.
> + */
> + sync_supers();
> +
> + /*
> + * Temporary measure, we want to make sure we don't see
> + * dirty data on the default backing_dev_info
> + */
> + if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list))
> + wb_do_writeback(me);
> +
> + prepare_to_wait(&me->wait, &wait, TASK_INTERRUPTIBLE);
> +
> + mutex_lock(&bdi_lock);
> +
> + /*
> + * Check if any existing bdi's have dirty data without
> + * a thread registered. If so, set that up.
> + */
> + list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) {
> + if (bdi->wb.task || !bdi_has_dirty_io(bdi))
> + continue;
> +
> + bdi_add_default_flusher_task(bdi);
> + }
> +
> + if (list_empty(&bdi_pending_list)) {
> + unsigned long wait;
> +
> + mutex_unlock(&bdi_lock);
> + wait = msecs_to_jiffies(dirty_writeback_interval * 10);
> + schedule_timeout(wait);
> + try_to_freeze();
> + continue;
> + }
> +
> + /*
> + * This is our real job - check for pending entries in
> + * bdi_pending_list, and create the tasks that got added
> + */
> + bdi = list_entry(bdi_pending_list.next, struct backing_dev_info,
> + bdi_list);
> + list_del_init(&bdi->bdi_list);
> + mutex_unlock(&bdi_lock);
> +
> + wb = bdi_new_wb(bdi);
> + if (!wb)
> + goto readd_flush;
> +
> + wb->task = kthread_run(bdi_start_fn, wb, "bdi-%s",
> + dev_name(bdi->dev));
> +
> + /*
> + * If task creation fails, then readd the bdi to
> + * the pending list and force writeout of the bdi
> + * from this forker thread. That will free some memory
> + * and we can try again.
> + */
> + if (!wb->task) {
> + bdi_put_wb(bdi, wb);
> +readd_flush:
> + /*
> + * Add this 'bdi' to the back, so we get
> + * a chance to flush other bdi's to free
> + * memory.
> + */
> + mutex_lock(&bdi_lock);
> + list_add_tail(&bdi->bdi_list, &bdi_pending_list);
> + mutex_unlock(&bdi_lock);
> +
> + bdi_flush_io(bdi);
> + }
> + }
> +
> + finish_wait(&me->wait, &wait);
> + return 0;
> +}
> +
> +/*
> + * bdi_lock held on entry
> + */
> +static void bdi_add_one_flusher_task(struct backing_dev_info *bdi,
> + int(*func)(struct backing_dev_info *))
> +{
> + if (!bdi_cap_writeback_dirty(bdi))
> + return;
> +
> + if (WARN_ON(!test_bit(BDI_registered, &bdi->state))) {
> + printk("bdi %p/%s is not registered!\n", bdi, bdi->name);
> + return;
> + }
> +
> + /*
> + * Check with the helper whether to proceed adding a task. Will only
> + * abort if we two or more simultanous calls to
> + * bdi_add_default_flusher_task() occured, further additions will block
> + * waiting for previous additions to finish.
> + */
> + if (!func(bdi)) {
> + list_move_tail(&bdi->bdi_list, &bdi_pending_list);
> +
> + /*
> + * We are now on the pending list, wake up bdi_forker_task()
> + * to finish the job and add us back to the active bdi_list
> + */
> + wake_up(&default_backing_dev_info.wb.wait);
> + }
> +}
> +
> +static int flusher_add_helper_block(struct backing_dev_info *bdi)
> +{
> + mutex_unlock(&bdi_lock);
> + wait_on_bit_lock(&bdi->state, BDI_pending, bdi_sched_wait,
> + TASK_UNINTERRUPTIBLE);
> + mutex_lock(&bdi_lock);
> + return 0;
> +}
> +
> +static int flusher_add_helper_test(struct backing_dev_info *bdi)
> +{
> + return test_and_set_bit(BDI_pending, &bdi->state);
> +}
> +
> +/*
> + * Add the default flusher task that gets created for any bdi
> + * that has dirty data pending writeout
> + */
> +void bdi_add_default_flusher_task(struct backing_dev_info *bdi)
> +{
> + bdi_add_one_flusher_task(bdi, flusher_add_helper_test);
> +}
> +
> +/**
> + * bdi_add_flusher_task - add one more flusher task to this @bdi
> + * @bdi: the bdi
> + *
> + * Add an additional flusher task to this @bdi. Will block waiting on
> + * previous additions, if any.
> + *
> + */
> +void bdi_add_flusher_task(struct backing_dev_info *bdi)
> +{
> + mutex_lock(&bdi_lock);
> + bdi_add_one_flusher_task(bdi, flusher_add_helper_block);
> + mutex_unlock(&bdi_lock);
> +}
> +EXPORT_SYMBOL(bdi_add_flusher_task);
> +
> int bdi_register(struct backing_dev_info *bdi, struct device *parent,
> const char *fmt, ...)
> {
> @@ -211,9 +585,41 @@ int bdi_register(struct backing_dev_info
> goto exit;
> }
>
> + mutex_lock(&bdi_lock);
> + list_add_tail(&bdi->bdi_list, &bdi_list);
> + mutex_unlock(&bdi_lock);
> +
> bdi->dev = dev;
> - bdi_debug_register(bdi, dev_name(dev));
>
> + /*
> + * Just start the forker thread for our default backing_dev_info,
> + * and add other bdi's to the list. They will get a thread created
> + * on-demand when they need it.
> + */
> + if (bdi_cap_flush_forker(bdi)) {
> + struct bdi_writeback *wb;
> +
> + wb = bdi_new_wb(bdi);
> + if (!wb) {
> + ret = -ENOMEM;
> + goto remove_err;
> + }
> +
> + wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s",
> + dev_name(dev));
> + if (!wb->task) {
> + bdi_put_wb(bdi, wb);
> + ret = -ENOMEM;
> +remove_err:
> + mutex_lock(&bdi_lock);
> + list_del(&bdi->bdi_list);
> + mutex_unlock(&bdi_lock);
> + goto exit;
> + }
> + }
> +
> + bdi_debug_register(bdi, dev_name(dev));
> + set_bit(BDI_registered, &bdi->state);
> exit:
> return ret;
> }
> @@ -225,9 +631,42 @@ int bdi_register_dev(struct backing_dev_
> }
> EXPORT_SYMBOL(bdi_register_dev);
>
> +/*
> + * Remove bdi from global list and shutdown any threads we have running
> + */
> +static void bdi_wb_shutdown(struct backing_dev_info *bdi)
> +{
> + struct bdi_writeback *wb;
> +
> + if (!bdi_cap_writeback_dirty(bdi))
> + return;
> +
> + /*
> + * If setup is pending, wait for that to complete first
> + */
> + wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
> + TASK_UNINTERRUPTIBLE);
> +
> + /*
> + * Make sure nobody finds us on the bdi_list anymore
> + */
> + mutex_lock(&bdi_lock);
> + list_del(&bdi->bdi_list);
> + mutex_unlock(&bdi_lock);
> +
> + /*
> + * Finally, kill the kernel threads. We don't need to be RCU
> + * safe anymore, since the bdi is gone from visibility.
> + */
> + list_for_each_entry(wb, &bdi->wb_list, list)
> + kthread_stop(wb->task);
> +}
> +
> void bdi_unregister(struct backing_dev_info *bdi)
> {
> if (bdi->dev) {
> + if (!bdi_cap_flush_forker(bdi))
> + bdi_wb_shutdown(bdi);
> bdi_debug_unregister(bdi);
> device_unregister(bdi->dev);
> bdi->dev = NULL;
> @@ -237,14 +676,21 @@ EXPORT_SYMBOL(bdi_unregister);
>
> int bdi_init(struct backing_dev_info *bdi)
> {
> - int i;
> - int err;
> + int i, err;
>
> bdi->dev = NULL;
>
> bdi->min_ratio = 0;
> bdi->max_ratio = 100;
> bdi->max_prop_frac = PROP_FRAC_BASE;
> + spin_lock_init(&bdi->wb_lock);
> + bdi->wb_mask = 0;
> + bdi->wb_cnt = 0;
> + INIT_LIST_HEAD(&bdi->bdi_list);
> + INIT_LIST_HEAD(&bdi->wb_list);
> + INIT_LIST_HEAD(&bdi->work_list);
> +
> + bdi_wb_init(&bdi->wb, bdi);
>
> for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
> err = percpu_counter_init(&bdi->bdi_stat[i], 0);
> @@ -252,10 +698,15 @@ int bdi_init(struct backing_dev_info *bd
> goto err;
> }
>
> + err = init_srcu_struct(&bdi->srcu);
> + if (err)
> + goto err;
> +
> bdi->dirty_exceeded = 0;
> err = prop_local_init_percpu(&bdi->completions);
>
> if (err) {
> + cleanup_srcu_struct(&bdi->srcu);
> err:
> while (i--)
> percpu_counter_destroy(&bdi->bdi_stat[i]);
> @@ -269,8 +720,12 @@ void bdi_destroy(struct backing_dev_info
> {
> int i;
>
> + WARN_ON(bdi_has_dirty_io(bdi));
> +
> bdi_unregister(bdi);
>
> + cleanup_srcu_struct(&bdi->srcu);
> +
> for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
> percpu_counter_destroy(&bdi->bdi_stat[i]);
>
> diff -Nraup linux-2.6.30-rc6/mm/Makefile linux-2.6.30-rc6_bdiflusherv7/mm/Makefile
> --- linux-2.6.30-rc6/mm/Makefile 2009-05-19 11:00:58.000000000 +0800
> +++ linux-2.6.30-rc6_bdiflusherv7/mm/Makefile 2009-05-27 08:59:27.000000000 +0800
> @@ -8,7 +8,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o
> vmalloc.o
>
> obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
> - maccess.o page_alloc.o page-writeback.o pdflush.o \
> + maccess.o page_alloc.o page-writeback.o \
> readahead.o swap.o truncate.o vmscan.o shmem.o \
> prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
> page_isolation.o mm_init.o $(mmu-y)
> diff -Nraup linux-2.6.30-rc6/mm/page-writeback.c linux-2.6.30-rc6_bdiflusherv7/mm/page-writeback.c
> --- linux-2.6.30-rc6/mm/page-writeback.c 2009-05-19 11:00:58.000000000 +0800
> +++ linux-2.6.30-rc6_bdiflusherv7/mm/page-writeback.c 2009-05-27 09:05:08.000000000 +0800
> @@ -36,15 +36,6 @@
> #include <linux/pagevec.h>
>
> /*
> - * The maximum number of pages to writeout in a single bdflush/kupdate
> - * operation. We do this so we don't hold I_SYNC against an inode for
> - * enormous amounts of time, which would block a userspace task which has
> - * been forced to throttle against that inode. Also, the code reevaluates
> - * the dirty each time it has written this many pages.
> - */
> -#define MAX_WRITEBACK_PAGES 1024
> -
> -/*
> * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
> * will look to see if it needs to force writeback or throttling.
> */
> @@ -117,8 +108,6 @@ EXPORT_SYMBOL(laptop_mode);
> /* End of sysctl-exported parameters */
>
>
> -static void background_writeout(unsigned long _min_pages);
> -
> /*
> * Scale the writeback cache size proportional to the relative writeout speeds.
> *
> @@ -319,15 +308,13 @@ static void task_dirty_limit(struct task
> /*
> *
> */
> -static DEFINE_SPINLOCK(bdi_lock);
> static unsigned int bdi_min_ratio;
>
> int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
> {
> int ret = 0;
> - unsigned long flags;
>
> - spin_lock_irqsave(&bdi_lock, flags);
> + mutex_lock(&bdi_lock);
> if (min_ratio > bdi->max_ratio) {
> ret = -EINVAL;
> } else {
> @@ -339,27 +326,26 @@ int bdi_set_min_ratio(struct backing_dev
> ret = -EINVAL;
> }
> }
> - spin_unlock_irqrestore(&bdi_lock, flags);
> + mutex_unlock(&bdi_lock);
>
> return ret;
> }
>
> int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
> {
> - unsigned long flags;
> int ret = 0;
>
> if (max_ratio > 100)
> return -EINVAL;
>
> - spin_lock_irqsave(&bdi_lock, flags);
> + mutex_lock(&bdi_lock);
> if (bdi->min_ratio > max_ratio) {
> ret = -EINVAL;
> } else {
> bdi->max_ratio = max_ratio;
> bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
> }
> - spin_unlock_irqrestore(&bdi_lock, flags);
> + mutex_unlock(&bdi_lock);
>
> return ret;
> }
> @@ -542,7 +528,7 @@ static void balance_dirty_pages(struct a
> * been flushed to permanent storage.
> */
> if (bdi_nr_reclaimable) {
> - writeback_inodes(&wbc);
> + generic_sync_bdi_inodes(NULL, &wbc);
> pages_written += write_chunk - wbc.nr_to_write;
> get_dirty_limits(&background_thresh, &dirty_thresh,
> &bdi_thresh, bdi);
> @@ -593,7 +579,7 @@ static void balance_dirty_pages(struct a
> (!laptop_mode && (global_page_state(NR_FILE_DIRTY)
> + global_page_state(NR_UNSTABLE_NFS)
> > background_thresh)))
> - pdflush_operation(background_writeout, 0);
> + bdi_start_writeback(bdi, NULL, 0, WB_SYNC_NONE);
> }
>
> void set_page_dirty_balance(struct page *page, int page_mkwrite)
> @@ -678,152 +664,34 @@ void throttle_vm_writeout(gfp_t gfp_mask
> }
>
> /*
> - * writeback at least _min_pages, and keep writing until the amount of dirty
> - * memory is less than the background threshold, or until we're all clean.
> - */
> -static void background_writeout(unsigned long _min_pages)
> -{
> - long min_pages = _min_pages;
> - struct writeback_control wbc = {
> - .bdi = NULL,
> - .sync_mode = WB_SYNC_NONE,
> - .older_than_this = NULL,
> - .nr_to_write = 0,
> - .nonblocking = 1,
> - .range_cyclic = 1,
> - };
> -
> - for ( ; ; ) {
> - unsigned long background_thresh;
> - unsigned long dirty_thresh;
> -
> - get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
> - if (global_page_state(NR_FILE_DIRTY) +
> - global_page_state(NR_UNSTABLE_NFS) < background_thresh
> - && min_pages <= 0)
> - break;
> - wbc.more_io = 0;
> - wbc.encountered_congestion = 0;
> - wbc.nr_to_write = MAX_WRITEBACK_PAGES;
> - wbc.pages_skipped = 0;
> - writeback_inodes(&wbc);
> - min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
> - if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
> - /* Wrote less than expected */
> - if (wbc.encountered_congestion || wbc.more_io)
> - congestion_wait(WRITE, HZ/10);
> - else
> - break;
> - }
> - }
> -}
> -
> -/*
> * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
> - * the whole world. Returns 0 if a pdflush thread was dispatched. Returns
> - * -1 if all pdflush threads were busy.
> + * the whole world.
> */
> -int wakeup_pdflush(long nr_pages)
> +void wakeup_flusher_threads(long nr_pages)
> {
> if (nr_pages == 0)
> nr_pages = global_page_state(NR_FILE_DIRTY) +
> global_page_state(NR_UNSTABLE_NFS);
> - return pdflush_operation(background_writeout, nr_pages);
> + bdi_writeback_all(NULL, nr_pages, WB_SYNC_NONE);
> }
>
> -static void wb_timer_fn(unsigned long unused);
> static void laptop_timer_fn(unsigned long unused);
>
> -static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0);
> static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
>
> /*
> - * Periodic writeback of "old" data.
> - *
> - * Define "old": the first time one of an inode's pages is dirtied, we mark the
> - * dirtying-time in the inode's address_space. So this periodic writeback code
> - * just walks the superblock inode list, writing back any inodes which are
> - * older than a specific point in time.
> - *
> - * Try to run once per dirty_writeback_interval. But if a writeback event
> - * takes longer than a dirty_writeback_interval interval, then leave a
> - * one-second gap.
> - *
> - * older_than_this takes precedence over nr_to_write. So we'll only write back
> - * all dirty pages if they are all attached to "old" mappings.
> - */
> -static void wb_kupdate(unsigned long arg)
> -{
> - unsigned long oldest_jif;
> - unsigned long start_jif;
> - unsigned long next_jif;
> - long nr_to_write;
> - struct writeback_control wbc = {
> - .bdi = NULL,
> - .sync_mode = WB_SYNC_NONE,
> - .older_than_this = &oldest_jif,
> - .nr_to_write = 0,
> - .nonblocking = 1,
> - .for_kupdate = 1,
> - .range_cyclic = 1,
> - };
> -
> - sync_supers();
> -
> - oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval);
> - start_jif = jiffies;
> - next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10);
> - nr_to_write = global_page_state(NR_FILE_DIRTY) +
> - global_page_state(NR_UNSTABLE_NFS) +
> - (inodes_stat.nr_inodes - inodes_stat.nr_unused);
> - while (nr_to_write > 0) {
> - wbc.more_io = 0;
> - wbc.encountered_congestion = 0;
> - wbc.nr_to_write = MAX_WRITEBACK_PAGES;
> - writeback_inodes(&wbc);
> - if (wbc.nr_to_write > 0) {
> - if (wbc.encountered_congestion || wbc.more_io)
> - congestion_wait(WRITE, HZ/10);
> - else
> - break; /* All the old data is written */
> - }
> - nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
> - }
> - if (time_before(next_jif, jiffies + HZ))
> - next_jif = jiffies + HZ;
> - if (dirty_writeback_interval)
> - mod_timer(&wb_timer, next_jif);
> -}
> -
> -/*
> * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
> */
> int dirty_writeback_centisecs_handler(ctl_table *table, int write,
> struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
> {
> proc_dointvec(table, write, file, buffer, length, ppos);
> - if (dirty_writeback_interval)
> - mod_timer(&wb_timer, jiffies +
> - msecs_to_jiffies(dirty_writeback_interval * 10));
> - else
> - del_timer(&wb_timer);
> return 0;
> }
>
> -static void wb_timer_fn(unsigned long unused)
> -{
> - if (pdflush_operation(wb_kupdate, 0) < 0)
> - mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
> -}
> -
> -static void laptop_flush(unsigned long unused)
> -{
> - sys_sync();
> -}
> -
> static void laptop_timer_fn(unsigned long unused)
> {
> - pdflush_operation(laptop_flush, 0);
> + wakeup_flusher_threads(0);
> }
>
> /*
> @@ -906,8 +774,6 @@ void __init page_writeback_init(void)
> {
> int shift;
>
> - mod_timer(&wb_timer,
> - jiffies + msecs_to_jiffies(dirty_writeback_interval * 10));
> writeback_set_ratelimit();
> register_cpu_notifier(&ratelimit_nb);
>
> diff -Nraup linux-2.6.30-rc6/mm/pdflush.c linux-2.6.30-rc6_bdiflusherv7/mm/pdflush.c
> --- linux-2.6.30-rc6/mm/pdflush.c 2009-05-19 11:00:58.000000000 +0800
> +++ linux-2.6.30-rc6_bdiflusherv7/mm/pdflush.c 1970-01-01 08:00:00.000000000 +0800
> @@ -1,269 +0,0 @@
> -/*
> - * mm/pdflush.c - worker threads for writing back filesystem data
> - *
> - * Copyright (C) 2002, Linus Torvalds.
> - *
> - * 09Apr2002 Andrew Morton
> - * Initial version
> - * 29Feb2004 kaos@....com
> - * Move worker thread creation to kthread to avoid chewing
> - * up stack space with nested calls to kernel_thread.
> - */
> -
> -#include <linux/sched.h>
> -#include <linux/list.h>
> -#include <linux/signal.h>
> -#include <linux/spinlock.h>
> -#include <linux/gfp.h>
> -#include <linux/init.h>
> -#include <linux/module.h>
> -#include <linux/fs.h> /* Needed by writeback.h */
> -#include <linux/writeback.h> /* Prototypes pdflush_operation() */
> -#include <linux/kthread.h>
> -#include <linux/cpuset.h>
> -#include <linux/freezer.h>
> -
> -
> -/*
> - * Minimum and maximum number of pdflush instances
> - */
> -#define MIN_PDFLUSH_THREADS 2
> -#define MAX_PDFLUSH_THREADS 8
> -
> -static void start_one_pdflush_thread(void);
> -
> -
> -/*
> - * The pdflush threads are worker threads for writing back dirty data.
> - * Ideally, we'd like one thread per active disk spindle. But the disk
> - * topology is very hard to divine at this level. Instead, we take
> - * care in various places to prevent more than one pdflush thread from
> - * performing writeback against a single filesystem. pdflush threads
> - * have the PF_FLUSHER flag set in current->flags to aid in this.
> - */
> -
> -/*
> - * All the pdflush threads. Protected by pdflush_lock
> - */
> -static LIST_HEAD(pdflush_list);
> -static DEFINE_SPINLOCK(pdflush_lock);
> -
> -/*
> - * The count of currently-running pdflush threads. Protected
> - * by pdflush_lock.
> - *
> - * Readable by sysctl, but not writable. Published to userspace at
> - * /proc/sys/vm/nr_pdflush_threads.
> - */
> -int nr_pdflush_threads = 0;
> -
> -/*
> - * The time at which the pdflush thread pool last went empty
> - */
> -static unsigned long last_empty_jifs;
> -
> -/*
> - * The pdflush thread.
> - *
> - * Thread pool management algorithm:
> - *
> - * - The minimum and maximum number of pdflush instances are bound
> - * by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS.
> - *
> - * - If there have been no idle pdflush instances for 1 second, create
> - * a new one.
> - *
> - * - If the least-recently-went-to-sleep pdflush thread has been asleep
> - * for more than one second, terminate a thread.
> - */
> -
> -/*
> - * A structure for passing work to a pdflush thread. Also for passing
> - * state information between pdflush threads. Protected by pdflush_lock.
> - */
> -struct pdflush_work {
> - struct task_struct *who; /* The thread */
> - void (*fn)(unsigned long); /* A callback function */
> - unsigned long arg0; /* An argument to the callback */
> - struct list_head list; /* On pdflush_list, when idle */
> - unsigned long when_i_went_to_sleep;
> -};
> -
> -static int __pdflush(struct pdflush_work *my_work)
> -{
> - current->flags |= PF_FLUSHER | PF_SWAPWRITE;
> - set_freezable();
> - my_work->fn = NULL;
> - my_work->who = current;
> - INIT_LIST_HEAD(&my_work->list);
> -
> - spin_lock_irq(&pdflush_lock);
> - for ( ; ; ) {
> - struct pdflush_work *pdf;
> -
> - set_current_state(TASK_INTERRUPTIBLE);
> - list_move(&my_work->list, &pdflush_list);
> - my_work->when_i_went_to_sleep = jiffies;
> - spin_unlock_irq(&pdflush_lock);
> - schedule();
> - try_to_freeze();
> - spin_lock_irq(&pdflush_lock);
> - if (!list_empty(&my_work->list)) {
> - /*
> - * Someone woke us up, but without removing our control
> - * structure from the global list. swsusp will do this
> - * in try_to_freeze()->refrigerator(). Handle it.
> - */
> - my_work->fn = NULL;
> - continue;
> - }
> - if (my_work->fn == NULL) {
> - printk("pdflush: bogus wakeup\n");
> - continue;
> - }
> - spin_unlock_irq(&pdflush_lock);
> -
> - (*my_work->fn)(my_work->arg0);
> -
> - spin_lock_irq(&pdflush_lock);
> -
> - /*
> - * Thread creation: For how long have there been zero
> - * available threads?
> - *
> - * To throttle creation, we reset last_empty_jifs.
> - */
> - if (time_after(jiffies, last_empty_jifs + 1 * HZ)) {
> - if (list_empty(&pdflush_list)) {
> - if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) {
> - last_empty_jifs = jiffies;
> - nr_pdflush_threads++;
> - spin_unlock_irq(&pdflush_lock);
> - start_one_pdflush_thread();
> - spin_lock_irq(&pdflush_lock);
> - }
> - }
> - }
> -
> - my_work->fn = NULL;
> -
> - /*
> - * Thread destruction: For how long has the sleepiest
> - * thread slept?
> - */
> - if (list_empty(&pdflush_list))
> - continue;
> - if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
> - continue;
> - pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
> - if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) {
> - /* Limit exit rate */
> - pdf->when_i_went_to_sleep = jiffies;
> - break; /* exeunt */
> - }
> - }
> - nr_pdflush_threads--;
> - spin_unlock_irq(&pdflush_lock);
> - return 0;
> -}
> -
> -/*
> - * Of course, my_work wants to be just a local in __pdflush(). It is
> - * separated out in this manner to hopefully prevent the compiler from
> - * performing unfortunate optimisations against the auto variables. Because
> - * these are visible to other tasks and CPUs. (No problem has actually
> - * been observed. This is just paranoia).
> - */
> -static int pdflush(void *dummy)
> -{
> - struct pdflush_work my_work;
> - cpumask_var_t cpus_allowed;
> -
> - /*
> - * Since the caller doesn't even check kthread_run() worked, let's not
> - * freak out too much if this fails.
> - */
> - if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
> - printk(KERN_WARNING "pdflush failed to allocate cpumask\n");
> - return 0;
> - }
> -
> - /*
> - * pdflush can spend a lot of time doing encryption via dm-crypt. We
> - * don't want to do that at keventd's priority.
> - */
> - set_user_nice(current, 0);
> -
> - /*
> - * Some configs put our parent kthread in a limited cpuset,
> - * which kthread() overrides, forcing cpus_allowed == cpu_all_mask.
> - * Our needs are more modest - cut back to our cpusets cpus_allowed.
> - * This is needed as pdflush's are dynamically created and destroyed.
> - * The boottime pdflush's are easily placed w/o these 2 lines.
> - */
> - cpuset_cpus_allowed(current, cpus_allowed);
> - set_cpus_allowed_ptr(current, cpus_allowed);
> - free_cpumask_var(cpus_allowed);
> -
> - return __pdflush(&my_work);
> -}
> -
> -/*
> - * Attempt to wake up a pdflush thread, and get it to do some work for you.
> - * Returns zero if it indeed managed to find a worker thread, and passed your
> - * payload to it.
> - */
> -int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
> -{
> - unsigned long flags;
> - int ret = 0;
> -
> - BUG_ON(fn == NULL); /* Hard to diagnose if it's deferred */
> -
> - spin_lock_irqsave(&pdflush_lock, flags);
> - if (list_empty(&pdflush_list)) {
> - ret = -1;
> - } else {
> - struct pdflush_work *pdf;
> -
> - pdf = list_entry(pdflush_list.next, struct pdflush_work, list);
> - list_del_init(&pdf->list);
> - if (list_empty(&pdflush_list))
> - last_empty_jifs = jiffies;
> - pdf->fn = fn;
> - pdf->arg0 = arg0;
> - wake_up_process(pdf->who);
> - }
> - spin_unlock_irqrestore(&pdflush_lock, flags);
> -
> - return ret;
> -}
> -
> -static void start_one_pdflush_thread(void)
> -{
> - struct task_struct *k;
> -
> - k = kthread_run(pdflush, NULL, "pdflush");
> - if (unlikely(IS_ERR(k))) {
> - spin_lock_irq(&pdflush_lock);
> - nr_pdflush_threads--;
> - spin_unlock_irq(&pdflush_lock);
> - }
> -}
> -
> -static int __init pdflush_init(void)
> -{
> - int i;
> -
> - /*
> - * Pre-set nr_pdflush_threads... If we fail to create,
> - * the count will be decremented.
> - */
> - nr_pdflush_threads = MIN_PDFLUSH_THREADS;
> -
> - for (i = 0; i < MIN_PDFLUSH_THREADS; i++)
> - start_one_pdflush_thread();
> - return 0;
> -}
> -
> -module_init(pdflush_init);
> diff -Nraup linux-2.6.30-rc6/mm/swap_state.c linux-2.6.30-rc6_bdiflusherv7/mm/swap_state.c
> --- linux-2.6.30-rc6/mm/swap_state.c 2009-05-19 11:00:28.000000000 +0800
> +++ linux-2.6.30-rc6_bdiflusherv7/mm/swap_state.c 2009-05-27 08:59:27.000000000 +0800
> @@ -34,6 +34,7 @@ static const struct address_space_operat
> };
>
> static struct backing_dev_info swap_backing_dev_info = {
> + .name = "swap",
> .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
> .unplug_io_fn = swap_unplug_io_fn,
> };
> diff -Nraup linux-2.6.30-rc6/mm/vmscan.c linux-2.6.30-rc6_bdiflusherv7/mm/vmscan.c
> --- linux-2.6.30-rc6/mm/vmscan.c 2009-05-19 11:00:58.000000000 +0800
> +++ linux-2.6.30-rc6_bdiflusherv7/mm/vmscan.c 2009-05-27 08:59:27.000000000 +0800
> @@ -1654,7 +1654,7 @@ static unsigned long do_try_to_free_page
> */
> if (total_scanned > sc->swap_cluster_max +
> sc->swap_cluster_max / 2) {
> - wakeup_pdflush(laptop_mode ? 0 : total_scanned);
> + wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
> sc->may_writepage = 1;
> }
>
--
Jens Axboe
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists