Introduce super_block.s_more_io_wait and writeback_control.more_io_wait. They are for inodes that for some reason cannot be synced immediately. These inodes will be moved to s_more_io_wait and retried after a while(waiting up to 0.1s). The normal lots-of-dirty-pages inodes will be moved to more_io and be synced after other inodes in s_io have been serviced(no sleep). The new data flow is now simple and fair: - to fill s_io: s_more_io + s_dirty(expired) + s_more_io_wait ---> s_io - to drain s_io: s_io -+--> clean inodes in inode_in_use/inode_unused | +--> s_more_io | +--> s_more_io_wait - s_dirty is now a strict FIFO queue - inode.dirtied_when now really means the first dirty time - once exipired, the dirty inode will stay in s_*io* queues until made clean - the dirty inodes in s_*io* queues will be revisted in order: no starvation Cc: Michael Rubin Cc: Peter Zijlstra Signed-off-by: Fengguang Wu --- fs/fs-writeback.c | 22 +++++++++++--- fs/super.c | 1 include/linux/fs.h | 1 include/linux/writeback.h | 1 mm/page-writeback.c | 56 ++++++++++++++++++++---------------- 5 files changed, 53 insertions(+), 28 deletions(-) --- linux-2.6.24-rc6-mm1.orig/fs/fs-writeback.c +++ linux-2.6.24-rc6-mm1/fs/fs-writeback.c @@ -170,10 +170,18 @@ static void redirty_tail(struct inode *i static void requeue_io(struct inode *inode) { list_move(&inode->i_list, &inode->i_sb->s_more_io); } +/* + * The inode should be retried after _sleeping_ for a while. + */ +static void requeue_io_wait(struct inode *inode) +{ + list_move(&inode->i_list, &inode->i_sb->s_more_io_wait); +} + static void inode_sync_complete(struct inode *inode) { /* * Prevent speculative execution through spin_unlock(&inode_lock); */ @@ -204,17 +212,19 @@ static void move_expired_inodes(struct l static void queue_io(struct super_block *sb, unsigned long *older_than_this) { list_splice_init(&sb->s_more_io, sb->s_io.prev); move_expired_inodes(&sb->s_dirty, &sb->s_io, older_than_this); + list_splice_init(&sb->s_more_io_wait, sb->s_io.prev); } int sb_has_dirty_inodes(struct super_block *sb) { - return !list_empty(&sb->s_dirty) || - !list_empty(&sb->s_io) || - !list_empty(&sb->s_more_io); + return !list_empty(&sb->s_dirty) || + !list_empty(&sb->s_io) || + !list_empty(&sb->s_more_io) || + !list_empty(&sb->s_more_io_wait); } EXPORT_SYMBOL(sb_has_dirty_inodes); /* * Write a single inode's dirty pages and inode data out to disk. @@ -470,15 +480,19 @@ int generic_sync_sb_inodes(struct super_ } spin_unlock(&inode_lock); iput(inode); cond_resched(); spin_lock(&inode_lock); - if (wbc->nr_to_write <= 0) + if (wbc->nr_to_write <= 0) { + wbc->more_io = 1; break; + } } if (!list_empty(&sb->s_more_io)) wbc->more_io = 1; + if (!list_empty(&sb->s_more_io_wait)) + wbc->more_io_wait = 1; spin_unlock(&inode_lock); return ret; /* Leave any unwritten inodes on s_io */ } EXPORT_SYMBOL(generic_sync_sb_inodes); --- linux-2.6.24-rc6-mm1.orig/mm/page-writeback.c +++ linux-2.6.24-rc6-mm1/mm/page-writeback.c @@ -541,21 +541,48 @@ void throttle_vm_writeout(gfp_t gfp_mask break; } } /* + * Write back up to MAX_WRITEBACK_PAGES. + * Return true if there's no more work. + */ +static int writeback_some_pages(struct writeback_control *wbc, int nr) +{ + int all_done = 0; + + wbc->more_io = 0; + wbc->more_io_wait = 0; + wbc->encountered_congestion = 0; + wbc->nr_to_write = nr; + + writeback_inodes(wbc); + + if (wbc->encountered_congestion) + congestion_wait(WRITE, HZ/10); + + if (wbc->more_io) + ; + else if (wbc->more_io_wait) + congestion_wait(WRITE, HZ/10); + else + all_done = 1; + + return all_done; +} + +/* * writeback at least _min_pages, and keep writing until the amount of dirty * memory is less than the background threshold, or until we're all clean. */ static void background_writeout(unsigned long _min_pages) { long min_pages = _min_pages; struct writeback_control wbc = { .bdi = NULL, .sync_mode = WB_SYNC_NONE, .older_than_this = NULL, - .nr_to_write = 0, .nonblocking = 1, .range_cyclic = 1, }; for ( ; ; ) { @@ -565,23 +592,13 @@ static void background_writeout(unsigned get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); if (global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS) < background_thresh && min_pages <= 0) break; - wbc.more_io = 0; - wbc.encountered_congestion = 0; - wbc.nr_to_write = MAX_WRITEBACK_PAGES; - wbc.pages_skipped = 0; - writeback_inodes(&wbc); + if (writeback_some_pages(&wbc, MAX_WRITEBACK_PAGES)) + break; min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; - if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { - /* Wrote less than expected */ - if (wbc.encountered_congestion || wbc.more_io) - congestion_wait(WRITE, HZ/10); - else - break; - } } } /* * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back @@ -625,11 +642,10 @@ static void wb_kupdate(unsigned long arg long nr_to_write; struct writeback_control wbc = { .bdi = NULL, .sync_mode = WB_SYNC_NONE, .older_than_this = &oldest_jif, - .nr_to_write = 0, .nonblocking = 1, .for_kupdate = 1, .range_cyclic = 1, }; @@ -640,20 +656,12 @@ static void wb_kupdate(unsigned long arg next_jif = start_jif + dirty_writeback_interval; nr_to_write = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS) + (inodes_stat.nr_inodes - inodes_stat.nr_unused); while (nr_to_write > 0) { - wbc.more_io = 0; - wbc.encountered_congestion = 0; - wbc.nr_to_write = MAX_WRITEBACK_PAGES; - writeback_inodes(&wbc); - if (wbc.nr_to_write > 0) { - if (wbc.encountered_congestion || wbc.more_io) - congestion_wait(WRITE, HZ/10); - else - break; /* All the old data is written */ - } + if (writeback_some_pages(&wbc, MAX_WRITEBACK_PAGES)) + break; nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; } if (time_before(next_jif, jiffies + HZ)) next_jif = jiffies + HZ; if (dirty_writeback_interval) --- linux-2.6.24-rc6-mm1.orig/fs/super.c +++ linux-2.6.24-rc6-mm1/fs/super.c @@ -62,10 +62,11 @@ static struct super_block *alloc_super(s goto out; } INIT_LIST_HEAD(&s->s_dirty); INIT_LIST_HEAD(&s->s_io); INIT_LIST_HEAD(&s->s_more_io); + INIT_LIST_HEAD(&s->s_more_io_wait); INIT_LIST_HEAD(&s->s_files); INIT_LIST_HEAD(&s->s_instances); INIT_HLIST_HEAD(&s->s_anon); INIT_LIST_HEAD(&s->s_inodes); init_rwsem(&s->s_umount); --- linux-2.6.24-rc6-mm1.orig/include/linux/fs.h +++ linux-2.6.24-rc6-mm1/include/linux/fs.h @@ -1009,10 +1009,11 @@ struct super_block { struct list_head s_inodes; /* all inodes */ struct list_head s_dirty; /* dirty inodes */ struct list_head s_io; /* parked for writeback */ struct list_head s_more_io; /* parked for more writeback */ + struct list_head s_more_io_wait; /* parked for sleep-then-retry */ struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */ struct list_head s_files; struct block_device *s_bdev; struct mtd_info *s_mtd; --- linux-2.6.24-rc6-mm1.orig/include/linux/writeback.h +++ linux-2.6.24-rc6-mm1/include/linux/writeback.h @@ -61,10 +61,11 @@ struct writeback_control { unsigned for_kupdate:1; /* A kupdate writeback */ unsigned for_reclaim:1; /* Invoked from the page allocator */ unsigned for_writepages:1; /* This is a writepages() call */ unsigned range_cyclic:1; /* range_start is cyclic */ unsigned more_io:1; /* more io to be dispatched */ + unsigned more_io_wait:1; /* more io to be dispatched after a while */ }; /* * fs/fs-writeback.c */ -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/