wbc.nr_segments serves two major purposes: - fairness between two large files, one is continuously dirtied, another is sparsely dirtied. Given the same amount of dirty pages, it could take vastly different time to sync them to the _same_ device. The nr_segments check helps to favor continuous data. - avoid seeks/fragmentations. To give each file fair chance of writeback, we have to abort a file when some nr_to_write or timeout is reached. However they are both not good abort conditions. The best is for filesystem to abort earlier in seek boundaries, and treat nr_to_write/timeout as large enough bottom lines. However a low nr_segments would be inefficient if all files are sparsely dirtied. For example, it may be inefficient for the block device inodes, which has lots of sparsely distributed metadata pages. The wbc.nr_segments here is determined purely by logical page index distance: if two pages are 1MB apart, it makes a new segment. Filesystems could do this better with real extent knowledges. One possible scheme is to record the previous page index in wbc.writeback_index, and let ->writepage compare if the current and previous pages lie in the same extent, and decrease wbc.nr_segments accordingly. Care should taken to avoid double decreases in writepage and write_cache_pages. CC: Theodore Ts'o CC: Chris Mason CC: Dave Chinner CC: Christoph Hellwig CC: Jan Kara CC: Peter Zijlstra CC: Jens Axboe Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 8 +++++++- fs/jbd2/commit.c | 1 + include/linux/writeback.h | 10 +++++++++- mm/filemap.c | 1 + mm/page-writeback.c | 7 +++++++ 5 files changed, 25 insertions(+), 2 deletions(-) --- linux.orig/fs/fs-writeback.c 2009-10-06 23:39:27.000000000 +0800 +++ linux/fs/fs-writeback.c 2009-10-06 23:39:28.000000000 +0800 @@ -542,6 +542,11 @@ writeback_single_inode(struct inode *ino spin_unlock(&inode_lock); + if (wbc->for_kupdate || wbc->for_background) + wbc->nr_segments = bdi_nonrot(wbc->bdi) ? 100 : 10; + else + wbc->nr_segments = LONG_MAX; + ret = do_writepages(mapping, wbc); /* Don't write the inode if only I_DIRTY_PAGES was set */ @@ -566,7 +571,8 @@ writeback_single_inode(struct inode *ino * sometimes bales out without doing anything. */ inode->i_state |= I_DIRTY_PAGES; - if (wbc->nr_to_write <= 0) { + if (wbc->nr_to_write <= 0 || + wbc->nr_segments <= 0) { /* * slice used up: queue for next turn */ --- linux.orig/include/linux/writeback.h 2009-10-06 23:39:27.000000000 +0800 +++ linux/include/linux/writeback.h 2009-10-06 23:39:28.000000000 +0800 @@ -48,6 +48,9 @@ struct writeback_control { long nr_to_write; /* Max pages to write per file, and decrement this for each page written */ + long nr_segments; /* Max page segments to write per file, + this is a count down value, too + */ long pages_skipped; /* Pages which were not written */ /* @@ -77,8 +80,13 @@ struct writeback_control { }; /* + * if two page ranges are more than 1MB apart, they are taken as two segments. + */ +#define WB_SEGMENT_DIST (1024 >> (PAGE_CACHE_SHIFT - 10)) + +/* * fs/fs-writeback.c - */ + */ struct bdi_writeback; int inode_wait(void *); void writeback_inodes_sb(struct super_block *); --- linux.orig/mm/filemap.c 2009-10-06 23:37:43.000000000 +0800 +++ linux/mm/filemap.c 2009-10-06 23:39:28.000000000 +0800 @@ -216,6 +216,7 @@ int __filemap_fdatawrite_range(struct ad struct writeback_control wbc = { .sync_mode = sync_mode, .nr_to_write = LONG_MAX, + .nr_segments = LONG_MAX, .range_start = start, .range_end = end, }; --- linux.orig/mm/page-writeback.c 2009-10-06 23:38:43.000000000 +0800 +++ linux/mm/page-writeback.c 2009-10-06 23:39:28.000000000 +0800 @@ -805,6 +805,13 @@ int write_cache_pages(struct address_spa break; } + if (nr_to_write != wbc->nr_to_write && + done_index + WB_SEGMENT_DIST < page->index && + --wbc->nr_segments <= 0) { + done = 1; + break; + } + done_index = page->index + 1; lock_page(page); --- linux.orig/fs/jbd2/commit.c 2009-10-06 23:37:42.000000000 +0800 +++ linux/fs/jbd2/commit.c 2009-10-06 23:39:28.000000000 +0800 @@ -219,6 +219,7 @@ static int journal_submit_inode_data_buf struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = mapping->nrpages * 2, + .nr_segments = LONG_MAX, .range_start = 0, .range_end = i_size_read(mapping->host), }; -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/