write_cache_pages has a number of problems, which appear to be real bugs:

* scanned == 1 is supposed to mean that cyclic writeback has circled through
  zero, thus we should not circle again. However it gets set to 1 after the
  first successful pagevec lookup. This leads to cases where not enough data
  gets written. Counterexample: file with first 10 pages dirty,
  writeback_index == 5, nr_to_write == 10. Then the 5 last pages will be found,
  and scanned will be set to 1, after writing those out, we will not cycle
  back to get the first 5. Rework this logic.

* If AOP_WRITEPAGE_ACTIVATE is returned, the filesystem is calling on us to
  drop the page lock and retry, however the existing code would just skip that
  page regardless of whether or not it was a data interity operation. Change
  this to always retry such a result.

  This is a data interity bug.

* If ret signals a real error, but we still have some pages left in the
  pagevec, done would be set to 1, but the remaining pages would continue to
  be processed and ret will be overwritten in the process. It could easily
  be overwritten with success, and thus success will be returned even if there
  is an error. Fix this by bailing immediately if there is an error, and
  retaining the error code.

  This is a data interity bug.

* nr_to_write is heeded by data interity operations, and the callers tend to
  set it to silly values that could break data interity semantics. For
  example, nr_to_write can be set to mapping->nr_pages * 2, however if a file
  has a single, dirty page, then fsync is called, subsequent pages might be
  concurrently added and dirtied, then write_cache_pages might writeout two
  of these newly dirty pages, while not writing out the old page that should
  have been written out. Fix this by ignoring nr_to_write if it is a data
  integrity sync.

  This is a data interity bug.

* In the range_cont case, range_start is set to index << PAGE_CACHE_SHIFT,
  but index is a pgoff_t and range_start is loff_t, so we can get truncation
  of the value on 32-bit platforms. Fix this by adding the standard loff_t
  cast.

  This is a data interity bug (depending on how range_cont is used).


Other problems that are not strictly bugs:

o If we get stuck behind another process that is cleaning pages, we will be
  forced to wait for them to finish, then perform our own writeout (if it
  was redirtied during the long wait), then wait for that. If a page under
  writeout is still clean, we can skip waiting for it (if we're part of a
  data integrity sync, we'll be waiting for all writeout pages afterwards).

o Control structures containing non-indempotent expressions. Break these out
  and make flow control clearer from data control.


---
 mm/filemap.c        |    2 -
 mm/page-writeback.c |  101 +++++++++++++++++++++++++++++++++++-----------------
 2 files changed, 70 insertions(+), 33 deletions(-)

Index: linux-2.6/mm/page-writeback.c
===================================================================
--- linux-2.6.orig/mm/page-writeback.c
+++ linux-2.6/mm/page-writeback.c
@@ -869,13 +869,13 @@ int write_cache_pages(struct address_spa
 {
 	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	int ret = 0;
-	int done = 0;
 	struct pagevec pvec;
 	int nr_pages;
+	pgoff_t writeback_index;
 	pgoff_t index;
 	pgoff_t end;		/* Inclusive */
-	int scanned = 0;
 	int range_whole = 0;
+	int cycled;
 
 	if (wbc->nonblocking && bdi_write_congested(bdi)) {
 		wbc->encountered_congestion = 1;
@@ -884,23 +884,28 @@ int write_cache_pages(struct address_spa
 
 	pagevec_init(&pvec, 0);
 	if (wbc->range_cyclic) {
-		index = mapping->writeback_index; /* Start from prev offset */
-		end = -1;
+		cycled = 0;
+		writeback_index = mapping->writeback_index; /* prev offset */
+		index = writeback_index;
+		if (index == 0)
+			cycled = 1;
+		end = ULLONG_MAX;
 	} else {
 		index = wbc->range_start >> PAGE_CACHE_SHIFT;
 		end = wbc->range_end >> PAGE_CACHE_SHIFT;
 		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
 			range_whole = 1;
-		scanned = 1;
 	}
 retry:
-	while (!done && (index <= end) &&
-	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-					      PAGECACHE_TAG_DIRTY,
-					      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
-		unsigned i;
+	do {
+		int i;
+
+		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+			      PAGECACHE_TAG_DIRTY,
+			      min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1);
+		if (!nr_pages)
+			break;
 
-		scanned = 1;
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
 
@@ -911,58 +916,90 @@ retry:
 			 * swizzled back from swapper_space to tmpfs file
 			 * mapping
 			 */
+again:
 			lock_page(page);
 
+			/*
+			 * Page truncated or invalidated. We can freely skip it
+			 * then, even for data integrity operations: the page
+			 * has disappeared concurrently, so there could be no
+			 * real expectation of this data interity operation
+			 * even if there is now a new, dirty page at the same
+			 * pagecache address.
+			 */
 			if (unlikely(page->mapping != mapping)) {
+continue_unlock:
 				unlock_page(page);
 				continue;
 			}
 
-			if (!wbc->range_cyclic && page->index > end) {
-				done = 1;
+			if (page->index > end) {
+				/* Can't be cyclic: end == ULLONG_MAX */
 				unlock_page(page);
-				continue;
+done_release:
+				pagevec_release(&pvec);
+				goto done;
 			}
 
-			if (wbc->sync_mode != WB_SYNC_NONE)
-				wait_on_page_writeback(page);
-
-			if (PageWriteback(page) ||
-			    !clear_page_dirty_for_io(page)) {
-				unlock_page(page);
-				continue;
+			if (PageWriteback(page)) {
+				/* someone else wrote it for us */
+				if (!PageDirty(page)) {
+					goto continue_unlock;
+				} else {
+					/* hmm, but it has been dirtied again */
+					if (wbc->sync_mode != WB_SYNC_NONE)
+						wait_on_page_writeback(page);
+					else
+						goto continue_unlock;
+				}
 			}
 
+			BUG_ON(PageWriteback(page));
+
+			if (!clear_page_dirty_for_io(page))
+				goto continue_unlock;
+
 			ret = (*writepage)(page, wbc, data);
 
-			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
-				unlock_page(page);
-				ret = 0;
+			if (unlikely(ret)) {
+				/* Must retry the write, esp. for integrity */
+				if (ret == AOP_WRITEPAGE_ACTIVATE) {
+					unlock_page(page);
+					ret = 0;
+					goto again;
+				}
+				goto done;
+			}
+			if (wbc->sync_mode == WB_SYNC_NONE) {
+				wbc->nr_to_write--;
+				if (wbc->nr_to_write <= 0)
+					goto done_release;
 			}
-			if (ret || (--(wbc->nr_to_write) <= 0))
-				done = 1;
 			if (wbc->nonblocking && bdi_write_congested(bdi)) {
 				wbc->encountered_congestion = 1;
-				done = 1;
+				goto done_release;
 			}
 		}
 		pagevec_release(&pvec);
 		cond_resched();
-	}
-	if (!scanned && !done) {
+	} while (index <= end);
+
+	if (wbc->range_cyclic && !cycled) {
 		/*
 		 * We hit the last page and there is more work to be done: wrap
 		 * back to the start of the file
 		 */
-		scanned = 1;
+		cycled = 1;
 		index = 0;
+		end = writeback_index - 1; /* won't be -ve, see above */
 		goto retry;
 	}
+done:
 	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		mapping->writeback_index = index;
-
 	if (wbc->range_cont)
-		wbc->range_start = index << PAGE_CACHE_SHIFT;
+		wbc->range_start = (loff_t)index << PAGE_CACHE_SHIFT;
+
 	return ret;
 }
 EXPORT_SYMBOL(write_cache_pages);
Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c
+++ linux-2.6/mm/filemap.c
@@ -209,7 +209,7 @@ int __filemap_fdatawrite_range(struct ad
 	int ret;
 	struct writeback_control wbc = {
 		.sync_mode = sync_mode,
-		.nr_to_write = mapping->nrpages * 2,
+		.nr_to_write = LONG_MAX,
 		.range_start = start,
 		.range_end = end,
 	};