This basically does

-	task_bw = linear_function(task_weight, bdi_dirty, bdi->throttle_bandwidth)
+	task_bw = linear_function(task_weight, avg_dirty, bdi->throttle_bandwidth)

So that the fluctuations of bdi_dirty can be filtered by half.

The main problem is, bdi_dirty regularly drops low suddenly for dozens
of megabytes in NFS on the completion of COMMIT requests.  The same
problem, though less severe, exists for btrfs, xfs and maybe some types
of storages. avg_dirty can help filter out such downwards spikes.

Upwards spikes are also possible, and if does happen, should better be
fixed in the FS code.  To avoid exceeding the dirty limits, once
bdi_dirty exceeds avg_dirty, the higher value will instantly be used as
the feedback to the control system. So the control system cannot filter
out upwards spikes for the sake of safety.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 include/linux/backing-dev.h |    2 +
 mm/page-writeback.c         |   44 ++++++++++++++++++++++++++++++----
 2 files changed, 42 insertions(+), 4 deletions(-)

--- linux-next.orig/include/linux/backing-dev.h	2010-12-09 12:08:16.000000000 +0800
+++ linux-next/include/linux/backing-dev.h	2010-12-09 12:08:18.000000000 +0800
@@ -79,6 +79,8 @@ struct backing_dev_info {
 	unsigned long written_stamp;
 	unsigned long write_bandwidth;
 	unsigned long throttle_bandwidth;
+	unsigned long avg_dirty;
+	unsigned long old_dirty;
 
 	struct prop_local_percpu completions;
 	int dirty_exceeded;
--- linux-next.orig/mm/page-writeback.c	2010-12-09 12:08:16.000000000 +0800
+++ linux-next/mm/page-writeback.c	2010-12-09 12:08:18.000000000 +0800
@@ -528,6 +528,36 @@ out:
 	return 1 + int_sqrt(dirty_thresh - dirty_pages);
 }
 
+static void __bdi_update_dirty_smooth(struct backing_dev_info *bdi,
+				      unsigned long dirty,
+				      unsigned long thresh)
+{
+	unsigned long avg = bdi->avg_dirty;
+	unsigned long old = bdi->old_dirty;
+
+	/* skip call from the flusher */
+	if (!thresh)
+		return;
+
+	if (avg > thresh) {
+		avg = dirty;
+		goto update;
+	}
+
+	if (dirty <= avg && dirty >= old)
+		goto out;
+
+	if (dirty >= avg && dirty <= old)
+		goto out;
+
+	avg = (avg * 15 + dirty) / 16;
+
+update:
+	bdi->avg_dirty = avg;
+out:
+	bdi->old_dirty = dirty;
+}
+
 /*
  * The bdi throttle bandwidth is introduced for resisting bdi_dirty from
  * getting too close to task_thresh. It allows scaling up to 1000+ concurrent
@@ -608,8 +638,9 @@ void bdi_update_bandwidth(struct backing
 	if (elapsed <= HZ/10)
 		goto unlock;
 
+	__bdi_update_dirty_smooth(bdi, bdi_dirty, bdi_thresh);
 	__bdi_update_write_bandwidth(bdi, elapsed, written);
-	__bdi_update_throttle_bandwidth(bdi, bdi_dirty, bdi_thresh);
+	__bdi_update_throttle_bandwidth(bdi, bdi->avg_dirty, bdi_thresh);
 
 snapshot:
 	bdi->written_stamp = written;
@@ -631,6 +662,7 @@ static void balance_dirty_pages(struct a
 	long nr_reclaimable;
 	long nr_dirty;
 	long bdi_dirty;  /* = file_dirty + writeback + unstable_nfs */
+	long avg_dirty;  /* smoothed bdi_dirty */
 	long bdi_prev_dirty = 0;
 	unsigned long background_thresh;
 	unsigned long dirty_thresh;
@@ -708,7 +740,11 @@ static void balance_dirty_pages(struct a
 
 		bdi_update_bandwidth(bdi, start_time, bdi_dirty, bdi_thresh);
 
-		if (bdi_dirty >= task_thresh || nr_dirty > dirty_thresh) {
+		avg_dirty = bdi->avg_dirty;
+		if (avg_dirty < bdi_dirty || avg_dirty > task_thresh)
+			avg_dirty = bdi_dirty;
+
+		if (avg_dirty >= task_thresh || nr_dirty > dirty_thresh) {
 			pause = MAX_PAUSE;
 			goto pause;
 		}
@@ -721,10 +757,10 @@ static void balance_dirty_pages(struct a
 		 * time when there are lots of dirtiers.
 		 */
 		bw = bdi->throttle_bandwidth;
-		bw = bw * (bdi_thresh - bdi_dirty);
+		bw = bw * (bdi_thresh - avg_dirty);
 		do_div(bw, bdi_thresh / BDI_SOFT_DIRTY_LIMIT + 1);
 
-		bw = bw * (task_thresh - bdi_dirty);
+		bw = bw * (task_thresh - avg_dirty);
 		do_div(bw, bdi_thresh / TASK_SOFT_DIRTY_LIMIT + 1);
 
 		period = HZ * pages_dirtied / ((unsigned long)bw + 1) + 1;


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/