It's all about bdi->dirty_ratelimit, which aims to be (write_bw / N)
when there are N dd tasks.

On write() syscall, use bdi->dirty_ratelimit
============================================

    balance_dirty_pages(pages_dirtied)
    {
        pos_bw = bdi->dirty_ratelimit * bdi_position_ratio();
        pause = pages_dirtied / pos_bw;
        sleep(pause);
    }

On every 200ms, update bdi->dirty_ratelimit
===========================================

    bdi_update_dirty_ratelimit()
    {
        bw = bdi->dirty_ratelimit;
        ref_bw = bw * bdi_position_ratio() * write_bw / dirty_bw;
        if (dirty pages unbalanced)
             bdi->dirty_ratelimit = (bw * 3 + ref_bw) / 4;
    }

Estimation of balanced bdi->dirty_ratelimit
===========================================

When started N dd, throttle each dd at

         task_ratelimit = pos_bw (any non-zero initial value is OK)

After 200ms, we got

         dirty_bw = # of pages dirtied by app / 200ms
         write_bw = # of pages written to disk / 200ms

For aggressive dirtiers, the equality holds

         dirty_bw == N * task_ratelimit
                  == N * pos_bw                      	(1)

The balanced throttle bandwidth can be estimated by

         ref_bw = pos_bw * write_bw / dirty_bw       	(2)

>>From (1) and (2), we get equality

         ref_bw == write_bw / N                      	(3)

If the N dd's are all throttled at ref_bw, the dirty/writeback rates
will match. So ref_bw is the balanced dirty rate.

In practice, the ref_bw calculated by (2) may fluctuate and have
estimation errors. So the bdi->dirty_ratelimit update policy is to
follow it only when both pos_bw and ref_bw point to the same direction
(indicating not only the dirty position has deviated from the global/bdi
setpoints, but also it's still departing away).

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 include/linux/backing-dev.h |    7 +++
 mm/backing-dev.c            |    1 
 mm/page-writeback.c         |   69 +++++++++++++++++++++++++++++++++-
 3 files changed, 75 insertions(+), 2 deletions(-)

--- linux-next.orig/include/linux/backing-dev.h	2011-08-05 18:05:36.000000000 +0800
+++ linux-next/include/linux/backing-dev.h	2011-08-05 18:05:36.000000000 +0800
@@ -75,10 +75,17 @@ struct backing_dev_info {
 	struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
 
 	unsigned long bw_time_stamp;	/* last time write bw is updated */
+	unsigned long dirtied_stamp;
 	unsigned long written_stamp;	/* pages written at bw_time_stamp */
 	unsigned long write_bandwidth;	/* the estimated write bandwidth */
 	unsigned long avg_write_bandwidth; /* further smoothed write bw */
 
+	/*
+	 * The base throttle bandwidth, re-calculated on every 200ms.
+	 * All the bdi tasks' dirty rate will be curbed under it.
+	 */
+	unsigned long dirty_ratelimit;
+
 	struct prop_local_percpu completions;
 	int dirty_exceeded;
 
--- linux-next.orig/mm/backing-dev.c	2011-08-05 18:05:36.000000000 +0800
+++ linux-next/mm/backing-dev.c	2011-08-05 18:05:36.000000000 +0800
@@ -674,6 +674,7 @@ int bdi_init(struct backing_dev_info *bd
 	bdi->bw_time_stamp = jiffies;
 	bdi->written_stamp = 0;
 
+	bdi->dirty_ratelimit = INIT_BW;
 	bdi->write_bandwidth = INIT_BW;
 	bdi->avg_write_bandwidth = INIT_BW;
 
--- linux-next.orig/mm/page-writeback.c	2011-08-05 18:05:36.000000000 +0800
+++ linux-next/mm/page-writeback.c	2011-08-06 09:08:35.000000000 +0800
@@ -736,6 +736,66 @@ static void global_update_bandwidth(unsi
 	spin_unlock(&dirty_lock);
 }
 
+/*
+ * Maintain bdi->dirty_ratelimit, the base throttle bandwidth.
+ *
+ * Normal bdi tasks will be curbed at or below it in long term.
+ * Obviously it should be around (write_bw / N) when there are N dd tasks.
+ */
+static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
+				       unsigned long thresh,
+				       unsigned long dirty,
+				       unsigned long bdi_thresh,
+				       unsigned long bdi_dirty,
+				       unsigned long dirtied,
+				       unsigned long elapsed)
+{
+	unsigned long bw = bdi->dirty_ratelimit;
+	unsigned long dirty_bw;
+	unsigned long pos_bw;
+	unsigned long ref_bw;
+	unsigned long long pos_ratio;
+
+	/*
+	 * The dirty rate will match the writeback rate in long term, except
+	 * when dirty pages are truncated by userspace or re-dirtied by FS.
+	 */
+	dirty_bw = (dirtied - bdi->dirtied_stamp) * HZ / elapsed;
+
+	pos_ratio = bdi_position_ratio(bdi, thresh, dirty,
+				       bdi_thresh, bdi_dirty);
+	/*
+	 * pos_bw reflects each dd's dirty rate enforced for the past 200ms.
+	 */
+	pos_bw = bw * pos_ratio >> BANDWIDTH_CALC_SHIFT;
+	pos_bw++;  /* this avoids bdi->dirty_ratelimit get stuck in 0 */
+
+	/*
+	 * ref_bw = pos_bw * write_bw / dirty_bw
+	 *
+	 * It's a linear estimation of the "balanced" throttle bandwidth.
+	 */
+	pos_ratio *= bdi->avg_write_bandwidth;
+	do_div(pos_ratio, dirty_bw | 1);
+	ref_bw = bw * pos_ratio >> BANDWIDTH_CALC_SHIFT;
+
+	/*
+	 * dirty_ratelimit will follow ref_bw/pos_bw conservatively iff they
+	 * are on the same side of dirty_ratelimit. Which not only makes it
+	 * more stable, but also is essential for preventing it being driven
+	 * away by possible systematic errors in ref_bw.
+	 */
+	if (pos_bw < bw) {
+		if (ref_bw < bw)
+			bw = max(ref_bw, pos_bw);
+	} else {
+		if (ref_bw > bw)
+			bw = min(ref_bw, pos_bw);
+	}
+
+	bdi->dirty_ratelimit = bw;
+}
+
 void __bdi_update_bandwidth(struct backing_dev_info *bdi,
 			    unsigned long thresh,
 			    unsigned long dirty,
@@ -745,6 +805,7 @@ void __bdi_update_bandwidth(struct backi
 {
 	unsigned long now = jiffies;
 	unsigned long elapsed = now - bdi->bw_time_stamp;
+	unsigned long dirtied;
 	unsigned long written;
 
 	/*
@@ -753,6 +814,7 @@ void __bdi_update_bandwidth(struct backi
 	if (elapsed < BANDWIDTH_INTERVAL)
 		return;
 
+	dirtied = percpu_counter_read(&bdi->bdi_stat[BDI_DIRTIED]);
 	written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
 
 	/*
@@ -762,12 +824,15 @@ void __bdi_update_bandwidth(struct backi
 	if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
 		goto snapshot;
 
-	if (thresh)
+	if (thresh) {
 		global_update_bandwidth(thresh, dirty, now);
-
+		bdi_update_dirty_ratelimit(bdi, thresh, dirty, bdi_thresh,
+					   bdi_dirty, dirtied, elapsed);
+	}
 	bdi_update_write_bandwidth(bdi, elapsed, written);
 
 snapshot:
+	bdi->dirtied_stamp = dirtied;
 	bdi->written_stamp = written;
 	bdi->bw_time_stamp = now;
 }


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/