[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20111207161440.GL4622@quack.suse.cz>
Date: Wed, 7 Dec 2011 17:14:40 +0100
From: Jan Kara <jack@...e.cz>
To: Wu Fengguang <fengguang.wu@...el.com>
Cc: linux-fsdevel@...r.kernel.org, Jan Kara <jack@...e.cz>,
Peter Zijlstra <a.p.zijlstra@...llo.nl>,
Christoph Hellwig <hch@....de>,
Andrew Morton <akpm@...ux-foundation.org>,
LKML <linux-kernel@...r.kernel.org>
Subject: Re: [PATCH 6/7] writeback: dirty ratelimit - think time
compensation
On Mon 28-11-11 21:53:44, Wu Fengguang wrote:
> Compensate the task's think time when computing the final pause time,
> so that ->dirty_ratelimit can be executed accurately.
>
> think time := time spend outside of balance_dirty_pages()
>
> In the rare case that the task slept longer than the 200ms period time
> (result in negative pause time), the sleep time will be compensated in
> the following periods, too, if it's less than 1 second.
>
> Accumulated errors are carefully avoided as long as the max pause area
> is not hitted.
>
> Pseudo code:
>
> period = pages_dirtied / task_ratelimit;
> think = jiffies - dirty_paused_when;
> pause = period - think;
>
> 1) normal case: period > think
>
> pause = period - think
> dirty_paused_when = jiffies + pause
> nr_dirtied = 0
>
> period time
> |===============================>|
> think time pause time
> |===============>|==============>|
> ------|----------------|---------------|------------------------
> dirty_paused_when jiffies
>
>
> 2) no pause case: period <= think
>
> don't pause; reduce future pause time by:
> dirty_paused_when += period
> nr_dirtied = 0
>
> period time
> |===============================>|
> think time
> |===================================================>|
> ------|--------------------------------+-------------------|----
> dirty_paused_when jiffies
>
> Acked-by: Peter Zijlstra <a.p.zijlstra@...llo.nl>
> Signed-off-by: Wu Fengguang <fengguang.wu@...el.com>
Looks good. You can add:
Acked-by: Jan Kara <jack@...e.cz>
Honza
> ---
> include/linux/sched.h | 1
> include/trace/events/writeback.h | 14 ++++++++---
> kernel/fork.c | 1
> mm/page-writeback.c | 36 +++++++++++++++++++++++++----
> 4 files changed, 45 insertions(+), 7 deletions(-)
>
> --- linux-next.orig/include/linux/sched.h 2011-11-17 20:12:22.000000000 +0800
> +++ linux-next/include/linux/sched.h 2011-11-17 20:12:35.000000000 +0800
> @@ -1527,6 +1527,7 @@ struct task_struct {
> */
> int nr_dirtied;
> int nr_dirtied_pause;
> + unsigned long dirty_paused_when; /* start of a write-and-pause period */
>
> #ifdef CONFIG_LATENCYTOP
> int latency_record_count;
> --- linux-next.orig/mm/page-writeback.c 2011-11-17 20:12:22.000000000 +0800
> +++ linux-next/mm/page-writeback.c 2011-11-17 20:12:38.000000000 +0800
> @@ -1010,6 +1010,7 @@ static void balance_dirty_pages(struct a
> unsigned long background_thresh;
> unsigned long dirty_thresh;
> unsigned long bdi_thresh;
> + long period;
> long pause = 0;
> long uninitialized_var(max_pause);
> bool dirty_exceeded = false;
> @@ -1020,6 +1021,8 @@ static void balance_dirty_pages(struct a
> unsigned long start_time = jiffies;
>
> for (;;) {
> + unsigned long now = jiffies;
> +
> /*
> * Unstable writes are a feature of certain networked
> * filesystems (i.e. NFS) in which data may have been
> @@ -1039,8 +1042,11 @@ static void balance_dirty_pages(struct a
> */
> freerun = dirty_freerun_ceiling(dirty_thresh,
> background_thresh);
> - if (nr_dirty <= freerun)
> + if (nr_dirty <= freerun) {
> + current->dirty_paused_when = now;
> + current->nr_dirtied = 0;
> break;
> + }
>
> if (unlikely(!writeback_in_progress(bdi)))
> bdi_start_background_writeback(bdi);
> @@ -1098,10 +1104,21 @@ static void balance_dirty_pages(struct a
> task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >>
> RATELIMIT_CALC_SHIFT;
> if (unlikely(task_ratelimit == 0)) {
> + period = max_pause;
> pause = max_pause;
> goto pause;
> }
> - pause = HZ * pages_dirtied / task_ratelimit;
> + period = HZ * pages_dirtied / task_ratelimit;
> + pause = period;
> + if (current->dirty_paused_when)
> + pause -= now - current->dirty_paused_when;
> + /*
> + * For less than 1s think time (ext3/4 may block the dirtier
> + * for up to 800ms from time to time on 1-HDD; so does xfs,
> + * however at much less frequency), try to compensate it in
> + * future periods by updating the virtual time; otherwise just
> + * do a reset, as it may be a light dirtier.
> + */
> if (unlikely(pause <= 0)) {
> trace_balance_dirty_pages(bdi,
> dirty_thresh,
> @@ -1112,8 +1129,16 @@ static void balance_dirty_pages(struct a
> dirty_ratelimit,
> task_ratelimit,
> pages_dirtied,
> + period,
> pause,
> start_time);
> + if (pause < -HZ) {
> + current->dirty_paused_when = now;
> + current->nr_dirtied = 0;
> + } else if (period) {
> + current->dirty_paused_when += period;
> + current->nr_dirtied = 0;
> + }
> pause = 1; /* avoid resetting nr_dirtied_pause below */
> break;
> }
> @@ -1129,11 +1154,15 @@ pause:
> dirty_ratelimit,
> task_ratelimit,
> pages_dirtied,
> + period,
> pause,
> start_time);
> __set_current_state(TASK_KILLABLE);
> io_schedule_timeout(pause);
>
> + current->dirty_paused_when = now + pause;
> + current->nr_dirtied = 0;
> +
> /*
> * This is typically equal to (nr_dirty < dirty_thresh) and can
> * also keep "1000+ dd on a slow USB stick" under control.
> @@ -1148,11 +1177,10 @@ pause:
> if (!dirty_exceeded && bdi->dirty_exceeded)
> bdi->dirty_exceeded = 0;
>
> - current->nr_dirtied = 0;
> if (pause == 0) { /* in freerun area */
> current->nr_dirtied_pause =
> dirty_poll_interval(nr_dirty, dirty_thresh);
> - } else if (pause <= max_pause / 4 &&
> + } else if (period <= max_pause / 4 &&
> pages_dirtied >= current->nr_dirtied_pause) {
> current->nr_dirtied_pause = clamp_val(
> dirty_ratelimit * (max_pause / 2) / HZ,
> --- linux-next.orig/kernel/fork.c 2011-11-17 20:12:23.000000000 +0800
> +++ linux-next/kernel/fork.c 2011-11-17 20:12:35.000000000 +0800
> @@ -1296,6 +1296,7 @@ static struct task_struct *copy_process(
>
> p->nr_dirtied = 0;
> p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
> + p->dirty_paused_when = 0;
>
> /*
> * Ok, make it visible to the rest of the system.
> --- linux-next.orig/include/trace/events/writeback.h 2011-11-17 19:13:41.000000000 +0800
> +++ linux-next/include/trace/events/writeback.h 2011-11-17 20:12:35.000000000 +0800
> @@ -289,12 +289,13 @@ TRACE_EVENT(balance_dirty_pages,
> unsigned long dirty_ratelimit,
> unsigned long task_ratelimit,
> unsigned long dirtied,
> + unsigned long period,
> long pause,
> unsigned long start_time),
>
> TP_ARGS(bdi, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty,
> dirty_ratelimit, task_ratelimit,
> - dirtied, pause, start_time),
> + dirtied, period, pause, start_time),
>
> TP_STRUCT__entry(
> __array( char, bdi, 32)
> @@ -309,6 +310,8 @@ TRACE_EVENT(balance_dirty_pages,
> __field(unsigned int, dirtied_pause)
> __field(unsigned long, paused)
> __field( long, pause)
> + __field(unsigned long, period)
> + __field( long, think)
> ),
>
> TP_fast_assign(
> @@ -325,6 +328,9 @@ TRACE_EVENT(balance_dirty_pages,
> __entry->task_ratelimit = KBps(task_ratelimit);
> __entry->dirtied = dirtied;
> __entry->dirtied_pause = current->nr_dirtied_pause;
> + __entry->think = current->dirty_paused_when == 0 ? 0 :
> + (long)(jiffies - current->dirty_paused_when) * 1000/HZ;
> + __entry->period = period * 1000 / HZ;
> __entry->pause = pause * 1000 / HZ;
> __entry->paused = (jiffies - start_time) * 1000 / HZ;
> ),
> @@ -335,7 +341,7 @@ TRACE_EVENT(balance_dirty_pages,
> "bdi_setpoint=%lu bdi_dirty=%lu "
> "dirty_ratelimit=%lu task_ratelimit=%lu "
> "dirtied=%u dirtied_pause=%u "
> - "paused=%lu pause=%ld",
> + "paused=%lu pause=%ld period=%lu think=%ld",
> __entry->bdi,
> __entry->limit,
> __entry->setpoint,
> @@ -347,7 +353,9 @@ TRACE_EVENT(balance_dirty_pages,
> __entry->dirtied,
> __entry->dirtied_pause,
> __entry->paused, /* ms */
> - __entry->pause /* ms */
> + __entry->pause, /* ms */
> + __entry->period, /* ms */
> + __entry->think /* ms */
> )
> );
>
>
>
--
Jan Kara <jack@...e.cz>
SUSE Labs, CR
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists