lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Wed, 7 Dec 2011 17:14:40 +0100
From:	Jan Kara <jack@...e.cz>
To:	Wu Fengguang <fengguang.wu@...el.com>
Cc:	linux-fsdevel@...r.kernel.org, Jan Kara <jack@...e.cz>,
	Peter Zijlstra <a.p.zijlstra@...llo.nl>,
	Christoph Hellwig <hch@....de>,
	Andrew Morton <akpm@...ux-foundation.org>,
	LKML <linux-kernel@...r.kernel.org>
Subject: Re: [PATCH 6/7] writeback: dirty ratelimit - think time
 compensation

On Mon 28-11-11 21:53:44, Wu Fengguang wrote:
> Compensate the task's think time when computing the final pause time,
> so that ->dirty_ratelimit can be executed accurately.
> 
>         think time := time spend outside of balance_dirty_pages()
> 
> In the rare case that the task slept longer than the 200ms period time
> (result in negative pause time), the sleep time will be compensated in
> the following periods, too, if it's less than 1 second.
> 
> Accumulated errors are carefully avoided as long as the max pause area
> is not hitted.
> 
> Pseudo code:
> 
>         period = pages_dirtied / task_ratelimit;
>         think = jiffies - dirty_paused_when;
>         pause = period - think;
> 
> 1) normal case: period > think
> 
>         pause = period - think
>         dirty_paused_when = jiffies + pause
>         nr_dirtied = 0
> 
>                              period time
>               |===============================>|
>                   think time      pause time
>               |===============>|==============>|
>         ------|----------------|---------------|------------------------
>         dirty_paused_when   jiffies
> 
> 
> 2) no pause case: period <= think
> 
>         don't pause; reduce future pause time by:
>         dirty_paused_when += period
>         nr_dirtied = 0
> 
>                            period time
>               |===============================>|
>                                   think time
>               |===================================================>|
>         ------|--------------------------------+-------------------|----
>         dirty_paused_when                                       jiffies
> 
> Acked-by: Peter Zijlstra <a.p.zijlstra@...llo.nl>
> Signed-off-by: Wu Fengguang <fengguang.wu@...el.com>
  Looks good. You can add:

  Acked-by: Jan Kara <jack@...e.cz>

								Honza

> ---
>  include/linux/sched.h            |    1 
>  include/trace/events/writeback.h |   14 ++++++++---
>  kernel/fork.c                    |    1 
>  mm/page-writeback.c              |   36 +++++++++++++++++++++++++----
>  4 files changed, 45 insertions(+), 7 deletions(-)
> 
> --- linux-next.orig/include/linux/sched.h	2011-11-17 20:12:22.000000000 +0800
> +++ linux-next/include/linux/sched.h	2011-11-17 20:12:35.000000000 +0800
> @@ -1527,6 +1527,7 @@ struct task_struct {
>  	 */
>  	int nr_dirtied;
>  	int nr_dirtied_pause;
> +	unsigned long dirty_paused_when; /* start of a write-and-pause period */
>  
>  #ifdef CONFIG_LATENCYTOP
>  	int latency_record_count;
> --- linux-next.orig/mm/page-writeback.c	2011-11-17 20:12:22.000000000 +0800
> +++ linux-next/mm/page-writeback.c	2011-11-17 20:12:38.000000000 +0800
> @@ -1010,6 +1010,7 @@ static void balance_dirty_pages(struct a
>  	unsigned long background_thresh;
>  	unsigned long dirty_thresh;
>  	unsigned long bdi_thresh;
> +	long period;
>  	long pause = 0;
>  	long uninitialized_var(max_pause);
>  	bool dirty_exceeded = false;
> @@ -1020,6 +1021,8 @@ static void balance_dirty_pages(struct a
>  	unsigned long start_time = jiffies;
>  
>  	for (;;) {
> +		unsigned long now = jiffies;
> +
>  		/*
>  		 * Unstable writes are a feature of certain networked
>  		 * filesystems (i.e. NFS) in which data may have been
> @@ -1039,8 +1042,11 @@ static void balance_dirty_pages(struct a
>  		 */
>  		freerun = dirty_freerun_ceiling(dirty_thresh,
>  						background_thresh);
> -		if (nr_dirty <= freerun)
> +		if (nr_dirty <= freerun) {
> +			current->dirty_paused_when = now;
> +			current->nr_dirtied = 0;
>  			break;
> +		}
>  
>  		if (unlikely(!writeback_in_progress(bdi)))
>  			bdi_start_background_writeback(bdi);
> @@ -1098,10 +1104,21 @@ static void balance_dirty_pages(struct a
>  		task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >>
>  							RATELIMIT_CALC_SHIFT;
>  		if (unlikely(task_ratelimit == 0)) {
> +			period = max_pause;
>  			pause = max_pause;
>  			goto pause;
>  		}
> -		pause = HZ * pages_dirtied / task_ratelimit;
> +		period = HZ * pages_dirtied / task_ratelimit;
> +		pause = period;
> +		if (current->dirty_paused_when)
> +			pause -= now - current->dirty_paused_when;
> +		/*
> +		 * For less than 1s think time (ext3/4 may block the dirtier
> +		 * for up to 800ms from time to time on 1-HDD; so does xfs,
> +		 * however at much less frequency), try to compensate it in
> +		 * future periods by updating the virtual time; otherwise just
> +		 * do a reset, as it may be a light dirtier.
> +		 */
>  		if (unlikely(pause <= 0)) {
>  			trace_balance_dirty_pages(bdi,
>  						  dirty_thresh,
> @@ -1112,8 +1129,16 @@ static void balance_dirty_pages(struct a
>  						  dirty_ratelimit,
>  						  task_ratelimit,
>  						  pages_dirtied,
> +						  period,
>  						  pause,
>  						  start_time);
> +			if (pause < -HZ) {
> +				current->dirty_paused_when = now;
> +				current->nr_dirtied = 0;
> +			} else if (period) {
> +				current->dirty_paused_when += period;
> +				current->nr_dirtied = 0;
> +			}
>  			pause = 1; /* avoid resetting nr_dirtied_pause below */
>  			break;
>  		}
> @@ -1129,11 +1154,15 @@ pause:
>  					  dirty_ratelimit,
>  					  task_ratelimit,
>  					  pages_dirtied,
> +					  period,
>  					  pause,
>  					  start_time);
>  		__set_current_state(TASK_KILLABLE);
>  		io_schedule_timeout(pause);
>  
> +		current->dirty_paused_when = now + pause;
> +		current->nr_dirtied = 0;
> +
>  		/*
>  		 * This is typically equal to (nr_dirty < dirty_thresh) and can
>  		 * also keep "1000+ dd on a slow USB stick" under control.
> @@ -1148,11 +1177,10 @@ pause:
>  	if (!dirty_exceeded && bdi->dirty_exceeded)
>  		bdi->dirty_exceeded = 0;
>  
> -	current->nr_dirtied = 0;
>  	if (pause == 0) { /* in freerun area */
>  		current->nr_dirtied_pause =
>  				dirty_poll_interval(nr_dirty, dirty_thresh);
> -	} else if (pause <= max_pause / 4 &&
> +	} else if (period <= max_pause / 4 &&
>  		   pages_dirtied >= current->nr_dirtied_pause) {
>  		current->nr_dirtied_pause = clamp_val(
>  					dirty_ratelimit * (max_pause / 2) / HZ,
> --- linux-next.orig/kernel/fork.c	2011-11-17 20:12:23.000000000 +0800
> +++ linux-next/kernel/fork.c	2011-11-17 20:12:35.000000000 +0800
> @@ -1296,6 +1296,7 @@ static struct task_struct *copy_process(
>  
>  	p->nr_dirtied = 0;
>  	p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
> +	p->dirty_paused_when = 0;
>  
>  	/*
>  	 * Ok, make it visible to the rest of the system.
> --- linux-next.orig/include/trace/events/writeback.h	2011-11-17 19:13:41.000000000 +0800
> +++ linux-next/include/trace/events/writeback.h	2011-11-17 20:12:35.000000000 +0800
> @@ -289,12 +289,13 @@ TRACE_EVENT(balance_dirty_pages,
>  		 unsigned long dirty_ratelimit,
>  		 unsigned long task_ratelimit,
>  		 unsigned long dirtied,
> +		 unsigned long period,
>  		 long pause,
>  		 unsigned long start_time),
>  
>  	TP_ARGS(bdi, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty,
>  		dirty_ratelimit, task_ratelimit,
> -		dirtied, pause, start_time),
> +		dirtied, period, pause, start_time),
>  
>  	TP_STRUCT__entry(
>  		__array(	 char,	bdi, 32)
> @@ -309,6 +310,8 @@ TRACE_EVENT(balance_dirty_pages,
>  		__field(unsigned int,	dirtied_pause)
>  		__field(unsigned long,	paused)
>  		__field(	 long,	pause)
> +		__field(unsigned long,	period)
> +		__field(	 long,	think)
>  	),
>  
>  	TP_fast_assign(
> @@ -325,6 +328,9 @@ TRACE_EVENT(balance_dirty_pages,
>  		__entry->task_ratelimit	= KBps(task_ratelimit);
>  		__entry->dirtied	= dirtied;
>  		__entry->dirtied_pause	= current->nr_dirtied_pause;
> +		__entry->think		= current->dirty_paused_when == 0 ? 0 :
> +			 (long)(jiffies - current->dirty_paused_when) * 1000/HZ;
> +		__entry->period		= period * 1000 / HZ;
>  		__entry->pause		= pause * 1000 / HZ;
>  		__entry->paused		= (jiffies - start_time) * 1000 / HZ;
>  	),
> @@ -335,7 +341,7 @@ TRACE_EVENT(balance_dirty_pages,
>  		  "bdi_setpoint=%lu bdi_dirty=%lu "
>  		  "dirty_ratelimit=%lu task_ratelimit=%lu "
>  		  "dirtied=%u dirtied_pause=%u "
> -		  "paused=%lu pause=%ld",
> +		  "paused=%lu pause=%ld period=%lu think=%ld",
>  		  __entry->bdi,
>  		  __entry->limit,
>  		  __entry->setpoint,
> @@ -347,7 +353,9 @@ TRACE_EVENT(balance_dirty_pages,
>  		  __entry->dirtied,
>  		  __entry->dirtied_pause,
>  		  __entry->paused,	/* ms */
> -		  __entry->pause	/* ms */
> +		  __entry->pause,	/* ms */
> +		  __entry->period,	/* ms */
> +		  __entry->think	/* ms */
>  	  )
>  );
>  
> 
> 
-- 
Jan Kara <jack@...e.cz>
SUSE Labs, CR
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ