linux-kernel - Re: [RFC PATCH] v2 mm: balance_dirty_pages. reduce calls to global_page

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20090906035537.GA16063@localhost>
Date:	Sun, 6 Sep 2009 11:55:37 +0800
From:	Wu Fengguang <fengguang.wu@...el.com>
To:	Richard Kennedy <richard@....demon.co.uk>
Cc:	Andrew Morton <akpm@...ux-foundation.org>,
	"chris.mason" <chris.mason@...cle.com>,
	linux-mm <linux-mm@...ck.org>,
	lkml <linux-kernel@...r.kernel.org>,
	Peter Zijlstra <a.p.zijlstra@...llo.nl>,
	Jens Axboe <jens.axboe@...cle.com>,
	linux-fsdevel <linux-fsdevel@...r.kernel.org>
Subject: Re: [RFC PATCH] v2 mm: balance_dirty_pages.  reduce calls to
	global_page_state to reduce cache references

On Fri, Sep 04, 2009 at 07:05:30PM +0800, Richard Kennedy wrote:
> Reducing the number of times balance_dirty_pages calls global_page_state
> reduces the cache references and so improves write performance on a
> variety of workloads.
> 
> 'perf stats' of simple fio write tests shows the reduction in cache
> access.
> Where the test is fio 'write,mmap,600Mb,pre_read' on AMD AthlonX2 with
> 3Gb memory (dirty_threshold approx 600 Mb)
> running each test 10 times, dropping the fasted & slowest values then
> taking 
> the average & standard deviation
> 
> 		average (s.d.) in millions (10^6)
> 2.6.31-rc8	648.6 (14.6)
> +patch		620.1 (16.5)
> 
> Achieving this reduction is by dropping clip_bdi_dirty_limit as it  
> rereads the counters to apply the dirty_threshold and moving this check
> up into balance_dirty_pages where it has already read the counters.
> 
> Also by rearrange the for loop to only contain one copy of the limit
> tests allows the pdflush test after the loop to use the local copies of
> the counters rather than rereading them.
> 
> In the common case with no throttling it now calls global_page_state 5
> fewer times and bdi_stat 2 fewer.
> 
> This version includes the changes suggested by 
> Wu Fengguang <fengguang.wu@...el.com>

It seems that an redundant pages_written test can be reduced by

--- linux.orig/mm/page-writeback.c	2009-09-06 11:44:39.000000000 +0800
+++ linux/mm/page-writeback.c	2009-09-06 11:44:42.000000000 +0800
@@ -526,10 +526,6 @@ static void balance_dirty_pages(struct a
 		    (background_thresh + dirty_thresh) / 2)
 			break;
 
-		/* done enough? */
-		if (pages_written >= write_chunk)
-			break;
-
 		if (!bdi->dirty_exceeded)
 			bdi->dirty_exceeded = 1;
 
@@ -547,7 +543,7 @@ static void balance_dirty_pages(struct a
 			pages_written += write_chunk - wbc.nr_to_write;
 			/* don't wait if we've done enough */
 			if (pages_written >= write_chunk)
-				continue;
+				break;
 		}
 		congestion_wait(BLK_RW_ASYNC, HZ/10);
 	}

Otherwise the patch looks good to me. Thank you for the nice work!

Reviewed-by: Wu Fengguang <fengguang.wu@...el.com>

> Signed-off-by: Richard Kennedy <richard@....demon.co.uk>
> ----
> Thanks to everybody for the feedback & suggestions.
> This patch is against 2.6.31-rc8
> 
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index 81627eb..9581359 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -260,32 +260,6 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi,
>  	}
>  }
>  
> -/*
> - * Clip the earned share of dirty pages to that which is actually available.
> - * This avoids exceeding the total dirty_limit when the floating averages
> - * fluctuate too quickly.
> - */
> -static void clip_bdi_dirty_limit(struct backing_dev_info *bdi,
> -		unsigned long dirty, unsigned long *pbdi_dirty)
> -{
> -	unsigned long avail_dirty;
> -
> -	avail_dirty = global_page_state(NR_FILE_DIRTY) +
> -		 global_page_state(NR_WRITEBACK) +
> -		 global_page_state(NR_UNSTABLE_NFS) +
> -		 global_page_state(NR_WRITEBACK_TEMP);
> -
> -	if (avail_dirty < dirty)
> -		avail_dirty = dirty - avail_dirty;
> -	else
> -		avail_dirty = 0;
> -
> -	avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
> -		bdi_stat(bdi, BDI_WRITEBACK);
> -
> -	*pbdi_dirty = min(*pbdi_dirty, avail_dirty);
> -}
> -
>  static inline void task_dirties_fraction(struct task_struct *tsk,
>  		long *numerator, long *denominator)
>  {
> @@ -478,7 +452,6 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
>  			bdi_dirty = dirty * bdi->max_ratio / 100;
>  
>  		*pbdi_dirty = bdi_dirty;
> -		clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);
>  		task_dirty_limit(current, pbdi_dirty);
>  	}
>  }
> @@ -499,7 +472,7 @@ static void balance_dirty_pages(struct address_space *mapping)
>  	unsigned long bdi_thresh;
>  	unsigned long pages_written = 0;
>  	unsigned long write_chunk = sync_writeback_pages();
> -
> +	int dirty_exceeded;
>  	struct backing_dev_info *bdi = mapping->backing_dev_info;
>  
>  	for (;;) {
> @@ -512,16 +485,36 @@ static void balance_dirty_pages(struct address_space *mapping)
>  		};
>  
>  		get_dirty_limits(&background_thresh, &dirty_thresh,
> -				&bdi_thresh, bdi);
> +				 &bdi_thresh, bdi);
>  
>  		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
> -					global_page_state(NR_UNSTABLE_NFS);
> -		nr_writeback = global_page_state(NR_WRITEBACK);
> +			global_page_state(NR_UNSTABLE_NFS);
> +		nr_writeback = global_page_state(NR_WRITEBACK) +
> +			global_page_state(NR_WRITEBACK_TEMP);
>  
> -		bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
> -		bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
> +		/*
> +		 * In order to avoid the stacked BDI deadlock we need
> +		 * to ensure we accurately count the 'dirty' pages when
> +		 * the threshold is low.
> +		 *
> +		 * Otherwise it would be possible to get thresh+n pages
> +		 * reported dirty, even though there are thresh-m pages
> +		 * actually dirty; with m+n sitting in the percpu
> +		 * deltas.
> +		 */
> +		if (bdi_thresh < 2*bdi_stat_error(bdi)) {
> +			bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
> +			bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
> +		} else if (bdi_nr_reclaimable) {
> +			bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
> +			bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
> +		}
>  
> -		if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
> +		dirty_exceeded =
> +			(bdi_nr_reclaimable + bdi_nr_writeback >= bdi_thresh)
> +			|| (nr_reclaimable + nr_writeback >= dirty_thresh);
> +
> +		if (!dirty_exceeded)
>  			break;
>  
>  		/*
> @@ -530,7 +523,11 @@ static void balance_dirty_pages(struct address_space *mapping)
>  		 * when the bdi limits are ramping up.
>  		 */
>  		if (nr_reclaimable + nr_writeback <
> -				(background_thresh + dirty_thresh) / 2)
> +		    (background_thresh + dirty_thresh) / 2)
> +			break;
> +
> +		/* done enough? */
> +		if (pages_written >= write_chunk)
>  			break;
>  
>  		if (!bdi->dirty_exceeded)
> @@ -548,38 +545,14 @@ static void balance_dirty_pages(struct address_space *mapping)
>  		if (bdi_nr_reclaimable > bdi_thresh) {
>  			writeback_inodes(&wbc);
>  			pages_written += write_chunk - wbc.nr_to_write;
> -			get_dirty_limits(&background_thresh, &dirty_thresh,
> -				       &bdi_thresh, bdi);
> -		}
> -
> -		/*
> -		 * In order to avoid the stacked BDI deadlock we need
> -		 * to ensure we accurately count the 'dirty' pages when
> -		 * the threshold is low.
> -		 *
> -		 * Otherwise it would be possible to get thresh+n pages
> -		 * reported dirty, even though there are thresh-m pages
> -		 * actually dirty; with m+n sitting in the percpu
> -		 * deltas.
> -		 */
> -		if (bdi_thresh < 2*bdi_stat_error(bdi)) {
> -			bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
> -			bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
> -		} else if (bdi_nr_reclaimable) {
> -			bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
> -			bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
> +			/* don't wait if we've done enough */
> +			if (pages_written >= write_chunk)
> +				continue;
>  		}
> -
> -		if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
> -			break;
> -		if (pages_written >= write_chunk)
> -			break;		/* We've done our duty */
> -
>  		congestion_wait(BLK_RW_ASYNC, HZ/10);
>  	}
>  
> -	if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
> -			bdi->dirty_exceeded)
> +	if (!dirty_exceeded && bdi->dirty_exceeded)
>  		bdi->dirty_exceeded = 0;
>  
>  	if (writeback_in_progress(bdi))
> @@ -593,10 +566,8 @@ static void balance_dirty_pages(struct address_space *mapping)
>  	 * In normal mode, we start background writeout at the lower
>  	 * background_thresh, to keep the amount of dirty memory low.
>  	 */
> -	if ((laptop_mode && pages_written) ||
> -			(!laptop_mode && (global_page_state(NR_FILE_DIRTY)
> -					  + global_page_state(NR_UNSTABLE_NFS)
> -					  > background_thresh)))
> +	if ((laptop_mode && pages_written) || (!laptop_mode &&
> +	     (nr_reclaimable > background_thresh)))
>  		pdflush_operation(background_writeout, 0);
>  }
>  
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/