lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <E1J8RZS-0005kk-TI@localhost>
Date:	Sat, 29 Dec 2007 10:37:06 +0800
From:	WU Fengguang <wfg@...l.ustc.edu.cn>
To:	Andrew Morton <akpm@...ux-foundation.org>
Cc:	Sascha Warner <prx@...atas.ath.cx>, linux-kernel@...r.kernel.org,
	Peter Zijlstra <a.p.zijlstra@...llo.nl>
Subject: Re: [PATCH 00/11] writeback bug fixes and simplifications

On Fri, Dec 28, 2007 at 03:01:11PM -0800, Andrew Morton wrote:
> On Thu, 27 Dec 2007 23:08:40 +0100 Sascha Warner <prx@...atas.ath.cx> wrote:
> 
> > Hi,
> > 
> > I applied your patches to 2.6.24-rc6-mm1, but now I am faced with one
> > pdflush often using 100% CPU for a long time. There seem to be some rare
> > pauses from its 100% usage, however.
> > 
> > On ~23 minutes uptime i have ~19 minutes pdflush runtime.
> > 
> > This is on E6600, x86_64, 2 Gig RAM, SATA HDD, running on gentoo ~x64_64
> > 
> > Let me know if you need more info.

Sascha, Thank you for the testing.

Replace the first patch with this one should help:

---
Subject: writeback: introduce s_more_io_wait for inodes to be re-synced after a while

Introduce super_block.s_more_io_wait and writeback_control.more_io_wait.
They are for inodes that for some reason cannot be synced immediately.

These inodes will be moved to s_more_io_wait and retried after a while(waiting
up to 0.1s).  The normal lots-of-dirty-pages inodes will be moved to more_io
and be synced after other inodes in s_io have been serviced(no sleep).

The data flow is made more simple:

s_dirty --> s_io --> s_more_io/s_more_io_wait --+
             ^                                  |
	     |                                  |
	     +----------------------------------+

- to fill s_io:
		s_more_io +
		s_dirty(expired) +
		s_more_io_wait
				---> s_io
- to drain s_io:
		s_io -+--> clean inodes goto inode_in_use/inode_unused
		      |
		      +--> s_more_io
		      |
		      +--> s_more_io_wait

Obviously there's no ordering or starvation problems in the queues:
- s_dirty is now a strict FIFO queue
- inode.dirtied_when now really means the first dirty time
- once exipired, the dirty inode will stay in s_*io* queues until made clean
- the dirty inodes in s_*io* queues will be revisted in order: no starvation

Cc: Michael Rubin <mrubin@...gle.com>
Cc: Peter Zijlstra <peterz@...radead.org>
Signed-off-by: Fengguang Wu <wfg@...l.ustc.edu.cn>
---
 fs/fs-writeback.c         |   26 ++++++++++++----
 fs/super.c                |    1 
 include/linux/fs.h        |    1 
 include/linux/writeback.h |    1 
 mm/page-writeback.c       |   56 ++++++++++++++++++++----------------
 5 files changed, 55 insertions(+), 30 deletions(-)

--- linux-2.6.24-rc6-mm1.orig/fs/fs-writeback.c
+++ linux-2.6.24-rc6-mm1/fs/fs-writeback.c
@@ -172,6 +172,14 @@ static void requeue_io(struct inode *ino
 	list_move(&inode->i_list, &inode->i_sb->s_more_io);
 }
 
+/*
+ * The inode should be retried after _sleeping_ for a while.
+ */
+static void requeue_io_wait(struct inode *inode)
+{
+	list_move(&inode->i_list, &inode->i_sb->s_more_io_wait);
+}
+
 static void inode_sync_complete(struct inode *inode)
 {
 	/*
@@ -206,13 +214,15 @@ static void queue_io(struct super_block 
 {
 	list_splice_init(&sb->s_more_io, sb->s_io.prev);
 	move_expired_inodes(&sb->s_dirty, &sb->s_io, older_than_this);
+	list_splice_init(&sb->s_more_io_wait, sb->s_io.prev);
 }
 
 int sb_has_dirty_inodes(struct super_block *sb)
 {
-	return !list_empty(&sb->s_dirty) ||
-	       !list_empty(&sb->s_io) ||
-	       !list_empty(&sb->s_more_io);
+	return !list_empty(&sb->s_dirty)   ||
+	       !list_empty(&sb->s_io)      ||
+	       !list_empty(&sb->s_more_io) ||
+	       !list_empty(&sb->s_more_io_wait);
 }
 EXPORT_SYMBOL(sb_has_dirty_inodes);
 
@@ -472,11 +482,15 @@ int generic_sync_sb_inodes(struct super_
 		iput(inode);
 		cond_resched();
 		spin_lock(&inode_lock);
-		if (wbc->nr_to_write <= 0)
+		if (wbc->nr_to_write <= 0) {
+			wbc->more_io = 1;
 			break;
+		}
+		if (!list_empty(&sb->s_more_io))
+			wbc->more_io = 1;
+		if (!list_empty(&sb->s_more_io_wait))
+			wbc->more_io_wait = 1;
 	}
-	if (!list_empty(&sb->s_more_io))
-		wbc->more_io = 1;
 	spin_unlock(&inode_lock);
 	return ret;		/* Leave any unwritten inodes on s_io */
 }
--- linux-2.6.24-rc6-mm1.orig/mm/page-writeback.c
+++ linux-2.6.24-rc6-mm1/mm/page-writeback.c
@@ -543,6 +543,34 @@ void throttle_vm_writeout(gfp_t gfp_mask
 }
 
 /*
+ * Write back up to MAX_WRITEBACK_PAGES.
+ * Return true if there's no more work.
+ */
+static int writeback_some_pages(struct writeback_control *wbc, int nr)
+{
+	int all_done = 0;
+
+	wbc->more_io = 0;
+	wbc->more_io_wait = 0;
+	wbc->encountered_congestion = 0;
+	wbc->nr_to_write = nr;
+
+	writeback_inodes(wbc);
+
+	if (wbc->encountered_congestion)
+		congestion_wait(WRITE, HZ/10);
+
+	if (wbc->more_io)
+		;
+	else if (wbc->more_io_wait)
+		congestion_wait(WRITE, HZ/10);
+	else
+		all_done = 1;
+
+	return all_done;
+}
+
+/*
  * writeback at least _min_pages, and keep writing until the amount of dirty
  * memory is less than the background threshold, or until we're all clean.
  */
@@ -553,7 +581,6 @@ static void background_writeout(unsigned
 		.bdi		= NULL,
 		.sync_mode	= WB_SYNC_NONE,
 		.older_than_this = NULL,
-		.nr_to_write	= 0,
 		.nonblocking	= 1,
 		.range_cyclic	= 1,
 	};
@@ -567,19 +594,9 @@ static void background_writeout(unsigned
 			global_page_state(NR_UNSTABLE_NFS) < background_thresh
 				&& min_pages <= 0)
 			break;
-		wbc.more_io = 0;
-		wbc.encountered_congestion = 0;
-		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
-		wbc.pages_skipped = 0;
-		writeback_inodes(&wbc);
+		if (writeback_some_pages(&wbc, MAX_WRITEBACK_PAGES))
+			break;
 		min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
-		if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
-			/* Wrote less than expected */
-			if (wbc.encountered_congestion || wbc.more_io)
-				congestion_wait(WRITE, HZ/10);
-			else
-				break;
-		}
 	}
 }
 
@@ -627,7 +644,6 @@ static void wb_kupdate(unsigned long arg
 		.bdi		= NULL,
 		.sync_mode	= WB_SYNC_NONE,
 		.older_than_this = &oldest_jif,
-		.nr_to_write	= 0,
 		.nonblocking	= 1,
 		.for_kupdate	= 1,
 		.range_cyclic	= 1,
@@ -642,16 +658,8 @@ static void wb_kupdate(unsigned long arg
 			global_page_state(NR_UNSTABLE_NFS) +
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
 	while (nr_to_write > 0) {
-		wbc.more_io = 0;
-		wbc.encountered_congestion = 0;
-		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
-		writeback_inodes(&wbc);
-		if (wbc.nr_to_write > 0) {
-			if (wbc.encountered_congestion || wbc.more_io)
-				congestion_wait(WRITE, HZ/10);
-			else
-				break;	/* All the old data is written */
-		}
+		if (writeback_some_pages(&wbc, MAX_WRITEBACK_PAGES))
+			break;
 		nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 	}
 	if (time_before(next_jif, jiffies + HZ))
--- linux-2.6.24-rc6-mm1.orig/fs/super.c
+++ linux-2.6.24-rc6-mm1/fs/super.c
@@ -64,6 +64,7 @@ static struct super_block *alloc_super(s
 		INIT_LIST_HEAD(&s->s_dirty);
 		INIT_LIST_HEAD(&s->s_io);
 		INIT_LIST_HEAD(&s->s_more_io);
+		INIT_LIST_HEAD(&s->s_more_io_wait);
 		INIT_LIST_HEAD(&s->s_files);
 		INIT_LIST_HEAD(&s->s_instances);
 		INIT_HLIST_HEAD(&s->s_anon);
--- linux-2.6.24-rc6-mm1.orig/include/linux/fs.h
+++ linux-2.6.24-rc6-mm1/include/linux/fs.h
@@ -1011,6 +1011,7 @@ struct super_block {
 	struct list_head	s_dirty;	/* dirty inodes */
 	struct list_head	s_io;		/* parked for writeback */
 	struct list_head	s_more_io;	/* parked for more writeback */
+	struct list_head	s_more_io_wait;	/* parked for sleep-then-retry */
 	struct hlist_head	s_anon;		/* anonymous dentries for (nfs) exporting */
 	struct list_head	s_files;
 
--- linux-2.6.24-rc6-mm1.orig/include/linux/writeback.h
+++ linux-2.6.24-rc6-mm1/include/linux/writeback.h
@@ -63,6 +63,7 @@ struct writeback_control {
 	unsigned for_writepages:1;	/* This is a writepages() call */
 	unsigned range_cyclic:1;	/* range_start is cyclic */
 	unsigned more_io:1;		/* more io to be dispatched */
+	unsigned more_io_wait:1;	/* more io to be dispatched after a while */
 };
 
 /*

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ