linux-kernel - Re: do_sync() and XFSQA test 182 failures....

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20081031004814.GN4985@disturbed>
Date:	Fri, 31 Oct 2008 11:48:14 +1100
From:	Dave Chinner <david@...morbit.com>
To:	Christoph Hellwig <hch@...radead.org>, xfs@....sgi.com,
	linux-fsdevel@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: Re: do_sync() and XFSQA test 182 failures....

On Fri, Oct 31, 2008 at 11:12:49AM +1100, Dave Chinner wrote:
> On Thu, Oct 30, 2008 at 06:46:25PM -0400, Christoph Hellwig wrote:
> > On Thu, Oct 30, 2008 at 07:50:20PM +1100, Dave Chinner wrote:
> > > Are there any other ways that we can get a custom ->do_sync
> > > method for XFS? I'd prefer a custom method so we don't have to
> > > revalidate every linux filesystem, especially as XFS already has
> > > everything it needs to provide it's own sync method (used for
> > > freezing) and a test suite to validate it is working correctly.....
> > 
> > I think having a method for this would be useful.  And given that
> > a proper sync should be exactly the same as a filesysytem freeze
> > we should maybe use one method for both of those and use the chance
> > to give filesystem better control over the freeze process?
> 
> Right - that's exactly where we should be going with this, I think.
> I'd suggest two callouts, perhaps: ->sync_data and ->sync_metadata.
> The freeze code can then still operate in two stages, and we can
> also use then for separating data and inode writeback in pdflush....
> 
> FWIW, I mentioned doing this sort of thing here:
> 
> http://xfs.org/index.php/Improving_inode_Caching#Avoiding_the_Generic_pdflush_Code
> 
> I think I'll look at redoing do_sync() to provide a custom sync
> method before trying to fix XFS....

As it is, here's the somewhat cleaned up patch that demonstrates
the changes needed to make xfsqa test 182 pass....

Cheers,

Dave.
-- 
Dave Chinner
david@...morbit.com

XFS, RFC: demonstration fix of test 182

This patch shows the various changes needed to ensure sync(1)
leave the filesystem in a consistent state. It shows the different
format of do_sync(), the changes to mark the inode dirty before
data I/O and the changes to xfs_fs_write_super() and the
functions it calls to ensure the log gets covered at the
end of the sync.

This is in no way a real fix for the problem, merely a demonstration
of the problem. The real fix is to make the XFS sync code use
the same flush algorithm as freeze, but that first requires VFS
level changes to provide a method for doing so.
---
 fs/sync.c                    |    7 +++----
 fs/xfs/linux-2.6/xfs_aops.c  |   38 ++++++++++++++++++++++++++++----------
 fs/xfs/linux-2.6/xfs_super.c |   29 +++++++++++------------------
 fs/xfs/linux-2.6/xfs_sync.c  |   41 ++++++++++++++++++++++++++---------------
 fs/xfs/linux-2.6/xfs_sync.h  |    2 +-
 5 files changed, 69 insertions(+), 48 deletions(-)

diff --git a/fs/sync.c b/fs/sync.c
index 137b550..69f40b7 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -23,13 +23,12 @@
  */
 static void do_sync(unsigned long wait)
 {
-	wakeup_pdflush(0);
-	sync_inodes(0);		/* All mappings, inodes and their blockdevs */
-	DQUOT_SYNC(NULL);
-	sync_supers();		/* Write the superblocks */
 	sync_filesystems(0);	/* Start syncing the filesystems */
 	sync_filesystems(wait);	/* Waitingly sync the filesystems */
+	DQUOT_SYNC(NULL);
+	sync_supers();		/* Write the superblocks */
 	sync_inodes(wait);	/* Mappings, inodes and blockdevs, again. */
+	sync_supers();		/* Write the superblocks, again */
 	if (!wait)
 		printk("Emergency Sync complete\n");
 	if (unlikely(laptop_mode))
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index f8fa620..cb79a21 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -143,19 +143,37 @@ xfs_destroy_ioend(
 }
 
 /*
+ * If the end of the current ioend is beyond the current EOF,
+ * return the new EOF value, otherwise zero.
+ */
+STATIC xfs_fsize_t
+xfs_ioend_new_eof(
+	xfs_ioend_t		*ioend)
+{
+	xfs_inode_t		*ip = XFS_I(ioend->io_inode);
+	xfs_fsize_t		isize;
+	xfs_fsize_t		bsize;
+
+	bsize = ioend->io_offset + ioend->io_size;
+	isize = MAX(ip->i_size, ip->i_new_size);
+	isize = MIN(isize, bsize);
+	return isize > ip->i_d.di_size ? isize : 0;
+}
+
+/*
  * Update on-disk file size now that data has been written to disk.
  * The current in-memory file size is i_size.  If a write is beyond
  * eof i_new_size will be the intended file size until i_size is
  * updated.  If this write does not extend all the way to the valid
  * file size then restrict this update to the end of the write.
  */
+
 STATIC void
 xfs_setfilesize(
 	xfs_ioend_t		*ioend)
 {
 	xfs_inode_t		*ip = XFS_I(ioend->io_inode);
 	xfs_fsize_t		isize;
-	xfs_fsize_t		bsize;
 
 	ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
 	ASSERT(ioend->io_type != IOMAP_READ);
@@ -163,19 +181,13 @@ xfs_setfilesize(
 	if (unlikely(ioend->io_error))
 		return;
 
-	bsize = ioend->io_offset + ioend->io_size;
-
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
-
-	isize = MAX(ip->i_size, ip->i_new_size);
-	isize = MIN(isize, bsize);
-
-	if (ip->i_d.di_size < isize) {
+	isize = xfs_ioend_new_eof(ioend);
+	if (isize) {
 		ip->i_d.di_size = isize;
 		ip->i_update_size = 1;
 		xfs_mark_inode_dirty_sync(ip);
 	}
-
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 }
 
@@ -366,10 +378,16 @@ xfs_submit_ioend_bio(
 	struct bio	*bio)
 {
 	atomic_inc(&ioend->io_remaining);
-
 	bio->bi_private = ioend;
 	bio->bi_end_io = xfs_end_bio;
 
+	/*
+	 * if the I/O is beyond EOF we mark the inode dirty immediately
+	 * but don't update the inode size until I/O completion.
+	 */
+	if (xfs_ioend_new_eof(ioend))
+		xfs_mark_inode_dirty_sync(XFS_I(ioend->io_inode));
+
 	submit_bio(WRITE, bio);
 	ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
 	bio_put(bio);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index c5cfc1e..bbac9e3 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1118,8 +1118,7 @@ xfs_fs_write_super(
 	struct super_block	*sb)
 {
 	if (!(sb->s_flags & MS_RDONLY))
-		xfs_sync_fsdata(XFS_M(sb), 0);
-	sb->s_dirt = 0;
+		xfs_sync_fsdata(XFS_M(sb), SYNC_WAIT);
 }
 
 STATIC int
@@ -1131,22 +1130,16 @@ xfs_fs_sync_super(
 	int			error;
 
 	/*
-	 * Treat a sync operation like a freeze.  This is to work
-	 * around a race in sync_inodes() which works in two phases
-	 * - an asynchronous flush, which can write out an inode
-	 * without waiting for file size updates to complete, and a
-	 * synchronous flush, which wont do anything because the
-	 * async flush removed the inode's dirty flag.  Also
-	 * sync_inodes() will not see any files that just have
-	 * outstanding transactions to be flushed because we don't
-	 * dirty the Linux inode until after the transaction I/O
-	 * completes.
+	 * Treat a blocking sync operation like a data freeze.
+	 * The are effectively the same thing - both need to
+	 * get all the data on disk and wait for it to complete.
+	 *
+	 * Also, no point marking the superblock clean - we'll
+	 * dirty metadata flushing data out....
 	 */
-	if (wait || unlikely(sb->s_frozen == SB_FREEZE_WRITE))
-		error = xfs_quiesce_data(mp);
-	else
-		error = xfs_sync_fsdata(mp, 0);
-	sb->s_dirt = 0;
+	if (unlikely(sb->s_frozen == SB_FREEZE_WRITE))
+		wait = 1;
+	error = xfs_quiesce_data(mp, wait);
 
 	if (unlikely(laptop_mode)) {
 		int	prev_sync_seq = mp->m_sync_seq;
@@ -1283,7 +1276,7 @@ xfs_fs_remount(
 
 	/* rw -> ro */
 	if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
-		xfs_quiesce_data(mp);
+		xfs_quiesce_data(mp, 1);
 		xfs_quiesce_attr(mp);
 		mp->m_flags |= XFS_MOUNT_RDONLY;
 	}
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index d12d31b..975534b 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -64,8 +64,6 @@ xfs_sync_inodes_ag(
 	int		last_error = 0;
 	int		fflag = XFS_B_ASYNC;
 
-	if (flags & SYNC_DELWRI)
-		fflag = XFS_B_DELWRI;
 	if (flags & SYNC_WAIT)
 		fflag = 0;		/* synchronous overrides all */
 
@@ -127,6 +125,8 @@ xfs_sync_inodes_ag(
 		/*
 		 * If we have to flush data or wait for I/O completion
 		 * we need to hold the iolock.
+		 *
+		 * XXX: this doesn't catch I/O in progress
 		 */
 		if ((flags & SYNC_DELWRI) && VN_DIRTY(inode)) {
 			xfs_ilock(ip, XFS_IOLOCK_SHARED);
@@ -202,11 +202,15 @@ xfs_sync_inodes(
 STATIC int
 xfs_commit_dummy_trans(
 	struct xfs_mount	*mp,
-	uint			log_flags)
+	uint			flags)
 {
 	struct xfs_inode	*ip = mp->m_rootip;
 	struct xfs_trans	*tp;
 	int			error;
+	int			log_flags = XFS_LOG_FORCE;
+
+	if (flags & SYNC_WAIT)
+		log_flags |= XFS_LOG_SYNC;
 
 	/*
 	 * Put a dummy transaction in the log to tell recovery
@@ -224,13 +228,12 @@ xfs_commit_dummy_trans(
 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 	xfs_trans_ihold(tp, ip);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-	/* XXX(hch): ignoring the error here.. */
 	error = xfs_trans_commit(tp, 0);
-
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 
+	/* the log force ensures this transaction is pushed to disk */
 	xfs_log_force(mp, 0, log_flags);
-	return 0;
+	return error;
 }
 
 int
@@ -270,6 +273,7 @@ xfs_sync_fsdata(
 		 */
 		if (XFS_BUF_ISPINNED(bp))
 			xfs_log_force(mp, 0, XFS_LOG_FORCE);
+		xfs_flush_buftarg(mp->m_ddev_targp, 1);
 	}
 
 
@@ -278,7 +282,13 @@ xfs_sync_fsdata(
 	else
 		XFS_BUF_ASYNC(bp);
 
-	return xfs_bwrite(mp, bp);
+	error = xfs_bwrite(mp, bp);
+	if (!error && xfs_log_need_covered(mp)) {
+		error = xfs_commit_dummy_trans(mp, (flags & SYNC_WAIT));
+		if (flags & SYNC_WAIT)
+			mp->m_super->s_dirt = 0;
+	}
+	return error;
 
  out_brelse:
 	xfs_buf_relse(bp);
@@ -305,18 +315,21 @@ xfs_sync_fsdata(
  */
 int
 xfs_quiesce_data(
-	struct xfs_mount	*mp)
+	struct xfs_mount	*mp,
+	int			wait)
 {
 	int error;
 
 	/* push non-blocking */
-	xfs_sync_inodes(mp, SYNC_DELWRI|SYNC_BDFLUSH);
+	xfs_sync_inodes(mp, SYNC_DELWRI);
 	XFS_QM_DQSYNC(mp, SYNC_BDFLUSH);
-	xfs_filestream_flush(mp);
 
-	/* push and block */
-	xfs_sync_inodes(mp, SYNC_DELWRI|SYNC_WAIT|SYNC_IOWAIT);
-	XFS_QM_DQSYNC(mp, SYNC_WAIT);
+	/* push and block till complete */
+	if (wait) {
+		xfs_sync_inodes(mp, SYNC_DELWRI|SYNC_WAIT|SYNC_IOWAIT);
+		XFS_QM_DQSYNC(mp, SYNC_WAIT);
+		xfs_filestream_flush(mp);
+	}
 
 	/* write superblock and hoover up shutdown errors */
 	error = xfs_sync_fsdata(mp, 0);
@@ -480,8 +493,6 @@ xfs_sync_worker(
 		/* dgc: errors ignored here */
 		error = XFS_QM_DQSYNC(mp, SYNC_BDFLUSH);
 		error = xfs_sync_fsdata(mp, SYNC_BDFLUSH);
-		if (xfs_log_need_covered(mp))
-			error = xfs_commit_dummy_trans(mp, XFS_LOG_FORCE);
 	}
 	mp->m_sync_seq++;
 	wake_up(&mp->m_wait_single_sync_task);
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 5f6de1e..5586f7a 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -39,7 +39,7 @@ void xfs_syncd_stop(struct xfs_mount *mp);
 int xfs_sync_inodes(struct xfs_mount *mp, int flags);
 int xfs_sync_fsdata(struct xfs_mount *mp, int flags);
 
-int xfs_quiesce_data(struct xfs_mount *mp);
+int xfs_quiesce_data(struct xfs_mount *mp, int wait);
 void xfs_quiesce_attr(struct xfs_mount *mp);
 
 void xfs_flush_inode(struct xfs_inode *ip);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/