lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1356335742-11793-9-git-send-email-wenqing.lz@taobao.com>
Date:	Mon, 24 Dec 2012 15:55:41 +0800
From:	Zheng Liu <gnehzuil.liu@...il.com>
To:	linux-ext4@...r.kernel.org
Cc:	Jan Kara <jack@...e.cz>,
	"Darrick J. Wong" <darrick.wong@...cle.com>,
	Christoph Hellwig <hch@...radead.org>,
	Zheng Liu <wenqing.lz@...bao.com>
Subject: [RFC][PATCH 8/9 v1] ext4: refine unwritten extent conversion

From: Zheng Liu <wenqing.lz@...bao.com>

Currently all unwritten extent conversion work is pushed into a workqueue to be
done because DIO end_io is in a irq context and this conversion needs to take
i_data_sem locking.  But we couldn't take a semaphore in a irq context.  After
tracking all extent status, we can first convert this unwritten extent in extent
status tree, and call aio_complete() and inode_dio_done() to notify upper level
that this dio has done.  We don't need to be worried about exposing a stale data
because we first try to lookup in extent status tree.  So it makes us to see the
latest extent status.  Meanwhile we queue a work to convert this unwritten
extent in extent tree.  After this improvement, reader also needn't wait this
conversion to be done when dioread_nolock is enabled.

CC: Jan Kara <jack@...e.cz>
CC: "Darrick J. Wong" <darrick.wong@...cle.com>
CC: Christoph Hellwig <hch@...radead.org>
Signed-off-by: Zheng Liu <wenqing.lz@...bao.com>
---
[Cc' to Jan, Darrick, and Christoph because Christoph is trying to handle
O_(D)SYNC for AIO]

Hi Jan, Darrick, and Christoph,

This patch refines the unwritten extent conversion in ext4.  Now we can call
aio_complete() and inode_dio_done() in end_io function.  I believe Christoph's
patch also can work well after applied this patch.  Could you please review
this patch?

Regards,
					- Zheng

 fs/ext4/ext4.h     |  2 +-
 fs/ext4/indirect.c | 11 ++++-------
 fs/ext4/inode.c    | 11 ++++-------
 fs/ext4/page-io.c  | 26 ++++++++++++++++++++++----
 4 files changed, 31 insertions(+), 19 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8462eb3..b76dc49 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -195,7 +195,6 @@ struct mpage_da_data {
 #define	EXT4_IO_END_UNWRITTEN	0x0001
 #define EXT4_IO_END_ERROR	0x0002
 #define EXT4_IO_END_QUEUED	0x0004
-#define EXT4_IO_END_DIRECT	0x0008
 
 struct ext4_io_page {
 	struct page	*p_page;
@@ -2538,6 +2537,7 @@ extern void ext4_ioend_wait(struct inode *);
 extern void ext4_free_io_end(ext4_io_end_t *io);
 extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
 extern void ext4_io_submit(struct ext4_io_submit *io);
+extern void ext4_end_io_bh(ext4_io_end_t *io_end, int is_dio);
 extern int ext4_bio_write_page(struct ext4_io_submit *io,
 			       struct page *page,
 			       int len,
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 20862f9..c6d7f7f 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -807,11 +807,6 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
 
 retry:
 	if (rw == READ && ext4_should_dioread_nolock(inode)) {
-		if (unlikely(atomic_read(&EXT4_I(inode)->i_unwritten))) {
-			mutex_lock(&inode->i_mutex);
-			ext4_flush_unwritten_io(inode);
-			mutex_unlock(&inode->i_mutex);
-		}
 		/*
 		 * Nolock dioread optimization may be dynamically disabled
 		 * via ext4_inode_block_unlocked_dio(). Check inode's state
@@ -831,8 +826,10 @@ retry:
 		inode_dio_done(inode);
 	} else {
 locked:
-		ret = blockdev_direct_IO(rw, iocb, inode, iov,
-				 offset, nr_segs, ext4_get_block);
+		ret = __blockdev_direct_IO(rw, iocb, inode,
+				 inode->i_sb->s_bdev, iov,
+				 offset, nr_segs,
+				 ext4_get_block, NULL, NULL, DIO_LOCKING);
 
 		if (unlikely((rw & WRITE) && ret < 0)) {
 			loff_t isize = i_size_read(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 6610dc7..4549103 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3024,9 +3024,8 @@ static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
 			       EXT4_GET_BLOCKS_NO_LOCK);
 }
 
-static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
-			    ssize_t size, void *private, int ret,
-			    bool is_async)
+static void ext4_end_dio(struct kiocb *iocb, loff_t offset, ssize_t size,
+			 void *private, int ret, bool is_async)
 {
 	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
         ext4_io_end_t *io_end = iocb->private;
@@ -3058,8 +3057,7 @@ out:
 		io_end->iocb = iocb;
 		io_end->result = ret;
 	}
-
-	ext4_add_complete_io(io_end);
+	ext4_end_io_bh(io_end, 1);
 }
 
 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
@@ -3195,7 +3193,6 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 			ret = -ENOMEM;
 			goto retake_lock;
 		}
-		io_end->flag |= EXT4_IO_END_DIRECT;
 		iocb->private = io_end;
 		/*
 		 * we save the io structure for current async direct
@@ -3216,7 +3213,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 				   inode->i_sb->s_bdev, iov,
 				   offset, nr_segs,
 				   get_block_func,
-				   ext4_end_io_dio,
+				   ext4_end_dio,
 				   NULL,
 				   dio_flags);
 
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 0016fbc..6b2d88d 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -103,17 +103,35 @@ static int ext4_end_io(ext4_io_end_t *io)
 			 "(inode %lu, offset %llu, size %zd, error %d)",
 			 inode->i_ino, offset, size, ret);
 	}
-	if (io->iocb)
-		aio_complete(io->iocb, io->result, 0);
 
-	if (io->flag & EXT4_IO_END_DIRECT)
-		inode_dio_done(inode);
 	/* Wake up anyone waiting on unwritten extent conversion */
 	if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
 		wake_up_all(ext4_ioend_wq(inode));
+
 	return ret;
 }
 
+void ext4_end_io_bh(ext4_io_end_t *io_end, int is_dio)
+{
+	struct inode *inode;
+
+	inode = io_end->inode;
+	(void)ext4_es_convert_unwritten_extents(inode, io_end->offset,
+						io_end->size);
+	ext4_add_complete_io(io_end);
+
+	/*
+	 * Here we can safely notify upper level that aio has done because
+	 * unwritten extent in extent status tree has been converted.  Thus,
+	 * others won't get a stale data because we always lookup extent status
+	 * tree firstly in get_block_t.
+	 */
+	if (io_end->iocb)
+		aio_complete(io_end->iocb, io_end->result, 0);
+	if (is_dio)
+		inode_dio_done(inode);
+}
+
 static void dump_completed_IO(struct inode *inode)
 {
 #ifdef	EXT4FS_DEBUG
-- 
1.7.12.rc2.18.g61b472e

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ