linux-kernel - [PATCH 42/52] ext4: Add basic delayed allocation support

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1215279378-30504-43-git-send-email-tytso@mit.edu>
Date:	Sat,  5 Jul 2008 13:36:08 -0400
From:	Theodore Ts'o <tytso@....edu>
To:	Ext4 Developers List <linux-ext4@...r.kernel.org>,
	Linux Kernel Developers List <linux-kernel@...r.kernel.org>
Cc:	Alex Tomas <alex@...sterfs.com>, Mingming Cao <cmm@...ibm.com>,
	Dave Kleikamp <shaggy@...ux.vnet.ibm.com>,
	"Theodore Ts'o" <tytso@....edu>,
	" Aneesh Kumar K.V" <aneesh.kumar@...ux.vnet.ibm.com>
Subject: [PATCH 42/52] ext4: Add basic delayed allocation support

From: Alex Tomas <alex@...sterfs.com>

Two special ->get_block() methods are introduced:

 * ext4_da_get_block_prep()
   to be used with ->write_begin(), defers allocation till flush
 * ext4_da_get_block_write()
   to be used with mpage_da_writepages(), allocate blocks and correct
   on-disk size

This patch supports data=writeback mode, to enable delalloc, need to
mount filesystem with delalloc,data=writeback options.

Updated fixes from Mingming cao <cmm@...ibm.com> to unlock and release
the page from page cache if the delalloc write_begin failed, and properly
handle preallocated blocks.

Updated fixes from  Aneesh Kumar K.V <aneesh.kumar@...ux.vnet.ibm.com>
to update i_disksize properly with delayed allocation,
and add bmap support for delalloc.

Signed-off-by: Alex Tomas <alex@...sterfs.com>
Signed-off-by: Mingming Cao <cmm@...ibm.com>
Signed-off-by: Dave Kleikamp <shaggy@...ux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@....edu>
Signed-off-by:  Aneesh Kumar K.V <aneesh.kumar@...ux.vnet.ibm.com>
---
 fs/ext4/ext4.h  |    1 +
 fs/ext4/inode.c |  296 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/ext4/super.c |    6 +-
 3 files changed, 297 insertions(+), 6 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2c0519a..0307d53 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -536,6 +536,7 @@ do {									       \
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT	0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
 #define EXT4_MOUNT_MBALLOC		0x4000000 /* Buddy allocation support */
+#define EXT4_MOUNT_DELALLOC		0x8000000 /* Delalloc support */
 /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
 #define clear_opt(o, opt)		o &= ~EXT4_MOUNT_##opt
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3af9cd7..2857ef3 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -46,6 +46,8 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
 						   new_size);
 }
 
+static void ext4_invalidatepage(struct page *page, unsigned long offset);
+
 /*
  * Test whether an inode is a fast symlink.
  */
@@ -1408,6 +1410,267 @@ static int ext4_journalled_write_end(struct file *file,
 }
 
 /*
+ * this is a special callback for ->write_begin() only
+ * it's intention is to return mapped block or reserve space
+ */
+static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+				  struct buffer_head *bh_result, int create)
+{
+	int ret = 0;
+
+	BUG_ON(create == 0);
+	BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+
+	/*
+	 * first, we need to know whether the block is allocated already
+	 * preallocated blocks are unmapped but should treated
+	 * the same as allocated blocks.
+	 */
+	ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1,  bh_result, 0, 0);
+	if (ret == 0) {
+		/* the block isn't allocated yet, let's reserve space */
+		/* XXX: call reservation here */
+		/*
+		 * XXX: __block_prepare_write() unmaps passed block,
+		 * is it OK?
+		 */
+		map_bh(bh_result, inode->i_sb, 0);
+		set_buffer_new(bh_result);
+		set_buffer_delay(bh_result);
+	} else if (ret > 0) {
+		bh_result->b_size = (ret << inode->i_blkbits);
+		ret = 0;
+	}
+
+	return ret;
+}
+
+static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
+				   struct buffer_head *bh_result, int create)
+{
+	int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
+	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+	loff_t disksize = EXT4_I(inode)->i_disksize;
+	handle_t *handle = NULL;
+
+	if (create) {
+		handle = ext4_journal_start(inode, needed_blocks);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			goto out;
+		}
+	}
+
+	ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+				   bh_result, create, 0);
+	if (ret > 0) {
+		bh_result->b_size = (ret << inode->i_blkbits);
+
+		/*
+		 * Update on-disk size along with block allocation
+		 * we don't use 'extend_disksize' as size may change
+		 * within already allocated block -bzzz
+		 */
+		disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
+		if (disksize > i_size_read(inode))
+			disksize = i_size_read(inode);
+		if (disksize > EXT4_I(inode)->i_disksize) {
+			/*
+			 * XXX: replace with spinlock if seen contended -bzzz
+			 */
+			down_write(&EXT4_I(inode)->i_data_sem);
+			if (disksize > EXT4_I(inode)->i_disksize)
+				EXT4_I(inode)->i_disksize = disksize;
+			up_write(&EXT4_I(inode)->i_data_sem);
+
+			if (EXT4_I(inode)->i_disksize == disksize) {
+				if (handle == NULL)
+					handle = ext4_journal_start(inode, 1);
+				if (!IS_ERR(handle))
+					ext4_mark_inode_dirty(handle, inode);
+			}
+		}
+
+		ret = 0;
+	}
+
+out:
+	if (handle && !IS_ERR(handle))
+		ext4_journal_stop(handle);
+
+	return ret;
+}
+/* FIXME!! only support data=writeback mode */
+static int ext4_da_writepage(struct page *page,
+				struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	handle_t *handle = NULL;
+	int ret = 0;
+	int err;
+
+	if (ext4_journal_current_handle())
+		goto out_fail;
+
+	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out_fail;
+	}
+
+	if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
+		ret = nobh_writepage(page, ext4_get_block, wbc);
+	else
+		ret = block_write_full_page(page, ext4_get_block, wbc);
+
+	if (!ret && inode->i_size > EXT4_I(inode)->i_disksize) {
+		EXT4_I(inode)->i_disksize = inode->i_size;
+		ext4_mark_inode_dirty(handle, inode);
+	}
+
+	err = ext4_journal_stop(handle);
+	if (!ret)
+		ret = err;
+	return ret;
+
+out_fail:
+	redirty_page_for_writepage(wbc, page);
+	unlock_page(page);
+	return ret;
+}
+
+static int ext4_da_writepages(struct address_space *mapping,
+				struct writeback_control *wbc)
+{
+	return mpage_da_writepages(mapping, wbc, ext4_da_get_block_write);
+}
+
+static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned flags,
+				struct page **pagep, void **fsdata)
+{
+	int ret;
+	struct page *page;
+	pgoff_t index;
+	unsigned from, to;
+	struct inode *inode = mapping->host;
+	handle_t *handle;
+
+	index = pos >> PAGE_CACHE_SHIFT;
+	from = pos & (PAGE_CACHE_SIZE - 1);
+	to = from + len;
+
+	/*
+	 * With delayed allocation, we don't log the i_disksize update
+	 * if there is delayed block allocation. But we still need
+	 * to journalling the i_disksize update if writes to the end
+	 * of file which has an already mapped buffer.
+	 */
+	handle = ext4_journal_start(inode, 1);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out;
+	}
+
+	page = __grab_cache_page(mapping, index);
+	if (!page)
+		return -ENOMEM;
+	*pagep = page;
+
+	ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+							ext4_da_get_block_prep);
+	if (ret < 0) {
+		unlock_page(page);
+		ext4_journal_stop(handle);
+		page_cache_release(page);
+	}
+
+out:
+	return ret;
+}
+
+static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+{
+	return !buffer_mapped(bh) || buffer_delay(bh);
+}
+
+static int ext4_da_write_end(struct file *file,
+				struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned copied,
+				struct page *page, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	int ret = 0, ret2;
+	handle_t *handle = ext4_journal_current_handle();
+	loff_t new_i_size;
+
+	/*
+	 * generic_write_end() will run mark_inode_dirty() if i_size
+	 * changes.  So let's piggyback the i_disksize mark_inode_dirty
+	 * into that.
+	 */
+
+	new_i_size = pos + copied;
+	if (new_i_size > EXT4_I(inode)->i_disksize)
+		if (!walk_page_buffers(NULL, page_buffers(page),
+				       0, len, NULL, ext4_bh_unmapped_or_delay)){
+			/*
+			 * Updating i_disksize when extending file without
+			 * needing block allocation
+			 */
+			if (ext4_should_order_data(inode))
+				ret = ext4_jbd2_file_inode(handle, inode);
+
+			EXT4_I(inode)->i_disksize = new_i_size;
+		}
+	ret2 = generic_write_end(file, mapping, pos, len, copied,
+							page, fsdata);
+	copied = ret2;
+	if (ret2 < 0)
+		ret = ret2;
+	ret2 = ext4_journal_stop(handle);
+	if (!ret)
+		ret = ret2;
+
+	return ret ? ret : copied;
+}
+
+static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
+{
+	struct buffer_head *head, *bh;
+	unsigned int curr_off = 0;
+
+	/*
+	 * Drop reserved blocks
+	 */
+	BUG_ON(!PageLocked(page));
+	if (!page_has_buffers(page))
+		goto out;
+
+	head = page_buffers(page);
+	bh = head;
+	do {
+		unsigned int next_off = curr_off + bh->b_size;
+
+		/*
+		 * is this block fully invalidated?
+		 */
+		if (offset <= curr_off && buffer_delay(bh)) {
+			clear_buffer_delay(bh);
+			/* XXX: add real stuff here */
+		}
+		curr_off = next_off;
+		bh = bh->b_this_page;
+	} while (bh != head);
+
+out:
+	ext4_invalidatepage(page, offset);
+
+	return;
+}
+
+
+/*
  * bmap() is special.  It gets used by applications such as lilo and by
  * the swapper to find the on-disk block of a specific piece of data.
  *
@@ -1427,6 +1690,16 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
 	journal_t *journal;
 	int err;
 
+	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
+			test_opt(inode->i_sb, DELALLOC)) {
+		/*
+		 * With delalloc we want to sync the file
+		 * so that we can make sure we allocate
+		 * blocks for file
+		 */
+		filemap_write_and_wait(mapping);
+	}
+
 	if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
 		/*
 		 * This is a REALLY heavyweight approach, but the use of
@@ -1471,11 +1744,6 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
 	return 0;
 }
 
-static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
-{
-	return !buffer_mapped(bh) || buffer_delay(bh);
-}
-
 /*
  * Note that we don't need to start a transaction unless we're journaling data
  * because we should have holes filled from ext4_page_mkwrite(). We even don't
@@ -1832,10 +2100,28 @@ static const struct address_space_operations ext4_journalled_aops = {
 	.releasepage	= ext4_releasepage,
 };
 
+static const struct address_space_operations ext4_da_aops = {
+	.readpage	= ext4_readpage,
+	.readpages	= ext4_readpages,
+	.writepage	= ext4_da_writepage,
+	.writepages	= ext4_da_writepages,
+	.sync_page	= block_sync_page,
+	.write_begin	= ext4_da_write_begin,
+	.write_end	= ext4_da_write_end,
+	.bmap		= ext4_bmap,
+	.invalidatepage	= ext4_da_invalidatepage,
+	.releasepage	= ext4_releasepage,
+	.direct_IO	= ext4_direct_IO,
+	.migratepage	= buffer_migrate_page,
+};
+
 void ext4_set_aops(struct inode *inode)
 {
 	if (ext4_should_order_data(inode))
 		inode->i_mapping->a_ops = &ext4_ordered_aops;
+	else if (ext4_should_writeback_data(inode) &&
+		 test_opt(inode->i_sb, DELALLOC))
+		inode->i_mapping->a_ops = &ext4_da_aops;
 	else if (ext4_should_writeback_data(inode))
 		inode->i_mapping->a_ops = &ext4_writeback_aops;
 	else
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 629d0fa..de9d3d0 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -898,7 +898,7 @@ enum {
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
 	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
 	Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
-	Opt_mballoc, Opt_nomballoc, Opt_stripe,
+	Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc,
 };
 
 static match_table_t tokens = {
@@ -957,6 +957,7 @@ static match_table_t tokens = {
 	{Opt_nomballoc, "nomballoc"},
 	{Opt_stripe, "stripe=%u"},
 	{Opt_resize, "resize"},
+	{Opt_delalloc, "delalloc"},
 	{Opt_err, NULL},
 };
 
@@ -1335,6 +1336,9 @@ set_qf_format:
 				return 0;
 			sbi->s_stripe = option;
 			break;
+		case Opt_delalloc:
+			set_opt(sbi->s_mount_opt, DELALLOC);
+			break;
 		default:
 			printk (KERN_ERR
 				"EXT4-fs: Unrecognized mount option \"%s\" "
-- 
1.5.6.rc3.1.g36b7.dirty

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/