linux-kernel - [PATCH 11/11] ext3: Implement delayed allocation on page

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1243429268-3028-12-git-send-email-jack@suse.cz>
Date:	Wed, 27 May 2009 15:01:08 +0200
From:	Jan Kara <jack@...e.cz>
To:	LKML <linux-kernel@...r.kernel.org>
Cc:	npiggin@...e.de, linux-ext4@...r.kernel.org,
	Jan Kara <jack@...e.cz>
Subject: [PATCH 11/11] ext3: Implement delayed allocation on page_mkwrite time

We don't want to really allocate blocks on page_mkwrite() time because for
random writes via mmap it results is much more fragmented files. So just
reserve enough free blocks in page_mkwrite() and do the real allocation from
writepage().

Signed-off-by: Jan Kara <jack@...e.cz>
---
 fs/ext3/balloc.c           |   96 ++++++++++++++--------
 fs/ext3/ialloc.c           |    2 +-
 fs/ext3/inode.c            |  192 +++++++++++++++++++++++++++-----------------
 fs/ext3/resize.c           |    2 +-
 fs/ext3/super.c            |   42 ++++++----
 include/linux/ext3_fs.h    |    9 ++-
 include/linux/ext3_fs_i.h  |    1 +
 include/linux/ext3_fs_sb.h |    3 +-
 8 files changed, 217 insertions(+), 130 deletions(-)

diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 225202d..ecde4ca 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -19,6 +19,8 @@
 #include <linux/ext3_jbd.h>
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
+#include <linux/delalloc_counter.h>
+#include <linux/writeback.h>
 
 /*
  * balloc.c contains the blocks allocation and deallocation routines
@@ -632,7 +634,7 @@ do_more:
 	spin_lock(sb_bgl_lock(sbi, block_group));
 	le16_add_cpu(&desc->bg_free_blocks_count, group_freed);
 	spin_unlock(sb_bgl_lock(sbi, block_group));
-	percpu_counter_add(&sbi->s_freeblocks_counter, count);
+	dac_free(&sbi->s_alloc_counter, count);
 
 	/* We dirtied the bitmap block */
 	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
@@ -1410,23 +1412,19 @@ out:
 }
 
 /**
- * ext3_has_free_blocks()
- * @sbi:		in-core super block structure.
+ * ext3_free_blocks_limit()
+ * @sb:			super block
  *
  * Check if filesystem has at least 1 free block available for allocation.
  */
-static int ext3_has_free_blocks(struct ext3_sb_info *sbi)
+ext3_fsblk_t ext3_free_blocks_limit(struct super_block *sb)
 {
-	ext3_fsblk_t free_blocks, root_blocks;
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
 
-	free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
-	root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count);
-	if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
-		sbi->s_resuid != current_fsuid() &&
-		(sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
-		return 0;
-	}
-	return 1;
+	if (!capable(CAP_SYS_RESOURCE) && sbi->s_resuid != current_fsuid() &&
+	    (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
+		return le32_to_cpu(sbi->s_es->s_r_blocks_count) + 1;
+	return 0;
 }
 
 /**
@@ -1443,12 +1441,23 @@ static int ext3_has_free_blocks(struct ext3_sb_info *sbi)
  */
 int ext3_should_retry_alloc(struct super_block *sb, int *retries)
 {
-	if (!ext3_has_free_blocks(EXT3_SB(sb)) || (*retries)++ > 3)
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	ext3_fsblk_t free_blocks, limit, reserved_blocks;
+
+	free_blocks = dac_get_avail(&sbi->s_alloc_counter);
+	limit = ext3_free_blocks_limit(sb);
+	reserved_blocks = dac_get_reserved(&sbi->s_alloc_counter);
+	if (!free_blocks + reserved_blocks < limit || (*retries)++ > 3)
 		return 0;
 
 	jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
-
-	return journal_force_commit_nested(EXT3_SB(sb)->s_journal);
+	/*
+	 * There's a chance commit will free some blocks and writeback can
+	 * write delayed blocks so that excessive reservation gets released.
+	 */
+	if (reserved_blocks)
+		wakeup_pdflush(0);
+	return journal_force_commit_nested(sbi->s_journal);
 }
 
 /**
@@ -1466,7 +1475,8 @@ int ext3_should_retry_alloc(struct super_block *sb, int *retries)
  *
  */
 ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
-			ext3_fsblk_t goal, unsigned long *count, int *errp)
+			ext3_fsblk_t goal, unsigned long *count,
+			unsigned int *reserved, int *errp)
 {
 	struct buffer_head *bitmap_bh = NULL;
 	struct buffer_head *gdp_bh;
@@ -1477,7 +1487,7 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
 	ext3_fsblk_t ret_block;		/* filesyetem-wide allocated block */
 	int bgi;			/* blockgroup iteration index */
 	int fatal = 0, err;
-	int performed_allocation = 0;
+	int got_quota = 0, got_space = 0;
 	ext3_grpblk_t free_blocks;	/* number of free blocks in a group */
 	struct super_block *sb;
 	struct ext3_group_desc *gdp;
@@ -1498,16 +1508,27 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
 		printk("ext3_new_block: nonexistent device");
 		return 0;
 	}
+	sbi = EXT3_SB(sb);
 
-	/*
-	 * Check quota for allocation of this block.
-	 */
-	if (vfs_dq_alloc_block(inode, num)) {
-		*errp = -EDQUOT;
-		return 0;
+	if (!*reserved) {
+		/*
+		 * Check quota for allocation of this block.
+		 */
+		if (vfs_dq_alloc_block(inode, num)) {
+			*errp = -EDQUOT;
+			goto out;
+		}
+		got_quota = 1;
+		if (dac_alloc(&sbi->s_alloc_counter, num,
+			      ext3_free_blocks_limit(sb)) < 0) {
+			*errp = -ENOSPC;
+			goto out;
+		}
+		got_space = 1;
+	} else {
+		WARN_ON(*reserved < num);
 	}
 
-	sbi = EXT3_SB(sb);
 	es = EXT3_SB(sb)->s_es;
 	ext3_debug("goal=%lu.\n", goal);
 	/*
@@ -1522,11 +1543,6 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
 	if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
 		my_rsv = &block_i->rsv_window_node;
 
-	if (!ext3_has_free_blocks(sbi)) {
-		*errp = -ENOSPC;
-		goto out;
-	}
-
 	/*
 	 * First, test whether the goal block is free.
 	 */
@@ -1650,8 +1666,6 @@ allocated:
 		goto retry_alloc;
 	}
 
-	performed_allocation = 1;
-
 #ifdef CONFIG_JBD_DEBUG
 	{
 		struct buffer_head *debug_bh;
@@ -1701,7 +1715,6 @@ allocated:
 	spin_lock(sb_bgl_lock(sbi, group_no));
 	le16_add_cpu(&gdp->bg_free_blocks_count, -num);
 	spin_unlock(sb_bgl_lock(sbi, group_no));
-	percpu_counter_sub(&sbi->s_freeblocks_counter, num);
 
 	BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
 	err = ext3_journal_dirty_metadata(handle, gdp_bh);
@@ -1714,7 +1727,15 @@ allocated:
 
 	*errp = 0;
 	brelse(bitmap_bh);
-	vfs_dq_free_block(inode, *count-num);
+	if (*reserved) {
+		dac_alloc_reserved(&sbi->s_alloc_counter, num);
+		vfs_dq_claim_block(inode, num);
+		atomic_sub(num, &EXT3_I(inode)->i_reserved_blocks);
+		*reserved -= num;
+	} else {
+		dac_free(&sbi->s_alloc_counter, *count - num);
+		vfs_dq_free_block(inode, *count - num);
+	}
 	*count = num;
 	return ret_block;
 
@@ -1728,8 +1749,10 @@ out:
 	/*
 	 * Undo the block allocation
 	 */
-	if (!performed_allocation)
+	if (got_quota)
 		vfs_dq_free_block(inode, *count);
+	if (got_space)
+		dac_free(&sbi->s_alloc_counter, *count);
 	brelse(bitmap_bh);
 	return 0;
 }
@@ -1738,8 +1761,9 @@ ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode,
 			ext3_fsblk_t goal, int *errp)
 {
 	unsigned long count = 1;
+	unsigned int reserved = 0;
 
-	return ext3_new_blocks(handle, inode, goal, &count, errp);
+	return ext3_new_blocks(handle, inode, goal, &count, &reserved, errp);
 }
 
 /**
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index dd13d60..97b1b70 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -269,7 +269,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
 
 	freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
 	avefreei = freei / ngroups;
-	freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+	freeb = dac_get_avail(&sbi->s_alloc_counter);
 	avefreeb = freeb / ngroups;
 	ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
 
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index cc012fe..d396006 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -38,6 +38,7 @@
 #include <linux/bio.h>
 #include <linux/fiemap.h>
 #include <linux/namei.h>
+#include <linux/mount.h>
 #include "xattr.h"
 #include "acl.h"
 
@@ -516,7 +517,8 @@ static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
  */
 static int ext3_alloc_blocks(handle_t *handle, struct inode *inode,
 			ext3_fsblk_t goal, int indirect_blks, int blks,
-			ext3_fsblk_t new_blocks[4], int *err)
+			unsigned int *reserved, ext3_fsblk_t new_blocks[4],
+			int *err)
 {
 	int target, i;
 	unsigned long count = 0;
@@ -537,7 +539,8 @@ static int ext3_alloc_blocks(handle_t *handle, struct inode *inode,
 	while (1) {
 		count = target;
 		/* allocating blocks for indirect blocks and direct blocks */
-		current_block = ext3_new_blocks(handle,inode,goal,&count,err);
+		current_block = ext3_new_blocks(handle, inode, goal, &count,
+						reserved, err);
 		if (*err)
 			goto failed_out;
 
@@ -591,8 +594,8 @@ failed_out:
  *	as described above and return 0.
  */
 static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
-			int indirect_blks, int *blks, ext3_fsblk_t goal,
-			int *offsets, Indirect *branch)
+			int indirect_blks, int *blks, unsigned int *reserved,
+			ext3_fsblk_t goal, int *offsets, Indirect *branch)
 {
 	int blocksize = inode->i_sb->s_blocksize;
 	int i, n = 0;
@@ -603,7 +606,7 @@ static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
 	ext3_fsblk_t current_block;
 
 	num = ext3_alloc_blocks(handle, inode, goal, indirect_blks,
-				*blks, new_blocks, &err);
+				*blks, reserved, new_blocks, &err);
 	if (err)
 		return err;
 
@@ -800,6 +803,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
 	int depth;
 	struct ext3_inode_info *ei = EXT3_I(inode);
 	int count = 0;
+	unsigned int reserved = 0;
 	ext3_fsblk_t first_block = 0;
 
 
@@ -898,8 +902,22 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
 	/*
 	 * Block out ext3_truncate while we alter the tree
 	 */
-	err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal,
-				offsets + (partial - chain), partial);
+	if (buffer_delay(bh_result)) {
+		WARN_ON(maxblocks != 1);
+		WARN_ON(!bh_result->b_page || !PageLocked(bh_result->b_page));
+		reserved = EXT3_DA_BLOCK_RESERVE;
+	}
+	err = ext3_alloc_branch(handle, inode, indirect_blks, &count,
+				&reserved, goal, offsets + (partial - chain),
+				partial);
+	/* Release additional reservation we had for this block */
+	if (!err && buffer_delay(bh_result)) {
+		dac_cancel_reserved(&EXT3_SB(inode->i_sb)->s_alloc_counter,
+				    reserved);
+		vfs_dq_release_reservation_block(inode, reserved);
+		atomic_sub(reserved, &ei->i_reserved_blocks);
+		clear_buffer_delay(bh_result);
+	}
 
 	/*
 	 * The ext3_splice_branch call will free and forget any buffers
@@ -1432,9 +1450,9 @@ static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
 	return generic_block_bmap(mapping,block,ext3_get_block);
 }
 
-static int buffer_unmapped(handle_t *handle, struct buffer_head *bh)
+static int ext3_bh_delay(handle_t *handle, struct buffer_head *bh)
 {
-	return !buffer_mapped(bh);
+	return buffer_delay(bh);
 }
 
 /*
@@ -1496,12 +1514,31 @@ static int ext3_common_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
-	int ret = 0;
+	int ret = 0, delay = 1, err;
+	handle_t *handle = NULL;
 
+	/* Start a transaction only if the page has delayed buffers */
+	if (page_has_buffers(page))
+		delay = walk_page_buffers(NULL, page_buffers(page), 0,
+				PAGE_CACHE_SIZE, NULL, ext3_bh_delay);
+	if (delay) {
+		handle = ext3_journal_start(inode,
+				ext3_writepage_trans_blocks(inode));
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			goto out;
+		}
+	}
 	if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode))
 		ret = nobh_writepage(page, ext3_get_block, wbc);
 	else
 		ret = block_write_full_page(page, ext3_get_block, wbc);
+	if (delay) {
+		err = ext3_journal_stop(handle);
+		if (!ret)
+			ret = err;
+	}
+out:
 	return ret;
 }
 
@@ -1576,15 +1613,37 @@ ext3_readpages(struct file *file, struct address_space *mapping,
 	return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
 }
 
+
+static int truncate_delayed_bh(handle_t *handle, struct buffer_head *bh)
+{
+	if (buffer_delay(bh)) {
+		struct inode *inode = bh->b_page->mapping->host;
+		struct ext3_inode_info *ei = EXT3_I(inode);
+
+		atomic_sub(EXT3_DA_BLOCK_RESERVE, &ei->i_reserved_blocks);
+		vfs_dq_release_reservation_block(inode, EXT3_DA_BLOCK_RESERVE);
+		dac_cancel_reserved(&EXT3_SB(inode->i_sb)->s_alloc_counter,
+				    EXT3_DA_BLOCK_RESERVE);
+		clear_buffer_delay(bh);
+	}
+	return 0;
+}
+
 static void ext3_invalidatepage(struct page *page, unsigned long offset)
 {
-	journal_t *journal = EXT3_JOURNAL(page->mapping->host);
+	struct inode *inode = page->mapping->host;
+	journal_t *journal = EXT3_JOURNAL(inode);
+	int bsize = 1 << inode->i_blkbits;
 
 	/*
 	 * If it's a full truncate we just forget about the pending dirtying
 	 */
 	if (offset == 0)
 		ClearPageChecked(page);
+	if (page_has_buffers(page)) {
+		walk_page_buffers(NULL, page_buffers(page), offset + bsize - 1,
+				  PAGE_CACHE_SIZE, NULL, truncate_delayed_bh);
+	}
 
 	journal_invalidatepage(journal, page, offset);
 }
@@ -1685,75 +1744,58 @@ out:
 	return ret;
 }
 
-int ext3_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+/*
+ * Reserve block writes instead of allocation. Called only on buffer heads
+ * attached to a page (and thus for 1 block).
+ */
+static int ext3_da_get_block(struct inode *inode, sector_t iblock,
+			     struct buffer_head *bh, int create)
 {
-	struct page *page = vmf->page;
-	struct file *file = vma->vm_file;
-	struct address_space *mapping = file->f_mapping;
-	struct inode *inode = file->f_path.dentry->d_inode;
-	int ret = VM_FAULT_NOPAGE;
-	loff_t size;
-	int len;
-	void *fsdata;
-
-	block_wait_on_hole_extend(inode, page_offset(page));
-	/*
-	 * Get i_alloc_sem to stop truncates messing with the inode. We cannot
-	 * get i_mutex because we are already holding mmap_sem.
-	 */
-	down_read(&inode->i_alloc_sem);
-	size = i_size_read(inode);
-	if ((page->mapping != inode->i_mapping) ||
-	    (page_offset(page) > size)) {
-		/* page got truncated out from underneath us */
-		goto out_unlock;
-	}
+	int ret, rsv;
+	struct ext3_sb_info *sbi;
 
-	/* page is wholly or partially inside EOF */
-	if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
-		len = size & ~PAGE_CACHE_MASK;
-	else
-		len = PAGE_CACHE_SIZE;
-
-	/*
-	 * Check for the common case that everything is already mapped. We
-	 * have to get the page lock so that buffers cannot be released
-	 * under us.
-	 */
-	lock_page(page);
-	if (page_has_buffers(page)) {
-		if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
-				       buffer_unmapped)) {
-			unlock_page(page);
-			ret = 0;
-			goto out_unlock;
-		}
-	}
-	unlock_page(page);
+	/* Buffer has already blocks reserved? */
+	if (buffer_delay(bh))
+		return 0;
 
-	/*
-	 * OK, we may need to fill the hole... Do write_begin write_end to do
-	 * block allocation/reservation. We are not holding inode.i_mutex
-	 * here. That allows parallel write_begin, write_end call. lock_page
-	 * prevent this from happening on the same page though.
-	 */
-	ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
-			len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
-	if (ret < 0)
-		goto out_unlock;
-	ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
-			len, len, page, fsdata);
+	ret = ext3_get_blocks_handle(NULL, inode, iblock, 1, bh, 0);
 	if (ret < 0)
-		goto out_unlock;
-	ret = 0;
-out_unlock:
-	if (unlikely(ret)) {
-		if (ret == -ENOMEM)
-			ret = VM_FAULT_OOM;
-		else /* -ENOSPC, -EIO, etc */
-			ret = VM_FAULT_SIGBUS;
+		goto out;
+	if (ret > 0 || !create) {
+		ret = 0;
+		goto out;
+	}
+	/* Upperbound on number of needed blocks */
+	rsv = EXT3_DA_BLOCK_RESERVE;
+	/* Delayed allocation needed */
+	if (vfs_dq_reserve_block(inode, rsv)) {
+		ret = -EDQUOT;
+		goto out;
+	}
+	sbi = EXT3_SB(inode->i_sb);
+	ret = dac_reserve(&sbi->s_alloc_counter, rsv,
+			  ext3_free_blocks_limit(inode->i_sb));
+	if (ret < 0) {
+		vfs_dq_release_reservation_block(inode, rsv);
+		goto out;
 	}
-	up_read(&inode->i_alloc_sem);
+	set_buffer_delay(bh);
+	set_buffer_new(bh);
+	atomic_add(rsv, &EXT3_I(inode)->i_reserved_blocks);
+out:
+	return ret;
+}
+
+int ext3_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	int retry = 0;
+	int ret;
+	struct super_block *sb = vma->vm_file->f_path.mnt->mnt_sb;
+
+	do {
+		ret = block_page_mkwrite(vma, vmf, ext3_da_get_block);
+	} while (ret == VM_FAULT_SIGBUS &&
+		 ext3_should_retry_alloc(sb, &retry));
 	return ret;
 }
 
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 78fdf38..5a35314 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -928,7 +928,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
 	le32_add_cpu(&es->s_r_blocks_count, input->reserved_blocks);
 
 	/* Update the free space counts */
-	percpu_counter_add(&sbi->s_freeblocks_counter,
+	percpu_counter_add(&sbi->s_alloc_counter.free,
 			   input->free_blocks_count);
 	percpu_counter_add(&sbi->s_freeinodes_counter,
 			   EXT3_INODES_PER_GROUP(sb));
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 599dbfe..73263eb 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -416,7 +416,7 @@ static void ext3_put_super (struct super_block * sb)
 	for (i = 0; i < sbi->s_gdb_count; i++)
 		brelse(sbi->s_group_desc[i]);
 	kfree(sbi->s_group_desc);
-	percpu_counter_destroy(&sbi->s_freeblocks_counter);
+	dac_destroy(&sbi->s_alloc_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
 	brelse(sbi->s_sbh);
@@ -492,6 +492,7 @@ static void init_once(void *foo)
 #ifdef CONFIG_EXT3_FS_XATTR
 	init_rwsem(&ei->xattr_sem);
 #endif
+	atomic_set(&ei->i_reserved_blocks, 0);
 	mutex_init(&ei->truncate_mutex);
 	inode_init_once(&ei->vfs_inode);
 }
@@ -515,23 +516,26 @@ static void destroy_inodecache(void)
 
 static void ext3_clear_inode(struct inode *inode)
 {
-	struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info;
+	struct ext3_inode_info *ei = EXT3_I(inode);
+	struct ext3_block_alloc_info *rsv = ei->i_block_alloc_info;
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
-	if (EXT3_I(inode)->i_acl &&
-			EXT3_I(inode)->i_acl != EXT3_ACL_NOT_CACHED) {
-		posix_acl_release(EXT3_I(inode)->i_acl);
-		EXT3_I(inode)->i_acl = EXT3_ACL_NOT_CACHED;
+	if (ei->i_acl && ei->i_acl != EXT3_ACL_NOT_CACHED) {
+		posix_acl_release(ei->i_acl);
+		ei->i_acl = EXT3_ACL_NOT_CACHED;
 	}
-	if (EXT3_I(inode)->i_default_acl &&
-			EXT3_I(inode)->i_default_acl != EXT3_ACL_NOT_CACHED) {
-		posix_acl_release(EXT3_I(inode)->i_default_acl);
-		EXT3_I(inode)->i_default_acl = EXT3_ACL_NOT_CACHED;
+	if (ei->i_default_acl && ei->i_default_acl != EXT3_ACL_NOT_CACHED) {
+		posix_acl_release(ei->i_default_acl);
+		ei->i_default_acl = EXT3_ACL_NOT_CACHED;
 	}
 #endif
 	ext3_discard_reservation(inode);
-	EXT3_I(inode)->i_block_alloc_info = NULL;
+	ei->i_block_alloc_info = NULL;
 	if (unlikely(rsv))
 		kfree(rsv);
+	if (atomic_read(&ei->i_reserved_blocks))
+		ext3_warning(inode->i_sb, __func__, "Releasing inode %lu with "
+			"%lu reserved blocks.\n", inode->i_ino,
+			(unsigned long)atomic_read(&ei->i_reserved_blocks));
 }
 
 static inline void ext3_show_quota_options(struct seq_file *seq, struct super_block *sb)
@@ -726,10 +730,19 @@ static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
 static ssize_t ext3_quota_write(struct super_block *sb, int type,
 				const char *data, size_t len, loff_t off);
 
+static qsize_t ext3_get_reserved_space(struct inode *inode)
+{
+	return atomic_read(&EXT3_I(inode)->i_reserved_blocks);
+}
+
 static struct dquot_operations ext3_quota_operations = {
 	.initialize	= dquot_initialize,
 	.drop		= dquot_drop,
 	.alloc_space	= dquot_alloc_space,
+	.reserve_space	= dquot_reserve_space,
+	.claim_space	= dquot_claim_space,
+	.release_rsv	= dquot_release_reserved_space,
+	.get_reserved_space = ext3_get_reserved_space,
 	.alloc_inode	= dquot_alloc_inode,
 	.free_space	= dquot_free_space,
 	.free_inode	= dquot_free_inode,
@@ -1851,8 +1864,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 	spin_lock_init(&sbi->s_next_gen_lock);
 
-	err = percpu_counter_init(&sbi->s_freeblocks_counter,
-			ext3_count_free_blocks(sb));
+	err = dac_init(&sbi->s_alloc_counter, ext3_count_free_blocks(sb));
 	if (!err) {
 		err = percpu_counter_init(&sbi->s_freeinodes_counter,
 				ext3_count_free_inodes(sb));
@@ -2005,7 +2017,7 @@ cantfind_ext3:
 failed_mount4:
 	journal_destroy(sbi->s_journal);
 failed_mount3:
-	percpu_counter_destroy(&sbi->s_freeblocks_counter);
+	dac_destroy(&sbi->s_alloc_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
 failed_mount2:
@@ -2685,7 +2697,7 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
 	buf->f_type = EXT3_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = le32_to_cpu(es->s_blocks_count) - sbi->s_overhead_last;
-	buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
+	buf->f_bfree = dac_get_avail_sum(&sbi->s_alloc_counter);
 	es->s_free_blocks_count = cpu_to_le32(buf->f_bfree);
 	buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count);
 	if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count))
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index 5051874..7f28907 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -809,6 +809,11 @@ ext3_group_first_block_no(struct super_block *sb, unsigned long group_no)
 #define ERR_BAD_DX_DIR	-75000
 
 /*
+ * Number of blocks we reserve in delayed allocation for one block
+ */
+#define EXT3_DA_BLOCK_RESERVE 4
+
+/*
  * Function prototypes
  */
 
@@ -821,12 +826,14 @@ ext3_group_first_block_no(struct super_block *sb, unsigned long group_no)
 # define NORET_AND     noreturn,
 
 /* balloc.c */
+extern ext3_fsblk_t ext3_free_blocks_limit(struct super_block *sb);
 extern int ext3_bg_has_super(struct super_block *sb, int group);
 extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
 extern ext3_fsblk_t ext3_new_block (handle_t *handle, struct inode *inode,
 			ext3_fsblk_t goal, int *errp);
 extern ext3_fsblk_t ext3_new_blocks (handle_t *handle, struct inode *inode,
-			ext3_fsblk_t goal, unsigned long *count, int *errp);
+			ext3_fsblk_t goal, unsigned long *count,
+			unsigned int *reserved, int *errp);
 extern void ext3_free_blocks (handle_t *handle, struct inode *inode,
 			ext3_fsblk_t block, unsigned long count);
 extern void ext3_free_blocks_sb (handle_t *handle, struct super_block *sb,
diff --git a/include/linux/ext3_fs_i.h b/include/linux/ext3_fs_i.h
index 7894dd0..ef288bd 100644
--- a/include/linux/ext3_fs_i.h
+++ b/include/linux/ext3_fs_i.h
@@ -129,6 +129,7 @@ struct ext3_inode_info {
 
 	/* on-disk additional length */
 	__u16 i_extra_isize;
+	atomic_t i_reserved_blocks;
 
 	/*
 	 * truncate_mutex is for serialising ext3_truncate() against
diff --git a/include/linux/ext3_fs_sb.h b/include/linux/ext3_fs_sb.h
index f07f34d..f92e162 100644
--- a/include/linux/ext3_fs_sb.h
+++ b/include/linux/ext3_fs_sb.h
@@ -21,6 +21,7 @@
 #include <linux/wait.h>
 #include <linux/blockgroup_lock.h>
 #include <linux/percpu_counter.h>
+#include <linux/delalloc_counter.h>
 #endif
 #include <linux/rbtree.h>
 
@@ -58,7 +59,7 @@ struct ext3_sb_info {
 	u32 s_hash_seed[4];
 	int s_def_hash_version;
 	int s_hash_unsigned;	/* 3 if hash should be signed, 0 if not */
-	struct percpu_counter s_freeblocks_counter;
+	struct delalloc_counter s_alloc_counter;
 	struct percpu_counter s_freeinodes_counter;
 	struct percpu_counter s_dirs_counter;
 	struct blockgroup_lock *s_blockgroup_lock;
-- 
1.6.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/