Index: linux-stage/fs/ext3/dir.c =================================================================== --- linux-stage.orig/fs/ext3/dir.c +++ linux-stage/fs/ext3/dir.c @@ -131,8 +131,7 @@ static int ext3_readdir(struct file * fi struct buffer_head *bh = NULL; map_bh.b_state = 0; - err = ext3_get_blocks_handle(NULL, inode, blk, 1, - &map_bh, 0, 0); + err = ext3_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0); if (err > 0) { page_cache_readahead(sb->s_bdev->bd_inode->i_mapping, &filp->f_ra, Index: linux-stage/fs/ext3/extents.c =================================================================== --- /dev/null +++ linux-stage/fs/ext3/extents.c @@ -0,0 +1,2278 @@ +/* + * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas + * + * Architecture independence: + * Copyright (c) 2005, Bull S.A. + * Written by Pierre Peiffer + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public Licens + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- + */ + +/* + * Extents support for EXT3 + * + * TODO: + * - ext3*_error() should be used in some situations + * - analyze all BUG()/BUG_ON(), use -EIO where appropriate + * - smart tree reduction + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed) +{ + int err; + + if (handle->h_buffer_credits > needed) + return handle; + if (!ext3_journal_extend(handle, needed)) + return handle; + err = ext3_journal_restart(handle, needed); + + return handle; +} + +/* + * could return: + * - EROFS + * - ENOMEM + */ +static int ext3_ext_get_access(handle_t *handle, struct inode *inode, + struct ext3_ext_path *path) +{ + if (path->p_bh) { + /* path points to block */ + return ext3_journal_get_write_access(handle, path->p_bh); + } + /* path points to leaf/index in inode body */ + /* we use in-core data, no need to protect them */ + return 0; +} + +/* + * could return: + * - EROFS + * - ENOMEM + * - EIO + */ +static int ext3_ext_dirty(handle_t *handle, struct inode *inode, + struct ext3_ext_path *path) +{ + int err; + if (path->p_bh) { + /* path points to block */ + err = ext3_journal_dirty_metadata(handle, path->p_bh); + } else { + /* path points to leaf/index in inode body */ + err = ext3_mark_inode_dirty(handle, inode); + } + return err; +} + +static int ext3_ext_find_goal(struct inode *inode, + struct ext3_ext_path *path, + unsigned long block) +{ + struct ext3_inode_info *ei = EXT3_I(inode); + unsigned long bg_start; + unsigned long colour; + int depth; + + if (path) { + struct ext3_extent *ex; + depth = path->p_depth; + + /* try to predict block placement */ + if ((ex = path[depth].p_ext)) + return le32_to_cpu(ex->ee_start) + + (block - le32_to_cpu(ex->ee_block)); + + /* it looks index is empty + * try to find starting from index itself */ + if (path[depth].p_bh) + return path[depth].p_bh->b_blocknr; + } + + /* OK. use inode's group */ + bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + + le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); + colour = (current->pid % 16) * + (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); + return bg_start + colour + block; +} + +static int +ext3_ext_new_block(handle_t *handle, struct inode *inode, + struct ext3_ext_path *path, + struct ext3_extent *ex, int *err) +{ + int goal, newblock; + + goal = ext3_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); + newblock = ext3_new_block(handle, inode, goal, err); + return newblock; +} + +static inline int ext3_ext_space_block(struct inode *inode) +{ + int size; + + size = (inode->i_sb->s_blocksize - sizeof(struct ext3_extent_header)) + / sizeof(struct ext3_extent); +#ifdef AGRESSIVE_TEST + if (size > 6) + size = 6; +#endif + return size; +} + +static inline int ext3_ext_space_block_idx(struct inode *inode) +{ + int size; + + size = (inode->i_sb->s_blocksize - sizeof(struct ext3_extent_header)) + / sizeof(struct ext3_extent_idx); +#ifdef AGRESSIVE_TEST + if (size > 5) + size = 5; +#endif + return size; +} + +static inline int ext3_ext_space_root(struct inode *inode) +{ + int size; + + size = sizeof(EXT3_I(inode)->i_data); + size -= sizeof(struct ext3_extent_header); + size /= sizeof(struct ext3_extent); +#ifdef AGRESSIVE_TEST + if (size > 3) + size = 3; +#endif + return size; +} + +static inline int ext3_ext_space_root_idx(struct inode *inode) +{ + int size; + + size = sizeof(EXT3_I(inode)->i_data); + size -= sizeof(struct ext3_extent_header); + size /= sizeof(struct ext3_extent_idx); +#ifdef AGRESSIVE_TEST + if (size > 4) + size = 4; +#endif + return size; +} + +static inline int +ext3_ext_max_entries(struct inode *inode, int depth) +{ + int max; + + if (depth == ext_depth(inode)) { + if (depth == 0) + max = ext3_ext_space_root(inode); + else + max = ext3_ext_space_root_idx(inode); + } else { + if (depth == 0) + max = ext3_ext_space_block(inode); + else + max = ext3_ext_space_block_idx(inode); + } + + return max; +} + +static int __ext3_ext_check_header(const char *function, int line, struct inode *inode, + struct ext3_extent_header *eh, + int depth) +{ + const char *error_msg = NULL; + int max = 0; + + if (unlikely(eh->eh_magic != cpu_to_le16(EXT3_EXT_MAGIC))) { + error_msg = "invalid magic"; + goto corrupted; + } + if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) { + error_msg = "unexpected eh_depth"; + goto corrupted; + } + if (unlikely(eh->eh_max == 0)) { + error_msg = "invalid eh_max"; + goto corrupted; + } + max = ext3_ext_max_entries(inode, depth); +#ifdef AGRESSIVE_TEST + if (eh->eh_max > 3) { + /* inode probably got extent without defining AGRESSIVE_TEST */ + max = eh->eh_max; + } +#endif + if (unlikely(le16_to_cpu(eh->eh_max) > max)) { + error_msg = "too large eh_max"; + goto corrupted; + } + if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) { + error_msg = "invalid eh_entries"; + goto corrupted; + } + if (unlikely((eh->eh_entries == 0) && (eh->eh_depth != 0))) { + error_msg = "invalid index, eh_entries=0 && eh_depth != 0"; + goto corrupted; + } + return 0; + +corrupted: + ext3_error(inode->i_sb, function, + ":%d: bad header in inode #%lu: %s - magic %x, " + "entries %u, max %u(%u), depth %u(%u)", line, + inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic), + le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), + max, le16_to_cpu(eh->eh_depth), depth); + + return -EIO; +} + +#define ext3_ext_check_header(inode,eh,depth) \ + __ext3_ext_check_header(__FUNCTION__,__LINE__,inode,eh,depth) + +#ifdef EXT_DEBUG +static void ext3_ext_show_path(struct inode *inode, struct ext3_ext_path *path) +{ + int k, l = path->p_depth; + + ext_debug(inode, "path:"); + for (k = 0; k <= l; k++, path++) { + if (path->p_idx) { + ext_debug(inode, " %d->%d", le32_to_cpu(path->p_idx->ei_block), + le32_to_cpu(path->p_idx->ei_leaf)); + } else if (path->p_ext) { + ext_debug(inode, " %d:%d:%d", + le32_to_cpu(path->p_ext->ee_block), + le16_to_cpu(path->p_ext->ee_len), + le32_to_cpu(path->p_ext->ee_start)); + } else + ext_debug(inode, " []"); + } + ext_debug(inode, "\n"); +} + +static void ext3_ext_show_leaf(struct inode *inode, struct ext3_ext_path *path) +{ + int depth = ext_depth(inode); + struct ext3_extent_header *eh; + struct ext3_extent *ex; + int i; + + if (!path) + return; + + eh = path[depth].p_hdr; + ex = EXT_FIRST_EXTENT(eh); + + for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { + ext_debug(inode, "%d:%d:%d ", le32_to_cpu(ex->ee_block), + le16_to_cpu(ex->ee_len), + le32_to_cpu(ex->ee_start)); + } + ext_debug(inode, "\n"); +} +#else +#define ext3_ext_show_path(inode,path) +#define ext3_ext_show_leaf(inode,path) +#endif + +static void ext3_ext_drop_refs(struct ext3_ext_path *path) +{ + int depth = path->p_depth; + int i; + + for (i = 0; i <= depth; i++, path++) + if (path->p_bh) { + brelse(path->p_bh); + path->p_bh = NULL; + } +} + +/* + * binary search for closest index by given block + * the header must be checked before calling this + */ +static void +ext3_ext_binsearch_idx(struct inode *inode, struct ext3_ext_path *path, int block) +{ + struct ext3_extent_header *eh = path->p_hdr; + struct ext3_extent_idx *r, *l, *m; + + ext_debug(inode, "binsearch for %d(idx): ", block); + + l = EXT_FIRST_INDEX(eh) + 1; + r = EXT_FIRST_INDEX(eh) + le16_to_cpu(eh->eh_entries) - 1; + while (l <= r) { + m = l + (r - l) / 2; + if (block < le32_to_cpu(m->ei_block)) + r = m - 1; + else + l = m + 1; + ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l, l->ei_block, + m, m->ei_block, r, r->ei_block); + } + + path->p_idx = l - 1; + ext_debug(inode, " -> %d->%d ", le32_to_cpu(path->p_idx->ei_block), + le32_to_cpu(path->p_idx->ei_leaf)); + +#ifdef CHECK_BINSEARCH + { + struct ext3_extent_idx *chix, *ix; + int k; + + chix = ix = EXT_FIRST_INDEX(eh); + for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) { + if (k != 0 && + le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) { + printk("k=%d, ix=0x%p, first=0x%p\n", k, + ix, EXT_FIRST_INDEX(eh)); + printk("%u <= %u\n", + le32_to_cpu(ix->ei_block), + le32_to_cpu(ix[-1].ei_block)); + } + BUG_ON(k && le32_to_cpu(ix->ei_block) + <= le32_to_cpu(ix[-1].ei_block)); + if (block < le32_to_cpu(ix->ei_block)) + break; + chix = ix; + } + BUG_ON(chix != path->p_idx); + } +#endif + +} + +/* + * binary search for closest extent by given block + * the header must be checked before calling this + */ +static void +ext3_ext_binsearch(struct inode *inode, struct ext3_ext_path *path, int block) +{ + struct ext3_extent_header *eh = path->p_hdr; + struct ext3_extent *r, *l, *m; + + if (eh->eh_entries == 0) { + /* + * this leaf is empty yet: + * we get such a leaf in split/add case + */ + return; + } + + ext_debug(inode, "binsearch for %d: ", block); + + l = EXT_FIRST_EXTENT(eh) + 1; + r = EXT_FIRST_EXTENT(eh) + le16_to_cpu(eh->eh_entries) - 1; + + while (l <= r) { + m = l + (r - l) / 2; + if (block < le32_to_cpu(m->ee_block)) + r = m - 1; + else + l = m + 1; + ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l, l->ee_block, + m, m->ee_block, r, r->ee_block); + } + + path->p_ext = l - 1; + ext_debug(inode, " -> %d:%d:%d ", + le32_to_cpu(path->p_ext->ee_block), + le32_to_cpu(path->p_ext->ee_start), + le16_to_cpu(path->p_ext->ee_len)); + +#ifdef CHECK_BINSEARCH + { + struct ext3_extent *chex, *ex; + int k; + + chex = ex = EXT_FIRST_EXTENT(eh); + for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) { + BUG_ON(k && le32_to_cpu(ex->ee_block) + <= le32_to_cpu(ex[-1].ee_block)); + if (block < le32_to_cpu(ex->ee_block)) + break; + chex = ex; + } + BUG_ON(chex != path->p_ext); + } +#endif + +} + +int ext3_ext_tree_init(handle_t *handle, struct inode *inode) +{ + struct ext3_extent_header *eh; + + eh = ext_inode_hdr(inode); + eh->eh_depth = 0; + eh->eh_entries = 0; + eh->eh_magic = cpu_to_le16(EXT3_EXT_MAGIC); + eh->eh_max = cpu_to_le16(ext3_ext_space_root(inode)); + ext3_mark_inode_dirty(handle, inode); + ext3_ext_invalidate_cache(inode); + return 0; +} + +struct ext3_ext_path * +ext3_ext_find_extent(struct inode *inode, int block, struct ext3_ext_path *path) +{ + struct ext3_extent_header *eh; + struct buffer_head *bh; + short int depth, i, ppos = 0, alloc = 0; + + eh = ext_inode_hdr(inode); + i = depth = ext_depth(inode); + if (ext3_ext_check_header(inode, eh, depth)) + return ERR_PTR(-EIO); + + /* account possible depth increase */ + if (!path) { + path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2), + GFP_NOFS); + if (!path) + return ERR_PTR(-ENOMEM); + alloc = 1; + } + memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); + path[0].p_hdr = eh; + + /* walk through the tree */ + while (i) { + ext_debug(inode, "depth %d: num %d, max %d\n", + ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); + + ext3_ext_binsearch_idx(inode, path + ppos, block); + path[ppos].p_block = le32_to_cpu(path[ppos].p_idx->ei_leaf); + path[ppos].p_depth = i; + path[ppos].p_ext = NULL; + + bh = sb_bread(inode->i_sb, path[ppos].p_block); + if (!bh) + goto err; + + eh = ext_block_hdr(bh); + ppos++; + BUG_ON(ppos > depth); + path[ppos].p_bh = bh; + path[ppos].p_hdr = eh; + i--; + + if (ext3_ext_check_header(inode, eh, i)) + goto err; + } + + path[ppos].p_depth = i; + path[ppos].p_hdr = eh; + path[ppos].p_ext = NULL; + path[ppos].p_idx = NULL; + + /* find extent */ + ext3_ext_binsearch(inode, path + ppos, block); + + ext3_ext_show_path(inode, path); + + return path; + +err: + ext3_ext_drop_refs(path); + if (alloc) + kfree(path); + return ERR_PTR(-EIO); +} + +/* + * insert new index [logical;ptr] into the block at cupr + * it check where to insert: before curp or after curp + */ +static int ext3_ext_insert_index(handle_t *handle, struct inode *inode, + struct ext3_ext_path *curp, + int logical, int ptr) +{ + struct ext3_extent_idx *ix; + int len, err; + + if ((err = ext3_ext_get_access(handle, inode, curp))) + return err; + + BUG_ON(logical == le32_to_cpu(curp->p_idx->ei_block)); + len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; + if (logical > le32_to_cpu(curp->p_idx->ei_block)) { + /* insert after */ + if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { + len = (len - 1) * sizeof(struct ext3_extent_idx); + len = len < 0 ? 0 : len; + ext_debug(inode, "insert new index %d after: %d. " + "move %d from 0x%p to 0x%p\n", + logical, ptr, len, + (curp->p_idx + 1), (curp->p_idx + 2)); + memmove(curp->p_idx + 2, curp->p_idx + 1, len); + } + ix = curp->p_idx + 1; + } else { + /* insert before */ + len = len * sizeof(struct ext3_extent_idx); + len = len < 0 ? 0 : len; + ext_debug(inode, "insert new index %d before: %d. " + "move %d from 0x%p to 0x%p\n", + logical, ptr, len, + curp->p_idx, (curp->p_idx + 1)); + memmove(curp->p_idx + 1, curp->p_idx, len); + ix = curp->p_idx; + } + + ix->ei_block = cpu_to_le32(logical); + ix->ei_leaf = cpu_to_le32(ptr); + ix->ei_leaf_hi = ix->ei_unused = 0; + curp->p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(curp->p_hdr->eh_entries)+1); + + BUG_ON(le16_to_cpu(curp->p_hdr->eh_entries) + > le16_to_cpu(curp->p_hdr->eh_max)); + BUG_ON(ix > EXT_LAST_INDEX(curp->p_hdr)); + + err = ext3_ext_dirty(handle, inode, curp); + ext3_std_error(inode->i_sb, err); + + return err; +} + +/* + * routine inserts new subtree into the path, using free index entry + * at depth 'at: + * - allocates all needed blocks (new leaf and all intermediate index blocks) + * - makes decision where to split + * - moves remaining extens and index entries (right to the split point) + * into the newly allocated blocks + * - initialize subtree + */ +static int ext3_ext_split(handle_t *handle, struct inode *inode, + struct ext3_ext_path *path, + struct ext3_extent *newext, int at) +{ + struct buffer_head *bh = NULL; + int depth = ext_depth(inode); + struct ext3_extent_header *neh; + struct ext3_extent_idx *fidx; + struct ext3_extent *ex; + int i = at, k, m, a; + unsigned long newblock, oldblock; + __le32 border; + int *ablocks = NULL; /* array of allocated blocks */ + int err = 0; + + /* make decision: where to split? */ + /* FIXME: now desicion is simplest: at current extent */ + + /* if current leaf will be splitted, then we should use + * border from split point */ + BUG_ON(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr)); + if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { + border = path[depth].p_ext[1].ee_block; + ext_debug(inode, "leaf will be splitted." + " next leaf starts at %d\n", + le32_to_cpu(border)); + } else { + border = newext->ee_block; + ext_debug(inode, "leaf will be added." + " next leaf starts at %d\n", + le32_to_cpu(border)); + } + + /* + * if error occurs, then we break processing + * and turn filesystem read-only. so, index won't + * be inserted and tree will be in consistent + * state. next mount will repair buffers too + */ + + /* + * get array to track all allocated blocks + * we need this to handle errors and free blocks + * upon them + */ + ablocks = kmalloc(sizeof(unsigned long) * depth, GFP_NOFS); + if (!ablocks) + return -ENOMEM; + memset(ablocks, 0, sizeof(unsigned long) * depth); + + /* allocate all needed blocks */ + ext_debug(inode, "allocate %d blocks for indexes/leaf\n", depth - at); + for (a = 0; a < depth - at; a++) { + newblock = ext3_ext_new_block(handle, inode, path, newext, &err); + if (newblock == 0) + goto cleanup; + ablocks[a] = newblock; + } + + /* initialize new leaf */ + newblock = ablocks[--a]; + BUG_ON(newblock == 0); + bh = sb_getblk(inode->i_sb, newblock); + if (!bh) { + err = -EIO; + goto cleanup; + } + lock_buffer(bh); + + if ((err = ext3_journal_get_create_access(handle, bh))) + goto cleanup; + + neh = ext_block_hdr(bh); + neh->eh_entries = 0; + neh->eh_max = cpu_to_le16(ext3_ext_space_block(inode)); + neh->eh_magic = cpu_to_le16(EXT3_EXT_MAGIC); + neh->eh_depth = 0; + ex = EXT_FIRST_EXTENT(neh); + + /* move remain of path[depth] to the new leaf */ + BUG_ON(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max); + /* start copy from next extent */ + /* TODO: we could do it by single memmove */ + m = 0; + path[depth].p_ext++; + while (path[depth].p_ext <= + EXT_MAX_EXTENT(path[depth].p_hdr)) { + ext_debug(inode, "move %d:%d:%d in new leaf %lu\n", + le32_to_cpu(path[depth].p_ext->ee_block), + le32_to_cpu(path[depth].p_ext->ee_start), + le16_to_cpu(path[depth].p_ext->ee_len), + newblock); + /*memmove(ex++, path[depth].p_ext++, + sizeof(struct ext3_extent)); + neh->eh_entries++;*/ + path[depth].p_ext++; + m++; + } + if (m) { + memmove(ex, path[depth].p_ext-m, sizeof(struct ext3_extent)*m); + neh->eh_entries = cpu_to_le16(le16_to_cpu(neh->eh_entries)+m); + } + + set_buffer_uptodate(bh); + unlock_buffer(bh); + + if ((err = ext3_journal_dirty_metadata(handle, bh))) + goto cleanup; + brelse(bh); + bh = NULL; + + /* correct old leaf */ + if (m) { + if ((err = ext3_ext_get_access(handle, inode, path + depth))) + goto cleanup; + path[depth].p_hdr->eh_entries = + cpu_to_le16(le16_to_cpu(path[depth].p_hdr->eh_entries)-m); + if ((err = ext3_ext_dirty(handle, inode, path + depth))) + goto cleanup; + + } + + /* create intermediate indexes */ + k = depth - at - 1; + BUG_ON(k < 0); + if (k) + ext_debug(inode, "create %d intermediate indices\n", k); + /* insert new index into current index block */ + /* current depth stored in i var */ + i = depth - 1; + while (k--) { + oldblock = newblock; + newblock = ablocks[--a]; + bh = sb_getblk(inode->i_sb, newblock); + if (!bh) { + err = -EIO; + goto cleanup; + } + lock_buffer(bh); + + if ((err = ext3_journal_get_create_access(handle, bh))) + goto cleanup; + + neh = ext_block_hdr(bh); + neh->eh_entries = cpu_to_le16(1); + neh->eh_magic = cpu_to_le16(EXT3_EXT_MAGIC); + neh->eh_max = cpu_to_le16(ext3_ext_space_block_idx(inode)); + neh->eh_depth = cpu_to_le16(depth - i); + fidx = EXT_FIRST_INDEX(neh); + fidx->ei_block = border; + fidx->ei_leaf = cpu_to_le32(oldblock); + fidx->ei_leaf_hi = fidx->ei_unused = 0; + + ext_debug(inode, "int.index at %d (block %lu): %lu -> %lu\n", i, + newblock, (unsigned long) le32_to_cpu(border), + oldblock); + /* copy indexes */ + m = 0; + path[i].p_idx++; + + ext_debug(inode, "cur 0x%p, last 0x%p\n", path[i].p_idx, + EXT_MAX_INDEX(path[i].p_hdr)); + BUG_ON(EXT_MAX_INDEX(path[i].p_hdr) != + EXT_LAST_INDEX(path[i].p_hdr)); + while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { + ext_debug(inode, "%d: move %d:%d in new index %lu\n", i, + le32_to_cpu(path[i].p_idx->ei_block), + le32_to_cpu(path[i].p_idx->ei_leaf), + newblock); + /*memmove(++fidx, path[i].p_idx++, + sizeof(struct ext3_extent_idx)); + neh->eh_entries++; + BUG_ON(neh->eh_entries > neh->eh_max);*/ + path[i].p_idx++; + m++; + } + if (m) { + memmove(++fidx, path[i].p_idx - m, + sizeof(struct ext3_extent_idx) * m); + neh->eh_entries = + cpu_to_le16(le16_to_cpu(neh->eh_entries) + m); + } + set_buffer_uptodate(bh); + unlock_buffer(bh); + + if ((err = ext3_journal_dirty_metadata(handle, bh))) + goto cleanup; + brelse(bh); + bh = NULL; + + /* correct old index */ + if (m) { + err = ext3_ext_get_access(handle, inode, path + i); + if (err) + goto cleanup; + path[i].p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(path[i].p_hdr->eh_entries)-m); + err = ext3_ext_dirty(handle, inode, path + i); + if (err) + goto cleanup; + } + + i--; + } + + /* insert new index */ + if (err) + goto cleanup; + + err = ext3_ext_insert_index(handle, inode, path + at, + le32_to_cpu(border), newblock); + +cleanup: + if (bh) { + if (buffer_locked(bh)) + unlock_buffer(bh); + brelse(bh); + } + + if (err) { + /* free all allocated blocks in error case */ + for (i = 0; i < depth; i++) { + if (!ablocks[i]) + continue; + ext3_free_blocks(handle, inode, ablocks[i], 1); + } + } + kfree(ablocks); + + return err; +} + +/* + * routine implements tree growing procedure: + * - allocates new block + * - moves top-level data (index block or leaf) into the new block + * - initialize new top-level, creating index that points to the + * just created block + */ +static int ext3_ext_grow_indepth(handle_t *handle, struct inode *inode, + struct ext3_ext_path *path, + struct ext3_extent *newext) +{ + struct ext3_ext_path *curp = path; + struct ext3_extent_header *neh; + struct ext3_extent_idx *fidx; + struct buffer_head *bh; + unsigned long newblock; + int err = 0; + + newblock = ext3_ext_new_block(handle, inode, path, newext, &err); + if (newblock == 0) + return err; + + bh = sb_getblk(inode->i_sb, newblock); + if (!bh) { + err = -EIO; + ext3_std_error(inode->i_sb, err); + return err; + } + lock_buffer(bh); + + if ((err = ext3_journal_get_create_access(handle, bh))) { + unlock_buffer(bh); + goto out; + } + + /* move top-level index/leaf into new block */ + memmove(bh->b_data, curp->p_hdr, sizeof(EXT3_I(inode)->i_data)); + + /* set size of new block */ + neh = ext_block_hdr(bh); + /* old root could have indexes or leaves + * so calculate e_max right way */ + if (ext_depth(inode)) + neh->eh_max = cpu_to_le16(ext3_ext_space_block_idx(inode)); + else + neh->eh_max = cpu_to_le16(ext3_ext_space_block(inode)); + neh->eh_magic = cpu_to_le16(EXT3_EXT_MAGIC); + set_buffer_uptodate(bh); + unlock_buffer(bh); + + if ((err = ext3_journal_dirty_metadata(handle, bh))) + goto out; + + /* create index in new top-level index: num,max,pointer */ + if ((err = ext3_ext_get_access(handle, inode, curp))) + goto out; + + curp->p_hdr->eh_magic = cpu_to_le16(EXT3_EXT_MAGIC); + curp->p_hdr->eh_max = cpu_to_le16(ext3_ext_space_root_idx(inode)); + curp->p_hdr->eh_entries = cpu_to_le16(1); + curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); + /* FIXME: it works, but actually path[0] can be index */ + curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; + curp->p_idx->ei_leaf = cpu_to_le32(newblock); + curp->p_idx->ei_leaf_hi = curp->p_idx->ei_unused = 0; + + neh = ext_inode_hdr(inode); + fidx = EXT_FIRST_INDEX(neh); + ext_debug(inode, "new root: num %d(%d), lblock %d, ptr %d\n", + le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), + le32_to_cpu(fidx->ei_block), le32_to_cpu(fidx->ei_leaf)); + + neh->eh_depth = cpu_to_le16(path->p_depth + 1); + err = ext3_ext_dirty(handle, inode, curp); +out: + brelse(bh); + + return err; +} + +/* + * routine finds empty index and adds new leaf. if no free index found + * then it requests in-depth growing + */ +static int ext3_ext_create_new_leaf(handle_t *handle, struct inode *inode, + struct ext3_ext_path *path, + struct ext3_extent *newext) +{ + struct ext3_ext_path *curp; + int depth, i, err = 0; + +repeat: + i = depth = ext_depth(inode); + + /* walk up to the tree and look for free index entry */ + curp = path + depth; + while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { + i--; + curp--; + } + + /* we use already allocated block for index block + * so, subsequent data blocks should be contigoues */ + if (EXT_HAS_FREE_INDEX(curp)) { + /* if we found index with free entry, then use that + * entry: create all needed subtree and add new leaf */ + err = ext3_ext_split(handle, inode, path, newext, i); + if (err) + goto out; + + /* refill path */ + ext3_ext_drop_refs(path); + path = ext3_ext_find_extent(inode, + le32_to_cpu(newext->ee_block), + path); + if (IS_ERR(path)) + err = PTR_ERR(path); + } else { + /* tree is full, time to grow in depth */ + err = ext3_ext_grow_indepth(handle, inode, path, newext); + if (err) + goto out; + + /* refill path */ + ext3_ext_drop_refs(path); + path = ext3_ext_find_extent(inode, + le32_to_cpu(newext->ee_block), + path); + if (IS_ERR(path)) { + err = PTR_ERR(path); + goto out; + } + + /* + * only first (depth 0 -> 1) produces free space + * in all other cases we have to split growed tree + */ + depth = ext_depth(inode); + if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { + /* now we need split */ + goto repeat; + } + } + +out: + return err; +} + +/* + * search the closest allocated block to the left for *logical + * and returns it at @logical + it's physical address at @phys + * if *logical is the smallest allocated block, the function + * returns 0 at @phys + * return value contains 0 (success) or error code + */ +int +ext3_ext_search_left(struct inode *inode, struct ext3_ext_path *path, + unsigned long *logical, unsigned long *phys) +{ + struct ext3_extent_idx *ix; + struct ext3_extent *ex; + int depth; + + BUG_ON(path == NULL); + depth = path->p_depth; + *phys = 0; + + if (depth == 0 && path->p_ext == NULL) + return 0; + + /* usually extent in the path covers blocks smaller + * then *logical, but it can be that extent is the + * first one in the file */ + + ex = path[depth].p_ext; + if (*logical < le32_to_cpu(ex->ee_block)) { + BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex); + while (--depth >= 0) { + ix = path[depth].p_idx; + BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr)); + } + return 0; + } + + BUG_ON(*logical < le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len)); + + *logical = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1; + *phys = le32_to_cpu(ex->ee_start) + le16_to_cpu(ex->ee_len) - 1; + return 0; +} +EXPORT_SYMBOL(ext3_ext_search_left); + +/* + * search the closest allocated block to the right for *logical + * and returns it at @logical + it's physical address at @phys + * if *logical is the smallest allocated block, the function + * returns 0 at @phys + * return value contains 0 (success) or error code + */ +int +ext3_ext_search_right(struct inode *inode, struct ext3_ext_path *path, + unsigned long *logical, unsigned long *phys) +{ + struct buffer_head *bh = NULL; + struct ext3_extent_header *eh; + struct ext3_extent_idx *ix; + struct ext3_extent *ex; + unsigned long block; + int depth; + + BUG_ON(path == NULL); + depth = path->p_depth; + *phys = 0; + + if (depth == 0 && path->p_ext == NULL) + return 0; + + /* usually extent in the path covers blocks smaller + * then *logical, but it can be that extent is the + * first one in the file */ + + ex = path[depth].p_ext; + if (*logical < le32_to_cpu(ex->ee_block)) { + BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex); + while (--depth >= 0) { + ix = path[depth].p_idx; + BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr)); + } + *logical = le32_to_cpu(ex->ee_block); + *phys = le32_to_cpu(ex->ee_start); + return 0; + } + + BUG_ON(*logical < le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len)); + + if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { + /* next allocated block in this leaf */ + ex++; + *logical = le32_to_cpu(ex->ee_block); + *phys = le32_to_cpu(ex->ee_start); + return 0; + } + + /* go up and search for index to the right */ + while (--depth >= 0) { + ix = path[depth].p_idx; + if (ix != EXT_LAST_INDEX(path[depth].p_hdr)) + break; + } + + if (depth < 0) { + /* we've gone up to the root and + * found no index to the right */ + return 0; + } + + /* we've found index to the right, let's + * follow it and find the closest allocated + * block to the right */ + ix++; + block = le32_to_cpu(ix->ei_leaf); + while (++depth < path->p_depth) { + bh = sb_bread(inode->i_sb, block); + if (bh == NULL) + return -EIO; + eh = ext_block_hdr(bh); + if (ext3_ext_check_header(inode, eh, path->p_depth - depth)) { + brelse(bh); + return -EIO; + } + ix = EXT_FIRST_INDEX(eh); + block = le32_to_cpu(ix->ei_leaf); + brelse(bh); + } + + bh = sb_bread(inode->i_sb, block); + if (bh == NULL) + return -EIO; + eh = ext_block_hdr(bh); + if (ext3_ext_check_header(inode, eh, 0)) { + brelse(bh); + return -EIO; + } + ex = EXT_FIRST_EXTENT(eh); + *logical = le32_to_cpu(ex->ee_block); + *phys = le32_to_cpu(ex->ee_start); + brelse(bh); + return 0; + +} +EXPORT_SYMBOL(ext3_ext_search_right); + + + +/* + * returns allocated block in subsequent extent or EXT_UNSET_BLOCK + * NOTE: it consider block number from index entry as + * allocated block. thus, index entries have to be consistent + * with leafs + */ +static unsigned long +ext3_ext_next_allocated_block(struct ext3_ext_path *path) +{ + int depth; + + BUG_ON(path == NULL); + depth = path->p_depth; + + if (depth == 0 && path->p_ext == NULL) + return EXT_UNSET_BLOCK; + + while (depth >= 0) { + if (depth == path->p_depth) { + /* leaf */ + if (path[depth].p_ext != + EXT_LAST_EXTENT(path[depth].p_hdr)) + return le32_to_cpu(path[depth].p_ext[1].ee_block); + } else { + /* index */ + if (path[depth].p_idx != + EXT_LAST_INDEX(path[depth].p_hdr)) + return le32_to_cpu(path[depth].p_idx[1].ei_block); + } + depth--; + } + + return EXT_UNSET_BLOCK; +} + +/* + * returns first allocated block from next leaf or EXT_UNSET_BLOCK + */ +static unsigned ext3_ext_next_leaf_block(struct inode *inode, + struct ext3_ext_path *path) +{ + int depth; + + BUG_ON(path == NULL); + depth = path->p_depth; + + /* zero-tree has no leaf blocks at all */ + if (depth == 0) + return EXT_UNSET_BLOCK; + + /* go to index block */ + depth--; + + while (depth >= 0) { + if (path[depth].p_idx != + EXT_LAST_INDEX(path[depth].p_hdr)) + return le32_to_cpu(path[depth].p_idx[1].ei_block); + depth--; + } + + return EXT_UNSET_BLOCK; +} + +/* + * if leaf gets modified and modified extent is first in the leaf + * then we have to correct all indexes above + * TODO: do we need to correct tree in all cases? + */ +int ext3_ext_correct_indexes(handle_t *handle, struct inode *inode, + struct ext3_ext_path *path) +{ + struct ext3_extent_header *eh; + int depth = ext_depth(inode); + struct ext3_extent *ex; + __le32 border; + int k, err = 0; + + eh = path[depth].p_hdr; + ex = path[depth].p_ext; + BUG_ON(ex == NULL); + BUG_ON(eh == NULL); + + if (depth == 0) { + /* there is no tree at all */ + return 0; + } + + if (ex != EXT_FIRST_EXTENT(eh)) { + /* we correct tree if first leaf got modified only */ + return 0; + } + + /* + * TODO: we need correction if border is smaller then current one + */ + k = depth - 1; + border = path[depth].p_ext->ee_block; + if ((err = ext3_ext_get_access(handle, inode, path + k))) + return err; + path[k].p_idx->ei_block = border; + if ((err = ext3_ext_dirty(handle, inode, path + k))) + return err; + + while (k--) { + /* change all left-side indexes */ + if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) + break; + if ((err = ext3_ext_get_access(handle, inode, path + k))) + break; + path[k].p_idx->ei_block = border; + if ((err = ext3_ext_dirty(handle, inode, path + k))) + break; + } + + return err; +} + +static int inline +ext3_can_extents_be_merged(struct inode *inode, struct ext3_extent *ex1, + struct ext3_extent *ex2) +{ + /* FIXME: 48bit support */ + if (le32_to_cpu(ex1->ee_block) + le16_to_cpu(ex1->ee_len) + != le32_to_cpu(ex2->ee_block)) + return 0; + +#ifdef AGRESSIVE_TEST + if (le16_to_cpu(ex1->ee_len) >= 4) + return 0; +#endif + + if (le32_to_cpu(ex1->ee_start) + le16_to_cpu(ex1->ee_len) + == le32_to_cpu(ex2->ee_start)) + return 1; + return 0; +} + +/* + * this routine tries to merge requsted extent into the existing + * extent or inserts requested extent as new one into the tree, + * creating new leaf in no-space case + */ +int ext3_ext_insert_extent(handle_t *handle, struct inode *inode, + struct ext3_ext_path *path, + struct ext3_extent *newext) +{ + struct ext3_extent_header * eh; + struct ext3_extent *ex, *fex; + struct ext3_extent *nearex; /* nearest extent */ + struct ext3_ext_path *npath = NULL; + int depth, len, err, next; + + BUG_ON(newext->ee_len == 0); + depth = ext_depth(inode); + ex = path[depth].p_ext; + BUG_ON(path[depth].p_hdr == NULL); + + /* try to insert block into found extent and return */ + if (ex && ext3_can_extents_be_merged(inode, ex, newext)) { + ext_debug(inode, "append %d block to %d:%d (from %d)\n", + le16_to_cpu(newext->ee_len), + le32_to_cpu(ex->ee_block), + le16_to_cpu(ex->ee_len), + le32_to_cpu(ex->ee_start)); + if ((err = ext3_ext_get_access(handle, inode, path + depth))) + return err; + ex->ee_len = cpu_to_le16(le16_to_cpu(ex->ee_len) + + le16_to_cpu(newext->ee_len)); + eh = path[depth].p_hdr; + nearex = ex; + goto merge; + } + +repeat: + depth = ext_depth(inode); + eh = path[depth].p_hdr; + if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) + goto has_space; + + /* probably next leaf has space for us? */ + fex = EXT_LAST_EXTENT(eh); + next = ext3_ext_next_leaf_block(inode, path); + if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block) + && next != EXT_UNSET_BLOCK) { + ext_debug(inode, "next leaf block - %d\n", next); + BUG_ON(npath != NULL); + npath = ext3_ext_find_extent(inode, next, NULL); + if (IS_ERR(npath)) + return PTR_ERR(npath); + BUG_ON(npath->p_depth != path->p_depth); + eh = npath[depth].p_hdr; + if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) { + ext_debug(inode, "next leaf isnt full(%d)\n", + le16_to_cpu(eh->eh_entries)); + path = npath; + goto repeat; + } + ext_debug(inode, "next leaf has no free space(%d,%d)\n", + le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); + } + + /* + * there is no free space in found leaf + * we're gonna add new leaf in the tree + */ + err = ext3_ext_create_new_leaf(handle, inode, path, newext); + if (err) + goto cleanup; + depth = ext_depth(inode); + eh = path[depth].p_hdr; + +has_space: + nearex = path[depth].p_ext; + + if ((err = ext3_ext_get_access(handle, inode, path + depth))) + goto cleanup; + + if (!nearex) { + /* there is no extent in this leaf, create first one */ + ext_debug(inode, "first extent in the leaf: %d:%d:%d\n", + le32_to_cpu(newext->ee_block), + le32_to_cpu(newext->ee_start), + le16_to_cpu(newext->ee_len)); + path[depth].p_ext = EXT_FIRST_EXTENT(eh); + } else if (le32_to_cpu(newext->ee_block) + > le32_to_cpu(nearex->ee_block)) { + /* BUG_ON(newext->ee_block == nearex->ee_block); */ + if (nearex != EXT_LAST_EXTENT(eh)) { + len = EXT_MAX_EXTENT(eh) - nearex; + len = (len - 1) * sizeof(struct ext3_extent); + len = len < 0 ? 0 : len; + ext_debug(inode, "insert %d:%d:%d after: nearest 0x%p, " + "move %d from 0x%p to 0x%p\n", + le32_to_cpu(newext->ee_block), + le32_to_cpu(newext->ee_start), + le16_to_cpu(newext->ee_len), + nearex, len, nearex + 1, nearex + 2); + memmove(nearex + 2, nearex + 1, len); + } + path[depth].p_ext = nearex + 1; + } else { + BUG_ON(newext->ee_block == nearex->ee_block); + len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent); + len = len < 0 ? 0 : len; + ext_debug(inode, "insert %d:%d:%d before: nearest 0x%p, " + "move %d from 0x%p to 0x%p\n", + le32_to_cpu(newext->ee_block), + le32_to_cpu(newext->ee_start), + le16_to_cpu(newext->ee_len), + nearex, len, nearex + 1, nearex + 2); + memmove(nearex + 1, nearex, len); + path[depth].p_ext = nearex; + } + + eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)+1); + nearex = path[depth].p_ext; + nearex->ee_block = newext->ee_block; + nearex->ee_start = newext->ee_start; + nearex->ee_len = newext->ee_len; + /* FIXME: support for large fs */ + nearex->ee_start_hi = 0; + +merge: + /* try to merge extents to the right */ + while (nearex < EXT_LAST_EXTENT(eh)) { + if (!ext3_can_extents_be_merged(inode, nearex, nearex + 1)) + break; + /* merge with next extent! */ + nearex->ee_len = cpu_to_le16(le16_to_cpu(nearex->ee_len) + + le16_to_cpu(nearex[1].ee_len)); + if (nearex + 1 < EXT_LAST_EXTENT(eh)) { + len = (EXT_LAST_EXTENT(eh) - nearex - 1) + * sizeof(struct ext3_extent); + memmove(nearex + 1, nearex + 2, len); + } + eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)-1); + BUG_ON(eh->eh_entries == 0); + } + + /* try to merge extents to the left */ + + /* time to correct all indexes above */ + err = ext3_ext_correct_indexes(handle, inode, path); + if (err) + goto cleanup; + + err = ext3_ext_dirty(handle, inode, path + depth); + +cleanup: + if (npath) { + ext3_ext_drop_refs(npath); + kfree(npath); + } + ext3_ext_tree_changed(inode); + ext3_ext_invalidate_cache(inode); + return err; +} + +int ext3_ext_walk_space(struct inode *inode, unsigned long block, + unsigned long num, ext_prepare_callback func, + void *cbdata) +{ + struct ext3_ext_path *path = NULL; + struct ext3_ext_cache cbex; + struct ext3_extent *ex; + unsigned long next, start = 0, end = 0; + unsigned long last = block + num; + int depth, exists, err = 0; + + BUG_ON(func == NULL); + BUG_ON(inode == NULL); + + while (block < last && block != EXT_UNSET_BLOCK) { + num = last - block; + /* find extent for this block */ + path = ext3_ext_find_extent(inode, block, path); + if (IS_ERR(path)) { + err = PTR_ERR(path); + path = NULL; + break; + } + + depth = ext_depth(inode); + BUG_ON(path[depth].p_hdr == NULL); + ex = path[depth].p_ext; + next = ext3_ext_next_allocated_block(path); + + exists = 0; + if (!ex) { + /* there is no extent yet, so try to allocate + * all requested space */ + start = block; + end = block + num; + } else if (le32_to_cpu(ex->ee_block) > block) { + /* need to allocate space before found extent */ + start = block; + end = le32_to_cpu(ex->ee_block); + if (block + num < end) + end = block + num; + } else if (block >= + le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len)) { + /* need to allocate space after found extent */ + start = block; + end = block + num; + if (next == EXT_UNSET_BLOCK) + end = EXT_MAX_BLOCK; + else if (end >= next) + end = next; + } else if (block >= le32_to_cpu(ex->ee_block)) { + /* + * some part of requested space is covered + * by found extent + */ + start = block; + end = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len); + if (block + num < end) + end = block + num; + exists = 1; + } else { + BUG(); + } + BUG_ON(end <= start); + + if (!exists) { + cbex.ec_block = start; + cbex.ec_len = end - start; + cbex.ec_start = 0; + cbex.ec_type = EXT3_EXT_CACHE_GAP; + } else { + cbex.ec_block = le32_to_cpu(ex->ee_block); + cbex.ec_len = le16_to_cpu(ex->ee_len); + cbex.ec_start = le32_to_cpu(ex->ee_start); + cbex.ec_type = EXT3_EXT_CACHE_EXTENT; + } + + BUG_ON(cbex.ec_len == 0); + err = func(inode, path, &cbex, cbdata); + ext3_ext_drop_refs(path); + + if (err < 0) + break; + if (err == EXT_REPEAT) + continue; + else if (err == EXT_BREAK) { + err = 0; + break; + } + + if (ext_depth(inode) != depth) { + /* depth was changed. we have to realloc path */ + kfree(path); + path = NULL; + } + + block = cbex.ec_block + cbex.ec_len; + } + + if (path) { + ext3_ext_drop_refs(path); + kfree(path); + } + + return err; +} + +static inline void +ext3_ext_put_in_cache(struct inode *inode, __u32 block, + __u32 len, __u32 start, int type) +{ + struct ext3_ext_cache *cex; + BUG_ON(len == 0); + cex = &EXT3_I(inode)->i_cached_extent; + cex->ec_type = type; + cex->ec_block = block; + cex->ec_len = len; + cex->ec_start = start; +} + +/* + * this routine calculate boundaries of the gap requested block fits into + * and cache this gap + */ +static inline void +ext3_ext_put_gap_in_cache(struct inode *inode, struct ext3_ext_path *path, + unsigned long block) +{ + int depth = ext_depth(inode); + unsigned long lblock, len; + struct ext3_extent *ex; + + ex = path[depth].p_ext; + if (ex == NULL) { + /* there is no extent yet, so gap is [0;-] */ + lblock = 0; + len = EXT_MAX_BLOCK; + ext_debug(inode, "cache gap(whole file):"); + } else if (block < le32_to_cpu(ex->ee_block)) { + lblock = block; + len = le32_to_cpu(ex->ee_block) - block; + ext_debug(inode, "cache gap(before): %lu [%lu:%lu]", + (unsigned long) block, + (unsigned long) le32_to_cpu(ex->ee_block), + (unsigned long) le16_to_cpu(ex->ee_len)); + } else if (block >= le32_to_cpu(ex->ee_block) + + le16_to_cpu(ex->ee_len)) { + lblock = le32_to_cpu(ex->ee_block) + + le16_to_cpu(ex->ee_len); + len = ext3_ext_next_allocated_block(path); + ext_debug(inode, "cache gap(after): [%lu:%lu] %lu", + (unsigned long) le32_to_cpu(ex->ee_block), + (unsigned long) le16_to_cpu(ex->ee_len), + (unsigned long) block); + BUG_ON(len == lblock); + len = len - lblock; + } else { + lblock = len = 0; + BUG(); + } + + ext_debug(inode, " -> %lu:%lu\n", (unsigned long) lblock, len); + ext3_ext_put_in_cache(inode, lblock, len, 0, EXT3_EXT_CACHE_GAP); +} + +static inline int +ext3_ext_in_cache(struct inode *inode, unsigned long block, + struct ext3_extent *ex) +{ + struct ext3_ext_cache *cex; + + cex = &EXT3_I(inode)->i_cached_extent; + + /* has cache valid data? */ + if (cex->ec_type == EXT3_EXT_CACHE_NO) + return EXT3_EXT_CACHE_NO; + + BUG_ON(cex->ec_type != EXT3_EXT_CACHE_GAP && + cex->ec_type != EXT3_EXT_CACHE_EXTENT); + if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) { + ex->ee_block = cpu_to_le32(cex->ec_block); + ex->ee_start = cpu_to_le32(cex->ec_start); + ex->ee_start_hi = 0; + ex->ee_len = cpu_to_le16(cex->ec_len); + ext_debug(inode, "%lu cached by %lu:%lu:%lu\n", + (unsigned long) block, + (unsigned long) cex->ec_block, + (unsigned long) cex->ec_len, + (unsigned long) cex->ec_start); + return cex->ec_type; + } + + /* not in cache */ + return EXT3_EXT_CACHE_NO; +} + +/* + * routine removes index from the index block + * it's used in truncate case only. thus all requests are for + * last index in the block only + */ +int ext3_ext_rm_idx(handle_t *handle, struct inode *inode, + struct ext3_ext_path *path) +{ + struct buffer_head *bh; + int err; + unsigned long leaf; + + /* free index block */ + path--; + leaf = le32_to_cpu(path->p_idx->ei_leaf); + BUG_ON(path->p_hdr->eh_entries == 0); + if ((err = ext3_ext_get_access(handle, inode, path))) + return err; + path->p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(path->p_hdr->eh_entries)-1); + if ((err = ext3_ext_dirty(handle, inode, path))) + return err; + ext_debug(inode, "index is empty, remove it, free block %lu\n", leaf); + bh = sb_find_get_block(inode->i_sb, leaf); + ext3_forget(handle, 1, inode, bh, leaf); + ext3_free_blocks(handle, inode, leaf, 1); + return err; +} + +/* + * This routine returns max. credits extent tree can consume. + * It should be OK for low-performance paths like ->writepage() + * To allow many writing process to fit a single transaction, + * caller should calculate credits under truncate_mutex and + * pass actual path. + */ +int inline ext3_ext_calc_credits_for_insert(struct inode *inode, + struct ext3_ext_path *path) +{ + int depth, needed; + + if (path) { + /* probably there is space in leaf? */ + depth = ext_depth(inode); + if (le16_to_cpu(path[depth].p_hdr->eh_entries) + < le16_to_cpu(path[depth].p_hdr->eh_max)) + return 1; + } + + /* + * given 32bit logical block (4294967296 blocks), max. tree + * can be 4 levels in depth -- 4 * 340^4 == 53453440000. + * let's also add one more level for imbalance. + */ + depth = 5; + + /* allocation of new data block(s) */ + needed = 2; + + /* + * tree can be full, so it'd need to grow in depth: + * we need one credit to modify old root, credits for + * new root will be added in split accounting + */ + needed += 1; + + /* + * Index split can happen, we'd need: + * allocate intermediate indexes (bitmap + group) + * + change two blocks at each level, but root (already included) + */ + needed += (depth * 2) + (depth * 2); + + /* any allocation modifies superblock */ + needed += 1; + + return needed; +} + +static int ext3_remove_blocks(handle_t *handle, struct inode *inode, + struct ext3_extent *ex, + unsigned long from, unsigned long to) +{ + struct buffer_head *bh; + int i; + +#ifdef EXTENTS_STATS + { + struct ext3_sb_info *sbi = EXT3_SB(inode->i_sb); + unsigned short ee_len = le16_to_cpu(ex->ee_len); + spin_lock(&sbi->s_ext_stats_lock); + sbi->s_ext_blocks += ee_len; + sbi->s_ext_extents++; + if (ee_len < sbi->s_ext_min) + sbi->s_ext_min = ee_len; + if (ee_len > sbi->s_ext_max) + sbi->s_ext_max = ee_len; + if (ext_depth(inode) > sbi->s_depth_max) + sbi->s_depth_max = ext_depth(inode); + spin_unlock(&sbi->s_ext_stats_lock); + } +#endif + if (from >= le32_to_cpu(ex->ee_block) + && to == le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) { + /* tail removal */ + unsigned long num, start; + num = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - from; + start = le32_to_cpu(ex->ee_start) + le16_to_cpu(ex->ee_len) - num; + ext_debug(inode, "free last %lu blocks starting %lu\n", num, start); + for (i = 0; i < num; i++) { + bh = sb_find_get_block(inode->i_sb, start + i); + ext3_forget(handle, 0, inode, bh, start + i); + } + ext3_free_blocks(handle, inode, start, num); + } else if (from == le32_to_cpu(ex->ee_block) + && to <= le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) { + printk("strange request: removal %lu-%lu from %u:%u\n", + from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len)); + } else { + printk("strange request: removal(2) %lu-%lu from %u:%u\n", + from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len)); + } + return 0; +} + +static int +ext3_ext_rm_leaf(handle_t *handle, struct inode *inode, + struct ext3_ext_path *path, unsigned long start) +{ + int err = 0, correct_index = 0; + int depth = ext_depth(inode), credits; + struct ext3_extent_header *eh; + unsigned a, b, block, num; + unsigned long ex_ee_block; + unsigned short ex_ee_len; + struct ext3_extent *ex; + + /* the header must be checked already in ext3_ext_remove_space() */ + ext_debug(inode, "truncate since %lu in leaf\n", start); + if (!path[depth].p_hdr) + path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); + eh = path[depth].p_hdr; + BUG_ON(eh == NULL); + + /* find where to start removing */ + ex = EXT_LAST_EXTENT(eh); + + ex_ee_block = le32_to_cpu(ex->ee_block); + ex_ee_len = le16_to_cpu(ex->ee_len); + + while (ex >= EXT_FIRST_EXTENT(eh) && + ex_ee_block + ex_ee_len > start) { + ext_debug(inode, "remove ext %lu:%u\n", ex_ee_block, ex_ee_len); + path[depth].p_ext = ex; + + a = ex_ee_block > start ? ex_ee_block : start; + b = (unsigned long long)ex_ee_block + ex_ee_len - 1 < + EXT_MAX_BLOCK ? ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK; + + ext_debug(inode, " border %u:%u\n", a, b); + + if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) { + block = 0; + num = 0; + BUG(); + } else if (a != ex_ee_block) { + /* remove tail of the extent */ + block = ex_ee_block; + num = a - block; + } else if (b != ex_ee_block + ex_ee_len - 1) { + /* remove head of the extent */ + block = a; + num = b - a; + /* there is no "make a hole" API yet */ + BUG(); + } else { + /* remove whole extent: excellent! */ + block = ex_ee_block; + num = 0; + BUG_ON(a != ex_ee_block); + BUG_ON(b != ex_ee_block + ex_ee_len - 1); + } + + /* at present, extent can't cross block group */ + /* leaf + bitmap + group desc + sb + inode */ + credits = 5; + if (ex == EXT_FIRST_EXTENT(eh)) { + correct_index = 1; + credits += (ext_depth(inode)) + 1; + } +#ifdef CONFIG_QUOTA + credits += 2 * EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb); +#endif + + handle = ext3_ext_journal_restart(handle, credits); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto out; + } + + err = ext3_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + + err = ext3_remove_blocks(handle, inode, ex, a, b); + if (err) + goto out; + + if (num == 0) { + /* this extent is removed entirely mark slot unused */ + ex->ee_start = ex->ee_start_hi = 0; + eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)-1); + } + + ex->ee_block = cpu_to_le32(block); + ex->ee_len = cpu_to_le16(num); + + err = ext3_ext_dirty(handle, inode, path + depth); + if (err) + goto out; + + ext_debug(inode, "new extent: %u:%u:%u\n", block, num, + le32_to_cpu(ex->ee_start)); + ex--; + ex_ee_block = le32_to_cpu(ex->ee_block); + ex_ee_len = le16_to_cpu(ex->ee_len); + } + + if (correct_index && eh->eh_entries) + err = ext3_ext_correct_indexes(handle, inode, path); + + /* if this leaf is free, then we should + * remove it from index block above */ + if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) + err = ext3_ext_rm_idx(handle, inode, path + depth); + +out: + return err; +} + +/* + * returns 1 if current index have to be freed (even partial) + */ +static int inline +ext3_ext_more_to_rm(struct ext3_ext_path *path) +{ + BUG_ON(path->p_idx == NULL); + + if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) + return 0; + + /* + * if truncate on deeper level happened it it wasn't partial + * so we have to consider current index for truncation + */ + if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block) + return 0; + return 1; +} + +int ext3_ext_remove_space(struct inode *inode, unsigned long start) +{ + struct super_block *sb = inode->i_sb; + int depth = ext_depth(inode); + struct ext3_ext_path *path; + handle_t *handle; + int i = 0, err = 0; + + ext_debug(inode, "truncate since %lu\n", start); + + /* probably first extent we're gonna free will be last in block */ + handle = ext3_journal_start(inode, depth + 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + ext3_ext_invalidate_cache(inode); + + /* + * we start scanning from right side freeing all the blocks + * after i_size and walking into the deep + */ + path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL); + if (path == NULL) { + ext3_journal_stop(handle); + return -ENOMEM; + } + memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); + path[0].p_hdr = ext_inode_hdr(inode); + if (ext3_ext_check_header(inode, path[0].p_hdr, depth)) { + err = -EIO; + goto out; + } + path[0].p_depth = depth; + + while (i >= 0 && err == 0) { + if (i == depth) { + /* this is leaf block */ + err = ext3_ext_rm_leaf(handle, inode, path, start); + /* root level have p_bh == NULL, brelse() eats this */ + brelse(path[i].p_bh); + path[i].p_bh = NULL; + i--; + continue; + } + + /* this is index block */ + if (!path[i].p_hdr) { + ext_debug(inode, "initialize header\n"); + path[i].p_hdr = ext_block_hdr(path[i].p_bh); + } + + if (!path[i].p_idx) { + /* this level hasn't touched yet */ + path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr); + path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1; + ext_debug(inode, "init index ptr: hdr 0x%p, num %d\n", + path[i].p_hdr, + le16_to_cpu(path[i].p_hdr->eh_entries)); + } else { + /* we've already was here, see at next index */ + path[i].p_idx--; + } + + ext_debug(inode, "level %d - index, first 0x%p, cur 0x%p\n", + i, EXT_FIRST_INDEX(path[i].p_hdr), + path[i].p_idx); + if (ext3_ext_more_to_rm(path + i)) { + struct buffer_head *bh; + /* go to the next level */ + ext_debug(inode, "move to level %d (block %d)\n", + i + 1, le32_to_cpu(path[i].p_idx->ei_leaf)); + memset(path + i + 1, 0, sizeof(*path)); + bh = sb_bread(sb, le32_to_cpu(path[i].p_idx->ei_leaf)); + if (!bh) { + /* should we reset i_size? */ + err = -EIO; + break; + } + BUG_ON(i + 1 > depth); + if (ext3_ext_check_header(inode, ext_block_hdr(bh), + depth - i - 1)) { + err = -EIO; + break; + } + path[i+1].p_bh = bh; + + /* put actual number of indexes to know is this + * number got changed at the next iteration */ + path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries); + i++; + } else { + /* we finish processing this index, go up */ + if (path[i].p_hdr->eh_entries == 0 && i > 0) { + /* index is empty, remove it + * handle must be already prepared by the + * truncatei_leaf() */ + err = ext3_ext_rm_idx(handle, inode, path + i); + } + /* root level have p_bh == NULL, brelse() eats this */ + brelse(path[i].p_bh); + path[i].p_bh = NULL; + i--; + ext_debug(inode, "return to level %d\n", i); + } + } + + /* TODO: flexible tree reduction should be here */ + if (path->p_hdr->eh_entries == 0) { + /* + * truncate to zero freed all the tree + * so, we need to correct eh_depth + */ + err = ext3_ext_get_access(handle, inode, path); + if (err == 0) { + ext_inode_hdr(inode)->eh_depth = 0; + ext_inode_hdr(inode)->eh_max = + cpu_to_le16(ext3_ext_space_root(inode)); + err = ext3_ext_dirty(handle, inode, path); + } + } +out: + ext3_ext_tree_changed(inode); + ext3_ext_drop_refs(path); + kfree(path); + ext3_journal_stop(handle); + + return err; +} + +/* + * called at mount time + */ +void ext3_ext_init(struct super_block *sb) +{ + /* + * possible initialization would be here + */ + + if (test_opt(sb, EXTENTS)) { + printk("EXT3-fs: file extents enabled"); +#ifdef AGRESSIVE_TEST + printk(", agressive tests"); +#endif +#ifdef CHECK_BINSEARCH + printk(", check binsearch"); +#endif +#ifdef EXTENTS_STATS + printk(", stats"); +#endif + printk("\n"); +#ifdef EXTENTS_STATS + spin_lock_init(&EXT3_SB(sb)->s_ext_stats_lock); + EXT3_SB(sb)->s_ext_min = 1 << 30; + EXT3_SB(sb)->s_ext_max = 0; +#endif + } +} + +/* + * called at umount time + */ +void ext3_ext_release(struct super_block *sb) +{ + if (!test_opt(sb, EXTENTS)) + return; + +#ifdef EXTENTS_STATS + if (EXT3_SB(sb)->s_ext_blocks && EXT3_SB(sb)->s_ext_extents) { + struct ext3_sb_info *sbi = EXT3_SB(sb); + printk(KERN_ERR "EXT3-fs: %lu blocks in %lu extents (%lu ave)\n", + sbi->s_ext_blocks, sbi->s_ext_extents, + sbi->s_ext_blocks / sbi->s_ext_extents); + printk(KERN_ERR "EXT3-fs: extents: %lu min, %lu max, max depth %lu\n", + sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max); + } +#endif +} + +int ext3_ext_get_blocks(handle_t *handle, struct inode *inode, sector_t iblock, + unsigned long max_blocks, struct buffer_head *bh_result, + int create, int extend_disksize) +{ + struct ext3_ext_path *path = NULL; + struct ext3_extent newex, *ex; + int goal, newblock, err = 0, depth; + unsigned long allocated = 0; + unsigned long next; + + __clear_bit(BH_New, &bh_result->b_state); + ext_debug(inode, "blocks %d/%lu requested for inode %u\n", (int) iblock, + max_blocks, (unsigned) inode->i_ino); + mutex_lock(&EXT3_I(inode)->truncate_mutex); + + /* check in cache */ + if ((goal = ext3_ext_in_cache(inode, iblock, &newex))) { + if (goal == EXT3_EXT_CACHE_GAP) { + if (!create) { + /* block isn't allocated yet and + * user don't want to allocate it */ + goto out2; + } + /* we should allocate requested block */ + } else if (goal == EXT3_EXT_CACHE_EXTENT) { + /* block is already allocated */ + newblock = iblock + - le32_to_cpu(newex.ee_block) + + le32_to_cpu(newex.ee_start); + /* number of remain blocks in the extent */ + BUG_ON(iblock < le32_to_cpu(newex.ee_block)); + allocated = le16_to_cpu(newex.ee_len) - + (iblock - le32_to_cpu(newex.ee_block)); + goto out; + } else { + BUG(); + } + } + + /* find extent for this block */ + path = ext3_ext_find_extent(inode, iblock, NULL); + if (IS_ERR(path)) { + err = PTR_ERR(path); + path = NULL; + goto out2; + } + + depth = ext_depth(inode); + + /* + * consistent leaf must not be empty + * this situations is possible, though, _during_ tree modification + * this is why assert can't be put in ext3_ext_find_extent() + */ + BUG_ON(path[depth].p_ext == NULL && depth != 0); + + if ((ex = path[depth].p_ext)) { + unsigned long ee_block = le32_to_cpu(ex->ee_block); + unsigned long ee_start = le32_to_cpu(ex->ee_start); + unsigned short ee_len = le16_to_cpu(ex->ee_len); + /* if found exent covers block, simple return it */ + if (iblock >= ee_block && iblock < ee_block + ee_len) { + newblock = iblock - ee_block + ee_start; + /* number of remain blocks in the extent */ + allocated = ee_len - (iblock - ee_block); + ext_debug(inode, "%d fit into %lu:%d -> %d\n", (int) iblock, + ee_block, ee_len, newblock); + ext3_ext_put_in_cache(inode, ee_block, ee_len, + ee_start, EXT3_EXT_CACHE_EXTENT); + goto out; + } + } + + /* + * requested block isn't allocated yet + * we couldn't try to create block if create flag is zero + */ + if (!create) { + /* put just found gap into cache to speedup subsequest reqs */ + ext3_ext_put_gap_in_cache(inode, path, iblock); + goto out2; + } + + /* + * Okay, we need to do block allocation. Lazily initialize the block + * allocation info here if necessary + */ + if (S_ISREG(inode->i_mode) && (!EXT3_I(inode)->i_block_alloc_info)) + ext3_init_block_alloc_info(inode); + + /* find next allocated block so that we know how many + * blocks we can allocate without ovelapping next extent */ + BUG_ON(iblock < le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len)); + next = ext3_ext_next_allocated_block(path); + BUG_ON(next <= iblock); + allocated = next - iblock; + if (allocated > max_blocks) + allocated = max_blocks; + + /* allocate new block */ + goal = ext3_ext_find_goal(inode, path, iblock); + newblock = ext3_new_blocks(handle, inode, goal, &allocated, &err); + if (!newblock) + goto out2; + ext_debug(inode, "allocate new block: goal %d, found %d/%lu\n", + goal, newblock, allocated); + + /* try to insert new extent into found leaf and return */ + newex.ee_block = cpu_to_le32(iblock); + newex.ee_start = cpu_to_le32(newblock); + newex.ee_start_hi = 0; + newex.ee_len = cpu_to_le16(allocated); + err = ext3_ext_insert_extent(handle, inode, path, &newex); + if (err) { + /* free data blocks we just allocated */ + ext3_free_blocks(handle, inode, le32_to_cpu(newex.ee_start), + le16_to_cpu(newex.ee_len)); + goto out2; + } + + if (extend_disksize && inode->i_size > EXT3_I(inode)->i_disksize) + EXT3_I(inode)->i_disksize = inode->i_size; + + /* previous routine could use block we allocated */ + newblock = le32_to_cpu(newex.ee_start); + __set_bit(BH_New, &bh_result->b_state); + + ext3_ext_put_in_cache(inode, iblock, allocated, newblock, + EXT3_EXT_CACHE_EXTENT); +out: + if (allocated > max_blocks) + allocated = max_blocks; + ext3_ext_show_leaf(inode, path); + __set_bit(BH_Mapped, &bh_result->b_state); + bh_result->b_bdev = inode->i_sb->s_bdev; + bh_result->b_blocknr = newblock; + bh_result->b_size = (allocated << inode->i_blkbits); +out2: + if (path) { + ext3_ext_drop_refs(path); + kfree(path); + } + mutex_unlock(&EXT3_I(inode)->truncate_mutex); + + return err ? err : allocated; +} + +void ext3_ext_truncate(struct inode * inode, struct page *page) +{ + struct address_space *mapping = inode->i_mapping; + struct super_block *sb = inode->i_sb; + unsigned long last_block; + handle_t *handle; + int err = 0; + + /* + * probably first extent we're gonna free will be last in block + */ + err = ext3_writepage_trans_blocks(inode) + 3; + handle = ext3_journal_start(inode, err); + if (IS_ERR(handle)) { + if (page) { + clear_highpage(page); + flush_dcache_page(page); + unlock_page(page); + page_cache_release(page); + } + return; + } + + if (page) + ext3_block_truncate_page(handle, page, mapping, inode->i_size); + + mutex_lock(&EXT3_I(inode)->truncate_mutex); + ext3_ext_invalidate_cache(inode); + + /* + * TODO: optimization is possible here + * probably we need not scaning at all, + * because page truncation is enough + */ + if (ext3_orphan_add(handle, inode)) + goto out_stop; + + /* we have to know where to truncate from in crash case */ + EXT3_I(inode)->i_disksize = inode->i_size; + ext3_mark_inode_dirty(handle, inode); + + last_block = (inode->i_size + sb->s_blocksize - 1) + >> EXT3_BLOCK_SIZE_BITS(sb); + err = ext3_ext_remove_space(inode, last_block); + + /* In a multi-transaction truncate, we only make the final + * transaction synchronous */ + if (IS_SYNC(inode)) + handle->h_sync = 1; + +out_stop: + /* + * If this was a simple ftruncate(), and the file will remain alive + * then we need to clear up the orphan record which we created above. + * However, if this was a real unlink then we were called by + * ext3_delete_inode(), and we allow that function to clean up the + * orphan info for us. + */ + if (inode->i_nlink) + ext3_orphan_del(handle, inode); + + mutex_unlock(&EXT3_I(inode)->truncate_mutex); + ext3_journal_stop(handle); +} + +/* + * this routine calculate max number of blocks we could modify + * in order to allocate new block for an inode + */ +int ext3_ext_writepage_trans_blocks(struct inode *inode, int num) +{ + int needed; + + needed = ext3_ext_calc_credits_for_insert(inode, NULL); + + /* caller want to allocate num blocks, but note it includes sb */ + needed = needed * num - (num - 1); + +#ifdef CONFIG_QUOTA + needed += 2 * EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb); +#endif + + return needed; +} + +EXPORT_SYMBOL(ext3_mark_inode_dirty); +EXPORT_SYMBOL(ext3_ext_invalidate_cache); +EXPORT_SYMBOL(ext3_ext_insert_extent); +EXPORT_SYMBOL(ext3_ext_walk_space); +EXPORT_SYMBOL(ext3_ext_find_goal); +EXPORT_SYMBOL(ext3_ext_calc_credits_for_insert); Index: linux-stage/fs/ext3/ialloc.c =================================================================== --- linux-stage.orig/fs/ext3/ialloc.c +++ linux-stage/fs/ext3/ialloc.c @@ -651,6 +651,17 @@ got: ext3_std_error(sb, err); goto fail_free_drop; } + if (test_opt(sb, EXTENTS) && S_ISREG(inode->i_mode)) { + EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL; + ext3_ext_tree_init(handle, inode); + if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS)) { + err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); + if (err) goto fail; + EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS); + BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + } + } ext3_debug("allocating inode %lu\n", inode->i_ino); goto really_out; Index: linux-stage/fs/ext3/inode.c =================================================================== --- linux-stage.orig/fs/ext3/inode.c +++ linux-stage/fs/ext3/inode.c @@ -40,8 +40,6 @@ #include "iopen.h" #include "acl.h" -static int ext3_writepage_trans_blocks(struct inode *inode); - /* * Test whether an inode is a fast symlink. */ @@ -804,6 +802,7 @@ int ext3_get_blocks_handle(handle_t *han ext3_fsblk_t first_block = 0; + J_ASSERT(!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)); J_ASSERT(handle != NULL || create == 0); depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary); @@ -984,12 +983,10 @@ static int ext3_get_block(struct inode * get_block: if (ret == 0) { - ret = ext3_get_blocks_handle(handle, inode, iblock, + ret = ext3_get_blocks_wrap(handle, inode, iblock, max_blocks, bh_result, create, 0); - if (ret > 0) { - bh_result->b_size = (ret << inode->i_blkbits); + if (ret > 0) ret = 0; - } } return ret; } @@ -1008,7 +1005,7 @@ struct buffer_head *ext3_getblk(handle_t dummy.b_state = 0; dummy.b_blocknr = -1000; buffer_trace_init(&dummy.b_history); - err = ext3_get_blocks_handle(handle, inode, block, 1, + err = ext3_get_blocks_wrap(handle, inode, block, 1, &dummy, create, 1); /* * ext3_get_blocks_handle() returns number of blocks @@ -1759,7 +1756,7 @@ void ext3_set_aops(struct inode *inode) * This required during truncate. We need to physically zero the tail end * of that block so it doesn't yield old data if the file is later grown. */ -static int ext3_block_truncate_page(handle_t *handle, struct page *page, +int ext3_block_truncate_page(handle_t *handle, struct page *page, struct address_space *mapping, loff_t from) { ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT; @@ -2263,6 +2260,9 @@ void ext3_truncate(struct inode *inode) return; } + if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) + return ext3_ext_truncate(inode, page); + handle = start_transaction(inode); if (IS_ERR(handle)) { if (page) { @@ -3005,12 +3005,15 @@ err_out: * block and work out the exact number of indirects which are touched. Pah. */ -static int ext3_writepage_trans_blocks(struct inode *inode) +int ext3_writepage_trans_blocks(struct inode *inode) { int bpp = ext3_journal_blocks_per_page(inode); int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; int ret; + if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) + return ext3_ext_writepage_trans_blocks(inode, bpp); + if (ext3_should_journal_data(inode)) ret = 3 * (bpp + indirects) + 2; else @@ -3257,7 +3260,7 @@ int ext3_map_inode_page(struct inode *in if (blocks[i] != 0) continue; - rc = ext3_get_blocks_handle(handle, inode, iblock, 1, &dummy, 1, 1); + rc = ext3_get_blocks_wrap(handle, inode, iblock, 1, &dummy, 1, 1); if (rc < 0) { printk(KERN_INFO "ext3_map_inode_page: error reading " "block %ld\n", iblock); Index: linux-stage/fs/ext3/Makefile =================================================================== --- linux-stage.orig/fs/ext3/Makefile +++ linux-stage/fs/ext3/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ - ioctl.o namei.o super.o symlink.o hash.o resize.o + ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o Index: linux-stage/fs/ext3/super.c =================================================================== --- linux-stage.orig/fs/ext3/super.c +++ linux-stage/fs/ext3/super.c @@ -391,6 +391,7 @@ static void ext3_put_super (struct super struct ext3_super_block *es = sbi->s_es; int i; + ext3_ext_release(sb); ext3_xattr_put_super(sb); journal_destroy(sbi->s_journal); if (!(sb->s_flags & MS_RDONLY)) { @@ -455,6 +456,8 @@ static struct inode *ext3_alloc_inode(st #endif ei->i_block_alloc_info = NULL; ei->vfs_inode.i_version = 1; + + memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent)); return &ei->vfs_inode; } @@ -680,7 +683,8 @@ enum { Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, - Opt_grpquota + Opt_grpquota, + Opt_extents, Opt_noextents, Opt_extdebug, }; static match_table_t tokens = { @@ -733,6 +737,9 @@ static match_table_t tokens = { {Opt_noiopen, "noiopen"}, {Opt_iopen_nopriv, "iopen_nopriv"}, {Opt_barrier, "barrier=%u"}, + {Opt_extents, "extents"}, + {Opt_noextents, "noextents"}, + {Opt_extdebug, "extdebug"}, {Opt_err, NULL}, {Opt_resize, "resize"}, }; @@ -1077,6 +1084,15 @@ clear_qf_name: case Opt_bh: clear_opt(sbi->s_mount_opt, NOBH); break; + case Opt_extents: + set_opt (sbi->s_mount_opt, EXTENTS); + break; + case Opt_noextents: + clear_opt (sbi->s_mount_opt, EXTENTS); + break; + case Opt_extdebug: + set_opt (sbi->s_mount_opt, EXTDEBUG); + break; default: printk (KERN_ERR "EXT3-fs: Unrecognized mount option \"%s\" " @@ -1812,6 +1828,8 @@ static int ext3_fill_super (struct super test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": "writeback"); + ext3_ext_init(sb); + lock_kernel(); return 0; Index: linux-stage/include/linux/ext3_extents.h =================================================================== --- /dev/null +++ linux-stage/include/linux/ext3_extents.h @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public Licens + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- + */ + +#ifndef _LINUX_EXT3_EXTENTS +#define _LINUX_EXT3_EXTENTS + +#include + +/* + * with AGRESSIVE_TEST defined capacity of index/leaf blocks + * become very little, so index split, in-depth growing and + * other hard changes happens much more often + * this is for debug purposes only + */ +#define AGRESSIVE_TEST_ + +/* + * with EXTENTS_STATS defined number of blocks and extents + * are collected in truncate path. they'll be showed at + * umount time + */ +#define EXTENTS_STATS__ + +/* + * if CHECK_BINSEARCH defined, then results of binary search + * will be checked by linear search + */ +#define CHECK_BINSEARCH__ + +/* + * if EXT_DEBUG is defined you can use 'extdebug' mount option + * to get lots of info what's going on + */ +#define EXT_DEBUG_ +#ifdef EXT_DEBUG +#define ext_debug(inode,fmt,a...) \ +do { \ + if (test_opt(inode->i_sb, EXTDEBUG)) \ + printk(fmt, ##a); \ +} while (0); +#else +#define ext_debug(inode,fmt,a...) +#endif + + +/* + * if EXT_STATS is defined then stats numbers are collected + * these number will be displayed at umount time + */ +#define EXT_STATS_ + +/* + * define EXT3_ALLOC_NEEDED to 0 since block bitmap, group desc. and sb + * are now accounted in ext3_ext_calc_credits_for_insert() + */ +#define EXT3_ALLOC_NEEDED 0 + +/* + * ext3_inode has i_block array (60 bytes total) + * first 12 bytes store ext3_extent_header + * the remain stores array of ext3_extent + */ + +/* + * this is extent on-disk structure + * it's used at the bottom of the tree + */ +struct ext3_extent { + __le32 ee_block; /* first logical block extent covers */ + __le16 ee_len; /* number of blocks covered by extent */ + __le16 ee_start_hi; /* high 16 bits of physical block */ + __le32 ee_start; /* low 32 bigs of physical block */ +}; + +/* + * this is index on-disk structure + * it's used at all the levels, but the bottom + */ +struct ext3_extent_idx { + __le32 ei_block; /* index covers logical blocks from 'block' */ + __le32 ei_leaf; /* pointer to the physical block of the next * + * level. leaf or next index could bet here */ + __le16 ei_leaf_hi; /* high 16 bits of physical block */ + __u16 ei_unused; +}; + +/* + * each block (leaves and indexes), even inode-stored has header + */ +struct ext3_extent_header { + __le16 eh_magic; /* probably will support different formats */ + __le16 eh_entries; /* number of valid entries */ + __le16 eh_max; /* capacity of store in entries */ + __le16 eh_depth; /* has tree real underlaying blocks? */ + __le32 eh_generation; /* flags(8 bits) | generation of the tree */ +}; + +#define EXT3_EXT_MAGIC 0xf30a + +/* + * array of ext3_ext_path contains path to some extent + * creation/lookup routines use it for traversal/splitting/etc + * truncate uses it to simulate recursive walking + */ +struct ext3_ext_path { + __u32 p_block; + __u16 p_depth; + struct ext3_extent *p_ext; + struct ext3_extent_idx *p_idx; + struct ext3_extent_header *p_hdr; + struct buffer_head *p_bh; +}; + +/* + * structure for external API + */ + +#define EXT3_EXT_CACHE_NO 0 +#define EXT3_EXT_CACHE_GAP 1 +#define EXT3_EXT_CACHE_EXTENT 2 +#define EXT3_EXT_HAS_NO_TREE /* ext3_extents_tree struct is not used*/ + +/* + * to be called by ext3_ext_walk_space() + * negative retcode - error + * positive retcode - signal for ext3_ext_walk_space(), see below + * callback must return valid extent (passed or newly created) + */ +typedef int (*ext_prepare_callback)(struct inode *, struct ext3_ext_path *, + struct ext3_ext_cache *, + void *); + +#define EXT_CONTINUE 0 +#define EXT_BREAK 1 +#define EXT_REPEAT 2 + + +#define EXT_MAX_BLOCK 0xffffffff +#define EXT_UNSET_BLOCK 1 + +#define EXT_FLAGS_CLR_UNKNOWN 0x7 /* Flags cleared on modification */ +#define EXT_HDR_GEN_BITS 24 +#define EXT_HDR_GEN_MASK ((1 << EXT_HDR_GEN_BITS) - 1) + +#define EXT_FIRST_EXTENT(__hdr__) \ + ((struct ext3_extent *) (((char *) (__hdr__)) + \ + sizeof(struct ext3_extent_header))) +#define EXT_FIRST_INDEX(__hdr__) \ + ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \ + sizeof(struct ext3_extent_header))) +#define EXT_HAS_FREE_INDEX(__path__) \ + (le16_to_cpu((__path__)->p_hdr->eh_entries) \ + < le16_to_cpu((__path__)->p_hdr->eh_max)) +#define EXT_LAST_EXTENT(__hdr__) \ + (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1) +#define EXT_LAST_INDEX(__hdr__) \ + (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1) +#define EXT_MAX_EXTENT(__hdr__) \ + (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1) +#define EXT_MAX_INDEX(__hdr__) \ + (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1) + + +static inline struct ext3_extent_header *ext_inode_hdr(struct inode *inode) +{ + return (struct ext3_extent_header *) EXT3_I(inode)->i_data; +} + +static inline struct ext3_extent_header *ext_block_hdr(struct buffer_head *bh) +{ + return (struct ext3_extent_header *) bh->b_data; +} + +static inline unsigned short ext_depth(struct inode *inode) +{ + return le16_to_cpu(ext_inode_hdr(inode)->eh_depth); +} + +static inline unsigned short ext_flags(struct ext3_extent_header *neh) +{ + return le16_to_cpu(neh->eh_generation) >> EXT_HDR_GEN_BITS; +} + +static inline unsigned short ext_hdr_gen(struct ext3_extent_header *neh) +{ + return le16_to_cpu(neh->eh_generation) & EXT_HDR_GEN_MASK; +} + +static inline unsigned short ext_generation(struct inode *inode) +{ + return ext_hdr_gen(ext_inode_hdr(inode)); +} + +static inline void ext3_ext_tree_changed(struct inode *inode) +{ + struct ext3_extent_header *neh = ext_inode_hdr(inode); + neh->eh_generation = cpu_to_le32( + ((ext_flags(neh) & ~EXT_FLAGS_CLR_UNKNOWN) << EXT_HDR_GEN_BITS) + | ((ext_hdr_gen(neh) + 1) & EXT_HDR_GEN_MASK)); +} + +static inline void +ext3_ext_invalidate_cache(struct inode *inode) +{ + EXT3_I(inode)->i_cached_extent.ec_type = EXT3_EXT_CACHE_NO; +} + +extern int ext3_ext_search_left(struct inode *, struct ext3_ext_path *, unsigned long *, unsigned long *); +extern int ext3_ext_search_right(struct inode *, struct ext3_ext_path *, unsigned long *, unsigned long *); +extern int ext3_extent_tree_init(handle_t *, struct inode *); +extern int ext3_ext_calc_credits_for_insert(struct inode *, struct ext3_ext_path *); +extern int ext3_ext_insert_extent(handle_t *, struct inode *, struct ext3_ext_path *, struct ext3_extent *); +extern int ext3_ext_walk_space(struct inode *, unsigned long, unsigned long, ext_prepare_callback, void *); +extern struct ext3_ext_path * ext3_ext_find_extent(struct inode *, int, struct ext3_ext_path *); + +#endif /* _LINUX_EXT3_EXTENTS */ + Index: linux-stage/include/linux/ext3_fs.h =================================================================== --- linux-stage.orig/include/linux/ext3_fs.h +++ linux-stage/include/linux/ext3_fs.h @@ -182,8 +182,9 @@ struct ext3_group_desc #define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ #define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ +#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */ -#define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ +#define EXT3_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ #define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ /* @@ -373,6 +374,8 @@ struct ext3_inode { #define EXT3_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ #define EXT3_MOUNT_IOPEN 0x400000 /* Allow access via iopen */ #define EXT3_MOUNT_IOPEN_NOPRIV 0x800000/* Make iopen world-readable */ +#define EXT3_MOUNT_EXTENTS 0x2000000/* Extents support */ +#define EXT3_MOUNT_EXTDEBUG 0x4000000/* Extents debug */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef clear_opt @@ -572,11 +575,13 @@ static inline int ext3_valid_inum(struct #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ #define EXT3_FEATURE_INCOMPAT_META_BG 0x0010 +#define EXT3_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ #define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ EXT3_FEATURE_INCOMPAT_RECOVER| \ - EXT3_FEATURE_INCOMPAT_META_BG) + EXT3_FEATURE_INCOMPAT_META_BG| \ + EXT3_FEATURE_INCOMPAT_EXTENTS) #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT3_FEATURE_RO_COMPAT_BTREE_DIR) @@ -816,6 +821,9 @@ extern int ext3_get_inode_loc(struct ino extern void ext3_truncate (struct inode *); extern void ext3_set_inode_flags(struct inode *); extern void ext3_set_aops(struct inode *inode); +extern int ext3_writepage_trans_blocks(struct inode *); +extern int ext3_block_truncate_page(handle_t *handle, struct page *page, + struct address_space *mapping, loff_t from); /* ioctl.c */ extern int ext3_ioctl (struct inode *, struct file *, unsigned int, @@ -869,6 +877,30 @@ extern struct inode_operations ext3_spec extern struct inode_operations ext3_symlink_inode_operations; extern struct inode_operations ext3_fast_symlink_inode_operations; +/* extents.c */ +extern int ext3_ext_tree_init(handle_t *handle, struct inode *); +extern int ext3_ext_writepage_trans_blocks(struct inode *, int); +extern int ext3_ext_get_blocks(handle_t *, struct inode *, sector_t, + unsigned long, struct buffer_head *, int, int); +extern void ext3_ext_truncate(struct inode *, struct page *); +extern void ext3_ext_init(struct super_block *); +extern void ext3_ext_release(struct super_block *); +static inline int +ext3_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, + unsigned long max_blocks, struct buffer_head *bh, + int create, int extend_disksize) +{ + int ret; + if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) + return ext3_ext_get_blocks(handle, inode, block, max_blocks, + bh, create, extend_disksize); + ret = ext3_get_blocks_handle(handle, inode, block, max_blocks, bh, create, + extend_disksize); + if (ret > 0) + bh->b_size = (ret << inode->i_blkbits); + return ret; +} + #endif /* __KERNEL__ */ Index: linux-stage/include/linux/ext3_fs_i.h =================================================================== --- linux-stage.orig/include/linux/ext3_fs_i.h +++ linux-stage/include/linux/ext3_fs_i.h @@ -65,6 +65,16 @@ struct ext3_block_alloc_info { #define rsv_end rsv_window._rsv_end /* + * storage for cached extent + */ +struct ext3_ext_cache { + __u32 ec_start; + __u32 ec_block; + __u32 ec_len; /* must be 32bit to return holes */ + __u32 ec_type; +}; + +/* * third extended file system inode data in memory */ struct ext3_inode_info { @@ -142,6 +152,8 @@ struct ext3_inode_info { */ struct mutex truncate_mutex; struct inode vfs_inode; + + struct ext3_ext_cache i_cached_extent; }; #endif /* _LINUX_EXT3_FS_I */ Index: linux-stage/include/linux/ext3_fs_sb.h =================================================================== --- linux-stage.orig/include/linux/ext3_fs_sb.h +++ linux-stage/include/linux/ext3_fs_sb.h @@ -78,6 +78,16 @@ struct ext3_sb_info { char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ int s_jquota_fmt; /* Format of quota to use */ #endif + +#ifdef EXTENTS_STATS + /* ext3 extents stats */ + unsigned long s_ext_min; + unsigned long s_ext_max; + unsigned long s_depth_max; + spinlock_t s_ext_stats_lock; + unsigned long s_ext_blocks; + unsigned long s_ext_extents; +#endif }; #endif /* _LINUX_EXT3_FS_SB */ Index: linux-stage/include/linux/ext3_jbd.h =================================================================== --- linux-stage.orig/include/linux/ext3_jbd.h +++ linux-stage/include/linux/ext3_jbd.h @@ -23,12 +23,20 @@ /* Define the number of blocks we need to account to a transaction to * modify one block of data. - * + * * We may have to touch one inode, one bitmap buffer, up to three * indirection blocks, the group and superblock summaries, and the data - * block to complete the transaction. */ + * block to complete the transaction. + * + * For extents-enabled fs we may have to allocate and modify upto + * 5 levels of tree + root which is stored in inode. */ + +#define EXT3_SINGLEDATA_TRANS_BLOCKS(sb) \ + (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS) \ + || test_opt(sb, EXTENTS) ? 27U : 8U) -#define EXT3_SINGLEDATA_TRANS_BLOCKS 8U +/* Indicate that EXT3_SINGLEDATA_TRANS_BLOCKS takes the sb as argument */ +#define EXT3_SINGLEDATA_TRANS_BLOCKS_HAS_SB /* Extended attribute operations touch at most two data buffers, * two bitmap buffers, and two group summaries, in addition to the inode @@ -42,7 +50,7 @@ * superblock only gets updated once, of course, so don't bother * counting that again for the quota updates. */ -#define EXT3_DATA_TRANS_BLOCKS(sb) (EXT3_SINGLEDATA_TRANS_BLOCKS + \ +#define EXT3_DATA_TRANS_BLOCKS(sb) (EXT3_SINGLEDATA_TRANS_BLOCKS(sb) + \ EXT3_XATTR_TRANS_BLOCKS - 2 + \ 2*EXT3_QUOTA_TRANS_BLOCKS(sb)) @@ -78,9 +86,9 @@ /* Amount of blocks needed for quota insert/delete - we do some block writes * but inode, sb and group updates are done only once */ #define EXT3_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\ - (EXT3_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_INIT_REWRITE) : 0) + (EXT3_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0) #define EXT3_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\ - (EXT3_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_DEL_REWRITE) : 0) + (EXT3_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0) #else #define EXT3_QUOTA_TRANS_BLOCKS(sb) 0 #define EXT3_QUOTA_INIT_BLOCKS(sb) 0