>From b179b114b7ef6f4df1520e0013b8efe631ff577d Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 15 Jan 2008 10:04:43 +0100 Subject: [PATCH] fs: add extent_map library Signed-off-by: Jens Axboe --- fs/Makefile | 2 +- fs/dcache.c | 2 + fs/extent_map.c | 402 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/extent_map.h | 58 +++++++ 4 files changed, 463 insertions(+), 1 deletions(-) create mode 100644 fs/extent_map.c create mode 100644 include/linux/extent_map.h diff --git a/fs/Makefile b/fs/Makefile index 500cf15..01b37aa 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \ attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ pnode.o drop_caches.o splice.o sync.o utimes.o \ - stack.o + stack.o extent_map.o ifeq ($(CONFIG_BLOCK),y) obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o diff --git a/fs/dcache.c b/fs/dcache.c index d9ca1e5..edd9faa 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "internal.h" @@ -2170,6 +2171,7 @@ void __init vfs_caches_init(unsigned long mempages) dcache_init(); inode_init(); + extent_map_init(); files_init(mempages); mnt_init(); bdev_cache_init(); diff --git a/fs/extent_map.c b/fs/extent_map.c new file mode 100644 index 0000000..1e9ff6e --- /dev/null +++ b/fs/extent_map.c @@ -0,0 +1,402 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +static struct kmem_cache *extent_map_cache; + +int __init extent_map_init(void) +{ + extent_map_cache = kmem_cache_create("extent_map", + sizeof(struct extent_map), 0, + SLAB_MEM_SPREAD, NULL); + if (!extent_map_cache) + return -ENOMEM; + return 0; +} + +void __exit extent_map_exit(void) +{ + if (extent_map_cache) + kmem_cache_destroy(extent_map_cache); +} + +void extent_map_tree_init(struct extent_map_tree *tree) +{ + tree->map.rb_node = NULL; + tree->last = NULL; + rwlock_init(&tree->lock); +} +EXPORT_SYMBOL(extent_map_tree_init); + +struct extent_map *alloc_extent_map(gfp_t mask) +{ + struct extent_map *em; + em = kmem_cache_alloc(extent_map_cache, mask); + if (!em || IS_ERR(em)) + return em; + atomic_set(&em->refs, 1); + em->flags = 0; + return em; +} +EXPORT_SYMBOL(alloc_extent_map); + +void free_extent_map(struct extent_map *em) +{ + if (!em) + return; + if (atomic_dec_and_test(&em->refs)) + kmem_cache_free(extent_map_cache, em); +} +EXPORT_SYMBOL(free_extent_map); + +static struct rb_node *tree_insert(struct rb_root *root, u64 offset, + struct rb_node *node) +{ + struct rb_node ** p = &root->rb_node; + struct rb_node * parent = NULL; + struct extent_map *entry; + + while(*p) { + parent = *p; + entry = rb_entry(parent, struct extent_map, rb_node); + + if (offset < entry->start) + p = &(*p)->rb_left; + else if (offset >= entry->start + entry->len) + p = &(*p)->rb_right; + else + return parent; + } + + entry = rb_entry(node, struct extent_map, rb_node); + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +static struct rb_node *__tree_search(struct rb_root *root, u64 offset, + struct rb_node **prev_ret) +{ + struct rb_node * n = root->rb_node; + struct rb_node *prev = NULL; + struct extent_map *entry; + struct extent_map *prev_entry = NULL; + + while(n) { + entry = rb_entry(n, struct extent_map, rb_node); + prev = n; + prev_entry = entry; + + if (offset < entry->start) + n = n->rb_left; + else if (offset >= entry->start + entry->len) + n = n->rb_right; + else + return n; + } + if (!prev_ret) + return NULL; + while(prev && (offset >= prev_entry->start + prev_entry->len)) { + prev = rb_next(prev); + prev_entry = rb_entry(prev, struct extent_map, rb_node); + } + *prev_ret = prev; + return NULL; +} + +static inline struct rb_node *tree_search(struct rb_root *root, u64 offset) +{ + struct rb_node *prev; + struct rb_node *ret; + ret = __tree_search(root, offset, &prev); + if (!ret) + return prev; + return ret; +} + +static int tree_delete(struct rb_root *root, u64 offset) +{ + struct rb_node *node; + struct extent_map *entry; + + node = __tree_search(root, offset, NULL); + if (!node) + return -ENOENT; + entry = rb_entry(node, struct extent_map, rb_node); + rb_erase(node, root); + return 0; +} + +static int mergable_maps(struct extent_map *prev, struct extent_map *next) +{ + if (extent_map_end(prev) == next->start && + prev->flags == next->flags && + ((next->block_start == EXTENT_MAP_HOLE && + prev->block_start == EXTENT_MAP_HOLE) || + (next->block_start == EXTENT_MAP_INLINE && + prev->block_start == EXTENT_MAP_INLINE) || + (next->block_start == EXTENT_MAP_DELALLOC && + prev->block_start == EXTENT_MAP_DELALLOC) || + (next->block_start < EXTENT_MAP_LAST_BYTE - 1 && + next->block_start == extent_map_block_end(prev)))) { + return 1; + } + return 0; +} + +/* + * add_extent_mapping tries a simple forward/backward merge with existing + * mappings. The extent_map struct passed in will be inserted into + * the tree directly (no copies made, just a reference taken). + */ +int add_extent_mapping(struct extent_map_tree *tree, + struct extent_map *em) +{ + int ret = 0; + struct extent_map *merge = NULL; + struct rb_node *rb; + unsigned long flags; + + write_lock_irqsave(&tree->lock, flags); + rb = tree_insert(&tree->map, em->start, &em->rb_node); + if (rb) { + ret = -EEXIST; + goto out; + } + atomic_inc(&em->refs); + if (em->start != 0) { + rb = rb_prev(&em->rb_node); + if (rb) + merge = rb_entry(rb, struct extent_map, rb_node); + if (rb && mergable_maps(merge, em)) { + em->start = merge->start; + em->len += merge->len; + em->block_start = merge->block_start; + rb_erase(&merge->rb_node, &tree->map); + free_extent_map(merge); + } + } + rb = rb_next(&em->rb_node); + if (rb) + merge = rb_entry(rb, struct extent_map, rb_node); + if (rb && mergable_maps(em, merge)) { + em->len += merge->len; + rb_erase(&merge->rb_node, &tree->map); + free_extent_map(merge); + } + tree->last = em; +out: + write_unlock_irqrestore(&tree->lock, flags); + return ret; +} +EXPORT_SYMBOL(add_extent_mapping); + +/* + * lookup_extent_mapping returns the first extent_map struct in the + * tree that intersects the [start, len] range. There may + * be additional objects in the tree that intersect, so check the object + * returned carefully to make sure you don't need additional lookups. + */ +struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len) +{ + struct extent_map *em; + struct extent_map *last; + struct rb_node *rb_node; + unsigned long flags; + + read_lock_irqsave(&tree->lock, flags); + last = tree->last; + if (last && start >= last->start && + start + len <= extent_map_end(last)) { + em = last; + atomic_inc(&em->refs); + goto out; + } + rb_node = tree_search(&tree->map, start); + if (!rb_node) { + em = NULL; + goto out; + } + if (IS_ERR(rb_node)) { + em = ERR_PTR(PTR_ERR(rb_node)); + goto out; + } + em = rb_entry(rb_node, struct extent_map, rb_node); + if (extent_map_end(em) <= start || em->start >= start + len) { + em = NULL; + goto out; + } + atomic_inc(&em->refs); + tree->last = em; +out: + read_unlock_irqrestore(&tree->lock, flags); + return em; +} +EXPORT_SYMBOL(lookup_extent_mapping); + +/* + * removes an extent_map struct from the tree. No reference counts are + * dropped, and no checks are done to see if the range is in use + */ +int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) +{ + int ret; + unsigned long flags; + + write_lock_irqsave(&tree->lock, flags); + ret = tree_delete(&tree->map, em->start); + tree->last = NULL; + write_unlock_irqrestore(&tree->lock, flags); + return ret; +} +EXPORT_SYMBOL(remove_extent_mapping); + +static struct extent_map *__map_extent(struct extent_map_tree *tree, + struct address_space *mapping, + u64 start, u64 len, int create, + gfp_t gfp_mask, get_block_t get_block) +{ + struct inode *inode = mapping->host; + struct extent_map *em; + struct buffer_head result; + sector_t start_block; + u64 cur_len; + int ret; + +again: + em = lookup_extent_mapping(tree, start, len); + if (em) { + /* + * we may have found an extent that starts after the + * requested range. Double check and alter the length + * appropriately + */ + if (em->start > start) { + len = em->start - start; + } else if (!create || em->block_start != EXTENT_MAP_HOLE) { + return em; + } + free_extent_map(em); + + } + if (gfp_mask & GFP_ATOMIC) + return NULL; + + em = alloc_extent_map(GFP_NOFS); + if (!em) + return ERR_PTR(-ENOMEM); + + len = min_t(u64, len, (size_t)-1); + result.b_state = 0; + result.b_size = len; + start_block = start >> inode->i_blkbits; + + if (len < inode->i_sb->s_blocksize) { + printk("warning2: mapping length %Lu\n", len); + } + + /* + * FIXME if there are errors later on, we end up exposing stale + * data on disk while filling holes. + */ + ret = get_block(inode, start_block, + &result, create); + if (ret < 0) { + free_extent_map(em); + return ERR_PTR(ret); + } + + cur_len = result.b_size; + em->start = start; + em->len = cur_len; + em->bdev = result.b_bdev; + + if (create && buffer_new(&result)) { + remove_extent_mappings(tree, em->start, em->len); + em->flags = (1 << EXTENT_MAP_HOLE_FILLED); + } + + if (buffer_mapped(&result)) + em->block_start = (u64)result.b_blocknr << inode->i_blkbits; + else { + em->block_start = EXTENT_MAP_HOLE; + if (create) { + free_extent_map(em); + return ERR_PTR(-EIO); + } + } + ret = add_extent_mapping(tree, em); + if (ret == -EEXIST) { + free_extent_map(em); + goto again; + } + return em; +} + +struct extent_map *map_extent_get_block(struct extent_map_tree *tree, + struct address_space *mapping, + u64 start, u64 len, int create, + gfp_t gfp_mask, get_block_t get_block) +{ + struct extent_map *em; + u64 last; + u64 map_ahead_len = 0; + + em = __map_extent(tree, mapping, start, len, create, + gfp_mask, get_block); + + /* + * if we're doing a write or we found a large extent, return it + */ + if (IS_ERR(em) || !em || create || start + len < extent_map_end(em)) { + return em; + } + + /* + * otherwise, try to walk forward a bit and see if we can build + * something bigger. + */ + do { + last = extent_map_end(em); + free_extent_map(em); + em = __map_extent(tree, mapping, last, len, create, + gfp_mask, get_block); + if (IS_ERR(em) || !em) + break; + map_ahead_len += extent_map_end(em) - last; + } while(em->start <= start && start + len <= extent_map_end(em) && + em->block_start < EXTENT_MAP_LAST_BYTE && + map_ahead_len < (512 * 1024)); + + /* make sure we return the extent for this range */ + if (!em || IS_ERR(em) || em->start > start || + start + len > extent_map_end(em)) { + free_extent_map(em); + em = __map_extent(tree, mapping, start, len, create, + gfp_mask, get_block); + } + return em; +} +EXPORT_SYMBOL(map_extent_get_block); + +int remove_extent_mappings(struct extent_map_tree *tree, + u64 start, u64 len) +{ + struct extent_map *em; + + while((em = lookup_extent_mapping(tree, start, len))) { + remove_extent_mapping(tree, em); + /* once for us */ + free_extent_map(em); + /* once for the tree */ + free_extent_map(em); + } + return 0; +} +EXPORT_SYMBOL(remove_extent_mappings); diff --git a/include/linux/extent_map.h b/include/linux/extent_map.h new file mode 100644 index 0000000..7cdc71c --- /dev/null +++ b/include/linux/extent_map.h @@ -0,0 +1,58 @@ +#ifndef __EXTENTMAP__ +#define __EXTENTMAP__ + +#include + +/* special values for struct extent_map->block_start */ +#define EXTENT_MAP_LAST_BYTE (u64)-4 +#define EXTENT_MAP_HOLE (u64)-3 +#define EXTENT_MAP_INLINE (u64)-2 +#define EXTENT_MAP_DELALLOC (u64)-1 + +/* bit flags for struct extent_map->flags */ +#define EXTENT_MAP_COMMIT_REQUIRED 1 +#define EXTENT_MAP_HOLE_FILLED 2 + +struct extent_map_tree { + struct rb_root map; + rwlock_t lock; + struct extent_map *last; +}; + +struct extent_map { + struct rb_node rb_node; + u64 start; + u64 len; + u64 block_start; + struct block_device *bdev; + atomic_t refs; + unsigned long flags; +}; + +static inline u64 extent_map_end(struct extent_map *em) +{ + return em->start + em->len; +} + +static inline u64 extent_map_block_end(struct extent_map *em) +{ + return em->block_start + em->len; +} + +void extent_map_tree_init(struct extent_map_tree *tree); +struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 end); +struct extent_map *map_extent_get_block(struct extent_map_tree *tree, + struct address_space *mapping, + u64 start, u64 len, int create, + gfp_t gfp_mask, get_block_t get_block); +int add_extent_mapping(struct extent_map_tree *tree, + struct extent_map *em); +int remove_extent_mappings(struct extent_map_tree *tree, + u64 start, u64 len); +int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); +struct extent_map *alloc_extent_map(gfp_t mask); +void free_extent_map(struct extent_map *em); +int __init extent_map_init(void); +void __exit extent_map_exit(void); +#endif -- 1.5.4.rc2.84.gf85fd