[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date: Fri, 9 Sep 2011 11:32:36 +0200 (CEST)
From: Lukas Czerner <lczerner@...hat.com>
To: Yongqiang Yang <xiaoqiangnk@...il.com>
cc: linux-ext4@...r.kernel.org, achender@...ux.vnet.ibm.com,
adityakali@...gle.com, tytso@....edu
Subject: Re: [RFC PATCH 3/5] ext4: add operations on delayed extent tree
On Tue, 30 Aug 2011, Yongqiang Yang wrote:
> This patch adds operations on a delayed extent tree.
>
> Signed-off-by; Yongqiang Yang <xiaoqiangnk@...il.com>
>
> Signed-off-by: Yongqiang Yang <xiaoqiangnk@...il.com>
> ---
> fs/ext4/Makefile | 2 +-
> fs/ext4/delayed_extents.c | 390 +++++++++++++++++++++++++++++++++++++++++++++
> fs/ext4/delayed_extents.h | 25 +++
> 3 files changed, 416 insertions(+), 1 deletions(-)
> create mode 100644 fs/ext4/delayed_extents.c
>
> diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
> index 56fd8f86..ee16ad3 100644
> --- a/fs/ext4/Makefile
> +++ b/fs/ext4/Makefile
> @@ -7,7 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
> ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
> ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
> ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
> - mmp.o indirect.o
> + mmp.o indirect.o delayed_extents.o
>
> ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
> ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
> diff --git a/fs/ext4/delayed_extents.c b/fs/ext4/delayed_extents.c
> new file mode 100644
> index 0000000..6275b19
> --- /dev/null
> +++ b/fs/ext4/delayed_extents.c
> @@ -0,0 +1,390 @@
> +/*
> + * fs/ext4/delayed_extents.c
> + *
> + * Written by Yongqiang Yang <xiaoqiangnk@...il.com>
> + *
> + * Ext4 delayed-extents-list core functions.
> + */
> +
> +#include "ext4.h"
> +#include "delayed_extents.h"
> +#include "ext4_extents.h"
> +
> +/*
> + * delayed extent implementation for ext4.
> + *
> + *
> + * ==========================================================================
> + * 1. Why delayed extent implementation ?
> + *
> + * Without delayed extent, ext4 identifies a delayed extent by looking up
> + * page cache, this has several deficiencies - complicated, buggy, and
> + * inefficient code.
> + *
> + * FIEMAP, SEEK_HOLE/DATA, bigalloc, punch hole and writeout all need to know if
> + * a block or a range of blocks are belonged to a delayed extent.
> + *
> + * Let us have a look at how they do without delayed extents implementation.
> + * -- FIEMAP
> + * FIEMAP looks up page cache to identify delayed allocations from holes.
> + *
> + * -- SEEK_HOLE/DATA
> + * SEEK_HOLE/DATA has the same problem as FIEMAP.
> + *
> + * -- bigalloc
> + * bigalloc looks up page cache to figure out if a block is already
> + * under delayed allocation or not to determine whether quota reserving
> + * is needed for the cluster.
> + *
> + * -- punch hole
> + * punch hole looks up page cache to identify a delayed extent.
> + *
> + * -- writeout
> + * Writeout looks up whole page cache to see if a buffer is mapped, If
> + * there are not very many delayed buffers, then it is time comsuming.
> + *
> + * With delayed extents implementation, FIEMAP, SEEK_HOLE/DATA, bigalloc and
> + * writeout can figure out if a block or a range of blocks is under delayed
> + * allocation(belonged to a delayed extent) or not by searching the delayed
> + * extent list.
> + *
> + *
> + * ==========================================================================
> + * 2. ext4 delayed extents impelmentation
> + *
> + * -- delayed extent
> + * A delayed extent is a range of blocks which are contiguous logically and
> + * under delayed allocation. Unlike extent in ext4, delayed extent in ext4
> + * is a in-memory struct, there is no corresponding on-disk data. There is
> + * no limit on length of delayed extent, so a delayed extent can contain as
> + * many blocks as they are contiguous logically.
> + *
> + * -- delayed extent list
> + * Every inode has a delayed extent list and all under delayed allocation
> + * blocks are added to the list as dealyed extents. Delayed extents in
> + * the list are sorted by logical block no.
> + *
> + * -- operations on a delayed extent list
> + * There are three operations on a delayed extent list: find next delayed
> + * adding a space(a range of blocks) to a list and removing a space from
> + * a list.
> + *
> + * -- race on a delayed extent list
> + * Delayed extent list is protected inode->i_data_sem like extent tree.
> + *
> + *
> + * ==========================================================================
> + * 3. performance analysis
> + * -- overhead
> + * 1. Apart from operations on a delayed extent list, we need to
> + * down_write(inode->i_data_sem) in delayed write path to maintain delayed
> + * extent list, this can have impact on parallel read-write and write-write
> + *
> + * 2. There is a cache extent for write access, so if writes are not very
> + * random, adding space operaions are in O(1) time.
> + *
> + * -- gain
> + * 3. Code is much simpler, more readable, more maintainable and
> + * more efficient.
> + *
> + * 4. TODO
> + * -- cache_de in de_find_extent
> + * de_find_extent is used heavily by bgalloc, so de_find_extent should use
> + * cache_de.
> + *
> + * ==========================================================================
> + * 4. More sophiscated data structure like tree ?
> + * Does we need rb tree instead?
> + *
> + */
> +static struct kmem_cache *ext4_de_cachep;
> +
> +int __init ext4_init_de(void)
> +{
> + ext4_de_cachep = KMEM_CACHE(ext4_delayed_extent,
> + SLAB_RECLAIM_ACCOUNT);
> + if (ext4_de_cachep == NULL)
> + return -ENOMEM;
> + return 0;
> +}
> +
> +void ext4_exit_de(void)
> +{
> + kmem_cache_destroy(ext4_de_cachep);
> +}
> +
> +#ifdef DE_DEBUG
> +void ext4_de_print_list(struct inode *inode)
> +{
> + struct ext4_delayed_info *delayed_info;
> + struct list_head *cur;
> +
> + printk(KERN_DEBUG "delayed extents for inode %lu:", inode->i_ino);
> + delayed_info = &EXT4_I(inode)->i_delayed_info;
> + list_for_each(cur, &delayed_info->de_list) {
> + struct ext4_delayed_extent *de;
> + de = list_entry(cur, struct ext4_delayed_extent, de_list);
> + printk(KERN_DEBUG " [%u, %u)", de->de_start, de->de_end);
> + }
> + printk(KERN_DEBUG "\n");
> +}
> +#else
> +#define ext4_de_print_list(inode)
> +#endif
> +
> +#define list_tail(list, type, member) \
> + list_entry((list)->prev, type, member)
> +
> +#define list_next(head, elm, member) \
> + ((elm)->member.next != head ? \
> + list_entry((elm)->member.next, typeof(*elm), member) : NULL)
> +
> +#define list_prev(head, elm, member) \
> + ((elm)->member.prev != head ? \
> + list_entry((elm)->member.prev, typeof(*elm), member) : NULL)
> +
> +/*
> + * ext4_de_first_extent_since: find the 1st delayed extent covering @start
> + * if it exists, otherwise, the next extent after @start.
> + *
> + * @inode: the inode which owns delayed extents
> + * @start: logic block
> + *
> + * Return next delayed block after the found extent. Delayed extent is returned
> + * the extent is returned via @de.
> + */
> +ext4_lblk_t ext4_de_find_extent(struct inode *inode,
> + struct ext4_delayed_extent *de)
> +{
> + struct ext4_delayed_info *delayed_info;
> + struct list_head *cur;
> + struct ext4_delayed_extent *de1;
> +
> + de->de_end = 0;
> + delayed_info = &EXT4_I(inode)->i_delayed_info;
> + if (list_empty(&delayed_info->de_list))
> + return EXT_MAX_BLOCKS;
> +
> + de1 = list_tail(&delayed_info->de_list, struct ext4_delayed_extent,
> + de_list);
> + if (de->de_start >= de1->de_end)
> + return EXT_MAX_BLOCKS;
> +
> + list_for_each(cur, &delayed_info->de_list) {
> + de1 = list_entry(cur, struct ext4_delayed_extent, de_list);
> + if (de->de_end != 0)
> + return de1->de_start;
> +
> + if (de1->de_end <= de->de_start)
> + /* @de is followed by @*start and they are
> + * not contiguous.
> + * |------------|.........|
> + * de *start
> + */
> + continue;
> + de->de_start = de1->de_start;
> + de->de_end = de1->de_end;
> + }
> +
> + BUG_ON(de->de_end == 0);
> + return EXT_MAX_BLOCKS;
> +}
> +
> +static struct ext4_delayed_extent *
> +ext4_de_add_entry(struct list_head *head, ext4_lblk_t start, ext4_lblk_t end)
> +{
> + struct ext4_delayed_extent *de;
> + de = kmem_cache_alloc(ext4_de_cachep, GFP_NOFS);
> + if (de == NULL)
> + return NULL;
> + de->de_start = start;
> + de->de_end = end;
> + INIT_LIST_HEAD(&de->de_list);
> + list_add(&de->de_list, head);
> + return de;
> +}
> +
> +static void ext4_de_remove_entry(struct inode *inode,
> + struct ext4_delayed_extent *de)
> +{
> + struct ext4_delayed_info *delayed_info = &EXT4_I(inode)->i_delayed_info;
> +
> + list_del(&de->de_list);
> + kmem_cache_free(ext4_de_cachep, de);
> + if (delayed_info->cache_de == de)
> + delayed_info->cache_de = NULL;
> +}
> +
> +static void ext4_de_try_to_merge_left(struct inode *inode,
> + struct ext4_delayed_extent *de)
> +{
> + struct ext4_delayed_extent *de1;
> +
> + de1 = list_prev(&EXT4_I(inode)->i_delayed_info.de_list, de, de_list);
> + if (de1 && de1->de_end == de->de_start) {
> + de->de_start = de1->de_start;
> + ext4_de_remove_entry(inode, de1);
> + }
> +}
> +
> +static void ext4_de_try_to_merge_right(struct inode *inode,
> + struct ext4_delayed_extent *de)
> +{
> + struct ext4_delayed_extent *de1;
> +
> + de1 = list_next(&EXT4_I(inode)->i_delayed_info.de_list, de, de_list);
> + if (de1 && de1->de_start == de->de_end) {
> + de->de_end = de1->de_end;
> + ext4_de_remove_entry(inode, de1);
> + }
> +}
> +
> +/*
> + * ext4_de_add_space() adds a space to a delayed extent list.
> + * Caller hold inode->i_data_sem.
> + *
> + * Return 0 on success, error code on failure.
> + */
> +int ext4_de_add_space(struct inode *inode, ext4_lblk_t start, ext4_lblk_t len)
> +{
> + ext4_lblk_t end = start + len;
In the first patch you are saying that:
ext4_lblk_t de_end; /* block at which extent ends */
which implies to me that the de_end is the last block of the extent. But
start+len certainly is not the last block, but rather last+1.
I am mentioning this because we have had similar problems in the past
where the documentation said that EXT_MAX_BLOCK was a maximum logical
block, but it was in fact used as a maximum count of blocks and that
caused problems. So I would like to be clear on this, what the "end"
actually really means to avoid confusion.
Also in ext4_extent we are using "start" and "len" approach, I can not
say which is better to use, but it might be better to be consistent in
ext4 code.
Thanks!
-Lukas
> + struct ext4_delayed_info *delayed_info;
> + struct ext4_delayed_extent *de;
> + struct list_head *cur;
> +
> + /* ext4_de_add_space is called with len = 1. */
> + BUG_ON(len != 1);
> + delayed_info = &EXT4_I(inode)->i_delayed_info;
> + de = delayed_info->cache_de;
> + de_debug("add [%u, %u) to delayed extent list of inode %lu\n",
> + start, end, inode->i_ino);
> +
> + if (de && de->de_end == start) {
> + de->de_end = end;
> + ext4_de_try_to_merge_right(inode, de);
> + goto out;
> + } else if (de && de->de_start == end) {
> + de->de_start = start;
> + ext4_de_try_to_merge_left(inode, de);
> + goto out;
> + } else if (de && de->de_start <= start && de->de_end >= end)
> + goto out;
> +
> + /* We traverse the list backwards. */
> + list_for_each_prev(cur, &delayed_info->de_list) {
> + de = list_entry(cur, struct ext4_delayed_extent, de_list);
> + if (de->de_start > end)
> + /* de folllows [start, end) and they are
> + * not contiguous.
> + * |------------|----|-----------|
> + * [start, end) de
> + */
> + continue;
> +
> + if (de->de_end == start) {
> + /* de is followed by [start, end) and they are
> + * contiguous.
> + * |-----------|---------------|
> + * de [start, end)
> + */
> + de->de_end = end;
> + goto out;
> + } else if (de->de_start == end) {
> + /* de follows [start, end) and they are contiguous.
> + * |-------------|------------|
> + * [start, end) de
> + */
> + de->de_start = start;
> + ext4_de_try_to_merge_left(inode, de);
> + goto out;
> + } else if (start >= de->de_start && end <= de->de_end)
> + goto out;
> +
> + break;
> + }
> +
> + de = ext4_de_add_entry(cur, start, end);
> + if (!de)
> + return -ENOMEM;
> +out:
> + delayed_info->cache_de = de;
> + ext4_de_print_list(inode);
> +
> + return 0;
> +}
> +
> +/*
> + * ext4_de_remove_space() removes a space from a delayed extent list.
> + * Caller hold inode->i_data_sem.
> + *
> + * Return 0 on success, error code on failure.
> + */
> +int ext4_de_remove_space(struct inode *inode, ext4_lblk_t start,
> + ext4_lblk_t len)
> +{
> + ext4_lblk_t end = start + len;
> + struct ext4_delayed_info *delayed_info;
> + struct list_head *cur;
> +
> + delayed_info = &EXT4_I(inode)->i_delayed_info;
> +
> + de_debug("remove [%u, %u) from delayed extent list of inode %lu\n",
> + start, end, inode->i_ino);
> +
> + list_for_each(cur, &delayed_info->de_list) {
> + struct ext4_delayed_extent *de, *new_de;
> + new_de = NULL;
> + de = list_entry(cur, struct ext4_delayed_extent, de_list);
> + if (de->de_end <= start)
> + /* de resides before [start, end)
> + * |---------|...|-----------|
> + * de [start, end)
> + */
> + continue;
> +
> + if (de->de_start >= end)
> + /* de resides after [start, end)
> + * |---------|...|-----------|
> + * [start,end) de
> + */
> + break;
> +
> + /* de and [start, end) intersect */
> + if (de->de_start < start && de->de_end <= end) {
> + /*
> + * |-----------------|--------|.........|
> + * de->de_start start de->de_end end
> + */
> + de->de_end = start;
> + start = de->de_end;
> + } else if ((de->de_start >= start && de->de_end <= end)) {
> + /*
> + * |............|--------|.........|
> + * start de->start de->end end
> + */
> + start = de->de_end;
> + cur = de->de_list.prev;
> + ext4_de_remove_entry(inode, de);
> + } else if (de->de_start < start && de->de_end > end) {
> + /*
> + * |------------|--------|---------|
> + * de->start start end de->de_end
> + */
> + if (!ext4_de_add_entry(&de->de_list, end, de->de_end))
> + return -ENOMEM;
> + de->de_end = start;
> + break;
> + } else if (de->de_start >= start && de->de_end > end) {
> + /*
> + * |.............|---------|--------|
> + * start de->de_start end de->de_end
> + */
> + de->de_start = end;
> + break;
> + } else
> + BUG();
> + }
> +
> + ext4_de_print_list(inode);
> + return 0;
> +}
> diff --git a/fs/ext4/delayed_extents.h b/fs/ext4/delayed_extents.h
> index b7baaef..dd3bdb5 100644
> --- a/fs/ext4/delayed_extents.h
> +++ b/fs/ext4/delayed_extents.h
> @@ -8,6 +8,21 @@
> #ifndef _EXT4_DELAYED_EXTENTS_H
> #define _EXT4_DELAYED_EXTENTS_H
>
> +/*
> + * Turn on DE_DEBUG to get lots of info about delayed extents operations.
> + */
> +#define DE_DEBUG__
> +#ifdef DE_DEBUG
> +#define de_debug(f, a...) \
> + do { \
> + printk(KERN_DEBUG "EXT4-fs delayed-extents (%s, %d): " \
> + "%s:", __FILE__, __LINE__, __func__); \
> + printk(KERN_DEBUG f, ## a); \
> + } while (0)
> +#else
> +#define de_debug(f, a...)
> +#endif
> +
> struct ext4_delayed_extent {
> struct list_head de_list;
> ext4_lblk_t de_start; /* first block extent covers */
> @@ -19,4 +34,14 @@ struct ext4_delayed_info {
> struct ext4_delayed_extent *cache_de; /* recently accessed extent */
> };
>
> +extern int __init ext4_init_de(void);
> +extern void ext4_exit_de(void);
> +
> +extern int ext4_de_add_space(struct inode *inode, ext4_lblk_t start,
> + ext4_lblk_t len);
> +extern int ext4_de_remove_space(struct inode *inode, ext4_lblk_t start,
> + ext4_lblk_t len);
> +extern ext4_lblk_t ext4_de_find_extent(struct inode *inode,
> + struct ext4_delayed_extent *de);
> +
> #endif /* _EXT4_DELAYED_EXTENTS_H */
>
--
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists