[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1360313046-9876-10-git-send-email-wenqing.lz@taobao.com>
Date: Fri, 8 Feb 2013 16:44:05 +0800
From: Zheng Liu <gnehzuil.liu@...il.com>
To: linux-ext4@...r.kernel.org
Cc: Zheng Liu <wenqing.lz@...bao.com>, "Theodore Ts'o" <tytso@....edu>,
Jan kara <jack@...e.cz>
Subject: [PATCH 09/10 v5] ext4: convert unwritten extents from extent status tree in end_io
From: Zheng Liu <wenqing.lz@...bao.com>
This commit tries to convert unwritten extents from extent status tree
in end_io callback functions and ext4_ext_direct_IO.
Signed-off-by: Zheng Liu <wenqing.lz@...bao.com>
Cc: "Theodore Ts'o" <tytso@....edu>
Cc: Jan kara <jack@...e.cz>
---
fs/ext4/extents.c | 6 +-
fs/ext4/extents_status.c | 180 ++++++++++++++++++++++++++++++++++++++++----
fs/ext4/extents_status.h | 2 +
fs/ext4/inode.c | 5 ++
fs/ext4/page-io.c | 8 +-
include/trace/events/ext4.h | 25 ++++++
6 files changed, 208 insertions(+), 18 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 9f21430..a03cabf 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4443,8 +4443,10 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
ret = PTR_ERR(handle);
break;
}
- ret = ext4_map_blocks(handle, inode, &map,
- EXT4_GET_BLOCKS_IO_CONVERT_EXT);
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ret = ext4_ext_map_blocks(handle, inode, &map,
+ EXT4_GET_BLOCKS_IO_CONVERT_EXT);
+ up_write(&EXT4_I(inode)->i_data_sem);
if (ret <= 0) {
WARN_ON(ret <= 0);
ext4_msg(inode->i_sb, KERN_ERR,
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index bac5286..eab8893 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -248,10 +248,11 @@ ext4_lblk_t ext4_es_find_delayed_extent(struct inode *inode,
struct extent_status *es1 = NULL;
struct rb_node *node;
ext4_lblk_t ret = EXT_MAX_BLOCKS;
+ unsigned long flags;
trace_ext4_es_find_delayed_extent_enter(inode, es->es_lblk);
- read_lock(&EXT4_I(inode)->i_es_lock);
+ read_lock_irqsave(&EXT4_I(inode)->i_es_lock, flags);
tree = &EXT4_I(inode)->i_es_tree;
/* find extent in cache firstly */
@@ -291,7 +292,7 @@ out:
}
}
- read_unlock(&EXT4_I(inode)->i_es_lock);
+ read_unlock_irqrestore(&EXT4_I(inode)->i_es_lock, flags);
ext4_es_lru_add(inode);
trace_ext4_es_find_delayed_extent_exit(inode, es, ret);
@@ -458,6 +459,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
{
struct extent_status newes;
ext4_lblk_t end = lblk + len - 1;
+ unsigned long flags;
int err = 0;
es_debug("add [%u/%u) %llu %llx to extent status tree of inode %lu\n",
@@ -471,14 +473,14 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_es_store_status(&newes, status);
trace_ext4_es_insert_extent(inode, &newes);
- write_lock(&EXT4_I(inode)->i_es_lock);
+ write_lock_irqsave(&EXT4_I(inode)->i_es_lock, flags);
err = __es_remove_extent(inode, lblk, end);
if (err != 0)
goto error;
err = __es_insert_extent(inode, &newes);
error:
- write_unlock(&EXT4_I(inode)->i_es_lock);
+ write_unlock_irqrestore(&EXT4_I(inode)->i_es_lock, flags);
ext4_es_lru_add(inode);
ext4_es_print_tree(inode);
@@ -498,13 +500,14 @@ int ext4_es_lookup_extent(struct inode *inode, struct extent_status *es)
struct ext4_es_tree *tree;
struct extent_status *es1 = NULL;
struct rb_node *node;
+ unsigned long flags;
int found = 0;
trace_ext4_es_lookup_extent_enter(inode, es->es_lblk);
es_debug("lookup extent in block %u\n", es->es_lblk);
tree = &EXT4_I(inode)->i_es_tree;
- read_lock(&EXT4_I(inode)->i_es_lock);
+ read_lock_irqsave(&EXT4_I(inode)->i_es_lock, flags);
/* find extent in cache firstly */
es->es_len = es->es_pblk = 0;
@@ -539,7 +542,7 @@ out:
es->es_pblk = es1->es_pblk;
}
- read_unlock(&EXT4_I(inode)->i_es_lock);
+ read_unlock_irqrestore(&EXT4_I(inode)->i_es_lock, flags);
ext4_es_lru_add(inode);
trace_ext4_es_lookup_extent_exit(inode, es, found);
@@ -649,6 +652,7 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len)
{
ext4_lblk_t end;
+ unsigned long flags;
int err = 0;
trace_ext4_es_remove_extent(inode, lblk, len);
@@ -658,9 +662,9 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
end = lblk + len - 1;
BUG_ON(end < lblk);
- write_lock(&EXT4_I(inode)->i_es_lock);
+ write_lock_irqsave(&EXT4_I(inode)->i_es_lock, flags);
err = __es_remove_extent(inode, lblk, end);
- write_unlock(&EXT4_I(inode)->i_es_lock);
+ write_unlock_irqrestore(&EXT4_I(inode)->i_es_lock, flags);
ext4_es_print_tree(inode);
return err;
}
@@ -671,6 +675,7 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
struct ext4_sb_info, s_es_shrinker);
struct ext4_inode_info *ei;
struct list_head *cur, *tmp, scanned;
+ unsigned long flags;
int nr_to_scan = sc->nr_to_scan;
int ret, nr_shrunk = 0;
@@ -687,16 +692,16 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
- read_lock(&ei->i_es_lock);
+ read_lock_irqsave(&ei->i_es_lock, flags);
if (ei->i_es_lru_nr == 0) {
- read_unlock(&ei->i_es_lock);
+ read_unlock_irqrestore(&ei->i_es_lock, flags);
continue;
}
- read_unlock(&ei->i_es_lock);
+ read_unlock_irqrestore(&ei->i_es_lock, flags);
- write_lock(&ei->i_es_lock);
+ write_lock_irqsave(&ei->i_es_lock, flags);
ret = __es_try_to_reclaim_extents(ei, nr_to_scan);
- write_unlock(&ei->i_es_lock);
+ write_unlock_irqrestore(&ei->i_es_lock, flags);
nr_shrunk += ret;
nr_to_scan -= ret;
@@ -756,14 +761,15 @@ static int ext4_es_reclaim_extents_count(struct super_block *sb)
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_inode_info *ei;
struct list_head *cur;
+ unsigned long flags;
int nr_cached = 0;
spin_lock(&sbi->s_es_lru_lock);
list_for_each(cur, &sbi->s_es_lru) {
ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
- read_lock(&ei->i_es_lock);
+ read_lock_irqsave(&ei->i_es_lock, flags);
nr_cached += ei->i_es_lru_nr;
- read_unlock(&ei->i_es_lock);
+ read_unlock_irqrestore(&ei->i_es_lock, flags);
}
spin_unlock(&sbi->s_es_lru_lock);
trace_ext4_es_reclaim_extents_count(sb, nr_cached);
@@ -801,3 +807,147 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
tree->cache_es = NULL;
return nr_shrunk;
}
+
+int ext4_es_convert_unwritten_extents(struct inode *inode, loff_t offset,
+ size_t size)
+{
+ struct ext4_es_tree *tree;
+ struct rb_node *node;
+ struct extent_status *es, orig_es, conv_es;
+ ext4_lblk_t end, len1, len2;
+ ext4_lblk_t lblk = 0, len = 0;
+ ext4_fsblk_t block;
+ unsigned long flags;
+ unsigned int blkbits;
+ int err = 0;
+
+ trace_ext4_es_convert_unwritten_extents(inode, offset, size);
+ blkbits = inode->i_blkbits;
+ lblk = offset >> blkbits;
+ len = (EXT4_BLOCK_ALIGN(offset + size, blkbits) >> blkbits) - lblk;
+
+ end = lblk + len - 1;
+ BUG_ON(end < lblk);
+
+ tree = &EXT4_I(inode)->i_es_tree;
+
+ write_lock_irqsave(&EXT4_I(inode)->i_es_lock, flags);
+ es = __es_tree_search(&tree->root, lblk);
+ if (!es)
+ goto out;
+ if (es->es_lblk > end)
+ goto out;
+
+ tree->cache_es = NULL;
+
+ orig_es.es_lblk = es->es_lblk;
+ orig_es.es_len = es->es_len;
+ orig_es.es_pblk = es->es_pblk;
+
+ len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0;
+ len2 = ext4_es_end(es) > end ?
+ ext4_es_end(es) - end : 0;
+ if (len1 > 0)
+ es->es_len = len1;
+ if (len2 > 0) {
+ if (len1 > 0) {
+ struct extent_status newes;
+
+ newes.es_lblk = end + 1;
+ newes.es_len = len2;
+ block = ext4_es_pblock(&orig_es) +
+ orig_es.es_len - len2;
+ ext4_es_store_pblock(&newes, block);
+ ext4_es_store_status(&newes, ext4_es_status(&orig_es));
+ err = __es_insert_extent(inode, &newes);
+ if (err) {
+ es->es_lblk = orig_es.es_lblk;
+ es->es_len = orig_es.es_len;
+ es->es_pblk = orig_es.es_pblk;
+ goto out;
+ }
+
+ conv_es.es_lblk = orig_es.es_lblk + len1;
+ conv_es.es_len = orig_es.es_len - len1 - len2;
+ block = ext4_es_pblock(&orig_es) + len1;
+ ext4_es_store_pblock(&conv_es, block);
+ ext4_es_store_status(&conv_es, EXTENT_STATUS_WRITTEN);
+ err = __es_insert_extent(inode, &conv_es);
+ if (err) {
+ int err2 = __es_remove_extent(inode,
+ conv_es.es_lblk,
+ ext4_es_end(&newes));
+ if (err2)
+ goto out;
+ es->es_lblk = orig_es.es_lblk;
+ es->es_len = orig_es.es_len;
+ es->es_pblk = orig_es.es_pblk;
+ goto out;
+ }
+ } else {
+ es->es_lblk = end + 1;
+ es->es_len = len2;
+ block = ext4_es_pblock(&orig_es) +
+ orig_es.es_len - len2;
+ ext4_es_store_pblock(es, block);
+
+ conv_es.es_lblk = orig_es.es_lblk;
+ conv_es.es_len = orig_es.es_len - len2;
+ ext4_es_store_pblock(&conv_es,
+ ext4_es_pblock(&orig_es));
+ ext4_es_store_status(&conv_es, EXTENT_STATUS_WRITTEN);
+ err = __es_insert_extent(inode, &conv_es);
+ if (err) {
+ es->es_lblk = orig_es.es_lblk;
+ es->es_len = orig_es.es_len;
+ es->es_pblk = orig_es.es_pblk;
+ }
+ }
+ goto out;
+ }
+
+ if (len1 > 0) {
+ node = rb_next(&es->rb_node);
+ if (node)
+ es = rb_entry(node, struct extent_status, rb_node);
+ else
+ es = NULL;
+ }
+
+ while (es && ext4_es_end(es) <= end) {
+ node = rb_next(&es->rb_node);
+ ext4_es_store_status(es, EXTENT_STATUS_WRITTEN);
+ if (!inode) {
+ es = NULL;
+ break;
+ }
+ es = rb_entry(node, struct extent_status, rb_node);
+ }
+
+ if (es && es->es_lblk < end + 1) {
+ ext4_lblk_t orig_len = es->es_len;
+
+ /*
+ * Here we first set conv_es just because of avoiding copy the
+ * value of es to a temporary variable.
+ */
+ len1 = ext4_es_end(es) - end;
+ conv_es.es_lblk = es->es_lblk;
+ conv_es.es_len = es->es_len - len1;
+ ext4_es_store_pblock(&conv_es, ext4_es_pblock(es));
+ ext4_es_store_status(&conv_es, EXTENT_STATUS_WRITTEN);
+
+ es->es_lblk = end + 1;
+ es->es_len = len1;
+ block = ext4_es_pblock(es) + orig_len - len1;
+ ext4_es_store_pblock(es, block);
+
+ err = __es_insert_extent(inode, &conv_es);
+ if (err)
+ goto out;
+ }
+
+out:
+ write_unlock_irqrestore(&EXT4_I(inode)->i_es_lock, flags);
+ return err;
+}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 938ad2b..2849d74 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -54,6 +54,8 @@ extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
extern ext4_lblk_t ext4_es_find_delayed_extent(struct inode *inode,
struct extent_status *es);
extern int ext4_es_lookup_extent(struct inode *inode, struct extent_status *es);
+extern int ext4_es_convert_unwritten_extents(struct inode *inode, loff_t offset,
+ size_t size);
static inline int ext4_es_is_written(struct extent_status *es)
{
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 670779a..08cf720 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3063,6 +3063,7 @@ out:
io_end->result = ret;
}
+ ext4_es_convert_unwritten_extents(inode, offset, size);
ext4_add_complete_io(io_end);
}
@@ -3088,6 +3089,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
*/
inode = io_end->inode;
ext4_set_io_unwritten_flag(inode, io_end);
+ ext4_es_convert_unwritten_extents(inode, io_end->offset, io_end->size);
ext4_add_complete_io(io_end);
out:
bh->b_private = NULL;
@@ -3246,6 +3248,9 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
} else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
EXT4_STATE_DIO_UNWRITTEN)) {
int err;
+ err = ext4_es_convert_unwritten_extents(inode, offset, ret);
+ if (err)
+ ret = err;
/*
* for non AIO case, since the IO is already
* completed, we could do the conversion right here
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 0016fbc..66ea30e 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -276,6 +276,13 @@ static void ext4_end_bio(struct bio *bio, int error)
error = 0;
bio_put(bio);
+ /*
+ * We need to convert unwrittne extents in extent status tree before
+ * end_page_writeback() is called. Otherwise, when dioread_nolock is
+ * enabled, we will be likely to read stale data.
+ */
+ inode = io_end->inode;
+ ext4_es_convert_unwritten_extents(inode, io_end->offset, io_end->size);
for (i = 0; i < io_end->num_io_pages; i++) {
struct page *page = io_end->pages[i]->p_page;
struct buffer_head *bh, *head;
@@ -305,7 +312,6 @@ static void ext4_end_bio(struct bio *bio, int error)
put_io_page(io_end->pages[i]);
}
io_end->num_io_pages = 0;
- inode = io_end->inode;
if (error) {
io_end->flag |= EXT4_IO_END_ERROR;
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index f0734b3..d32e3d5 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -2233,6 +2233,31 @@ TRACE_EVENT(ext4_es_lookup_extent_exit,
__entry->found ? __entry->status : 0)
);
+TRACE_EVENT(ext4_es_convert_unwritten_extents,
+ TP_PROTO(struct inode *inode, loff_t offset, loff_t size),
+
+ TP_ARGS(inode, offset, size),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( ino_t, ino )
+ __field( loff_t, offset )
+ __field( loff_t, size )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->offset = offset;
+ __entry->size = size;
+ ),
+
+ TP_printk("dev %d,%d ino %lu convert unwritten extents [%llu/%llu",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long) __entry->ino,
+ __entry->offset, __entry->size)
+);
+
TRACE_EVENT(ext4_es_reclaim_extents_count,
TP_PROTO(struct super_block *sb, int nr_cached),
--
1.7.12.rc2.18.g61b472e
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists