[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1525702692-65658-3-git-send-email-bo.liu@linux.alibaba.com>
Date: Mon, 7 May 2018 22:18:12 +0800
From: Liu Bo <bo.liu@...ux.alibaba.com>
To: <linux-ext4@...r.kernel.org>
Subject: [PATCH 2/2] Ext4: bigalloc: do not reserve space for delalloc extents if there is an available cluster
Here is another overreservation which results in premature ENOSPC when
using bigalloc and delalloc,
case 1:
$ xfs_io -f -c "pwrite 0 2" -c "fsync" -c "pwrite 16K 2" -c "fsync" /mnt/dir/foo
or
case 2:
$ xfs_io -f -c "falloc 0 2" -c "pwrite 16K 2" -c "fsync" /mnt/dir/foo
Currently 'du' reports 128K instead of 64K because of overreservation.
What was happending is, take case 1 as an example,
a) in ext4_da_write_begin(), extent [0,2) will reserve one cluster for
itself,
b) forced by fsync, extent [0,2) allocates a 64K cluster and uses the
reserved cluster,
c) in ext4_da_write_begin(), extent [16k,16k+2) will reserve one cluster
for itself,
d) forced by fsync, extent [16k, 16k+2) finds out that it can use the new
'allocated' block, i.e. @map_from_cluster is true, such that it doesn't
update the reservation accounting, and no one will.
Currently ext4_da_write_begin() only checks if there is a delalloc extent
which has reserved a cluster, but a written extent also implies an
available cluster.
With this, ext4_da_write_begin() now checks delalloc, written and
unwritten extents.
Signed-off-by: Liu Bo <bo.liu@...ux.alibaba.com>
---
fs/ext4/ext4.h | 3 ++-
fs/ext4/extents.c | 24 ++++++++++++++++++++++--
fs/ext4/extents_status.c | 49 ++++++++++++++++++++++++++++++++----------------
fs/ext4/extents_status.h | 12 ++++++++++++
fs/ext4/inode.c | 5 +++--
5 files changed, 72 insertions(+), 21 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a42e71203e53..b07d30b1c635 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3121,7 +3121,8 @@ extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t,
extern int ext4_find_delalloc_range(struct inode *inode,
ext4_lblk_t lblk_start,
ext4_lblk_t lblk_end);
-extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
+extern int ext4_find_delalloc_or_allocated_cluster(struct inode *inode,
+ ext4_lblk_t lblk);
extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path);
extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 1b3b24a53e1f..acc8b5c213b1 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3836,14 +3836,34 @@ int ext4_find_delalloc_range(struct inode *inode,
return 0;
}
-int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk)
+static int ext4_find_own_rsv_range(struct inode *inode,
+ ext4_lblk_t lblk_start,
+ ext4_lblk_t lblk_end)
+{
+ struct extent_status es;
+
+ ext4_es_find_own_rsv_extent_range(inode, lblk_start,
+ lblk_end, &es);
+ if (es.es_len == 0)
+ return 0; /* there is no delay extent in this tree */
+ else if (es.es_lblk <= lblk_start &&
+ lblk_start < es.es_lblk + es.es_len)
+ return 1;
+ else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end)
+ return 1;
+ else
+ return 0;
+}
+
+int ext4_find_delalloc_or_allocated_cluster(struct inode *inode,
+ ext4_lblk_t lblk)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
ext4_lblk_t lblk_start, lblk_end;
lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
- return ext4_find_delalloc_range(inode, lblk_start, lblk_end);
+ return ext4_find_own_rsv_range(inode, lblk_start, lblk_end);
}
/**
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 763ef185dd17..c828c04c947d 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -233,18 +233,10 @@ static struct extent_status *__es_tree_search(struct rb_root *root,
return NULL;
}
-/*
- * ext4_es_find_delayed_extent_range: find the 1st delayed extent covering
- * @es->lblk if it exists, otherwise, the next extent after @es->lblk.
- *
- * @inode: the inode which owns delayed extents
- * @lblk: the offset where we start to search
- * @end: the offset where we stop to search
- * @es: delayed extent that we found
- */
-void ext4_es_find_delayed_extent_range(struct inode *inode,
- ext4_lblk_t lblk, ext4_lblk_t end,
- struct extent_status *es)
+void __ext4_es_find_range_type(struct inode *inode,
+ ext4_lblk_t lblk, ext4_lblk_t end,
+ struct extent_status *es,
+ ext4_es_istype_t *ext4_es_istype)
{
struct ext4_es_tree *tree = NULL;
struct extent_status *es1 = NULL;
@@ -252,7 +244,6 @@ void ext4_es_find_delayed_extent_range(struct inode *inode,
BUG_ON(es == NULL);
BUG_ON(end < lblk);
- trace_ext4_es_find_delayed_extent_range_enter(inode, lblk);
read_lock(&EXT4_I(inode)->i_es_lock);
tree = &EXT4_I(inode)->i_es_tree;
@@ -272,19 +263,19 @@ void ext4_es_find_delayed_extent_range(struct inode *inode,
es1 = __es_tree_search(&tree->root, lblk);
out:
- if (es1 && !ext4_es_is_delayed(es1)) {
+ if (es1 && !ext4_es_istype(es1)) {
while ((node = rb_next(&es1->rb_node)) != NULL) {
es1 = rb_entry(node, struct extent_status, rb_node);
if (es1->es_lblk > end) {
es1 = NULL;
break;
}
- if (ext4_es_is_delayed(es1))
+ if (ext4_es_istype(es1))
break;
}
}
- if (es1 && ext4_es_is_delayed(es1)) {
+ if (es1 && ext4_es_istype(es1)) {
tree->cache_es = es1;
es->es_lblk = es1->es_lblk;
es->es_len = es1->es_len;
@@ -292,10 +283,36 @@ void ext4_es_find_delayed_extent_range(struct inode *inode,
}
read_unlock(&EXT4_I(inode)->i_es_lock);
+}
+
+/*
+ * ext4_es_find_delayed_extent_range: find the 1st delayed extent covering
+ * @es->lblk if it exists, otherwise, the next extent after @es->lblk.
+ *
+ * @inode: the inode which owns delayed extents
+ * @lblk: the offset where we start to search
+ * @end: the offset where we stop to search
+ * @es: delayed extent that we found
+ */
+void ext4_es_find_delayed_extent_range(struct inode *inode,
+ ext4_lblk_t lblk, ext4_lblk_t end,
+ struct extent_status *es)
+{
+ trace_ext4_es_find_delayed_extent_range_enter(inode, lblk);
+
+ __ext4_es_find_range_type(inode, lblk, end, es, ext4_es_is_delayed);
trace_ext4_es_find_delayed_extent_range_exit(inode, es);
}
+void ext4_es_find_own_rsv_extent_range(struct inode *inode,
+ ext4_lblk_t lblk, ext4_lblk_t end,
+ struct extent_status *es)
+{
+ __ext4_es_find_range_type(inode, lblk, end, es,
+ ext4_es_is_own_rsv);
+}
+
static void ext4_es_list_add(struct inode *inode)
{
struct ext4_inode_info *ei = EXT4_I(inode);
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 8efdeb903d6b..90ce7fd1cfe7 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -93,9 +93,14 @@ extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
extern void ext4_es_find_delayed_extent_range(struct inode *inode,
ext4_lblk_t lblk, ext4_lblk_t end,
struct extent_status *es);
+extern void ext4_es_find_own_rsv_extent_range(struct inode *inode,
+ ext4_lblk_t lblk, ext4_lblk_t end,
+ struct extent_status *es);
extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
struct extent_status *es);
+typedef int (ext4_es_istype_t)(struct extent_status *es);
+
static inline unsigned int ext4_es_status(struct extent_status *es)
{
return es->es_pblk >> ES_SHIFT;
@@ -121,6 +126,13 @@ static inline int ext4_es_is_delayed(struct extent_status *es)
return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0;
}
+static inline int ext4_es_is_own_rsv(struct extent_status *es)
+{
+ return ext4_es_is_delayed(es) ||
+ ext4_es_is_written(es) ||
+ ext4_es_is_unwritten(es);
+}
+
static inline int ext4_es_is_hole(struct extent_status *es)
{
return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1e50c5efae67..86f9545fd941 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1679,7 +1679,7 @@ static void ext4_da_page_release_reservation(struct page *page,
lblk = (page->index << (PAGE_SHIFT - inode->i_blkbits)) +
((num_clusters - 1) << sbi->s_cluster_bits);
if (sbi->s_cluster_ratio == 1 ||
- !ext4_find_delalloc_cluster(inode, lblk))
+ !ext4_find_delalloc_or_allocated_cluster(inode, lblk))
ext4_da_release_space(inode, 1);
num_clusters--;
@@ -1867,7 +1867,8 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
* to reserve metadata for every block we're going to write.
*/
if (EXT4_SB(inode->i_sb)->s_cluster_ratio == 1 ||
- !ext4_find_delalloc_cluster(inode, map->m_lblk)) {
+ !ext4_find_delalloc_or_allocated_cluster(inode,
+ map->m_lblk)) {
ret = ext4_da_reserve_space(inode);
if (ret) {
/* not enough space to reserve */
--
1.8.3.1
Powered by blists - more mailing lists