lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20180907204529.1662-5-enwlinux@gmail.com>
Date:   Fri,  7 Sep 2018 16:45:27 -0400
From:   Eric Whitney <enwlinux@...il.com>
To:     linux-ext4@...r.kernel.org
Cc:     tytso@....edu, Eric Whitney <enwlinux@...il.com>
Subject: [PATCH 4/6] ext4: reduce reserved cluster count by number of allocated clusters

Ext4 does not always reduce the reserved cluster count by the number
of clusters allocated when mapping a delayed extent.  It sometimes
adds back one or more clusters after allocation if delalloc blocks
adjacent to the range allocated by ext4_ext_map_blocks() share the
clusters newly allocated for that range.  However, this overcounts
the number of clusters needed to satisfy future mapping requests
(holding one or more reservations for clusters that have already been
allocated) and premature ENOSPC and quota failures, etc., result.

Ext4 also does not reduce the reserved cluster count when allocating
clusters for non-delayed allocated writes that have previously been
reserved for delayed writes.  This also results in overcounts.

To make it possible to handle reserved cluster accounting for
fallocated regions in the same manner as used for other non-delayed
writes, do the reserved cluster accounting for them at the time of
allocation.  In the current code, this is only done later when a
delayed extent sharing the fallocated region is finally mapped.

Address comment correcting handling of unsigned long long constant
from Jan Kara's review of RFC version of this patch.

Signed-off-by: Eric Whitney <enwlinux@...il.com>
---
 fs/ext4/extents.c        | 195 +++++++++--------------------------------------
 fs/ext4/extents_status.c | 183 ++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/extents_status.h |  11 +++
 3 files changed, 229 insertions(+), 160 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 26481e543312..c83b62d09ce4 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3819,83 +3819,6 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
 	return ext4_mark_inode_dirty(handle, inode);
 }
 
-/**
- * Determines how many complete clusters (out of those specified by the 'map')
- * are under delalloc and were reserved quota for.
- * This function is called when we are writing out the blocks that were
- * originally written with their allocation delayed, but then the space was
- * allocated using fallocate() before the delayed allocation could be resolved.
- * The cases to look for are:
- * ('=' indicated delayed allocated blocks
- *  '-' indicates non-delayed allocated blocks)
- * (a) partial clusters towards beginning and/or end outside of allocated range
- *     are not delalloc'ed.
- *	Ex:
- *	|----c---=|====c====|====c====|===-c----|
- *	         |++++++ allocated ++++++|
- *	==> 4 complete clusters in above example
- *
- * (b) partial cluster (outside of allocated range) towards either end is
- *     marked for delayed allocation. In this case, we will exclude that
- *     cluster.
- *	Ex:
- *	|----====c========|========c========|
- *	     |++++++ allocated ++++++|
- *	==> 1 complete clusters in above example
- *
- *	Ex:
- *	|================c================|
- *            |++++++ allocated ++++++|
- *	==> 0 complete clusters in above example
- *
- * The ext4_da_update_reserve_space will be called only if we
- * determine here that there were some "entire" clusters that span
- * this 'allocated' range.
- * In the non-bigalloc case, this function will just end up returning num_blks
- * without ever calling ext4_find_delalloc_range.
- */
-static unsigned int
-get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
-			   unsigned int num_blks)
-{
-	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-	ext4_lblk_t alloc_cluster_start, alloc_cluster_end;
-	ext4_lblk_t lblk_from, lblk_to, c_offset;
-	unsigned int allocated_clusters = 0;
-
-	alloc_cluster_start = EXT4_B2C(sbi, lblk_start);
-	alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1);
-
-	/* max possible clusters for this allocation */
-	allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1;
-
-	trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
-
-	/* Check towards left side */
-	c_offset = EXT4_LBLK_COFF(sbi, lblk_start);
-	if (c_offset) {
-		lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start);
-		lblk_to = lblk_from + c_offset - 1;
-
-		if (ext4_es_scan_range(inode, &ext4_es_is_delayed, lblk_from,
-				       lblk_to))
-			allocated_clusters--;
-	}
-
-	/* Now check towards right. */
-	c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks);
-	if (allocated_clusters && c_offset) {
-		lblk_from = lblk_start + num_blks;
-		lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
-
-		if (ext4_es_scan_range(inode, &ext4_es_is_delayed, lblk_from,
-				       lblk_to))
-			allocated_clusters--;
-	}
-
-	return allocated_clusters;
-}
-
 static int
 convert_initialized_extent(handle_t *handle, struct inode *inode,
 			   struct ext4_map_blocks *map,
@@ -4077,23 +4000,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
 	}
 	map->m_len = allocated;
 
-	/*
-	 * If we have done fallocate with the offset that is already
-	 * delayed allocated, we would have block reservation
-	 * and quota reservation done in the delayed write path.
-	 * But fallocate would have already updated quota and block
-	 * count for this offset. So cancel these reservation
-	 */
-	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
-		unsigned int reserved_clusters;
-		reserved_clusters = get_reserved_cluster_alloc(inode,
-				map->m_lblk, map->m_len);
-		if (reserved_clusters)
-			ext4_da_update_reserve_space(inode,
-						     reserved_clusters,
-						     0);
-	}
-
 map_out:
 	map->m_flags |= EXT4_MAP_MAPPED;
 	if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
@@ -4482,80 +4388,49 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	map->m_flags |= EXT4_MAP_NEW;
 
 	/*
-	 * Update reserved blocks/metadata blocks after successful
-	 * block allocation which had been deferred till now.
+	 * Reduce the reserved cluster count to reflect successful deferred
+	 * allocation of delayed allocated clusters or direct allocation of
+	 * clusters discovered to be delayed allocated.  Once allocated, a
+	 * cluster is not included in the reserved count.
 	 */
-	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
-		unsigned int reserved_clusters;
-		/*
-		 * Check how many clusters we had reserved this allocated range
-		 */
-		reserved_clusters = get_reserved_cluster_alloc(inode,
-						map->m_lblk, allocated);
-		if (!map_from_cluster) {
-			BUG_ON(allocated_clusters < reserved_clusters);
-			if (reserved_clusters < allocated_clusters) {
-				struct ext4_inode_info *ei = EXT4_I(inode);
-				int reservation = allocated_clusters -
-						  reserved_clusters;
-				/*
-				 * It seems we claimed few clusters outside of
-				 * the range of this allocation. We should give
-				 * it back to the reservation pool. This can
-				 * happen in the following case:
-				 *
-				 * * Suppose s_cluster_ratio is 4 (i.e., each
-				 *   cluster has 4 blocks. Thus, the clusters
-				 *   are [0-3],[4-7],[8-11]...
-				 * * First comes delayed allocation write for
-				 *   logical blocks 10 & 11. Since there were no
-				 *   previous delayed allocated blocks in the
-				 *   range [8-11], we would reserve 1 cluster
-				 *   for this write.
-				 * * Next comes write for logical blocks 3 to 8.
-				 *   In this case, we will reserve 2 clusters
-				 *   (for [0-3] and [4-7]; and not for [8-11] as
-				 *   that range has a delayed allocated blocks.
-				 *   Thus total reserved clusters now becomes 3.
-				 * * Now, during the delayed allocation writeout
-				 *   time, we will first write blocks [3-8] and
-				 *   allocate 3 clusters for writing these
-				 *   blocks. Also, we would claim all these
-				 *   three clusters above.
-				 * * Now when we come here to writeout the
-				 *   blocks [10-11], we would expect to claim
-				 *   the reservation of 1 cluster we had made
-				 *   (and we would claim it since there are no
-				 *   more delayed allocated blocks in the range
-				 *   [8-11]. But our reserved cluster count had
-				 *   already gone to 0.
-				 *
-				 *   Thus, at the step 4 above when we determine
-				 *   that there are still some unwritten delayed
-				 *   allocated blocks outside of our current
-				 *   block range, we should increment the
-				 *   reserved clusters count so that when the
-				 *   remaining blocks finally gets written, we
-				 *   could claim them.
-				 */
-				dquot_reserve_block(inode,
-						EXT4_C2B(sbi, reservation));
-				spin_lock(&ei->i_block_reservation_lock);
-				ei->i_reserved_data_blocks += reservation;
-				spin_unlock(&ei->i_block_reservation_lock);
-			}
+	if (test_opt(inode->i_sb, DELALLOC) && !map_from_cluster) {
+		if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
 			/*
-			 * We will claim quota for all newly allocated blocks.
-			 * We're updating the reserved space *after* the
-			 * correction above so we do not accidentally free
-			 * all the metadata reservation because we might
-			 * actually need it later on.
+			 * When allocating delayed allocated clusters, simply
+			 * reduce the reserved cluster count and claim quota
 			 */
 			ext4_da_update_reserve_space(inode, allocated_clusters,
 							1);
+		} else {
+			ext4_lblk_t lblk, len;
+			unsigned int n;
+
+			/*
+			 * When allocating non-delayed allocated clusters
+			 * (from fallocate, filemap, DIO, or clusters
+			 * allocated when delalloc has been disabled by
+			 * ext4_nonda_switch), reduce the reserved cluster
+			 * count by the number of allocated clusters that
+			 * have previously been delayed allocated.  Quota
+			 * has been claimed by ext4_mb_new_blocks() above,
+			 * so release the quota reservations made for any
+			 * previously delayed allocated clusters.
+			 */
+			lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk);
+			len = allocated_clusters << sbi->s_cluster_bits;
+			n = ext4_es_delayed_clu(inode, lblk, len);
+			if (n > 0)
+				ext4_da_update_reserve_space(inode, (int) n, 0);
 		}
 	}
 
+	if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC)) {
+		if (map_from_cluster)
+			ext4_cancel_pending(inode, map->m_lblk, map->m_len);
+		else
+			ext4_make_pending(inode, map->m_lblk, map->m_len);
+	}
+
 	/*
 	 * Cache the extent and update transaction to commit on fdatasync only
 	 * when it is _not_ an unwritten extent.
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index c5d456e12062..a2eb68f866bb 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -1605,3 +1605,186 @@ int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
 
 	return err;
 }
+
+/*
+ * __es_delayed_clu - count number of clusters containing blocks that
+ *                    are both delayed and unwritten
+ *
+ * @inode - file containing block range
+ * @start - logical block defining start of range
+ * @end - logical block defining end of range
+ *
+ * Returns the number of clusters containing delayed and unwritten blocks
+ * in the range specified by @start and @end.  Any cluster or part of a
+ * cluster within the range and containing a delayed and unwritten block
+ * within the range is counted as a whole cluster.
+ */
+static unsigned int __es_delayed_clu(struct inode *inode, ext4_lblk_t start,
+				     ext4_lblk_t end)
+{
+	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
+	struct extent_status *es;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct rb_node *node;
+	ext4_lblk_t first_lclu, last_lclu;
+	unsigned long long last_counted_lclu;
+	unsigned int n = 0;
+
+	/* guaranteed to be unequal to any ext4_lblk_t value */
+	last_counted_lclu = ~0ULL;
+
+	es = __es_tree_search(&tree->root, start);
+
+	while (es && (es->es_lblk <= end)) {
+		if (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es)) {
+			if (es->es_lblk <= start)
+				first_lclu = EXT4_B2C(sbi, start);
+			else
+				first_lclu = EXT4_B2C(sbi, es->es_lblk);
+
+			if (ext4_es_end(es) >= end)
+				last_lclu = EXT4_B2C(sbi, end);
+			else
+				last_lclu = EXT4_B2C(sbi, ext4_es_end(es));
+
+			if (first_lclu == last_counted_lclu)
+				n += last_lclu - first_lclu;
+			else
+				n += last_lclu - first_lclu + 1;
+			last_counted_lclu = last_lclu;
+		}
+		node = rb_next(&es->rb_node);
+		if (!node)
+			break;
+		es = rb_entry(node, struct extent_status, rb_node);
+	}
+
+	return n;
+}
+
+/*
+ * ext4_es_delayed_clu - count number of clusters containing blocks that
+ *                       are both delayed and unwritten
+ *
+ * @inode - file containing block range
+ * @lblk - logical block defining start of range
+ * @len - number of blocks in range
+ *
+ * Locking for external use of __es_delayed_clu().
+ */
+unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
+				 ext4_lblk_t len)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	ext4_lblk_t end;
+	unsigned int n;
+
+	if (len == 0)
+		return 0;
+
+	end = lblk + len - 1;
+	WARN_ON(end < lblk);
+
+	read_lock(&ei->i_es_lock);
+
+	n = __es_delayed_clu(inode, lblk, end);
+
+	read_unlock(&ei->i_es_lock);
+
+	return n;
+}
+
+/*
+ * ext4_cancel_pending - cancels a pending cluster reservation if the cluster
+ *                       does not contain delayed and unwritten blocks outside
+ *                       a specified range
+ *
+ * @inode - file containing the range
+ * @lblk - logical block defining the start of range
+ * @len  - length of range in blocks
+ *
+ * Meant for use when a mapping request for delayed allocated blocks in the
+ * specified range is satisfied by mapping a previously allocated cluster.
+ * Requires the block range be completely contained within a single cluster.
+ */
+void ext4_cancel_pending(struct inode *inode, ext4_lblk_t lblk,
+			 ext4_lblk_t len)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	ext4_lblk_t first, last;
+	bool du_left = false, du_right = false;
+
+	WARN_ON(EXT4_B2C(sbi, lblk) != EXT4_B2C(sbi, lblk + len - 1));
+
+	if (len == 0)
+		return;
+
+	write_lock(&EXT4_I(inode)->i_es_lock);
+
+	first = EXT4_LBLK_CMASK(sbi, lblk);
+	if (first != lblk) {
+		last = lblk - 1;
+		du_left = __es_scan_range(inode, &ext4_es_is_delunwrit,
+					  first, last);
+	}
+	if (!du_left) {
+		last = EXT4_LBLK_CMASK(sbi, lblk) + sbi->s_cluster_ratio - 1;
+		if (last != lblk + len - 1) {
+			first = lblk + len;
+			du_right = __es_scan_range(inode, &ext4_es_is_delunwrit,
+						   first, last);
+		}
+		if (!du_right)
+			__remove_pending(inode, lblk);
+	}
+
+	write_unlock(&EXT4_I(inode)->i_es_lock);
+}
+
+/*
+ * ext4_make_pending - makes pending cluster reservations if the clusters at
+ *                     the end of a specified range contain delayed and
+ *                     unwritten blocks outside that range
+ *
+ * @inode - file containing the range
+ * @lblk - logical block defining the start of range
+ * @len  - length of range in blocks
+ *
+ * Meant for use when a mapping request for delayed allocated blocks in the
+ * specified range is satisfied by mapping one or more newly allocated clusters
+ * that might be shared with delayed and unwritten blocks outside that range.
+ * Since these clusters are newly allocated, there can't be unwritten blocks
+ * mapped to them.  This allows the use of ext4_es_is_delayed alone to search
+ * for delayed and unwritten blocks on the edges of the allocated range.
+ */
+void ext4_make_pending(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	ext4_lblk_t first, last;
+	bool done = false;
+
+	write_lock(&EXT4_I(inode)->i_es_lock);
+
+	first = EXT4_LBLK_CMASK(sbi, lblk);
+	if (first != lblk) {
+		last = lblk - 1;
+		if (__es_scan_range(inode, &ext4_es_is_delayed, first, last)) {
+			__insert_pending(inode, first);
+			if (EXT4_B2C(sbi, lblk) ==
+			    EXT4_B2C(sbi, lblk + len - 1))
+				done = true;
+		}
+	}
+	if (!done) {
+		last = EXT4_LBLK_CMASK(sbi, lblk + len - 1) +
+		       sbi->s_cluster_ratio - 1;
+		if (last != lblk + len - 1) {
+			first = lblk + len;
+			if (__es_scan_range(inode, &ext4_es_is_delayed,
+					    first, last))
+				__insert_pending(inode, first);
+		}
+	}
+
+	write_unlock(&EXT4_I(inode)->i_es_lock);
+}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 3c8e8697edb6..46e41ef3be0a 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -183,6 +183,11 @@ static inline int ext4_es_is_mapped(struct extent_status *es)
 	return (ext4_es_is_written(es) || ext4_es_is_unwritten(es));
 }
 
+static inline int ext4_es_is_delunwrit(struct extent_status *es)
+{
+	return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es));
+}
+
 static inline void ext4_es_set_referenced(struct extent_status *es)
 {
 	es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT;
@@ -239,5 +244,11 @@ extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk);
 extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk);
 extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
 					bool allocated);
+extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
+					ext4_lblk_t len);
+extern void ext4_cancel_pending(struct inode *inode, ext4_lblk_t lblk,
+				ext4_lblk_t len);
+extern void ext4_make_pending(struct inode *inode, ext4_lblk_t lblk,
+			      ext4_lblk_t len);
 
 #endif /* _EXT4_EXTENTS_STATUS_H */
-- 
2.11.0

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ