linux-kernel - Re: [PATCH v8 05/11] btrfs: lookup physical address from stripe extent

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <6de87230-f981-411c-b173-55871e4d4720@gmx.com>
Date:   Thu, 14 Sep 2023 18:48:17 +0930
From:   Qu Wenruo <quwenruo.btrfs@....com>
To:     Johannes Thumshirn <johannes.thumshirn@....com>,
        Chris Mason <clm@...com>, Josef Bacik <josef@...icpanda.com>,
        David Sterba <dsterba@...e.com>
Cc:     Christoph Hellwig <hch@....de>,
        Naohiro Aota <naohiro.aota@....com>, Qu Wenruo <wqu@...e.com>,
        Damien Le Moal <dlemoal@...nel.org>,
        linux-btrfs@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v8 05/11] btrfs: lookup physical address from stripe
 extent



On 2023/9/11 22:22, Johannes Thumshirn wrote:
> Lookup the physical address from the raid stripe tree when a read on an
> RAID volume formatted with the raid stripe tree was attempted.
>
> If the requested logical address was not found in the stripe tree, it may
> still be in the in-memory ordered stripe tree, so fallback to searching
> the ordered stripe tree in this case.
>
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@....com>
> ---
>   fs/btrfs/raid-stripe-tree.c | 159 ++++++++++++++++++++++++++++++++++++++++++++
>   fs/btrfs/raid-stripe-tree.h |  11 +++
>   fs/btrfs/volumes.c          |  37 ++++++++---
>   3 files changed, 198 insertions(+), 9 deletions(-)
>
> diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
> index 5b12f40877b5..7ed02e4b79ec 100644
> --- a/fs/btrfs/raid-stripe-tree.c
> +++ b/fs/btrfs/raid-stripe-tree.c
> @@ -324,3 +324,162 @@ int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
>
>   	return ret;
>   }
> +
> +static bool btrfs_check_for_extent(struct btrfs_fs_info *fs_info, u64 logical,
> +				   u64 length, struct btrfs_path *path)
> +{
> +	struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical);
> +	struct btrfs_key key;
> +	int ret;
> +
> +	btrfs_release_path(path);
> +
> +	key.objectid = logical;
> +	key.type = BTRFS_EXTENT_ITEM_KEY;
> +	key.offset = length;
> +
> +	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
> +
> +	return ret;
> +}
> +
> +int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
> +				 u64 logical, u64 *length, u64 map_type,
> +				 u32 stripe_index,
> +				 struct btrfs_io_stripe *stripe)
> +{
> +	struct btrfs_root *stripe_root = btrfs_stripe_tree_root(fs_info);
> +	struct btrfs_stripe_extent *stripe_extent;
> +	struct btrfs_key stripe_key;
> +	struct btrfs_key found_key;
> +	struct btrfs_path *path;
> +	struct extent_buffer *leaf;
> +	int num_stripes;
> +	u8 encoding;
> +	u64 offset;
> +	u64 found_logical;
> +	u64 found_length;
> +	u64 end;
> +	u64 found_end;
> +	int slot;
> +	int ret;
> +	int i;
> +
> +	stripe_key.objectid = logical;
> +	stripe_key.type = BTRFS_RAID_STRIPE_KEY;
> +	stripe_key.offset = 0;
> +
> +	path = btrfs_alloc_path();
> +	if (!path)
> +		return -ENOMEM;
> +
> +	ret = btrfs_search_slot(NULL, stripe_root, &stripe_key, path, 0, 0);
> +	if (ret < 0)
> +		goto free_path;
> +	if (ret) {
> +		if (path->slots[0] != 0)
> +			path->slots[0]--;

IIRC we have btrfs_previous_item() to do the forward search.

> +	}
> +
> +	end = logical + *length;

IMHO we can make it const and initialize it at the definition part.

> +
> +	while (1) {

Here we only can hit at most one RST item, thus I'd recommend to remove
the while().

Although this would mean we will need a if () to handle (ret > 0) case,
but it may still be a little easier to read than a loop.

You may want to refer to btrfs_lookup_csum() for the non-loop
implementation.

> +		leaf = path->nodes[0];
> +		slot = path->slots[0];
> +
> +		btrfs_item_key_to_cpu(leaf, &found_key, slot);
> +		found_logical = found_key.objectid;
> +		found_length = found_key.offset;
> +		found_end = found_logical + found_length;
> +
> +		if (found_logical > end) {
> +			ret = -ENOENT;
> +			goto out;
> +		}
> +
> +		if (in_range(logical, found_logical, found_length))
> +			break;
> +
> +		ret = btrfs_next_item(stripe_root, path);
> +		if (ret)
> +			goto out;
> +	}
> +
> +	offset = logical - found_logical;
> +
> +	/*
> +	 * If we have a logically contiguous, but physically noncontinuous
> +	 * range, we need to split the bio. Record the length after which we
> +	 * must split the bio.
> +	 */
> +	if (end > found_end)
> +		*length -= end - found_end;
> +
> +	num_stripes = btrfs_num_raid_stripes(btrfs_item_size(leaf, slot));
> +	stripe_extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
> +	encoding = btrfs_stripe_extent_encoding(leaf, stripe_extent);
> +
> +	if (encoding != btrfs_bg_type_to_raid_encoding(map_type)) {
> +		ret = -ENOENT;
> +		goto out;

This looks like a very weird situation, we have a bg with a different type.
Should we do some warning or is there some valid situation for this?

> +	}
> +
> +	for (i = 0; i < num_stripes; i++) {
> +		struct btrfs_raid_stride *stride = &stripe_extent->strides[i];
> +		u64 devid = btrfs_raid_stride_devid(leaf, stride);
> +		u64 len = btrfs_raid_stride_length(leaf, stride);
> +		u64 physical = btrfs_raid_stride_physical(leaf, stride);
> +
> +		if (offset >= len) {
> +			offset -= len;
> +
> +			if (offset >= BTRFS_STRIPE_LEN)
> +				continue;
> +		}
> +
> +		if (devid != stripe->dev->devid)
> +			continue;
> +
> +		if ((map_type & BTRFS_BLOCK_GROUP_DUP) && stripe_index != i)
> +			continue;
> +
> +		stripe->physical = physical + offset;
> +
> +		ret = 0;
> +		goto free_path;
> +	}
> +
> +	/*
> +	 * If we're here, we haven't found the requested devid in the stripe.
> +	 */
> +	ret = -ENOENT;
> +out:
> +	if (ret > 0)
> +		ret = -ENOENT;
> +	if (ret && ret != -EIO) {
> +		/*
> +		 * Check if the range we're looking for is actually backed by
> +		 * an extent. This can happen, e.g. when scrub is running on a
> +		 * block-group and the extent it is trying to scrub get's
> +		 * deleted in the meantime. Although scrub is setting the
> +		 * block-group to read-only, deletion of extents are still
> +		 * allowed. If the extent is gone, simply return ENOENT and be
> +		 * good.
> +		 */

As mentioned in the next patch (sorry for the reversed order), this
should be handled in a different way (by only searching commit root for
scrub usage).
> +		if (btrfs_check_for_extent(fs_info, logical, *length, path)) {
> +			ret = -ENOENT;
> +			goto free_path;
> +		}
> +
> +		if (IS_ENABLED(CONFIG_BTRFS_DEBUG))
> +			btrfs_print_tree(leaf, 1);
> +		btrfs_err(fs_info,
> +			  "cannot find raid-stripe for logical [%llu, %llu] devid %llu, profile %s",
> +			  logical, logical + *length, stripe->dev->devid,
> +			  btrfs_bg_type_to_raid_name(map_type));
> +	}
> +free_path:
> +	btrfs_free_path(path);
> +
> +	return ret;
> +}
> diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h
> index 7560dc501a65..40aa553ae8aa 100644
> --- a/fs/btrfs/raid-stripe-tree.h
> +++ b/fs/btrfs/raid-stripe-tree.h
> @@ -13,6 +13,10 @@ struct btrfs_io_stripe;
>
>   int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start,
>   			     u64 length);
> +int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
> +				 u64 logical, u64 *length, u64 map_type,
> +				 u32 stripe_index,
> +				 struct btrfs_io_stripe *stripe);
>   int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
>   			     struct btrfs_ordered_extent *ordered_extent);
>
> @@ -33,4 +37,11 @@ static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info,
>
>   	return false;
>   }
> +
> +static inline int btrfs_num_raid_stripes(u32 item_size)
> +{
> +	return (item_size - offsetof(struct btrfs_stripe_extent, strides)) /
> +		sizeof(struct btrfs_raid_stride);
> +}
> +
>   #endif
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 0c0fd4eb4848..7c25f5c77788 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -35,6 +35,7 @@
>   #include "relocation.h"
>   #include "scrub.h"
>   #include "super.h"
> +#include "raid-stripe-tree.h"
>
>   #define BTRFS_BLOCK_GROUP_STRIPE_MASK	(BTRFS_BLOCK_GROUP_RAID0 | \
>   					 BTRFS_BLOCK_GROUP_RAID10 | \
> @@ -6206,12 +6207,22 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
>   	return U64_MAX;
>   }
>
> -static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map,
> -			  u32 stripe_index, u64 stripe_offset, u32 stripe_nr)
> +static int set_io_stripe(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
> +		      u64 logical, u64 *length, struct btrfs_io_stripe *dst,
> +		      struct map_lookup *map, u32 stripe_index,
> +		      u64 stripe_offset, u64 stripe_nr)
Do we need @length to be a pointer?
IIRC we can return the number of bytes we mapped, or <0 for errors.
Thus at least @length doesn't need to be a pointer.

Thanks,
Qu

>   {
>   	dst->dev = map->stripes[stripe_index].dev;
> +
> +	if (op == BTRFS_MAP_READ &&
> +	    btrfs_need_stripe_tree_update(fs_info, map->type))
> +		return btrfs_get_raid_extent_offset(fs_info, logical, length,
> +						    map->type, stripe_index,
> +						    dst);
> +
>   	dst->physical = map->stripes[stripe_index].physical +
>   			stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr);
> +	return 0;
>   }
>
>   /*
> @@ -6428,11 +6439,11 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
>   	 */
>   	if (smap && num_alloc_stripes == 1 &&
>   	    !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)) {
> -		set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr);
> +		ret = set_io_stripe(fs_info, op, logical, length, smap, map,
> +				    stripe_index, stripe_offset, stripe_nr);
>   		if (mirror_num_ret)
>   			*mirror_num_ret = mirror_num;
>   		*bioc_ret = NULL;
> -		ret = 0;
>   		goto out;
>   	}
>
> @@ -6463,21 +6474,29 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
>   		bioc->full_stripe_logical = em->start +
>   			btrfs_stripe_nr_to_offset(stripe_nr * data_stripes);
>   		for (i = 0; i < num_stripes; i++)
> -			set_io_stripe(&bioc->stripes[i], map,
> -				      (i + stripe_nr) % num_stripes,
> -				      stripe_offset, stripe_nr);
> +			ret = set_io_stripe(fs_info, op, logical, length,
> +					    &bioc->stripes[i], map,
> +					    (i + stripe_nr) % num_stripes,
> +					    stripe_offset, stripe_nr);
>   	} else {
>   		/*
>   		 * For all other non-RAID56 profiles, just copy the target
>   		 * stripe into the bioc.
>   		 */
>   		for (i = 0; i < num_stripes; i++) {
> -			set_io_stripe(&bioc->stripes[i], map, stripe_index,
> -				      stripe_offset, stripe_nr);
> +			ret = set_io_stripe(fs_info, op, logical, length,
> +					    &bioc->stripes[i], map, stripe_index,
> +					    stripe_offset, stripe_nr);
>   			stripe_index++;
>   		}
>   	}
>
> +	if (ret) {
> +		*bioc_ret = NULL;
> +		btrfs_put_bioc(bioc);
> +		goto out;
> +	}
> +
>   	if (op != BTRFS_MAP_READ)
>   		max_errors = btrfs_chunk_max_errors(map);
>
>