linux-kernel - Re: [PATCH AUTOSEL 6.12 13/19] btrfs: reduce lock contention when eb cache miss for btree search

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAL3q7H4n+7ZN228X9VGvgLX9S7cw5J27cJSGSiSCHjFbY9htLQ@mail.gmail.com>
Date: Mon, 25 Nov 2024 11:23:17 +0000
From: Filipe Manana <fdmanana@...nel.org>
To: Sasha Levin <sashal@...nel.org>
Cc: linux-kernel@...r.kernel.org, stable@...r.kernel.org, 
	Robbie Ko <robbieko@...ology.com>, Filipe Manana <fdmanana@...e.com>, 
	David Sterba <dsterba@...e.com>, clm@...com, josef@...icpanda.com, 
	linux-btrfs@...r.kernel.org
Subject: Re: [PATCH AUTOSEL 6.12 13/19] btrfs: reduce lock contention when eb
 cache miss for btree search

On Sun, Nov 24, 2024 at 12:46 PM Sasha Levin <sashal@...nel.org> wrote:
>
> From: Robbie Ko <robbieko@...ology.com>
>
> [ Upstream commit 99785998ed1cea142e20f4904ced26537a37bf74 ]

Why is this being picked for stable?

It's not a bug fix or anything critical.
It's just a performance optimization, and it's not even one where we
know (AFAIK) of any workload where it would give very significant
gains to justify backporting to stable.

Thanks.

>
> When crawling btree, if an eb cache miss occurs, we change to use the eb
> read lock and release all previous locks (including the parent lock) to
> reduce lock contention.
>
> If an eb cache miss occurs in a leaf and needs to execute IO, before this
> change we released locks only from level 2 and up and we read a leaf's
> content from disk while holding a lock on its parent (level 1), causing
> the unnecessary lock contention on the parent, after this change we
> release locks from level 1 and up, but we lock level 0, and read leaf's
> content from disk.
>
> Because we have prepared the check parameters and the read lock of eb we
> hold, we can ensure that no race will occur during the check and cause
> unexpected errors.
>
> Reviewed-by: Filipe Manana <fdmanana@...e.com>
> Signed-off-by: Robbie Ko <robbieko@...ology.com>
> Signed-off-by: David Sterba <dsterba@...e.com>
> Signed-off-by: Sasha Levin <sashal@...nel.org>
> ---
>  fs/btrfs/ctree.c | 101 ++++++++++++++++++++++++++++++++---------------
>  1 file changed, 70 insertions(+), 31 deletions(-)
>
> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
> index 0cc919d15b144..dd92acd66624f 100644
> --- a/fs/btrfs/ctree.c
> +++ b/fs/btrfs/ctree.c
> @@ -1515,12 +1515,14 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
>         struct btrfs_tree_parent_check check = { 0 };
>         u64 blocknr;
>         u64 gen;
> -       struct extent_buffer *tmp;
> -       int ret;
> +       struct extent_buffer *tmp = NULL;
> +       int ret = 0;
>         int parent_level;
> -       bool unlock_up;
> +       int err;
> +       bool read_tmp = false;
> +       bool tmp_locked = false;
> +       bool path_released = false;
>
> -       unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
>         blocknr = btrfs_node_blockptr(*eb_ret, slot);
>         gen = btrfs_node_ptr_generation(*eb_ret, slot);
>         parent_level = btrfs_header_level(*eb_ret);
> @@ -1551,68 +1553,105 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
>                          */
>                         if (btrfs_verify_level_key(tmp,
>                                         parent_level - 1, &check.first_key, gen)) {
> -                               free_extent_buffer(tmp);
> -                               return -EUCLEAN;
> +                               ret = -EUCLEAN;
> +                               goto out;
>                         }
>                         *eb_ret = tmp;
> -                       return 0;
> +                       tmp = NULL;
> +                       ret = 0;
> +                       goto out;
>                 }
>
>                 if (p->nowait) {
> -                       free_extent_buffer(tmp);
> -                       return -EAGAIN;
> +                       ret = -EAGAIN;
> +                       goto out;
>                 }
>
> -               if (unlock_up)
> +               if (!p->skip_locking) {
>                         btrfs_unlock_up_safe(p, level + 1);
> -
> -               /* now we're allowed to do a blocking uptodate check */
> -               ret = btrfs_read_extent_buffer(tmp, &check);
> -               if (ret) {
> -                       free_extent_buffer(tmp);
> +                       tmp_locked = true;
> +                       btrfs_tree_read_lock(tmp);
>                         btrfs_release_path(p);
> -                       return ret;
> +                       ret = -EAGAIN;
> +                       path_released = true;
>                 }
>
> -               if (unlock_up)
> -                       ret = -EAGAIN;
> +               /* Now we're allowed to do a blocking uptodate check. */
> +               err = btrfs_read_extent_buffer(tmp, &check);
> +               if (err) {
> +                       ret = err;
> +                       goto out;
> +               }
>
> +               if (ret == 0) {
> +                       ASSERT(!tmp_locked);
> +                       *eb_ret = tmp;
> +                       tmp = NULL;
> +               }
>                 goto out;
>         } else if (p->nowait) {
> -               return -EAGAIN;
> +               ret = -EAGAIN;
> +               goto out;
>         }
>
> -       if (unlock_up) {
> +       if (!p->skip_locking) {
>                 btrfs_unlock_up_safe(p, level + 1);
>                 ret = -EAGAIN;
> -       } else {
> -               ret = 0;
>         }
>
>         if (p->reada != READA_NONE)
>                 reada_for_search(fs_info, p, level, slot, key->objectid);
>
> -       tmp = read_tree_block(fs_info, blocknr, &check);
> +       tmp = btrfs_find_create_tree_block(fs_info, blocknr, check.owner_root, check.level);
>         if (IS_ERR(tmp)) {
> +               ret = PTR_ERR(tmp);
> +               tmp = NULL;
> +               goto out;
> +       }
> +       read_tmp = true;
> +
> +       if (!p->skip_locking) {
> +               ASSERT(ret == -EAGAIN);
> +               tmp_locked = true;
> +               btrfs_tree_read_lock(tmp);
>                 btrfs_release_path(p);
> -               return PTR_ERR(tmp);
> +               path_released = true;
> +       }
> +
> +       /* Now we're allowed to do a blocking uptodate check. */
> +       err = btrfs_read_extent_buffer(tmp, &check);
> +       if (err) {
> +               ret = err;
> +               goto out;
>         }
> +
>         /*
>          * If the read above didn't mark this buffer up to date,
>          * it will never end up being up to date.  Set ret to EIO now
>          * and give up so that our caller doesn't loop forever
>          * on our EAGAINs.
>          */
> -       if (!extent_buffer_uptodate(tmp))
> +       if (!extent_buffer_uptodate(tmp)) {
>                 ret = -EIO;
> +               goto out;
> +       }
>
> -out:
>         if (ret == 0) {
> +               ASSERT(!tmp_locked);
>                 *eb_ret = tmp;
> -       } else {
> -               free_extent_buffer(tmp);
> -               btrfs_release_path(p);
> +               tmp = NULL;
> +       }
> +out:
> +       if (tmp) {
> +               if (tmp_locked)
> +                       btrfs_tree_read_unlock(tmp);
> +               if (read_tmp && ret && ret != -EAGAIN)
> +                       free_extent_buffer_stale(tmp);
> +               else
> +                       free_extent_buffer(tmp);
>         }
> +       if (ret && !path_released)
> +               btrfs_release_path(p);
>
>         return ret;
>  }
> @@ -2198,7 +2237,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
>                 }
>
>                 err = read_block_for_search(root, p, &b, level, slot, key);
> -               if (err == -EAGAIN)
> +               if (err == -EAGAIN && !p->nowait)
>                         goto again;
>                 if (err) {
>                         ret = err;
> @@ -2325,7 +2364,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
>                 }
>
>                 err = read_block_for_search(root, p, &b, level, slot, key);
> -               if (err == -EAGAIN)
> +               if (err == -EAGAIN && !p->nowait)
>                         goto again;
>                 if (err) {
>                         ret = err;
> --
> 2.43.0
>
>