linux-kernel - Re: [PATCH v2 1/2] Revert "mm: zswap: fix race between [de]compression and CPU hotunplug"

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAGsJ_4wCkCqCteG_1RW3Lh214zQ_tD7AtguvBEG6viLthwbPVw@mail.gmail.com>
Date: Wed, 8 Jan 2025 12:01:09 +1300
From: Barry Song <baohua@...nel.org>
To: Yosry Ahmed <yosryahmed@...gle.com>
Cc: Andrew Morton <akpm@...ux-foundation.org>, Johannes Weiner <hannes@...xchg.org>, 
	Nhat Pham <nphamcs@...il.com>, Chengming Zhou <chengming.zhou@...ux.dev>, 
	Vitaly Wool <vitalywool@...il.com>, Sam Sun <samsun1006219@...il.com>, 
	Kanchana P Sridhar <kanchana.p.sridhar@...el.com>, linux-mm@...ck.org, 
	linux-kernel@...r.kernel.org, syzbot <syzkaller@...glegroups.com>
Subject: Re: [PATCH v2 1/2] Revert "mm: zswap: fix race between
 [de]compression and CPU hotunplug"

On Wed, Jan 8, 2025 at 11:22 AM Yosry Ahmed <yosryahmed@...gle.com> wrote:
>
> This reverts commit eaebeb93922ca6ab0dd92027b73d0112701706ef.
>
> Commit eaebeb93922c ("mm: zswap: fix race between [de]compression and
> CPU hotunplug") used the CPU hotplug lock in zswap compress/decompress
> operations to protect against a race with CPU hotunplug making some
> per-CPU resources go away.
>
> However, zswap compress/decompress can be reached through reclaim while
> the lock is held, resulting in a potential deadlock as reported by
> syzbot:
> ======================================================
> WARNING: possible circular locking dependency detected
> 6.13.0-rc6-syzkaller-00006-g5428dc1906dd #0 Not tainted
> ------------------------------------------------------
> kswapd0/89 is trying to acquire lock:
>  ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: acomp_ctx_get_cpu mm/zswap.c:886 [inline]
>  ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: zswap_compress mm/zswap.c:908 [inline]
>  ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: zswap_store_page mm/zswap.c:1439 [inline]
>  ffffffff8e7d2ed0 (cpu_hotplug_lock){++++}-{0:0}, at: zswap_store+0xa74/0x1ba0 mm/zswap.c:1546
>
> but task is already holding lock:
>  ffffffff8ea355a0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat mm/vmscan.c:6871 [inline]
>  ffffffff8ea355a0 (fs_reclaim){+.+.}-{0:0}, at: kswapd+0xb58/0x2f30 mm/vmscan.c:7253
>
> which lock already depends on the new lock.

We have functions like percpu_is_write_locked(),
percpu_is_read_locked(), and cpus_read_trylock().
Could they help prevent circular locking dependencies if we perform a
check before acquiring the lock?

>
> the existing dependency chain (in reverse order) is:
>
> -> #1 (fs_reclaim){+.+.}-{0:0}:
>         lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849
>         __fs_reclaim_acquire mm/page_alloc.c:3853 [inline]
>         fs_reclaim_acquire+0x88/0x130 mm/page_alloc.c:3867
>         might_alloc include/linux/sched/mm.h:318 [inline]
>         slab_pre_alloc_hook mm/slub.c:4070 [inline]
>         slab_alloc_node mm/slub.c:4148 [inline]
>         __kmalloc_cache_node_noprof+0x40/0x3a0 mm/slub.c:4337
>         kmalloc_node_noprof include/linux/slab.h:924 [inline]
>         alloc_worker kernel/workqueue.c:2638 [inline]
>         create_worker+0x11b/0x720 kernel/workqueue.c:2781
>         workqueue_prepare_cpu+0xe3/0x170 kernel/workqueue.c:6628
>         cpuhp_invoke_callback+0x48d/0x830 kernel/cpu.c:194
>         __cpuhp_invoke_callback_range kernel/cpu.c:965 [inline]
>         cpuhp_invoke_callback_range kernel/cpu.c:989 [inline]
>         cpuhp_up_callbacks kernel/cpu.c:1020 [inline]
>         _cpu_up+0x2b3/0x580 kernel/cpu.c:1690
>         cpu_up+0x184/0x230 kernel/cpu.c:1722
>         cpuhp_bringup_mask+0xdf/0x260 kernel/cpu.c:1788
>         cpuhp_bringup_cpus_parallel+0xf9/0x160 kernel/cpu.c:1878
>         bringup_nonboot_cpus+0x2b/0x50 kernel/cpu.c:1892
>         smp_init+0x34/0x150 kernel/smp.c:1009
>         kernel_init_freeable+0x417/0x5d0 init/main.c:1569
>         kernel_init+0x1d/0x2b0 init/main.c:1466
>         ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147
>         ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244
>
> -> #0 (cpu_hotplug_lock){++++}-{0:0}:
>         check_prev_add kernel/locking/lockdep.c:3161 [inline]
>         check_prevs_add kernel/locking/lockdep.c:3280 [inline]
>         validate_chain+0x18ef/0x5920 kernel/locking/lockdep.c:3904
>         __lock_acquire+0x1397/0x2100 kernel/locking/lockdep.c:5226
>         lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849
>         percpu_down_read include/linux/percpu-rwsem.h:51 [inline]
>         cpus_read_lock+0x42/0x150 kernel/cpu.c:490
>         acomp_ctx_get_cpu mm/zswap.c:886 [inline]
>         zswap_compress mm/zswap.c:908 [inline]
>         zswap_store_page mm/zswap.c:1439 [inline]
>         zswap_store+0xa74/0x1ba0 mm/zswap.c:1546
>         swap_writepage+0x647/0xce0 mm/page_io.c:279
>         shmem_writepage+0x1248/0x1610 mm/shmem.c:1579
>         pageout mm/vmscan.c:696 [inline]
>         shrink_folio_list+0x35ee/0x57e0 mm/vmscan.c:1374
>         shrink_inactive_list mm/vmscan.c:1967 [inline]
>         shrink_list mm/vmscan.c:2205 [inline]
>         shrink_lruvec+0x16db/0x2f30 mm/vmscan.c:5734
>         mem_cgroup_shrink_node+0x385/0x8e0 mm/vmscan.c:6575
>         mem_cgroup_soft_reclaim mm/memcontrol-v1.c:312 [inline]
>         memcg1_soft_limit_reclaim+0x346/0x810 mm/memcontrol-v1.c:362
>         balance_pgdat mm/vmscan.c:6975 [inline]
>         kswapd+0x17b3/0x2f30 mm/vmscan.c:7253
>         kthread+0x2f0/0x390 kernel/kthread.c:389
>         ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147
>         ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244
>
> other info that might help us debug this:
>
>  Possible unsafe locking scenario:
>
>        CPU0                    CPU1
>        ----                    ----
>   lock(fs_reclaim);
>                                lock(cpu_hotplug_lock);
>                                lock(fs_reclaim);
>   rlock(cpu_hotplug_lock);
>
>  *** DEADLOCK ***
>
> 1 lock held by kswapd0/89:
>   #0: ffffffff8ea355a0 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat mm/vmscan.c:6871 [inline]
>   #0: ffffffff8ea355a0 (fs_reclaim){+.+.}-{0:0}, at: kswapd+0xb58/0x2f30 mm/vmscan.c:7253
>
> stack backtrace:
> CPU: 0 UID: 0 PID: 89 Comm: kswapd0 Not tainted 6.13.0-rc6-syzkaller-00006-g5428dc1906dd #0
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024
> Call Trace:
>  <TASK>
>   __dump_stack lib/dump_stack.c:94 [inline]
>   dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120
>   print_circular_bug+0x13a/0x1b0 kernel/locking/lockdep.c:2074
>   check_noncircular+0x36a/0x4a0 kernel/locking/lockdep.c:2206
>   check_prev_add kernel/locking/lockdep.c:3161 [inline]
>   check_prevs_add kernel/locking/lockdep.c:3280 [inline]
>   validate_chain+0x18ef/0x5920 kernel/locking/lockdep.c:3904
>   __lock_acquire+0x1397/0x2100 kernel/locking/lockdep.c:5226
>   lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849
>   percpu_down_read include/linux/percpu-rwsem.h:51 [inline]
>   cpus_read_lock+0x42/0x150 kernel/cpu.c:490
>   acomp_ctx_get_cpu mm/zswap.c:886 [inline]
>   zswap_compress mm/zswap.c:908 [inline]
>   zswap_store_page mm/zswap.c:1439 [inline]
>   zswap_store+0xa74/0x1ba0 mm/zswap.c:1546
>   swap_writepage+0x647/0xce0 mm/page_io.c:279
>   shmem_writepage+0x1248/0x1610 mm/shmem.c:1579
>   pageout mm/vmscan.c:696 [inline]
>   shrink_folio_list+0x35ee/0x57e0 mm/vmscan.c:1374
>   shrink_inactive_list mm/vmscan.c:1967 [inline]
>   shrink_list mm/vmscan.c:2205 [inline]
>   shrink_lruvec+0x16db/0x2f30 mm/vmscan.c:5734
>   mem_cgroup_shrink_node+0x385/0x8e0 mm/vmscan.c:6575
>   mem_cgroup_soft_reclaim mm/memcontrol-v1.c:312 [inline]
>   memcg1_soft_limit_reclaim+0x346/0x810 mm/memcontrol-v1.c:362
>   balance_pgdat mm/vmscan.c:6975 [inline]
>   kswapd+0x17b3/0x2f30 mm/vmscan.c:7253
>   kthread+0x2f0/0x390 kernel/kthread.c:389
>   ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147
>   ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244
>  </TASK>
>
> Revert the change. A different fix for the race with CPU hotunplug will
> follow.
>
> Reported-by: syzbot <syzkaller@...glegroups.com>
> Signed-off-by: Yosry Ahmed <yosryahmed@...gle.com>
> ---
>
> The patches apply on top of mm-hotfixes-unstable and are meant for
> v6.13.
>
> Andrew, I am not sure what's the best way to handle this. This fix is
> already merged into Linus's tree and had CC:stable, so I thought it's
> best to revert it and replace it with a separate fix that would be easy
> to backport instead of the revert patch, especially that functionally
> the new fix is different anyway.
>
> v1 -> v2:
> - Disable migration as an alternative fix instead of SRCU, and explain
>   why SRCU and cpus_read_lock() cannot be used in the commit log of
>   patch 2.
>
> ---
>  mm/zswap.c | 19 +++----------------
>  1 file changed, 3 insertions(+), 16 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 5a27af8d86ea9..f6316b66fb236 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -880,18 +880,6 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
>         return 0;
>  }
>
> -/* Prevent CPU hotplug from freeing up the per-CPU acomp_ctx resources */
> -static struct crypto_acomp_ctx *acomp_ctx_get_cpu(struct crypto_acomp_ctx __percpu *acomp_ctx)
> -{
> -       cpus_read_lock();
> -       return raw_cpu_ptr(acomp_ctx);
> -}
> -
> -static void acomp_ctx_put_cpu(void)
> -{
> -       cpus_read_unlock();
> -}
> -
>  static bool zswap_compress(struct page *page, struct zswap_entry *entry,
>                            struct zswap_pool *pool)
>  {
> @@ -905,7 +893,8 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
>         gfp_t gfp;
>         u8 *dst;
>
> -       acomp_ctx = acomp_ctx_get_cpu(pool->acomp_ctx);
> +       acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
> +
>         mutex_lock(&acomp_ctx->mutex);
>
>         dst = acomp_ctx->buffer;
> @@ -961,7 +950,6 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
>                 zswap_reject_alloc_fail++;
>
>         mutex_unlock(&acomp_ctx->mutex);
> -       acomp_ctx_put_cpu();
>         return comp_ret == 0 && alloc_ret == 0;
>  }
>
> @@ -972,7 +960,7 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
>         struct crypto_acomp_ctx *acomp_ctx;
>         u8 *src;
>
> -       acomp_ctx = acomp_ctx_get_cpu(entry->pool->acomp_ctx);
> +       acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
>         mutex_lock(&acomp_ctx->mutex);
>
>         src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
> @@ -1002,7 +990,6 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
>
>         if (src != acomp_ctx->buffer)
>                 zpool_unmap_handle(zpool, entry->handle);
> -       acomp_ctx_put_cpu();
>  }
>
>  /*********************************
> --
> 2.47.1.613.gc27f4b7a9f-goog
>

Thanks
barry