linux-kernel - Re: [PATCH 1/3] mm, swap: change back to use each swap device's percpu cluster

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAMgjq7AomHkGAtpvEt_ZrGK6fLUkWgg0vDGZ0B570QU_oNwRGA@mail.gmail.com>
Date: Thu, 13 Nov 2025 14:07:59 +0800
From: Kairui Song <ryncsn@...il.com>
To: Youngjun Park <youngjun.park@....com>
Cc: akpm@...ux-foundation.org, linux-mm@...ck.org, cgroups@...r.kernel.org, 
	linux-kernel@...r.kernel.org, chrisl@...nel.org, hannes@...xchg.org, 
	mhocko@...nel.org, roman.gushchin@...ux.dev, shakeel.butt@...ux.dev, 
	muchun.song@...ux.dev, shikemeng@...weicloud.com, nphamcs@...il.com, 
	bhe@...hat.com, baohua@...nel.org, gunho.lee@....com, taejoon.song@....com
Subject: Re: [PATCH 1/3] mm, swap: change back to use each swap device's
 percpu cluster

On Sun, Nov 9, 2025 at 8:54 PM Youngjun Park <youngjun.park@....com> wrote:
>
> This reverts commit 1b7e90020eb7 ("mm, swap: use percpu cluster as
> allocation fast path").
>
> Because in the newly introduced swap tiers, the global percpu cluster
> will cause two issues:
> 1) it will cause caching oscillation in the same order of different si
>    if two different memcg can only be allowed to access different si and
>    both of them are swapping out.
> 2) It can cause priority inversion on swap devices. Imagine a case where
>    there are two memcg, say memcg1 and memcg2. Memcg1 can access si A, B
>    and A is higher priority device. While memcg2 can only access si B.
>    Then memcg 2 could write the global percpu cluster with si B, then
>    memcg1 take si B in fast path even though si A is not exhausted.
>
> Hence in order to support swap tier, revert commit 1b7e90020eb7 to use
> each swap device's percpu cluster.
>
> Co-developed-by: Baoquan He <bhe@...hat.com>
> Suggested-by: Kairui Song <kasong@...cent.com>
> Signed-off-by: Baoquan He <bhe@...hat.com>
> Signed-off-by: Youngjun Park <youngjun.park@....com>

Hi Youngjun, Baoquan, Thanks for the work on the percpu cluster thing.

It will be better if you can provide some benchmark result since the
whole point of global percpu cluster is to improve the performance and
get rid of the swap slot cache.

I'm fine with a small regression but we better be aware of it. And we
can still figure out some other ways to optimize it. e.g. I remember
Chris once mentioned an idea of having a per device slot cache, that
is different from the original slot cache (swap_slot.c): the allocator
will be aware of it so it will be much cleaner.

> ---
>  include/linux/swap.h |  13 +++-
>  mm/swapfile.c        | 151 +++++++++++++------------------------------
>  2 files changed, 56 insertions(+), 108 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 38ca3df68716..90fa27bb7796 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -250,10 +250,17 @@ enum {
>  #endif
>
>  /*
> - * We keep using same cluster for rotational device so IO will be sequential.
> - * The purpose is to optimize SWAP throughput on these device.
> + * We assign a cluster to each CPU, so each CPU can allocate swap entry from
> + * its own cluster and swapout sequentially. The purpose is to optimize swapout
> + * throughput.
>   */
> +struct percpu_cluster {
> +       local_lock_t lock; /* Protect the percpu_cluster above */

I think you mean "below"?

> +       unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
> +};
> +
>  struct swap_sequential_cluster {
> +       spinlock_t lock; /* Serialize usage of global cluster */
>         unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
>  };
>
> @@ -278,8 +285,8 @@ struct swap_info_struct {
>                                         /* list of cluster that are fragmented or contented */
>         unsigned int pages;             /* total of usable pages of swap */
>         atomic_long_t inuse_pages;      /* number of those currently in use */
> +       struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
>         struct swap_sequential_cluster *global_cluster; /* Use one global cluster for rotating device */
> -       spinlock_t global_cluster_lock; /* Serialize usage of global cluster */
>         struct rb_root swap_extent_root;/* root of the swap extent rbtree */
>         struct block_device *bdev;      /* swap device or bdev of swap file */
>         struct file *swap_file;         /* seldom referenced */
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 44eb6a6e5800..3bb77c9a4879 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -118,18 +118,6 @@ static atomic_t proc_poll_event = ATOMIC_INIT(0);
>
>  atomic_t nr_rotate_swap = ATOMIC_INIT(0);
>
> -struct percpu_swap_cluster {
> -       struct swap_info_struct *si[SWAP_NR_ORDERS];
> -       unsigned long offset[SWAP_NR_ORDERS];
> -       local_lock_t lock;
> -};
> -
> -static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = {
> -       .si = { NULL },
> -       .offset = { SWAP_ENTRY_INVALID },
> -       .lock = INIT_LOCAL_LOCK(),
> -};
> -
>  /* May return NULL on invalid type, caller must check for NULL return */
>  static struct swap_info_struct *swap_type_to_info(int type)
>  {
> @@ -497,7 +485,7 @@ swap_cluster_alloc_table(struct swap_info_struct *si,
>          * Swap allocator uses percpu clusters and holds the local lock.
>          */
>         lockdep_assert_held(&ci->lock);
> -       lockdep_assert_held(&this_cpu_ptr(&percpu_swap_cluster)->lock);
> +       lockdep_assert_held(&this_cpu_ptr(si->percpu_cluster)->lock);
>
>         /* The cluster must be free and was just isolated from the free list. */
>         VM_WARN_ON_ONCE(ci->flags || !cluster_is_empty(ci));
> @@ -515,8 +503,8 @@ swap_cluster_alloc_table(struct swap_info_struct *si,
>          */
>         spin_unlock(&ci->lock);
>         if (!(si->flags & SWP_SOLIDSTATE))
> -               spin_unlock(&si->global_cluster_lock);
> -       local_unlock(&percpu_swap_cluster.lock);
> +               spin_unlock(&si->global_cluster->lock);
> +       local_unlock(&si->percpu_cluster->lock);
>
>         table = swap_table_alloc(__GFP_HIGH | __GFP_NOMEMALLOC | GFP_KERNEL);
>
> @@ -528,9 +516,9 @@ swap_cluster_alloc_table(struct swap_info_struct *si,
>          * could happen with ignoring the percpu cluster is fragmentation,
>          * which is acceptable since this fallback and race is rare.
>          */
> -       local_lock(&percpu_swap_cluster.lock);
> +       local_lock(&si->percpu_cluster->lock);
>         if (!(si->flags & SWP_SOLIDSTATE))
> -               spin_lock(&si->global_cluster_lock);
> +               spin_lock(&si->global_cluster->lock);
>         spin_lock(&ci->lock);
>
>         /* Nothing except this helper should touch a dangling empty cluster. */
> @@ -642,7 +630,7 @@ static bool swap_do_scheduled_discard(struct swap_info_struct *si)
>                 ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list);
>                 /*
>                  * Delete the cluster from list to prepare for discard, but keep
> -                * the CLUSTER_FLAG_DISCARD flag, percpu_swap_cluster could be
> +                * the CLUSTER_FLAG_DISCARD flag, there could be percpu_cluster
>                  * pointing to it, or ran into by relocate_cluster.
>                  */
>                 list_del(&ci->list);
> @@ -939,12 +927,11 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
>  out:
>         relocate_cluster(si, ci);
>         swap_cluster_unlock(ci);
> -       if (si->flags & SWP_SOLIDSTATE) {
> -               this_cpu_write(percpu_swap_cluster.offset[order], next);
> -               this_cpu_write(percpu_swap_cluster.si[order], si);
> -       } else {
> +       if (si->flags & SWP_SOLIDSTATE)
> +               this_cpu_write(si->percpu_cluster->next[order], next);
> +       else
>                 si->global_cluster->next[order] = next;
> -       }
> +
>         return found;
>  }
>
> @@ -1037,13 +1024,17 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
>         if (order && !(si->flags & SWP_BLKDEV))
>                 return 0;
>
> -       if (!(si->flags & SWP_SOLIDSTATE)) {
> +       if (si->flags & SWP_SOLIDSTATE) {
> +               /* Fast path using per CPU cluster */
> +               local_lock(&si->percpu_cluster->lock);
> +               offset = __this_cpu_read(si->percpu_cluster->next[order]);
> +       } else {
>                 /* Serialize HDD SWAP allocation for each device. */
> -               spin_lock(&si->global_cluster_lock);
> +               spin_lock(&si->global_cluster->lock);
>                 offset = si->global_cluster->next[order];
> -               if (offset == SWAP_ENTRY_INVALID)
> -                       goto new_cluster;
> +       }
>
> +       if (offset != SWAP_ENTRY_INVALID) {
>                 ci = swap_cluster_lock(si, offset);
>                 /* Cluster could have been used by another order */
>                 if (cluster_is_usable(ci, order)) {
> @@ -1058,7 +1049,6 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
>                         goto done;
>         }
>
> -new_cluster:
>         /*
>          * If the device need discard, prefer new cluster over nonfull
>          * to spread out the writes.
> @@ -1121,8 +1111,10 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
>                         goto done;
>         }
>  done:
> -       if (!(si->flags & SWP_SOLIDSTATE))
> -               spin_unlock(&si->global_cluster_lock);
> +       if (si->flags & SWP_SOLIDSTATE)
> +               local_unlock(&si->percpu_cluster->lock);
> +       else
> +               spin_unlock(&si->global_cluster->lock);
>
>         return found;
>  }
> @@ -1303,43 +1295,8 @@ static bool get_swap_device_info(struct swap_info_struct *si)
>         return true;
>  }
>
> -/*
> - * Fast path try to get swap entries with specified order from current
> - * CPU's swap entry pool (a cluster).
> - */
> -static bool swap_alloc_fast(swp_entry_t *entry,
> -                           int order)
> -{
> -       struct swap_cluster_info *ci;
> -       struct swap_info_struct *si;
> -       unsigned int offset, found = SWAP_ENTRY_INVALID;
> -
> -       /*
> -        * Once allocated, swap_info_struct will never be completely freed,
> -        * so checking it's liveness by get_swap_device_info is enough.
> -        */
> -       si = this_cpu_read(percpu_swap_cluster.si[order]);
> -       offset = this_cpu_read(percpu_swap_cluster.offset[order]);
> -       if (!si || !offset || !get_swap_device_info(si))
> -               return false;
> -
> -       ci = swap_cluster_lock(si, offset);
> -       if (cluster_is_usable(ci, order)) {
> -               if (cluster_is_empty(ci))
> -                       offset = cluster_offset(si, ci);
> -               found = alloc_swap_scan_cluster(si, ci, offset, order, SWAP_HAS_CACHE);
> -               if (found)
> -                       *entry = swp_entry(si->type, found);
> -       } else {
> -               swap_cluster_unlock(ci);
> -       }
> -
> -       put_swap_device(si);
> -       return !!found;
> -}
> -
>  /* Rotate the device and switch to a new cluster */
> -static bool swap_alloc_slow(swp_entry_t *entry,
> +static void swap_alloc_entry(swp_entry_t *entry,
>                             int order)

It seems you also changed the rotation rule here so every allocation
of any order is causing a swap device rotation? Before 1b7e90020eb7
every 64 allocation causes a rotation as we had slot cache
(swap_slot.c). The global cluster makes the rotation happen for every
cluster so the overhead is even lower on average. But now a per
allocation roration seems a rather high overhead and may cause serious
fragmentation.