[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAMgjq7COLbfGwd4CYxNBaLTi4UaPDkKQzkLhsV-caoA-xq1V-g@mail.gmail.com>
Date: Wed, 23 Jul 2025 01:44:49 +0800
From: Kairui Song <ryncsn@...il.com>
To: Youngjun Park <youngjun.park@....com>
Cc: akpm@...ux-foundation.org, hannes@...xchg.org, mhocko@...nel.org,
roman.gushchin@...ux.dev, shakeel.butt@...ux.dev, muchun.song@...ux.dev,
shikemeng@...weicloud.com, nphamcs@...il.com, bhe@...hat.com,
baohua@...nel.org, chrisl@...nel.org, cgroups@...r.kernel.org,
linux-mm@...ck.org, linux-kernel@...r.kernel.org, gunho.lee@....com,
iamjoonsoo.kim@....com, taejoon.song@....com
Subject: Re: [PATCH 4/4] mm: swap: Per-cgroup per-CPU swap device cache with
shared clusters
On Thu, Jul 17, 2025 at 4:21 AM Youngjun Park <youngjun.park@....com> wrote:
>
> This patch introduces a new swap allocation mechanism that supports
> per-cgroup per-CPU swap device caches, combined with per-device per-CPU
> cluster management.
>
> The existing global swap allocator uses a per-CPU device cache and
> cluster, shared by all cgroups. Under this model, per-cgroup swap
> priorities cannot be effectively honored on the fast path, as allocations
> do not distinguish between cgroups.
>
> To address this, we introduce per-cgroup per-CPU swap device caches.
> This allows fast-path swap allocations to respect each cgroup’s
> individual priority settings.
>
> To avoid an explosion of cluster structures proportional to the number
> of cgroups, clusters remain per-device and are shared across cgroups.
> This strikes a balance between performance and memory overhead.
>
> Suggested-by: Nhat Pham <nphamcs@...il.com>
> Suggested-by: Kairui Song <kasong@...cent.com>
> Signed-off-by: Youngjun Park <youngjun.park@....com>
> ---
> include/linux/swap.h | 7 ++
> mm/swap_cgroup_priority.c | 156 +++++++++++++++++++++++++++++++++++++-
> mm/swap_cgroup_priority.h | 39 ++++++++++
> mm/swapfile.c | 47 +++++++-----
> 4 files changed, 228 insertions(+), 21 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index bfddbec2ee28..ab15f4c103a1 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -283,6 +283,12 @@ enum swap_cluster_flags {
> #define SWAP_NR_ORDERS 1
> #endif
>
> +#ifdef CONFIG_SWAP_CGROUP_PRIORITY
> +struct percpu_cluster {
> + unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
> +};
> +#endif
> +
> /*
> * We keep using same cluster for rotational device so IO will be sequential.
> * The purpose is to optimize SWAP throughput on these device.
> @@ -341,6 +347,7 @@ struct swap_info_struct {
> struct list_head discard_clusters; /* discard clusters list */
> #ifdef CONFIG_SWAP_CGROUP_PRIORITY
> u64 id;
> + struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
> #endif
> struct plist_node avail_lists[]; /*
> * entries in swap_avail_heads, one
> diff --git a/mm/swap_cgroup_priority.c b/mm/swap_cgroup_priority.c
> index 84e876b77f01..f960c3dcab48 100644
> --- a/mm/swap_cgroup_priority.c
> +++ b/mm/swap_cgroup_priority.c
> @@ -21,6 +21,17 @@
> #include "swap_cgroup_priority.h"
> #include "memcontrol-v1.h"
>
> +/*
> + * We do maintain a cache on a per-cgroup-per-swap-device basis.
> + * However, the underlying cluster cache itself is managed
> + * per-swap-device. This design prevents each individual
> + * swap_cgroup_priority entry from caching its own cluster data,
> + * even as the number of such entries increases.
> + */
> +struct percpu_swap_device {
> + struct swap_info_struct *si[SWAP_NR_ORDERS];
> +};
> +
> static DEFINE_MUTEX(swap_cgroup_priority_inherit_lck);
> static LIST_HEAD(swap_cgroup_priority_list);
>
> @@ -49,6 +60,7 @@ static LIST_HEAD(swap_cgroup_priority_list);
> * least_priority - Current lowest priority.
> * distance - Priority differences from global swap priority.
> * default_prio - Default priority for this cgroup.
> + * pcpu_swapdev - Per-CPU swap device.
> * plist - Priority list head.
> */
> struct swap_cgroup_priority {
> @@ -64,6 +76,7 @@ struct swap_cgroup_priority {
> int least_priority;
> s8 distance;
> int default_prio;
> + struct percpu_swap_device __percpu *pcpu_swapdev;
> struct plist_head plist[];
> };
>
> @@ -132,6 +145,21 @@ static struct swap_cgroup_priority *get_effective_swap_cgroup_priority(
> return swap_priority->effective;
> }
>
> +static struct swap_cgroup_priority *get_effective_swap_cgroup_priority_rcu(
> + struct mem_cgroup *memcg)
> +{
> + struct swap_cgroup_priority *swap_priority;
> +
> + if (!memcg)
> + return NULL;
> +
> + swap_priority = rcu_dereference(memcg->swap_priority);
> + if (!swap_priority)
> + return NULL;
> +
> + return rcu_dereference(swap_priority->effective);
> +}
> +
> static bool validate_effective_swap_cgroup_priority(
> struct mem_cgroup *memcg,
> struct swap_cgroup_priority **swap_priority)
> @@ -172,6 +200,9 @@ static void free_swap_cgroup_priority_pnode(
> static void free_swap_cgroup_priority(
> struct swap_cgroup_priority *swap_priority)
> {
> + if (swap_priority->pcpu_swapdev)
> + free_percpu(swap_priority->pcpu_swapdev);
> +
> for (int i = 0; i < MAX_SWAPFILES; i++)
> free_swap_cgroup_priority_pnode(swap_priority->pnode[i]);
>
> @@ -187,6 +218,12 @@ static struct swap_cgroup_priority *alloc_swap_cgroup_priority(void)
> if (!swap_priority)
> return NULL;
>
> + swap_priority->pcpu_swapdev = alloc_percpu(struct percpu_swap_device);
> + if (!swap_priority->pcpu_swapdev) {
> + kvfree(swap_priority);
> + return NULL;
> + }
> +
> /*
> * Pre-allocates pnode array up to nr_swapfiles at init.
> * Individual pnodes are assigned on swapon, but not freed
> @@ -326,10 +363,34 @@ bool swap_alloc_cgroup_priority(struct mem_cgroup *memcg,
> unsigned long offset;
> int node;
>
> - /*
> - * TODO: Per-cpu swap cluster cache can't be used directly
> - * as cgroup-specific priorities may select different devices.
> - */
> + rcu_read_lock();
> + if (!(swap_priority = get_effective_swap_cgroup_priority_rcu(memcg))) {
> + rcu_read_unlock();
> + return false;
> + }
> +
> + /* Fast path */
> + si = this_cpu_read(swap_priority->pcpu_swapdev->si[order]);
> + if (si && get_swap_device_info(si)) {
> + offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE);
> + if (offset) {
> + *entry = swp_entry(si->type, offset);
> + /*
> + * Protected by 'percpu_swap_cluster' local_lock;
> + * CPU migration is disabled during this operation.
> + */
> + this_cpu_write(swap_priority->pcpu_swapdev->si[order],
> + si);
> + put_swap_device(si);
> + rcu_read_unlock();
> +
> + return true;
> + }
> + put_swap_device(si);
> + }
> + rcu_read_unlock();
> +
> + /* Slow path */
Hi Youngjun
One thing I noticed after a quick glance is that this
swap_alloc_cgroup_priority is bloated and it is doing similar things
as folio_alloc_swap.
I imagined that we can just have a struct (eg. let's call it struct
swap_percpu_info / pi) as a closure of what the allocator needs, it
contains the plist and fast path device.
With slight changes to folio_alloc_swap, it can respect either the
cgroup's pi or global pi. (might be a horrible name though, feel free
to change it)
For example first thing swap_alloc_fast do will be:
`struct swap_percpu_info *pi = folio_swap_percpu_info(folio);`
folio_swap_percpu_info returns the cgroup's swap_percpu_info or the global one.
swap_alloc_slow can do a similar thing, it then can just use pi->plist
and pi->pcpu_swapdev, (cluster info will be in si) ignoring all the
cgroup differences.
Also it is better to check your patches with ./scripts/checkpatch.pl,
I'm seeing some styling issues.
I'll check your other patches too later this week, thanks for the
update on this idea.
> spin_lock(&swap_avail_lock);
> node = numa_node_id();
>
> @@ -350,6 +411,14 @@ bool swap_alloc_cgroup_priority(struct mem_cgroup *memcg,
> if (get_swap_device_info(si)) {
> offset = cluster_alloc_swap_entry(si, order,
> SWAP_HAS_CACHE);
> + /*
> + * Protected by 'percpu_swap_cluster' local_lock;
> + * CPU migration is disabled during this operation.
> + */
> + if (memcg->swap_priority == swap_priority)
> + this_cpu_write(
> + swap_priority->pcpu_swapdev->si[order],
> + si);
> put_swap_device(si);
> if (offset) {
> *entry = swp_entry(si->type, offset);
> @@ -687,6 +756,21 @@ static int __apply_swap_cgroup_priority(
> return 0;
> }
>
> +static int init_swap_cgroup_priority_pcpu_swapdev_cache(
> + struct swap_cgroup_priority *swap_priority)
> +{
> + int cpu;
> +
> + for_each_possible_cpu(cpu) {
> + struct percpu_swap_device *pcp_swap_dev =
> + per_cpu_ptr(swap_priority->pcpu_swapdev, cpu);
> + for (int i = 0; i < SWAP_NR_ORDERS; i++)
> + pcp_swap_dev->si[i] = NULL;
> + }
> +
> + return 0;
> +}
> +
> /*
> * If this is the top-level swap_cgroup_priority, propagation is needed.
> * We traverse the 'mem_cgroup_tree' using 'for_each_mem_cgroup_tree'.
> @@ -795,6 +879,8 @@ int apply_swap_cgroup_priority(struct mem_cgroup *memcg, u64 id, int prio)
> for_each_node(nid)
> plist_head_init(&swap_priority->plist[nid]);
>
> + init_swap_cgroup_priority_pcpu_swapdev_cache(swap_priority);
> +
> prio_set:
> spin_lock(&swap_lock);
> spin_lock(&swap_avail_lock);
> @@ -843,6 +929,23 @@ int apply_swap_cgroup_priority(struct mem_cgroup *memcg, u64 id, int prio)
>
> spin_unlock(&swap_avail_lock);
> spin_unlock(&swap_lock);
> + /*
> + * XXX: We cannot fully synchronize with swap_alloc_cgroup_priority
> + * when updating the next si.
> + * Still, we ensure that flush operations inside swap_priority
> + * are performed as reliably as possible.
> + */
> + if (id != DEFAULT_ID &&
> + swap_priority == swap_priority->effective && !new) {
> + int cpu;
> + struct swap_info_struct **pcp_si;
> + for_each_possible_cpu(cpu) {
> + pcp_si = per_cpu_ptr(
> + swap_priority->pcpu_swapdev->si, cpu);
> + for (int i = 0; i < SWAP_NR_ORDERS; i++)
> + pcp_si[i] = NULL;
> + }
> + }
> mutex_unlock(&swap_cgroup_priority_inherit_lck);
> return 0;
>
> @@ -886,3 +989,48 @@ void delete_swap_cgroup_priority(struct mem_cgroup *memcg)
> spin_unlock(&swap_avail_lock);
> mutex_unlock(&swap_cgroup_priority_inherit_lck);
> }
> +
> +void flush_swap_cgroup_priority_percpu_swapdev(struct swap_info_struct *si)
> +{
> + int cpu, i;
> + struct swap_info_struct **pcp_si;
> + struct swap_cgroup_priority *swap_priority;
> +
> + rcu_read_lock();
> + list_for_each_entry_rcu(swap_priority,
> + &swap_cgroup_priority_list, link) {
> + for_each_possible_cpu(cpu) {
> + pcp_si = per_cpu_ptr(
> + swap_priority->pcpu_swapdev->si, cpu);
> +
> + for (i = 0; i < SWAP_NR_ORDERS; i++)
> + cmpxchg(&pcp_si[i], si, NULL);
> + }
> + }
> + rcu_read_unlock();
> +}
> +
> +bool alloc_percpu_swap_cluster(struct swap_info_struct *si)
> +{
> + si->percpu_cluster = alloc_percpu(struct percpu_cluster);
> + if (!si->percpu_cluster)
> + return false;
> +
> + int cpu;
> + int i;
> + for_each_possible_cpu(cpu) {
> + struct percpu_cluster *cluster;
> +
> + cluster = per_cpu_ptr(si->percpu_cluster, cpu);
> + for (i = 0; i < SWAP_NR_ORDERS; i++)
> + cluster->next[i] = SWAP_ENTRY_INVALID;
> + }
> +
> + return true;
> +}
> +
> +void free_percpu_swap_cluster(struct swap_info_struct *si)
> +{
> + free_percpu(si->percpu_cluster);
> + si->percpu_cluster = NULL;
> +}
> diff --git a/mm/swap_cgroup_priority.h b/mm/swap_cgroup_priority.h
> index 5d16b63d12e0..815822ebd0d1 100644
> --- a/mm/swap_cgroup_priority.h
> +++ b/mm/swap_cgroup_priority.h
> @@ -47,6 +47,22 @@ struct swap_cgroup_priority *inherit_swap_cgroup_priority(
> bool swap_alloc_cgroup_priority(struct mem_cgroup *memcg, swp_entry_t *entry,
> int order);
> void delete_swap_cgroup_priority(struct mem_cgroup *memcg);
> +void flush_swap_cgroup_priority_percpu_swapdev(struct swap_info_struct *si);
> +
> +bool alloc_percpu_swap_cluster(struct swap_info_struct *si);
> +void free_percpu_swap_cluster(struct swap_info_struct *si);
> +static inline void write_percpu_swap_cluster_next(struct swap_info_struct *si,
> + int order,
> + unsigned int next)
> +{
> + this_cpu_write(si->percpu_cluster->next[order], next);
> +}
> +
> +static inline unsigned int read_percpu_swap_cluster_next(
> + struct swap_info_struct *si, int order)
> +{
> + return __this_cpu_read(si->percpu_cluster->next[order]);
> +}
> #else
> int swap_node(struct swap_info_struct *si);
> unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
> @@ -85,5 +101,28 @@ static inline bool swap_alloc_cgroup_priority(struct mem_cgroup *memcg,
> static inline void delete_swap_cgroup_priority(struct mem_cgroup *memcg)
> {
> }
> +static inline void flush_swap_cgroup_priority_percpu_swapdev(
> + struct swap_info_struct *si)
> +{
> +}
> +static inline bool alloc_percpu_swap_cluster(struct swap_info_struct *si)
> +{
> + return true;
> +}
> +static inline void free_percpu_swap_cluster(struct swap_info_struct *si)
> +{
> +}
> +static inline void write_percpu_swap_cluster_next(struct swap_info_struct *si,
> + int order,
> + unsigned int next)
> +{
> + return;
> +}
> +
> +static inline unsigned int read_percpu_swap_cluster_next(
> + struct swap_info_struct *si, int order)
> +{
> + return SWAP_ENTRY_INVALID;
> +}
> #endif
> #endif
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index bfd0532ad250..6a5ac9962e9f 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -817,12 +817,15 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
> out:
> relocate_cluster(si, ci);
> unlock_cluster(ci);
> +
> if (si->flags & SWP_SOLIDSTATE) {
> this_cpu_write(percpu_swap_cluster.offset[order], next);
Why not just remove the `percpu_swap_cluster.offset` and just share
si->percpu_cluster among all cgroups (including root cgroup)?
Otherwise, eg. if rootcg's pcpu cluster and one cgroup's pcpu
cluster are pointing to one same cluster, they might be in
contention on allocation of different order, or even in the same order
the performance might not be good as multiple CPUs will race
with each other.
It will be easier to implement too.
> this_cpu_write(percpu_swap_cluster.si[order], si);
> + write_percpu_swap_cluster_next(si, order, next);
> } else {
> si->global_cluster->next[order] = next;
> }
> +
> return found;
> }
>
> @@ -892,26 +895,29 @@ unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
> if (order && !(si->flags & SWP_BLKDEV))
> return 0;
>
> - if (!(si->flags & SWP_SOLIDSTATE)) {
> + if (si->flags & SWP_SOLIDSTATE) {
> + offset = read_percpu_swap_cluster_next(si, order);
> + } else {
> /* Serialize HDD SWAP allocation for each device. */
> spin_lock(&si->global_cluster_lock);
> offset = si->global_cluster->next[order];
> - if (offset == SWAP_ENTRY_INVALID)
> - goto new_cluster;
> + }
>
> - ci = lock_cluster(si, offset);
> - /* Cluster could have been used by another order */
> - if (cluster_is_usable(ci, order)) {
> - if (cluster_is_empty(ci))
> - offset = cluster_offset(si, ci);
> - found = alloc_swap_scan_cluster(si, ci, offset,
> - order, usage);
> - } else {
> - unlock_cluster(ci);
> - }
> - if (found)
> - goto done;
> + if (offset == SWAP_ENTRY_INVALID)
> + goto new_cluster;
> +
> + ci = lock_cluster(si, offset);
> + /* Cluster could have been used by another order */
> + if (cluster_is_usable(ci, order)) {
> + if (cluster_is_empty(ci))
> + offset = cluster_offset(si, ci);
> + found = alloc_swap_scan_cluster(si, ci, offset,
> + order, usage);
> + } else {
> + unlock_cluster(ci);
> }
> + if (found)
> + goto done;
>
> new_cluster:
> ci = isolate_lock_cluster(si, &si->free_clusters);
> @@ -991,6 +997,7 @@ unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
> done:
> if (!(si->flags & SWP_SOLIDSTATE))
> spin_unlock(&si->global_cluster_lock);
> +
> return found;
> }
>
> @@ -2674,6 +2681,8 @@ static void flush_percpu_swap_cluster(struct swap_info_struct *si)
> for (i = 0; i < SWAP_NR_ORDERS; i++)
> cmpxchg(&pcp_si[i], si, NULL);
> }
> +
> + flush_swap_cgroup_priority_percpu_swapdev(si);
> }
>
>
> @@ -2802,6 +2811,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
> arch_swap_invalidate_area(p->type);
> zswap_swapoff(p->type);
> mutex_unlock(&swapon_mutex);
> + free_percpu_swap_cluster(p);
> kfree(p->global_cluster);
> p->global_cluster = NULL;
> vfree(swap_map);
> @@ -2900,7 +2910,6 @@ static void swap_stop(struct seq_file *swap, void *v)
> mutex_unlock(&swapon_mutex);
> }
>
> -
> #ifndef CONFIG_SWAP_CGROUP_PRIORITY
> static int swap_show(struct seq_file *swap, void *v)
> {
> @@ -3239,7 +3248,10 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
> for (i = 0; i < nr_clusters; i++)
> spin_lock_init(&cluster_info[i].lock);
>
> - if (!(si->flags & SWP_SOLIDSTATE)) {
> + if (si->flags & SWP_SOLIDSTATE) {
> + if (!alloc_percpu_swap_cluster(si))
> + goto err_free;
> + } else {
> si->global_cluster = kmalloc(sizeof(*si->global_cluster),
> GFP_KERNEL);
> if (!si->global_cluster)
> @@ -3532,6 +3544,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
> bad_swap_unlock_inode:
> inode_unlock(inode);
> bad_swap:
> + free_percpu_swap_cluster(si);
> kfree(si->global_cluster);
> si->global_cluster = NULL;
> inode = NULL;
> --
> 2.34.1
>
>
Powered by blists - more mailing lists