linux-kernel - Re: [RFC PATCH 2/2] mm: swap: apply per cgroup swap priority mechansim on swap layer

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAMgjq7BAvsqWkGRZO6_u+6n-cUQ1nVHwMnerOs-s_RHkf90i2A@mail.gmail.com>
Date: Thu, 12 Jun 2025 19:16:56 +0800
From: Kairui Song <ryncsn@...il.com>
To: youngjun.park@....com
Cc: linux-mm@...ck.org, akpm@...ux-foundation.org, hannes@...xchg.org, 
	mhocko@...nel.org, roman.gushchin@...ux.dev, shakeel.butt@...ux.dev, 
	cgroups@...r.kernel.org, linux-kernel@...r.kernel.org, 
	shikemeng@...weicloud.com, nphamcs@...il.com, bhe@...hat.com, 
	baohua@...nel.org, chrisl@...nel.org, muchun.song@...ux.dev, 
	iamjoonsoo.kim@....com, taejoon.song@....com, gunho.lee@....com
Subject: Re: [RFC PATCH 2/2] mm: swap: apply per cgroup swap priority
 mechansim on swap layer

On Thu, Jun 12, 2025 at 7:14 PM Kairui Song <ryncsn@...il.com> wrote:
>
> On Thu, Jun 12, 2025 at 6:43 PM <youngjun.park@....com> wrote:
> >
> > From: "youngjun.park" <youngjun.park@....com>
> >
>
> Hi, Youngjun,
>
> Thanks for sharing this series.
>
> > This patch implements swap device selection and swap on/off propagation
> > when a cgroup-specific swap priority is set.
> >
> > There is one workaround to this implementation as follows.
> > Current per-cpu swap cluster enforces swap device selection based solely
> > on CPU locality, overriding the swap cgroup's configured priorities.
>
> I've been thinking about this, we can switch to a per-cgroup-per-cpu
> next cluster selector, the problem with current code is that swap
> allocator is not designed with folio / cgroup in mind at all, so it's
> really ugly to implement, which is why I have following two patches in
> the swap table series:
>
> https://lore.kernel.org/linux-mm/20250514201729.48420-18-ryncsn@gmail.com/
> https://lore.kernel.org/linux-mm/20250514201729.48420-22-ryncsn@gmail.com/

And BTW this is not the only reason, these two are also quite critical
to get rid of the swap_cgroup_ctrl later, and maybe switch to use
folio lock for more swap operations, etc..

> The first one makes all swap allocation starts with a folio, the
> second one makes the allocator always folio aware. So you can know
> which cgroup is doing the allocation at anytime inside the allocator
> (and it reduced the number of argument, also improving performance :)
> )
>
> So the allocator can just use cgroup's swap info if available, plist,
> percpu cluster, and fallback to global locality in a very natural way.
>
>
> > Therefore, when a swap cgroup priority is assigned, we fall back to
> > using per-CPU clusters per swap device, similar to the previous behavior.
> >
> > A proper fix for this workaround will be evaluated in the next patch.
>
> Hmm, but this is already the last patch in the series?
>
> >
> > Signed-off-by: Youngjun park <youngjun.park@....com>
> > ---
> >  include/linux/swap.h      |   8 +++
> >  mm/swap.h                 |   8 +++
> >  mm/swap_cgroup_priority.c | 133 ++++++++++++++++++++++++++++++++++++++
> >  mm/swapfile.c             | 125 ++++++++++++++++++++++++-----------
> >  4 files changed, 238 insertions(+), 36 deletions(-)
> >
> > diff --git a/include/linux/swap.h b/include/linux/swap.h
> > index 49b73911c1bd..d158b0d5c997 100644
> > --- a/include/linux/swap.h
> > +++ b/include/linux/swap.h
> > @@ -283,6 +283,13 @@ enum swap_cluster_flags {
> >  #define SWAP_NR_ORDERS         1
> >  #endif
> >
> > +#ifdef CONFIG_SWAP_CGROUP_PRIORITY
> > +struct percpu_cluster {
> > +       local_lock_t lock; /* Protect the percpu_cluster above */
> > +       unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
> > +};
> > +#endif
> > +
> >  /*
> >   * We keep using same cluster for rotational device so IO will be sequential.
> >   * The purpose is to optimize SWAP throughput on these device.
> > @@ -341,6 +348,7 @@ struct swap_info_struct {
> >         struct list_head discard_clusters; /* discard clusters list */
> >  #ifdef CONFIG_SWAP_CGROUP_PRIORITY
> >         int unique_id;
> > +       struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
> >  #endif
> >         struct plist_node avail_lists[]; /*
> >                                            * entries in swap_avail_heads, one
> > diff --git a/mm/swap.h b/mm/swap.h
> > index cd2649c632ed..cb6d653fe3f1 100644
> > --- a/mm/swap.h
> > +++ b/mm/swap.h
> > @@ -113,7 +113,15 @@ void delete_swap_cgroup_priority(struct mem_cgroup *memcg);
> >  void show_swap_device_unique_id(struct seq_file *m);
> >  #else
> >  static inline void delete_swap_cgroup_priority(struct mem_cgroup *memcg) {}
> > +static inline void activate_swap_cgroup_priority_pnode(struct swap_info_struct *swp, bool swapon) {}
> > +static inline void deactivate_swap_cgroup_priority_pnode(struct swap_info_struct *swp, bool swapoff){}
> >  static inline void get_swap_unique_id(struct swap_info_struct *si) {}
> > +static inline bool swap_alloc_cgroup_priority(struct mem_cgroup *memcg,
> > +                               swp_entry_t *entry, int order)
> > +{
> > +       return false;
> > +}
> > +
> >  #endif
> >
> >  #else /* CONFIG_SWAP */
> > diff --git a/mm/swap_cgroup_priority.c b/mm/swap_cgroup_priority.c
> > index b3e20b676680..bb18cb251f60 100644
> > --- a/mm/swap_cgroup_priority.c
> > +++ b/mm/swap_cgroup_priority.c
> > @@ -54,6 +54,132 @@ static void get_swap_unique_id(struct swap_info_struct *si)
> >         si->unique_id = atomic_add_return(1, &swap_unique_id_counter);
> >  }
> >
> > +static bool swap_alloc_cgroup_priority(struct mem_cgroup *memcg,
> > +                               swp_entry_t *entry, int order)
> > +{
> > +       struct swap_cgroup_priority *swap_priority;
> > +       struct swap_cgroup_priority_pnode *pnode, *next;
> > +       unsigned long offset;
> > +       int node;
> > +
> > +       if (!memcg)
> > +               return false;
> > +
> > +       spin_lock(&swap_avail_lock);
> > +priority_check:
> > +       swap_priority = memcg->swap_priority;
> > +       if (!swap_priority) {
> > +               spin_unlock(&swap_avail_lock);
> > +               return false;
> > +       }
> > +
> > +       node = numa_node_id();
> > +start_over:
> > +       plist_for_each_entry_safe(pnode, next, &swap_priority->plist[node],
> > +                                       avail_lists[node]) {
> > +               struct swap_info_struct *si = pnode->swap;
> > +               plist_requeue(&pnode->avail_lists[node],
> > +                       &swap_priority->plist[node]);
> > +               spin_unlock(&swap_avail_lock);
> > +
> > +               if (get_swap_device_info(si)) {
> > +                       offset = cluster_alloc_swap_entry(si,
> > +                                       order, SWAP_HAS_CACHE, true);
> > +                       put_swap_device(si);
> > +                       if (offset) {
> > +                               *entry = swp_entry(si->type, offset);
> > +                               return true;
> > +                       }
> > +                       if (order)
> > +                               return false;
> > +               }
> > +
> > +               spin_lock(&swap_avail_lock);
> > +
> > +               /* swap_priority is remove or changed under us. */
> > +               if (swap_priority != memcg->swap_priority)
> > +                       goto priority_check;
> > +
> > +               if (plist_node_empty(&next->avail_lists[node]))
> > +                       goto start_over;
> > +       }
> > +       spin_unlock(&swap_avail_lock);
> > +
> > +       return false;
> > +}
> > +
> > +/* add_to_avail_list (swapon / swapusage > 0) */
> > +static void activate_swap_cgroup_priority_pnode(struct swap_info_struct *swp,
> > +                       bool swapon)
> > +{
> > +       struct swap_cgroup_priority *swap_priority;
> > +       int i;
> > +
> > +       list_for_each_entry(swap_priority, &swap_cgroup_priority_list, link) {
> > +               struct swap_cgroup_priority_pnode *pnode
> > +                       = swap_priority->pnode[swp->type];
> > +
> > +               if (swapon) {
> > +                       pnode->swap = swp;
> > +                       pnode->prio = swp->prio;
> > +               }
> > +
> > +               /* NUMA priority handling */
> > +               for_each_node(i) {
> > +                       if (swapon) {
> > +                               if (swap_node(swp) == i) {
> > +                                       plist_node_init(
> > +                                               &pnode->avail_lists[i],
> > +                                               1);
> > +                               } else {
> > +                                       plist_node_init(
> > +                                               &pnode->avail_lists[i],
> > +                                               -pnode->prio);
> > +                               }
> > +                       }
> > +
> > +                       plist_add(&pnode->avail_lists[i],
> > +                               &swap_priority->plist[i]);
> > +               }
> > +       }
> > +}
> > +
> > +/* del_from_avail_list (swapoff / swap usage <= 0) */
> > +static void deactivate_swap_cgroup_priority_pnode(struct swap_info_struct *swp,
> > +               bool swapoff)
> > +{
> > +       struct swap_cgroup_priority *swap_priority;
> > +       int nid, i;
> > +
> > +       list_for_each_entry(swap_priority, &swap_cgroup_priority_list, link) {
> > +               struct swap_cgroup_priority_pnode *pnode;
> > +
> > +               if (swapoff && swp->prio < 0) {
> > +                       /*
> > +                       * NUMA priority handling
> > +                       * mimic swapoff prio adjustment without plist
> > +                       */
> > +                       for (int i = 0; i < MAX_SWAPFILES; i++) {
> > +                               pnode = swap_priority->pnode[i];
> > +                               if (pnode->prio > swp->prio ||
> > +                                       pnode->swap == swp)
> > +                                       continue;
> > +
> > +                               pnode->prio++;
> > +                               for_each_node(nid) {
> > +                                       if (pnode->avail_lists[nid].prio != 1)
> > +                                               pnode->avail_lists[nid].prio--;
> > +                               }
> > +                       }
> > +               }
> > +
> > +               pnode = swap_priority->pnode[swp->type];
> > +               for_each_node(i)
> > +                       plist_del(&pnode->avail_lists[i],
> > +                               &swap_priority->plist[i]);
> > +       }
> > +}
> > +
> >  int create_swap_cgroup_priority(struct mem_cgroup *memcg,
> >                 int unique[], int prio[], int nr)
> >  {
> > @@ -183,6 +309,12 @@ void delete_swap_cgroup_priority(struct mem_cgroup *memcg)
> >  {
> >         struct swap_cgroup_priority *swap_priority;
> >
> > +       /*
> > +       * XXX: Possible RCU wait? No. Cannot protect priority list addition.
> > +       * swap_avail_lock gives protection.
> > +       * Think about other object protection mechanism
> > +       * might be solve it and better. (e.g object reference)
> > +       */
> >         spin_lock(&swap_avail_lock);
> >         swap_priority = memcg->swap_priority;
> >         if (!swap_priority) {
> > @@ -198,5 +330,6 @@ void delete_swap_cgroup_priority(struct mem_cgroup *memcg)
> >
> >         for (int i = 0; i < MAX_SWAPFILES; i++)
> >                 kvfree(swap_priority->pnode[i]);
> > +
> >         kvfree(swap_priority);
> >  }
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index f8e48dd2381e..28afe4ec0504 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -126,8 +126,12 @@ static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = {
> >         .offset = { SWAP_ENTRY_INVALID },
> >         .lock = INIT_LOCAL_LOCK(),
> >  };
> > -/* TODO: better choice? */
> > +/* TODO: better arrangement */
> >  #ifdef CONFIG_SWAP_CGROUP_PRIORITY
> > +static bool get_swap_device_info(struct swap_info_struct *si);
> > +static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
> > +                                             unsigned char usage, bool is_cgroup_priority);
> > +static int swap_node(struct swap_info_struct *si);
> >  #include "swap_cgroup_priority.c"
> >  #endif
> >
> > @@ -776,7 +780,8 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
> >                                             struct swap_cluster_info *ci,
> >                                             unsigned long offset,
> >                                             unsigned int order,
> > -                                           unsigned char usage)
> > +                                           unsigned char usage,
> > +                                           bool is_cgroup_priority)
> >  {
> >         unsigned int next = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;
> >         unsigned long start = ALIGN_DOWN(offset, SWAPFILE_CLUSTER);
> > @@ -820,12 +825,19 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
> >  out:
> >         relocate_cluster(si, ci);
> >         unlock_cluster(ci);
> > +
> >         if (si->flags & SWP_SOLIDSTATE) {
> > -               this_cpu_write(percpu_swap_cluster.offset[order], next);
> > -               this_cpu_write(percpu_swap_cluster.si[order], si);
> > -       } else {
> > +               if (!is_cgroup_priority) {
> > +                       this_cpu_write(percpu_swap_cluster.offset[order], next);
> > +                       this_cpu_write(percpu_swap_cluster.si[order], si);
> > +               } else {
> > +#ifdef CONFIG_SWAP_CGROUP_PRIORITY
> > +                       __this_cpu_write(si->percpu_cluster->next[order], next);
> > +#endif
> > +               }
> > +       } else
> >                 si->global_cluster->next[order] = next;
> > -       }
> > +
> >         return found;
> >  }
> >
> > @@ -883,7 +895,7 @@ static void swap_reclaim_work(struct work_struct *work)
> >   * cluster for current CPU too.
> >   */
> >  static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
> > -                                             unsigned char usage)
> > +                                             unsigned char usage, bool is_cgroup_priority)
> >  {
> >         struct swap_cluster_info *ci;
> >         unsigned int offset = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;
> > @@ -895,32 +907,38 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
> >         if (order && !(si->flags & SWP_BLKDEV))
> >                 return 0;
> >
> > -       if (!(si->flags & SWP_SOLIDSTATE)) {
> > +       if (si->flags & SWP_SOLIDSTATE) {
> > +#ifdef CONFIG_SWAP_CGROUP_PRIORITY
> > +                local_lock(&si->percpu_cluster->lock);
> > +                offset = __this_cpu_read(si->percpu_cluster->next[order]);
> > +#endif
> > +       } else {
> >                 /* Serialize HDD SWAP allocation for each device. */
> >                 spin_lock(&si->global_cluster_lock);
> >                 offset = si->global_cluster->next[order];
> > -               if (offset == SWAP_ENTRY_INVALID)
> > -                       goto new_cluster;
> > +       }
> >
> > -               ci = lock_cluster(si, offset);
> > -               /* Cluster could have been used by another order */
> > -               if (cluster_is_usable(ci, order)) {
> > -                       if (cluster_is_empty(ci))
> > -                               offset = cluster_offset(si, ci);
> > -                       found = alloc_swap_scan_cluster(si, ci, offset,
> > -                                                       order, usage);
> > -               } else {
> > -                       unlock_cluster(ci);
> > -               }
> > -               if (found)
> > -                       goto done;
> > +       if (offset == SWAP_ENTRY_INVALID)
> > +               goto new_cluster;
> > +
> > +       ci = lock_cluster(si, offset);
> > +       /* Cluster could have been used by another order */
> > +       if (cluster_is_usable(ci, order)) {
> > +               if (cluster_is_empty(ci))
> > +                       offset = cluster_offset(si, ci);
> > +               found = alloc_swap_scan_cluster(si, ci, offset,
> > +                                               order, usage, is_cgroup_priority);
> > +       } else {
> > +               unlock_cluster(ci);
> >         }
> > +       if (found)
> > +               goto done;
> >
> >  new_cluster:
> >         ci = isolate_lock_cluster(si, &si->free_clusters);
> >         if (ci) {
> >                 found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
> > -                                               order, usage);
> > +                                               order, usage, is_cgroup_priority);
> >                 if (found)
> >                         goto done;
> >         }
> > @@ -934,7 +952,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
> >
> >                 while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[order]))) {
> >                         found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
> > -                                                       order, usage);
> > +                                                       order, usage, is_cgroup_priority);
> >                         if (found)
> >                                 goto done;
> >                         /* Clusters failed to allocate are moved to frag_clusters */
> > @@ -952,7 +970,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
> >                          * reclaimable (eg. lazy-freed swap cache) slots.
> >                          */
> >                         found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
> > -                                                       order, usage);
> > +                                                       order, usage, is_cgroup_priority);
> >                         if (found)
> >                                 goto done;
> >                         frags++;
> > @@ -979,21 +997,27 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
> >                 while ((ci = isolate_lock_cluster(si, &si->frag_clusters[o]))) {
> >                         atomic_long_dec(&si->frag_cluster_nr[o]);
> >                         found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
> > -                                                       0, usage);
> > +                                                       0, usage, is_cgroup_priority);
> >                         if (found)
> >                                 goto done;
> >                 }
> >
> >                 while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[o]))) {
> >                         found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
> > -                                                       0, usage);
> > +                                                       0, usage, is_cgroup_priority);
> >                         if (found)
> >                                 goto done;
> >                 }
> >         }
> >  done:
> > -       if (!(si->flags & SWP_SOLIDSTATE))
> > +       if (si->flags & SWP_SOLIDSTATE) {
> > +#ifdef CONFIG_SWAP_CGROUP_PRIORITY
> > +               local_unlock(&si->percpu_cluster->lock);
> > +#endif
> > +       } else {
> >                 spin_unlock(&si->global_cluster_lock);
> > +       }
> > +
> >         return found;
> >  }
> >
> > @@ -1032,6 +1056,7 @@ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
> >         for_each_node(nid)
> >                 plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]);
> >
> > +       deactivate_swap_cgroup_priority_pnode(si, swapoff);
> >  skip:
> >         spin_unlock(&swap_avail_lock);
> >  }
> > @@ -1075,6 +1100,7 @@ static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
> >         for_each_node(nid)
> >                 plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]);
> >
> > +       activate_swap_cgroup_priority_pnode(si, swapon);
> >  skip:
> >         spin_unlock(&swap_avail_lock);
> >  }
> > @@ -1200,7 +1226,8 @@ static bool swap_alloc_fast(swp_entry_t *entry,
> >         if (cluster_is_usable(ci, order)) {
> >                 if (cluster_is_empty(ci))
> >                         offset = cluster_offset(si, ci);
> > -               found = alloc_swap_scan_cluster(si, ci, offset, order, SWAP_HAS_CACHE);
> > +               found = alloc_swap_scan_cluster(si, ci, offset, order,
> > +                               SWAP_HAS_CACHE, false);
> >                 if (found)
> >                         *entry = swp_entry(si->type, found);
> >         } else {
> > @@ -1227,7 +1254,7 @@ static bool swap_alloc_slow(swp_entry_t *entry,
> >                 plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
> >                 spin_unlock(&swap_avail_lock);
> >                 if (get_swap_device_info(si)) {
> > -                       offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE);
> > +                       offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE, false);
> >                         put_swap_device(si);
> >                         if (offset) {
> >                                 *entry = swp_entry(si->type, offset);
> > @@ -1294,10 +1321,12 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp)
> >                 }
> >         }
> >
> > -       local_lock(&percpu_swap_cluster.lock);
> > -       if (!swap_alloc_fast(&entry, order))
> > -               swap_alloc_slow(&entry, order);
> > -       local_unlock(&percpu_swap_cluster.lock);
> > +       if (!swap_alloc_cgroup_priority(folio_memcg(folio), &entry, order)) {
> > +               local_lock(&percpu_swap_cluster.lock);
> > +               if (!swap_alloc_fast(&entry, order))
> > +                       swap_alloc_slow(&entry, order);
> > +               local_unlock(&percpu_swap_cluster.lock);
> > +       }
> >
> >         /* Need to call this even if allocation failed, for MEMCG_SWAP_FAIL. */
> >         if (mem_cgroup_try_charge_swap(folio, entry))
> > @@ -1870,7 +1899,7 @@ swp_entry_t get_swap_page_of_type(int type)
> >         /* This is called for allocating swap entry, not cache */
> >         if (get_swap_device_info(si)) {
> >                 if (si->flags & SWP_WRITEOK) {
> > -                       offset = cluster_alloc_swap_entry(si, 0, 1);
> > +                       offset = cluster_alloc_swap_entry(si, 0, 1, false);
> >                         if (offset) {
> >                                 entry = swp_entry(si->type, offset);
> >                                 atomic_long_dec(&nr_swap_pages);
> > @@ -2800,6 +2829,10 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
> >         arch_swap_invalidate_area(p->type);
> >         zswap_swapoff(p->type);
> >         mutex_unlock(&swapon_mutex);
> > +#ifdef CONFIG_SWAP_CGROUP_PRIORITY
> > +       free_percpu(p->percpu_cluster);
> > +       p->percpu_cluster = NULL;
> > +#endif
> >         kfree(p->global_cluster);
> >         p->global_cluster = NULL;
> >         vfree(swap_map);
> > @@ -3207,7 +3240,23 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
> >         for (i = 0; i < nr_clusters; i++)
> >                 spin_lock_init(&cluster_info[i].lock);
> >
> > -       if (!(si->flags & SWP_SOLIDSTATE)) {
> > +       if (si->flags & SWP_SOLIDSTATE) {
> > +#ifdef CONFIG_SWAP_CGROUP_PRIORITY
> > +               si->percpu_cluster = alloc_percpu(struct percpu_cluster);
> > +               if (!si->percpu_cluster)
> > +                       goto err_free;
> > +
> > +               int cpu;
> > +               for_each_possible_cpu(cpu) {
> > +                       struct percpu_cluster *cluster;
> > +
> > +                       cluster = per_cpu_ptr(si->percpu_cluster, cpu);
> > +                       for (i = 0; i < SWAP_NR_ORDERS; i++)
> > +                               cluster->next[i] = SWAP_ENTRY_INVALID;
> > +                       local_lock_init(&cluster->lock);
> > +               }
> > +#endif
> > +       } else {
> >                 si->global_cluster = kmalloc(sizeof(*si->global_cluster),
> >                                      GFP_KERNEL);
> >                 if (!si->global_cluster)
> > @@ -3495,6 +3544,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
> >  bad_swap_unlock_inode:
> >         inode_unlock(inode);
> >  bad_swap:
> > +#ifdef CONFIG_SWAP_CGROUP_PRIORITY
> > +       free_percpu(si->percpu_cluster);
> > +       si->percpu_cluster = NULL;
> > +#endif
> >         kfree(si->global_cluster);
> >         si->global_cluster = NULL;
> >         inode = NULL;
> > --
> > 2.34.1
> >
> >