[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <vj7utcgrseaot6ktpbwgshthmjza7w7vk2glede273tza7yfi6@cyxppxoney5u>
Date: Sun, 7 Sep 2025 14:55:43 +0200
From: Klara Modin <klarasmodin@...il.com>
To: Kairui Song <kasong@...cent.com>
Cc: linux-mm@...ck.org, Andrew Morton <akpm@...ux-foundation.org>,
Matthew Wilcox <willy@...radead.org>, Hugh Dickins <hughd@...gle.com>, Chris Li <chrisl@...nel.org>,
Barry Song <baohua@...nel.org>, Baoquan He <bhe@...hat.com>, Nhat Pham <nphamcs@...il.com>,
Kemeng Shi <shikemeng@...weicloud.com>, Baolin Wang <baolin.wang@...ux.alibaba.com>,
Ying Huang <ying.huang@...ux.alibaba.com>, Johannes Weiner <hannes@...xchg.org>,
David Hildenbrand <david@...hat.com>, Yosry Ahmed <yosryahmed@...gle.com>,
Lorenzo Stoakes <lorenzo.stoakes@...cle.com>, Zi Yan <ziy@...dia.com>, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v2 11/15] mm, swap: use the swap table for the swap cache
and switch API
On 2025-09-06 03:13:53 +0800, Kairui Song wrote:
> From: Kairui Song <kasong@...cent.com>
>
> Introduce basic swap table infrastructures, which are now just a
> fixed-sized flat array inside each swap cluster, with access wrappers.
>
> Each cluster contains a swap table of 512 entries. Each table entry is
> an opaque atomic long. It could be in 3 types: a shadow type (XA_VALUE),
> a folio type (pointer), or NULL.
>
> In this first step, it only supports storing a folio or shadow, and it
> is a drop-in replacement for the current swap cache. Convert all swap
> cache users to use the new sets of APIs. Chris Li has been suggesting
> using a new infrastructure for swap cache for better performance, and
> that idea combined well with the swap table as the new backing
> structure. Now the lock contention range is reduced to 2M clusters,
> which is much smaller than the 64M address_space. And we can also drop
> the multiple address_space design.
>
> All the internal works are done with swap_cache_get_* helpers. Swap
> cache lookup is still lock-less like before, and the helper's contexts
> are same with original swap cache helpers. They still require a pin
> on the swap device to prevent the backing data from being freed.
>
> Swap cache updates are now protected by the swap cluster lock
> instead of the Xarray lock. This is mostly handled internally, but new
> __swap_cache_* helpers require the caller to lock the cluster. So, a
> few new cluster access and locking helpers are also introduced.
>
> A fully cluster-based unified swap table can be implemented on top
> of this to take care of all count tracking and synchronization work,
> with dynamic allocation. It should reduce the memory usage while
> making the performance even better.
>
> Co-developed-by: Chris Li <chrisl@...nel.org>
> Signed-off-by: Chris Li <chrisl@...nel.org>
> Signed-off-by: Kairui Song <kasong@...cent.com>
> ---
> MAINTAINERS | 1 +
> include/linux/swap.h | 2 -
> mm/huge_memory.c | 13 +-
> mm/migrate.c | 19 ++-
> mm/shmem.c | 8 +-
> mm/swap.h | 157 +++++++++++++++++------
> mm/swap_state.c | 289 +++++++++++++++++++------------------------
> mm/swap_table.h | 97 +++++++++++++++
> mm/swapfile.c | 100 +++++++++++----
> mm/vmscan.c | 20 ++-
> 10 files changed, 458 insertions(+), 248 deletions(-)
> create mode 100644 mm/swap_table.h
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 1c8292c0318d..de402ca91a80 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -16226,6 +16226,7 @@ F: include/linux/swapops.h
> F: mm/page_io.c
> F: mm/swap.c
> F: mm/swap.h
> +F: mm/swap_table.h
> F: mm/swap_state.c
> F: mm/swapfile.c
>
...
> diff --git a/mm/swap.h b/mm/swap.h
> index a139c9131244..bf4e54f1f6b6 100644
> --- a/mm/swap.h
> +++ b/mm/swap.h
> @@ -2,6 +2,7 @@
> #ifndef _MM_SWAP_H
> #define _MM_SWAP_H
>
> +#include <linux/atomic.h> /* for atomic_long_t */
> struct mempolicy;
> struct swap_iocb;
>
> @@ -35,6 +36,7 @@ struct swap_cluster_info {
> u16 count;
> u8 flags;
> u8 order;
> + atomic_long_t *table; /* Swap table entries, see mm/swap_table.h */
> struct list_head list;
> };
>
> @@ -55,6 +57,11 @@ enum swap_cluster_flags {
> #include <linux/swapops.h> /* for swp_offset */
Now that swp_offset() is used in folio_index(), should this perhaps also be
included for !CONFIG_SWAP?
> #include <linux/blk_types.h> /* for bio_end_io_t */
>
> +static inline unsigned int swp_cluster_offset(swp_entry_t entry)
> +{
> + return swp_offset(entry) % SWAPFILE_CLUSTER;
> +}
> +
> /*
> * Callers of all helpers below must ensure the entry, type, or offset is
> * valid, and protect the swap device with reference count or locks.
> @@ -81,6 +88,25 @@ static inline struct swap_cluster_info *__swap_offset_to_cluster(
> return &si->cluster_info[offset / SWAPFILE_CLUSTER];
> }
>
> +static inline struct swap_cluster_info *__swap_entry_to_cluster(swp_entry_t entry)
> +{
> + return __swap_offset_to_cluster(__swap_entry_to_info(entry),
> + swp_offset(entry));
> +}
> +
> +static __always_inline struct swap_cluster_info *__swap_cluster_lock(
> + struct swap_info_struct *si, unsigned long offset, bool irq)
> +{
> + struct swap_cluster_info *ci = __swap_offset_to_cluster(si, offset);
> +
> + VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */
> + if (irq)
> + spin_lock_irq(&ci->lock);
> + else
> + spin_lock(&ci->lock);
> + return ci;
> +}
> +
> /**
> * swap_cluster_lock - Lock and return the swap cluster of given offset.
> * @si: swap device the cluster belongs to.
> @@ -92,11 +118,48 @@ static inline struct swap_cluster_info *__swap_offset_to_cluster(
> static inline struct swap_cluster_info *swap_cluster_lock(
> struct swap_info_struct *si, unsigned long offset)
> {
> - struct swap_cluster_info *ci = __swap_offset_to_cluster(si, offset);
> + return __swap_cluster_lock(si, offset, false);
> +}
>
> - VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */
> - spin_lock(&ci->lock);
> - return ci;
> +static inline struct swap_cluster_info *__swap_cluster_lock_by_folio(
> + const struct folio *folio, bool irq)
> +{
> + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
> + VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio);
> + return __swap_cluster_lock(__swap_entry_to_info(folio->swap),
> + swp_offset(folio->swap), irq);
> +}
> +
> +/*
> + * swap_cluster_lock_by_folio - Locks the cluster that holds a folio's entries.
> + * @folio: The folio.
> + *
> + * This locks the swap cluster that contains a folio's swap entries. The
> + * swap entries of a folio are always in one single cluster, and a locked
> + * swap cache folio is enough to stabilize the entries and the swap device.
> + *
> + * Context: Caller must ensure the folio is locked and in the swap cache.
> + * Return: Pointer to the swap cluster.
> + */
> +static inline struct swap_cluster_info *swap_cluster_lock_by_folio(
> + const struct folio *folio)
> +{
> + return __swap_cluster_lock_by_folio(folio, false);
> +}
> +
> +/*
> + * swap_cluster_lock_by_folio_irq - Locks the cluster that holds a folio's entries.
> + * @folio: The folio.
> + *
> + * Same as swap_cluster_lock_by_folio but also disable IRQ.
> + *
> + * Context: Caller must ensure the folio is locked and in the swap cache.
> + * Return: Pointer to the swap cluster.
> + */
> +static inline struct swap_cluster_info *swap_cluster_lock_by_folio_irq(
> + const struct folio *folio)
> +{
> + return __swap_cluster_lock_by_folio(folio, true);
> }
>
> static inline void swap_cluster_unlock(struct swap_cluster_info *ci)
> @@ -104,6 +167,11 @@ static inline void swap_cluster_unlock(struct swap_cluster_info *ci)
> spin_unlock(&ci->lock);
> }
>
> +static inline void swap_cluster_unlock_irq(struct swap_cluster_info *ci)
> +{
> + spin_unlock_irq(&ci->lock);
> +}
> +
> /* linux/mm/page_io.c */
> int sio_pool_init(void);
> struct swap_iocb;
> @@ -123,10 +191,11 @@ void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
> #define SWAP_ADDRESS_SPACE_SHIFT 14
> #define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT)
> #define SWAP_ADDRESS_SPACE_MASK (SWAP_ADDRESS_SPACE_PAGES - 1)
> -extern struct address_space *swapper_spaces[];
> -#define swap_address_space(entry) \
> - (&swapper_spaces[swp_type(entry)][swp_offset(entry) \
> - >> SWAP_ADDRESS_SPACE_SHIFT])
> +extern struct address_space swap_space;
> +static inline struct address_space *swap_address_space(swp_entry_t entry)
> +{
> + return &swap_space;
> +}
>
> /*
> * Return the swap device position of the swap entry.
> @@ -136,15 +205,6 @@ static inline loff_t swap_dev_pos(swp_entry_t entry)
> return ((loff_t)swp_offset(entry)) << PAGE_SHIFT;
> }
>
> -/*
> - * Return the swap cache index of the swap entry.
> - */
> -static inline pgoff_t swap_cache_index(swp_entry_t entry)
> -{
> - BUILD_BUG_ON((SWP_OFFSET_MASK | SWAP_ADDRESS_SPACE_MASK) != SWP_OFFSET_MASK);
> - return swp_offset(entry) & SWAP_ADDRESS_SPACE_MASK;
> -}
> -
> /**
> * folio_matches_swap_entry - Check if a folio matches a given swap entry.
> * @folio: The folio.
> @@ -177,16 +237,15 @@ static inline bool folio_matches_swap_entry(const struct folio *folio,
> */
> struct folio *swap_cache_get_folio(swp_entry_t entry);
> void *swap_cache_get_shadow(swp_entry_t entry);
> -int swap_cache_add_folio(struct folio *folio, swp_entry_t entry,
> - gfp_t gfp, void **shadow);
> +void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadow);
> void swap_cache_del_folio(struct folio *folio);
> -void __swap_cache_del_folio(struct folio *folio,
> - swp_entry_t entry, void *shadow);
> -void __swap_cache_replace_folio(struct address_space *address_space,
> - swp_entry_t entry,
> - struct folio *old, struct folio *new);
> -void swap_cache_clear_shadow(int type, unsigned long begin,
> - unsigned long end);
> +/* Below helpers require the caller to lock and pass in the swap cluster. */
> +void __swap_cache_del_folio(struct swap_cluster_info *ci,
> + struct folio *folio, swp_entry_t entry, void *shadow);
> +void __swap_cache_replace_folio(struct swap_cluster_info *ci,
> + swp_entry_t entry, struct folio *old,
> + struct folio *new);
> +void __swap_cache_clear_shadow(swp_entry_t entry, int nr_ents);
>
> void show_swap_cache_info(void);
> void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr);
> @@ -254,6 +313,32 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
>
> #else /* CONFIG_SWAP */
> struct swap_iocb;
> +static inline struct swap_cluster_info *swap_cluster_lock(
> + struct swap_info_struct *si, pgoff_t offset, bool irq)
> +{
> + return NULL;
> +}
> +
> +static inline struct swap_cluster_info *swap_cluster_lock_by_folio(
> + struct folio *folio)
> +{
> + return NULL;
> +}
> +
> +static inline struct swap_cluster_info *swap_cluster_lock_by_folio_irq(
> + struct folio *folio)
> +{
> + return NULL;
> +}
> +
> +static inline void swap_cluster_unlock(struct swap_cluster_info *ci)
> +{
> +}
> +
> +static inline void swap_cluster_unlock_irq(struct swap_cluster_info *ci)
> +{
> +}
> +
> static inline struct swap_info_struct *__swap_entry_to_info(swp_entry_t entry)
> {
> return NULL;
> @@ -271,11 +356,6 @@ static inline struct address_space *swap_address_space(swp_entry_t entry)
> return NULL;
> }
>
> -static inline pgoff_t swap_cache_index(swp_entry_t entry)
> -{
> - return 0;
> -}
> -
> static inline bool folio_matches_swap_entry(const struct folio *folio, swp_entry_t entry)
> {
> return false;
> @@ -322,17 +402,22 @@ static inline void *swap_cache_get_shadow(swp_entry_t entry)
> return NULL;
> }
>
> -static inline int swap_cache_add_folio(swp_entry_t entry, struct folio *folio,
> - gfp_t gfp, void **shadow)
> +static inline void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadow)
> {
> - return -EINVAL;
> }
>
> static inline void swap_cache_del_folio(struct folio *folio)
> {
> }
>
> -static inline void __swap_cache_del_folio(swp_entry_t entry, struct folio *folio, void *shadow)
> +static inline void __swap_cache_del_folio(struct swap_cluster_info *ci,
> + struct folio *folio, swp_entry_t entry, void *shadow)
> +{
> +}
> +
> +static inline void __swap_cache_replace_folio(
> + struct swap_cluster_info *ci, swp_entry_t entry,
> + struct folio *old, struct folio *new)
> {
> }
>
> @@ -367,7 +452,7 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
> static inline pgoff_t folio_index(struct folio *folio)
> {
> if (unlikely(folio_test_swapcache(folio)))
> - return swap_cache_index(folio->swap);
> + return swp_offset(folio->swap);
This is outside CONFIG_SWAP.
> return folio->index;
> }
...
Regards,
Klara Modin
Powered by blists - more mailing lists