linux-kernel - Re: [PATCH v2 1/1] swap: shmem: remove SWAP_MAP

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAJD7tkaFv_KmF4gM=wb_Rwi7S1Dt4yy+TU=TyMd1R=gx=3eWuA@mail.gmail.com>
Date: Tue, 1 Oct 2024 18:33:12 -0700
From: Yosry Ahmed <yosryahmed@...gle.com>
To: Nhat Pham <nphamcs@...il.com>
Cc: akpm@...ux-foundation.org, hannes@...xchg.org, hughd@...gle.com, 
	shakeel.butt@...ux.dev, ryan.roberts@....com, ying.huang@...el.com, 
	chrisl@...nel.org, david@...hat.com, kasong@...cent.com, willy@...radead.org, 
	viro@...iv.linux.org.uk, baohua@...nel.org, chengming.zhou@...ux.dev, 
	v-songbaohua@...o.com, linux-mm@...ck.org, kernel-team@...a.com, 
	linux-kernel@...r.kernel.org
Subject: Re: [PATCH v2 1/1] swap: shmem: remove SWAP_MAP_SHMEM

On Tue, Oct 1, 2024 at 6:20 PM Nhat Pham <nphamcs@...il.com> wrote:
>
> The SWAP_MAP_SHMEM state was introduced in the commit aaa468653b4a
> ("swap_info: note SWAP_MAP_SHMEM"), to quickly determine if a swap entry
> belongs to shmem during swapoff.
>
> However, swapoff has since been rewritten in the commit b56a2d8af914
> ("mm: rid swapoff of quadratic complexity"). Now having swap count ==
> SWAP_MAP_SHMEM value is basically the same as having swap count == 1,
> and swap_shmem_alloc() behaves analogously to swap_duplicate(). The only
> difference of note is that swap_shmem_alloc() does not check for
> -ENOMEM returned from __swap_duplicate(), but it is OK because shmem
> never re-duplicates any swap entry it owns. This will stil be safe if we
> use (batched) swap_duplicate() instead.
>
> This commit adds swap_duplicate_nr(), the batched variant of
> swap_duplicate(), and removes the SWAP_MAP_SHMEM state and the
> associated swap_shmem_alloc() helper to simplify the state machine (both
> mentally and in terms of actual code). We will also have an extra
> state/special value that can be repurposed (for swap entries that never
> gets re-duplicated).
>
> Signed-off-by: Nhat Pham <nphamcs@...il.com>
> ---
>  include/linux/swap.h | 16 ++++++++--------
>  mm/shmem.c           |  2 +-
>  mm/swapfile.c        | 41 +++++++++++++++++++++--------------------
>  3 files changed, 30 insertions(+), 29 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index ca533b478c21..017f3c03ff7a 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -232,7 +232,6 @@ enum {
>  /* Special value in first swap_map */
>  #define SWAP_MAP_MAX   0x3e    /* Max count */
>  #define SWAP_MAP_BAD   0x3f    /* Note page is bad */
> -#define SWAP_MAP_SHMEM 0xbf    /* Owned by shmem/tmpfs */
>
>  /* Special value in each swap_map continuation */
>  #define SWAP_CONT_MAX  0x7f    /* Max count */
> @@ -482,8 +481,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry);
>  extern swp_entry_t get_swap_page_of_type(int);
>  extern int get_swap_pages(int n, swp_entry_t swp_entries[], int order);
>  extern int add_swap_count_continuation(swp_entry_t, gfp_t);
> -extern void swap_shmem_alloc(swp_entry_t, int);
> -extern int swap_duplicate(swp_entry_t);
> +extern int swap_duplicate_nr(swp_entry_t, int);
>  extern int swapcache_prepare(swp_entry_t entry, int nr);
>  extern void swap_free_nr(swp_entry_t entry, int nr_pages);
>  extern void swapcache_free_entries(swp_entry_t *entries, int n);
> @@ -549,11 +547,7 @@ static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
>         return 0;
>  }
>
> -static inline void swap_shmem_alloc(swp_entry_t swp, int nr)
> -{
> -}
> -
> -static inline int swap_duplicate(swp_entry_t swp)
> +static inline int swap_duplicate_nr(swp_entry_t swp, int nr)
>  {
>         return 0;
>  }
> @@ -606,6 +600,12 @@ static inline int add_swap_extent(struct swap_info_struct *sis,
>  }
>  #endif /* CONFIG_SWAP */
>
> +static inline int swap_duplicate(swp_entry_t entry)
> +{
> +       return swap_duplicate_nr(entry, 1);
> +}
> +
> +

Nit: extra blank line.

>  static inline void free_swap_and_cache(swp_entry_t entry)
>  {
>         free_swap_and_cache_nr(entry, 1);
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 0613421e09e7..e3f72f99be32 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -1561,7 +1561,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
>                         __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
>                         NULL) == 0) {
>                 shmem_recalc_inode(inode, 0, nr_pages);
> -               swap_shmem_alloc(swap, nr_pages);
> +               swap_duplicate_nr(swap, nr_pages);
>                 shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap));
>
>                 mutex_unlock(&shmem_swaplist_mutex);
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 0cded32414a1..9bb94e618914 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -1381,12 +1381,6 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *si,
>         if (usage == SWAP_HAS_CACHE) {
>                 VM_BUG_ON(!has_cache);
>                 has_cache = 0;
> -       } else if (count == SWAP_MAP_SHMEM) {
> -               /*
> -                * Or we could insist on shmem.c using a special
> -                * swap_shmem_free() and free_shmem_swap_and_cache()...
> -                */
> -               count = 0;
>         } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
>                 if (count == COUNT_CONTINUED) {
>                         if (swap_count_continued(si, offset, count))
> @@ -3626,7 +3620,6 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
>
>         offset = swp_offset(entry);
>         VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);
> -       VM_WARN_ON(usage == 1 && nr > 1);
>         ci = lock_cluster_or_swap_info(si, offset);
>
>         err = 0;
> @@ -3652,6 +3645,13 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
>                                 err = -EEXIST;
>                 } else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) {
>                         err = -EINVAL;
> +               } else {
> +                       /*
> +                        * The only swap_duplicate_nr() caller that passes nr > 1 is shmem,
> +                        * who never re-duplicates any swap entry it owns. So this should

nit: I think "which" is the right word here, but I am not a native speaker :)

> +                        * not happen.
> +                        */
> +                       VM_WARN_ON(nr > 1 && (count & ~COUNT_CONTINUED) == SWAP_MAP_MAX);

Why not return an error in this case? I think we should add recovery
for bugs when it's possible and simple, which I believe is the case
here.

In shmem_writepage() we can add a WARN if swap_duplicate_nr() fails,
or propagate an error to the caller as well (perhaps this belongs in a
separate patch that does this for swap_shmem_alloc() first).

Sorry if I am being paranoid here, please let me know if this is the case.

>                 }
>
>                 if (err)
> @@ -3686,27 +3686,28 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
>         return err;
>  }
>
> -/*
> - * Help swapoff by noting that swap entry belongs to shmem/tmpfs
> - * (in which case its reference count is never incremented).
> - */
> -void swap_shmem_alloc(swp_entry_t entry, int nr)
> -{
> -       __swap_duplicate(entry, SWAP_MAP_SHMEM, nr);
> -}
> -
> -/*
> - * Increase reference count of swap entry by 1.
> +/**
> + * swap_duplicate_nr() - Increase reference count of nr contiguous swap entries
> + *                       by 1.

Can we avoid the line break by using "refcount" instead of "reference count"?

> + *
> + * @entry: first swap entry from which we want to increase the refcount.
> + * @nr: Number of entries in range.
> + *
>   * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
>   * but could not be atomically allocated.  Returns 0, just as if it succeeded,
>   * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
>   * might occur if a page table entry has got corrupted.
> + *
> + * Note that we are currently not handling the case where nr > 1 and we need to
> + * add swap count continuation. This is OK, because no such user exists - shmem
> + * is the only user that can pass nr > 1, and it never re-duplicates any swap
> + * entry it owns.

Do we need this comment when we have the WARN + comment in __swap_duplicate()?

>   */
> -int swap_duplicate(swp_entry_t entry)
> +int swap_duplicate_nr(swp_entry_t entry, int nr)
>  {
>         int err = 0;
>
> -       while (!err && __swap_duplicate(entry, 1, 1) == -ENOMEM)
> +       while (!err && __swap_duplicate(entry, 1, nr) == -ENOMEM)
>                 err = add_swap_count_continuation(entry, GFP_ATOMIC);
>         return err;
>  }
> --
> 2.43.5