linux-kernel - Re: [PATCH v5 5/9] mm: swap: skip slot cache on freeing for mTHP

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20240803091118.84274-1-21cnbao@gmail.com>
Date: Sat,  3 Aug 2024 21:11:18 +1200
From: Barry Song <21cnbao@...il.com>
To: chrisl@...nel.org
Cc: akpm@...ux-foundation.org,
	baohua@...nel.org,
	hughd@...gle.com,
	kaleshsingh@...gle.com,
	kasong@...cent.com,
	linux-kernel@...r.kernel.org,
	linux-mm@...ck.org,
	ryan.roberts@....com,
	ying.huang@...el.com,
	Barry Song <v-songbaohua@...o.com>
Subject: Re: [PATCH v5 5/9] mm: swap: skip slot cache on freeing for mTHP

On Wed, Jul 31, 2024 at 6:49 PM <chrisl@...nel.org> wrote:
>
> From: Kairui Song <kasong@...cent.com>
>
> Currently when we are freeing mTHP folios from swap cache, we free
> then one by one and put each entry into swap slot cache. Slot
> cache is designed to reduce the overhead by batching the freeing,
> but mTHP swap entries are already continuous so they can be batch
> freed without it already, it saves litle overhead, or even increase
> overhead for larger mTHP.
>
> What's more, mTHP entries could stay in swap cache for a while.
> Contiguous swap entry is an rather rare resource so releasing them
> directly can help improve mTHP allocation success rate when under
> pressure.
>
> Signed-off-by: Kairui Song <kasong@...cent.com>

Acked-by: Barry Song <baohua@...nel.org>

I believe this is the right direction to take. Currently, entries are released
one by one, even when they are contiguous in the swap file(those nr_pages
entries are definitely in the same cluster and same si), leading to numerous
lock and unlock operations.
This approach provides batched support.

free_swap_and_cache_nr() has the same issue, so I drafted a patch based on
your code. I wonder if you can also help test and review before I send it
officially:

>From 4bed5c08bc0f7769ee2849812acdad70c4e32ead Mon Sep 17 00:00:00 2001
From: Barry Song <v-songbaohua@...o.com>
Date: Sat, 3 Aug 2024 20:21:14 +1200
Subject: [PATCH RFC] mm: attempt to batch free swap entries for zap_pte_range()

Zhiguo reported that swap release could be a serious bottleneck
during process exits[1]. With mTHP, we have the opportunity to
batch free swaps.
Thanks to the work of Chris and Kairui[2], I was able to achieve
this optimization with minimal code changes by building on their
efforts.

[1] https://lore.kernel.org/linux-mm/20240731133318.527-1-justinjiang@vivo.com/
[2] https://lore.kernel.org/linux-mm/20240730-swap-allocator-v5-0-cb9c148b9297@kernel.org/

Signed-off-by: Barry Song <v-songbaohua@...o.com>
---
 mm/swapfile.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index ea023fc25d08..9def6dba8d26 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -156,6 +156,25 @@ static bool swap_is_has_cache(struct swap_info_struct *si,
 	return true;
 }
 
+static bool swap_is_last_map(struct swap_info_struct *si,
+			      unsigned long offset, int nr_pages,
+			      bool *any_only_cache)
+{
+	unsigned char *map = si->swap_map + offset;
+	unsigned char *map_end = map + nr_pages;
+	bool cached = false;
+
+	do {
+		if ((*map & ~SWAP_HAS_CACHE) != 1)
+			return false;
+		if (*map & SWAP_HAS_CACHE)
+			cached = true;
+	} while (++map < map_end);
+
+	*any_only_cache = cached;
+	return true;
+}
+
 /*
  * returns number of pages in the folio that backs the swap entry. If positive,
  * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no
@@ -1808,6 +1827,29 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
 	if (WARN_ON(end_offset > si->max))
 		goto out;
 
+	if (nr > 1) {
+		struct swap_cluster_info *ci;
+		bool batched_free;
+		int i;
+
+		ci = lock_cluster_or_swap_info(si, start_offset);
+		if ((batched_free = swap_is_last_map(si, start_offset, nr, &any_only_cache))) {
+			for (i = 0; i < nr; i++)
+				WRITE_ONCE(si->swap_map[start_offset + i], SWAP_HAS_CACHE);
+		}
+		unlock_cluster_or_swap_info(si, ci);
+
+		if (batched_free) {
+			spin_lock(&si->lock);
+			pr_err("%s offset:%lx nr:%lx\n", __func__,start_offset, nr);
+			swap_entry_range_free(si, entry, nr);
+			spin_unlock(&si->lock);
+			if (any_only_cache)
+				goto reclaim;
+			goto out;
+		}
+	}
+
 	/*
 	 * First free all entries in the range.
 	 */
@@ -1828,6 +1870,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
 	if (!any_only_cache)
 		goto out;
 
+reclaim:
 	/*
 	 * Now go back over the range trying to reclaim the swap cache. This is
 	 * more efficient for large folios because we will only try to reclaim
-- 
2.34.1



> ---
>  mm/swapfile.c | 59 ++++++++++++++++++++++++++---------------------------------
>  1 file changed, 26 insertions(+), 33 deletions(-)
>
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 34e6ea13e8e4..9b63b2262cc2 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -479,20 +479,21 @@ static void inc_cluster_info_page(struct swap_info_struct *p,
>  }
>
>  /*
> - * The cluster ci decreases one usage. If the usage counter becomes 0,
> + * The cluster ci decreases @nr_pages usage. If the usage counter becomes 0,
>   * which means no page in the cluster is in use, we can optionally discard
>   * the cluster and add it to free cluster list.
>   */
> -static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluster_info *ci)
> +static void dec_cluster_info_page(struct swap_info_struct *p,
> +                                 struct swap_cluster_info *ci, int nr_pages)
>  {
>         if (!p->cluster_info)
>                 return;
>
> -       VM_BUG_ON(ci->count == 0);
> +       VM_BUG_ON(ci->count < nr_pages);
>         VM_BUG_ON(cluster_is_free(ci));
>         lockdep_assert_held(&p->lock);
>         lockdep_assert_held(&ci->lock);
> -       ci->count--;
> +       ci->count -= nr_pages;
>
>         if (!ci->count) {
>                 free_cluster(p, ci);
> @@ -998,19 +999,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
>         return n_ret;
>  }
>
> -static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
> -{
> -       unsigned long offset = idx * SWAPFILE_CLUSTER;
> -       struct swap_cluster_info *ci;
> -
> -       ci = lock_cluster(si, offset);
> -       memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
> -       ci->count = 0;
> -       free_cluster(si, ci);
> -       unlock_cluster(ci);
> -       swap_range_free(si, offset, SWAPFILE_CLUSTER);
> -}
> -
>  int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
>  {
>         int order = swap_entry_order(entry_order);
> @@ -1269,21 +1257,28 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p,
>         return usage;
>  }
>
> -static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
> +/*
> + * Drop the last HAS_CACHE flag of swap entries, caller have to
> + * ensure all entries belong to the same cgroup.
> + */
> +static void swap_entry_range_free(struct swap_info_struct *p, swp_entry_t entry,
> +                                 unsigned int nr_pages)
>  {
> -       struct swap_cluster_info *ci;
>         unsigned long offset = swp_offset(entry);
> -       unsigned char count;
> +       unsigned char *map = p->swap_map + offset;
> +       unsigned char *map_end = map + nr_pages;
> +       struct swap_cluster_info *ci;
>
>         ci = lock_cluster(p, offset);
> -       count = p->swap_map[offset];
> -       VM_BUG_ON(count != SWAP_HAS_CACHE);
> -       p->swap_map[offset] = 0;
> -       dec_cluster_info_page(p, ci);
> +       do {
> +               VM_BUG_ON(*map != SWAP_HAS_CACHE);
> +               *map = 0;
> +       } while (++map < map_end);
> +       dec_cluster_info_page(p, ci, nr_pages);
>         unlock_cluster(ci);
>
> -       mem_cgroup_uncharge_swap(entry, 1);
> -       swap_range_free(p, offset, 1);
> +       mem_cgroup_uncharge_swap(entry, nr_pages);
> +       swap_range_free(p, offset, nr_pages);
>  }
>
>  static void cluster_swap_free_nr(struct swap_info_struct *sis,
> @@ -1343,7 +1338,6 @@ void swap_free_nr(swp_entry_t entry, int nr_pages)
>  void put_swap_folio(struct folio *folio, swp_entry_t entry)
>  {
>         unsigned long offset = swp_offset(entry);
> -       unsigned long idx = offset / SWAPFILE_CLUSTER;
>         struct swap_cluster_info *ci;
>         struct swap_info_struct *si;
>         unsigned char *map;
> @@ -1356,19 +1350,18 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry)
>                 return;
>
>         ci = lock_cluster_or_swap_info(si, offset);
> -       if (size == SWAPFILE_CLUSTER) {
> +       if (size > 1) {
>                 map = si->swap_map + offset;
> -               for (i = 0; i < SWAPFILE_CLUSTER; i++) {
> +               for (i = 0; i < size; i++) {
>                         val = map[i];
>                         VM_BUG_ON(!(val & SWAP_HAS_CACHE));
>                         if (val == SWAP_HAS_CACHE)
>                                 free_entries++;
>                 }
> -               if (free_entries == SWAPFILE_CLUSTER) {
> +               if (free_entries == size) {
>                         unlock_cluster_or_swap_info(si, ci);
>                         spin_lock(&si->lock);
> -                       mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
> -                       swap_free_cluster(si, idx);
> +                       swap_entry_range_free(si, entry, size);
>                         spin_unlock(&si->lock);
>                         return;
>                 }
> @@ -1413,7 +1406,7 @@ void swapcache_free_entries(swp_entry_t *entries, int n)
>         for (i = 0; i < n; ++i) {
>                 p = swap_info_get_cont(entries[i], prev);
>                 if (p)
> -                       swap_entry_free(p, entries[i]);
> +                       swap_entry_range_free(p, entries[i], 1);
>                 prev = p;
>         }
>         if (p)
>
> --
> 2.46.0.rc1.232.g9752f9e123-goog
>

Thanks
Barry