linux-kernel - Re: [PATCH v3 1/2] mm: store zero pages to be swapped out in a bitmap

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAJD7tkZC8e8ZTBSOGZH-1srTeC=jqxwWchd-BjvNsV2FR0oT8Q@mail.gmail.com>
Date: Tue, 11 Jun 2024 10:51:08 -0700
From: Yosry Ahmed <yosryahmed@...gle.com>
To: Usama Arif <usamaarif642@...il.com>
Cc: 21cnbao@...il.com, akpm@...ux-foundation.org, hannes@...xchg.org, 
	david@...hat.com, ying.huang@...el.com, hughd@...gle.com, willy@...radead.org, 
	nphamcs@...il.com, chengming.zhou@...ux.dev, linux-mm@...ck.org, 
	linux-kernel@...r.kernel.org, kernel-team@...a.com, 
	Shakeel Butt <shakeel.butt@...ux.dev>
Subject: Re: [PATCH v3 1/2] mm: store zero pages to be swapped out in a bitmap

[..]
> >> I think its better to handle this in Barrys patch. I feel this series is
> >> close to its final state, i.e. the only diff I have for the next
> >> revision is below to remove start/end_writeback for zer_filled case. I
> >> will comment on Barrys patch once the I send out the next revision of this.
> > Sorry I did not make myself clearer. I did not mean that you should
> > handle the large folio swapin here. This needs to be handled at a
> > higher level because as you mentioned, a large folio may be partially
> > in the zeromap, zswap, swapcache, disk, etc.
> >
> > What I meant is that we should probably have a debug check to make
> > sure this doesn't go unhandled. For zswap, I am trying to add a
> > warning and fail the swapin operation if a large folio slips through
> > to zswap. We can do something similar here if folks agree this is the
> > right way in the interim:
> > https://lore.kernel.org/lkml/20240611024516.1375191-3-yosryahmed@google.com/.
> >
> > Maybe I am too paranoid, but I think it's easy to mess up these things
> > when working on large folio swapin imo.
>
> So there is a difference between zswap and this optimization. In this
> optimization, if the zeromap is set for all the folio bits, then we
> should do large folio swapin. There still needs to be a change in Barrys
> patch in alloc_swap_folio, but apart from that does the below diff over
> v3 make it better? I will send a v4 with this if it sounds good.
>
>
> diff --git a/mm/page_io.c b/mm/page_io.c
> index 6400be6e4291..bf01364748a9 100644
> --- a/mm/page_io.c
> +++ b/mm/page_io.c
> @@ -234,18 +234,24 @@ static void swap_zeromap_folio_clear(struct folio
> *folio)
>          }
>   }
>
> -static bool swap_zeromap_folio_test(struct folio *folio)
> +/*
> + * Return the index of the first subpage which is not zero-filled
> + * according to swap_info_struct->zeromap.
> + * If all pages are zero-filled according to zeromap, it will return
> + * folio_nr_pages(folio).
> + */
> +static long swap_zeromap_folio_test(struct folio *folio)
>   {
>          struct swap_info_struct *sis = swp_swap_info(folio->swap);
>          swp_entry_t entry;
> -       unsigned int i;
> +       long i;

Why long?

>
>          for (i = 0; i < folio_nr_pages(folio); i++) {
>                  entry = page_swap_entry(folio_page(folio, i));
>                  if (!test_bit(swp_offset(entry), sis->zeromap))
> -                       return false;
> +                       return i;
>          }
> -       return true;
> +       return i;
>   }
>
>   /*
> @@ -581,6 +587,7 @@ void swap_read_folio(struct folio *folio, bool
> synchronous,
>   {
>          struct swap_info_struct *sis = swp_swap_info(folio->swap);
>          bool workingset = folio_test_workingset(folio);
> +       long first_non_zero_page_idx;
>          unsigned long pflags;
>          bool in_thrashing;
>
> @@ -598,10 +605,19 @@ void swap_read_folio(struct folio *folio, bool
> synchronous,
>                  psi_memstall_enter(&pflags);
>          }
>          delayacct_swapin_start();
> -       if (swap_zeromap_folio_test(folio)) {
> +       first_non_zero_page_idx = swap_zeromap_folio_test(folio);
> +       if (first_non_zero_page_idx == folio_nr_pages(folio)) {
>                  folio_zero_fill(folio);
>                  folio_mark_uptodate(folio);
>                  folio_unlock(folio);
> +       } else if (first_non_zero_page_idx != 0) {
> +               /*
> +                * The case for when only *some* of subpages being
> swapped-in were recorded
> +                * in sis->zeromap, while the rest are in zswap/disk is
> currently not handled.
> +                * WARN in this case and return without marking the
> folio uptodate so that
> +                * an IO error is emitted (e.g. do_swap_page() will sigbus).
> +                */
> +                WARN_ON_ONCE(1);
>          } else if (zswap_load(folio)) {
>                  folio_mark_uptodate(folio);
>                  folio_unlock(folio);
>
>

This is too much noise for swap_read_folio(). How about adding
swap_read_folio_zeromap() that takes care of this and decides whether
or not to call folio_mark_uptodate()?

-static bool swap_zeromap_folio_test(struct folio *folio)
+/*
+ * Return the index of the first subpage which is not zero-filled according to
+ * swap_info_struct->zeromap.  If all pages are zero-filled according to
+ * zeromap, it will return folio_nr_pages(folio).
+ */
+static unsigned int swap_zeromap_folio_test(struct folio *folio)
 {
        struct swap_info_struct *sis = swp_swap_info(folio->swap);
        swp_entry_t entry;
@@ -243,9 +248,9 @@ static bool swap_zeromap_folio_test(struct folio *folio)
        for (i = 0; i < folio_nr_pages(folio); i++) {
                entry = page_swap_entry(folio_page(folio, i));
                if (!test_bit(swp_offset(entry), sis->zeromap))
-                       return false;
+                       return i;
        }
-       return true;
+       return i;
 }

 /*
@@ -511,6 +516,25 @@ static void sio_read_complete(struct kiocb *iocb, long ret)
        mempool_free(sio, sio_pool);
 }

+static bool swap_read_folio_zeromap(struct folio *folio)
+{
+       unsigned int idx = swap_zeromap_folio_test(folio);
+
+       if (idx == 0)
+               return false;
+
+       /*
+        * Swapping in a large folio that is partially in the zeromap is not
+        * currently handled. Return true without marking the folio uptodate so
+        * that an IO error is emitted (e.g.  do_swap_page() will sigbus).
+        */
+       if (WARN_ON_ONCE(idx < folio_nr_pages(folio)))
+               return true;
+
+       folio_zero_fill(folio);
+       folio_mark_uptodate(folio);
+       return true
+}
+
 static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug)
 {
        struct swap_info_struct *sis = swp_swap_info(folio->swap);
@@ -600,9 +624,7 @@ void swap_read_folio(struct folio *folio, bool synchronous,
                psi_memstall_enter(&pflags);
        }
        delayacct_swapin_start();
-       if (swap_zeromap_folio_test(folio)) {
-               folio_zero_fill(folio);
-               folio_mark_uptodate(folio);
+       if (swap_read_folio_zeromap(folio)) {
                folio_unlock(folio);
        } else if (zswap_load(folio)) {
                folio_mark_uptodate(folio);