linux-kernel - Re: [PATCH v3 1/2] mm: store zero pages to be swapped out in a bitmap

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <622bc591-ad14-448e-a9f3-988976fbb98a@gmail.com>
Date: Tue, 11 Jun 2024 19:43:41 +0100
From: Usama Arif <usamaarif642@...il.com>
To: Yosry Ahmed <yosryahmed@...gle.com>
Cc: 21cnbao@...il.com, akpm@...ux-foundation.org, hannes@...xchg.org,
 david@...hat.com, ying.huang@...el.com, hughd@...gle.com,
 willy@...radead.org, nphamcs@...il.com, chengming.zhou@...ux.dev,
 linux-mm@...ck.org, linux-kernel@...r.kernel.org, kernel-team@...a.com,
 Shakeel Butt <shakeel.butt@...ux.dev>
Subject: Re: [PATCH v3 1/2] mm: store zero pages to be swapped out in a bitmap


On 11/06/2024 18:51, Yosry Ahmed wrote:
> [..]
>>>> I think its better to handle this in Barrys patch. I feel this series is
>>>> close to its final state, i.e. the only diff I have for the next
>>>> revision is below to remove start/end_writeback for zer_filled case. I
>>>> will comment on Barrys patch once the I send out the next revision of this.
>>> Sorry I did not make myself clearer. I did not mean that you should
>>> handle the large folio swapin here. This needs to be handled at a
>>> higher level because as you mentioned, a large folio may be partially
>>> in the zeromap, zswap, swapcache, disk, etc.
>>>
>>> What I meant is that we should probably have a debug check to make
>>> sure this doesn't go unhandled. For zswap, I am trying to add a
>>> warning and fail the swapin operation if a large folio slips through
>>> to zswap. We can do something similar here if folks agree this is the
>>> right way in the interim:
>>> https://lore.kernel.org/lkml/20240611024516.1375191-3-yosryahmed@google.com/.
>>>
>>> Maybe I am too paranoid, but I think it's easy to mess up these things
>>> when working on large folio swapin imo.
>> So there is a difference between zswap and this optimization. In this
>> optimization, if the zeromap is set for all the folio bits, then we
>> should do large folio swapin. There still needs to be a change in Barrys
>> patch in alloc_swap_folio, but apart from that does the below diff over
>> v3 make it better? I will send a v4 with this if it sounds good.
>>
>>
>> diff --git a/mm/page_io.c b/mm/page_io.c
>> index 6400be6e4291..bf01364748a9 100644
>> --- a/mm/page_io.c
>> +++ b/mm/page_io.c
>> @@ -234,18 +234,24 @@ static void swap_zeromap_folio_clear(struct folio
>> *folio)
>>           }
>>    }
>>
>> -static bool swap_zeromap_folio_test(struct folio *folio)
>> +/*
>> + * Return the index of the first subpage which is not zero-filled
>> + * according to swap_info_struct->zeromap.
>> + * If all pages are zero-filled according to zeromap, it will return
>> + * folio_nr_pages(folio).
>> + */
>> +static long swap_zeromap_folio_test(struct folio *folio)
>>    {
>>           struct swap_info_struct *sis = swp_swap_info(folio->swap);
>>           swp_entry_t entry;
>> -       unsigned int i;
>> +       long i;
> Why long?


folio_nr_pages returns long, but I just checked that 
folio->_folio_nr_pages is unsigned int, but that will probably be 
typecasted to long :). I will switch to unsigned int as its not really 
going to go to long for CONFIG_64BIT

>>           for (i = 0; i < folio_nr_pages(folio); i++) {
>>                   entry = page_swap_entry(folio_page(folio, i));
>>                   if (!test_bit(swp_offset(entry), sis->zeromap))
>> -                       return false;
>> +                       return i;
>>           }
>> -       return true;
>> +       return i;
>>    }
>>
>>    /*
>> @@ -581,6 +587,7 @@ void swap_read_folio(struct folio *folio, bool
>> synchronous,
>>    {
>>           struct swap_info_struct *sis = swp_swap_info(folio->swap);
>>           bool workingset = folio_test_workingset(folio);
>> +       long first_non_zero_page_idx;
>>           unsigned long pflags;
>>           bool in_thrashing;
>>
>> @@ -598,10 +605,19 @@ void swap_read_folio(struct folio *folio, bool
>> synchronous,
>>                   psi_memstall_enter(&pflags);
>>           }
>>           delayacct_swapin_start();
>> -       if (swap_zeromap_folio_test(folio)) {
>> +       first_non_zero_page_idx = swap_zeromap_folio_test(folio);
>> +       if (first_non_zero_page_idx == folio_nr_pages(folio)) {
>>                   folio_zero_fill(folio);
>>                   folio_mark_uptodate(folio);
>>                   folio_unlock(folio);
>> +       } else if (first_non_zero_page_idx != 0) {
>> +               /*
>> +                * The case for when only *some* of subpages being
>> swapped-in were recorded
>> +                * in sis->zeromap, while the rest are in zswap/disk is
>> currently not handled.
>> +                * WARN in this case and return without marking the
>> folio uptodate so that
>> +                * an IO error is emitted (e.g. do_swap_page() will sigbus).
>> +                */
>> +                WARN_ON_ONCE(1);
>>           } else if (zswap_load(folio)) {
>>                   folio_mark_uptodate(folio);
>>                   folio_unlock(folio);
>>
>>
> This is too much noise for swap_read_folio(). How about adding
> swap_read_folio_zeromap() that takes care of this and decides whether
> or not to call folio_mark_uptodate()?

Sounds good, will do as below. Thanks!

>
> -static bool swap_zeromap_folio_test(struct folio *folio)
> +/*
> + * Return the index of the first subpage which is not zero-filled according to
> + * swap_info_struct->zeromap.  If all pages are zero-filled according to
> + * zeromap, it will return folio_nr_pages(folio).
> + */
> +static unsigned int swap_zeromap_folio_test(struct folio *folio)
>   {
>          struct swap_info_struct *sis = swp_swap_info(folio->swap);
>          swp_entry_t entry;
> @@ -243,9 +248,9 @@ static bool swap_zeromap_folio_test(struct folio *folio)
>          for (i = 0; i < folio_nr_pages(folio); i++) {
>                  entry = page_swap_entry(folio_page(folio, i));
>                  if (!test_bit(swp_offset(entry), sis->zeromap))
> -                       return false;
> +                       return i;
>          }
> -       return true;
> +       return i;
>   }
>
>   /*
> @@ -511,6 +516,25 @@ static void sio_read_complete(struct kiocb *iocb, long ret)
>          mempool_free(sio, sio_pool);
>   }
>
> +static bool swap_read_folio_zeromap(struct folio *folio)
> +{
> +       unsigned int idx = swap_zeromap_folio_test(folio);
> +
> +       if (idx == 0)
> +               return false;
> +
> +       /*
> +        * Swapping in a large folio that is partially in the zeromap is not
> +        * currently handled. Return true without marking the folio uptodate so
> +        * that an IO error is emitted (e.g.  do_swap_page() will sigbus).
> +        */
> +       if (WARN_ON_ONCE(idx < folio_nr_pages(folio)))
> +               return true;
> +
> +       folio_zero_fill(folio);
> +       folio_mark_uptodate(folio);
> +       return true
> +}
> +
>   static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug)
>   {
>          struct swap_info_struct *sis = swp_swap_info(folio->swap);
> @@ -600,9 +624,7 @@ void swap_read_folio(struct folio *folio, bool synchronous,
>                  psi_memstall_enter(&pflags);
>          }
>          delayacct_swapin_start();
> -       if (swap_zeromap_folio_test(folio)) {
> -               folio_zero_fill(folio);
> -               folio_mark_uptodate(folio);
> +       if (swap_read_folio_zeromap(folio)) {
>                  folio_unlock(folio);
>          } else if (zswap_load(folio)) {
>                  folio_mark_uptodate(folio);