[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <622bc591-ad14-448e-a9f3-988976fbb98a@gmail.com>
Date: Tue, 11 Jun 2024 19:43:41 +0100
From: Usama Arif <usamaarif642@...il.com>
To: Yosry Ahmed <yosryahmed@...gle.com>
Cc: 21cnbao@...il.com, akpm@...ux-foundation.org, hannes@...xchg.org,
david@...hat.com, ying.huang@...el.com, hughd@...gle.com,
willy@...radead.org, nphamcs@...il.com, chengming.zhou@...ux.dev,
linux-mm@...ck.org, linux-kernel@...r.kernel.org, kernel-team@...a.com,
Shakeel Butt <shakeel.butt@...ux.dev>
Subject: Re: [PATCH v3 1/2] mm: store zero pages to be swapped out in a bitmap
On 11/06/2024 18:51, Yosry Ahmed wrote:
> [..]
>>>> I think its better to handle this in Barrys patch. I feel this series is
>>>> close to its final state, i.e. the only diff I have for the next
>>>> revision is below to remove start/end_writeback for zer_filled case. I
>>>> will comment on Barrys patch once the I send out the next revision of this.
>>> Sorry I did not make myself clearer. I did not mean that you should
>>> handle the large folio swapin here. This needs to be handled at a
>>> higher level because as you mentioned, a large folio may be partially
>>> in the zeromap, zswap, swapcache, disk, etc.
>>>
>>> What I meant is that we should probably have a debug check to make
>>> sure this doesn't go unhandled. For zswap, I am trying to add a
>>> warning and fail the swapin operation if a large folio slips through
>>> to zswap. We can do something similar here if folks agree this is the
>>> right way in the interim:
>>> https://lore.kernel.org/lkml/20240611024516.1375191-3-yosryahmed@google.com/.
>>>
>>> Maybe I am too paranoid, but I think it's easy to mess up these things
>>> when working on large folio swapin imo.
>> So there is a difference between zswap and this optimization. In this
>> optimization, if the zeromap is set for all the folio bits, then we
>> should do large folio swapin. There still needs to be a change in Barrys
>> patch in alloc_swap_folio, but apart from that does the below diff over
>> v3 make it better? I will send a v4 with this if it sounds good.
>>
>>
>> diff --git a/mm/page_io.c b/mm/page_io.c
>> index 6400be6e4291..bf01364748a9 100644
>> --- a/mm/page_io.c
>> +++ b/mm/page_io.c
>> @@ -234,18 +234,24 @@ static void swap_zeromap_folio_clear(struct folio
>> *folio)
>> }
>> }
>>
>> -static bool swap_zeromap_folio_test(struct folio *folio)
>> +/*
>> + * Return the index of the first subpage which is not zero-filled
>> + * according to swap_info_struct->zeromap.
>> + * If all pages are zero-filled according to zeromap, it will return
>> + * folio_nr_pages(folio).
>> + */
>> +static long swap_zeromap_folio_test(struct folio *folio)
>> {
>> struct swap_info_struct *sis = swp_swap_info(folio->swap);
>> swp_entry_t entry;
>> - unsigned int i;
>> + long i;
> Why long?
folio_nr_pages returns long, but I just checked that
folio->_folio_nr_pages is unsigned int, but that will probably be
typecasted to long :). I will switch to unsigned int as its not really
going to go to long for CONFIG_64BIT
>> for (i = 0; i < folio_nr_pages(folio); i++) {
>> entry = page_swap_entry(folio_page(folio, i));
>> if (!test_bit(swp_offset(entry), sis->zeromap))
>> - return false;
>> + return i;
>> }
>> - return true;
>> + return i;
>> }
>>
>> /*
>> @@ -581,6 +587,7 @@ void swap_read_folio(struct folio *folio, bool
>> synchronous,
>> {
>> struct swap_info_struct *sis = swp_swap_info(folio->swap);
>> bool workingset = folio_test_workingset(folio);
>> + long first_non_zero_page_idx;
>> unsigned long pflags;
>> bool in_thrashing;
>>
>> @@ -598,10 +605,19 @@ void swap_read_folio(struct folio *folio, bool
>> synchronous,
>> psi_memstall_enter(&pflags);
>> }
>> delayacct_swapin_start();
>> - if (swap_zeromap_folio_test(folio)) {
>> + first_non_zero_page_idx = swap_zeromap_folio_test(folio);
>> + if (first_non_zero_page_idx == folio_nr_pages(folio)) {
>> folio_zero_fill(folio);
>> folio_mark_uptodate(folio);
>> folio_unlock(folio);
>> + } else if (first_non_zero_page_idx != 0) {
>> + /*
>> + * The case for when only *some* of subpages being
>> swapped-in were recorded
>> + * in sis->zeromap, while the rest are in zswap/disk is
>> currently not handled.
>> + * WARN in this case and return without marking the
>> folio uptodate so that
>> + * an IO error is emitted (e.g. do_swap_page() will sigbus).
>> + */
>> + WARN_ON_ONCE(1);
>> } else if (zswap_load(folio)) {
>> folio_mark_uptodate(folio);
>> folio_unlock(folio);
>>
>>
> This is too much noise for swap_read_folio(). How about adding
> swap_read_folio_zeromap() that takes care of this and decides whether
> or not to call folio_mark_uptodate()?
Sounds good, will do as below. Thanks!
>
> -static bool swap_zeromap_folio_test(struct folio *folio)
> +/*
> + * Return the index of the first subpage which is not zero-filled according to
> + * swap_info_struct->zeromap. If all pages are zero-filled according to
> + * zeromap, it will return folio_nr_pages(folio).
> + */
> +static unsigned int swap_zeromap_folio_test(struct folio *folio)
> {
> struct swap_info_struct *sis = swp_swap_info(folio->swap);
> swp_entry_t entry;
> @@ -243,9 +248,9 @@ static bool swap_zeromap_folio_test(struct folio *folio)
> for (i = 0; i < folio_nr_pages(folio); i++) {
> entry = page_swap_entry(folio_page(folio, i));
> if (!test_bit(swp_offset(entry), sis->zeromap))
> - return false;
> + return i;
> }
> - return true;
> + return i;
> }
>
> /*
> @@ -511,6 +516,25 @@ static void sio_read_complete(struct kiocb *iocb, long ret)
> mempool_free(sio, sio_pool);
> }
>
> +static bool swap_read_folio_zeromap(struct folio *folio)
> +{
> + unsigned int idx = swap_zeromap_folio_test(folio);
> +
> + if (idx == 0)
> + return false;
> +
> + /*
> + * Swapping in a large folio that is partially in the zeromap is not
> + * currently handled. Return true without marking the folio uptodate so
> + * that an IO error is emitted (e.g. do_swap_page() will sigbus).
> + */
> + if (WARN_ON_ONCE(idx < folio_nr_pages(folio)))
> + return true;
> +
> + folio_zero_fill(folio);
> + folio_mark_uptodate(folio);
> + return true
> +}
> +
> static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug)
> {
> struct swap_info_struct *sis = swp_swap_info(folio->swap);
> @@ -600,9 +624,7 @@ void swap_read_folio(struct folio *folio, bool synchronous,
> psi_memstall_enter(&pflags);
> }
> delayacct_swapin_start();
> - if (swap_zeromap_folio_test(folio)) {
> - folio_zero_fill(folio);
> - folio_mark_uptodate(folio);
> + if (swap_read_folio_zeromap(folio)) {
> folio_unlock(folio);
> } else if (zswap_load(folio)) {
> folio_mark_uptodate(folio);
Powered by blists - more mailing lists