[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Z8CofKw5Klcus_LM@google.com>
Date: Thu, 27 Feb 2025 18:01:32 +0000
From: Yosry Ahmed <yosry.ahmed@...ux.dev>
To: Johannes Weiner <hannes@...xchg.org>
Cc: Nhat Pham <nphamcs@...il.com>, akpm@...ux-foundation.org,
chengming.zhou@...ux.dev, linux-mm@...ck.org, kernel-team@...a.com,
linux-kernel@...r.kernel.org
Subject: Re: [PATCH v2] zswap: do not crash the kernel on decompression
failure
On Thu, Feb 27, 2025 at 11:05:28AM -0500, Johannes Weiner wrote:
> On Thu, Feb 27, 2025 at 07:29:45AM +0000, Yosry Ahmed wrote:
> > Maybe we can do something like this:
> >
> > /* Returns true if the folio was in the zeromap or zswap */
> > bool swap_read_folio_in_memory(struct folio *folio)
> > {
> > int ret;
> >
> > ret = swap_read_folio_zeromap(folio);
> > if (ret == -ENOENT)
> > ret = zswap_load(folio);
> >
> > if (ret == 0) {
> > folio_mark_uptodate(folio);
> > folio_unlock(folio);
> > return true;
> > } else if (ret != -ENOENT) {
> > folio_unlock(folio);
> > return true;
> > } else {
> > return false;
> > }
> > }
>
> Eh, I think we're getting colder again.
>
> This looks repetitive, zswap_load() is kind of awkward in that error
> leg, and combining the two into one function is a bit of a stretch.
>
> There is also something to be said about folio_mark_uptodate() and
> folio_unlock() ususally being done by the backend implementation - in
> what the page cache would call the "filler" method - to signal when
> it's done reading, and what the outcome was.
>
> E.g. for fs it's always in the specific ->read implementation:
>
> static int simple_read_folio(struct file *file, struct folio *folio)
> {
> folio_zero_range(folio, 0, folio_size(folio));
> flush_dcache_folio(folio);
> folio_mark_uptodate(folio);
> folio_unlock(folio);
> return 0;
> }
>
> and not in the generic manifold:
>
> $ grep -c folio_mark_uptodate mm/filemap.c
> 0
>
> I'd actually rather push those down into zeromap and zswap as well to
> follow that pattern more closely:
Hmm yeah for the sake of consistency I think we can do that. My main
concern was the comments clarifying the 'true' return value without
marking uptodate is a fail in a lot of places in zswap and zeromap code.
However, I think returning an error code as you suggested makes it more
obvious and reduces the need for comments.
So yes I think your proposed approach is better (with a few comments).
>
> diff --git a/mm/page_io.c b/mm/page_io.c
> index 9b983de351f9..1fb5ce1884bd 100644
> --- a/mm/page_io.c
> +++ b/mm/page_io.c
> @@ -538,6 +538,7 @@ static bool swap_read_folio_zeromap(struct folio *folio)
>
> folio_zero_range(folio, 0, folio_size(folio));
> folio_mark_uptodate(folio);
> + folio_unlock(folio);
> return true;
So I think we should double down and follow the same pattern for
swap_read_folio_zeromap() too. Return an error code too to clarify the
skip uptodate case.
> }
>
> @@ -635,13 +636,11 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
> }
> delayacct_swapin_start();
>
> - if (swap_read_folio_zeromap(folio)) {
> - folio_unlock(folio);
> + if (swap_read_folio_zeromap(folio))
> goto finish;
and this ends up being closer to the zswap pattern:
if (swap_read_folio_zeromap(folio) != -ENOENT)
goto finish;
> - } else if (zswap_load(folio)) {
> - folio_unlock(folio);
> +
> + if (zswap_load(folio) != -ENOENT)
> goto finish;
> - }
>
> /* We have to read from slower devices. Increase zswap protection. */
> zswap_folio_swapin(folio);
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 6dbf31bd2218..76b2a964b0cd 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -1620,7 +1620,7 @@ bool zswap_store(struct folio *folio)
> return ret;
> }
>
And here we should explain the different return codes and when we return
with the folio locked/unlocked or marked uptodate.
> -bool zswap_load(struct folio *folio)
> +int zswap_load(struct folio *folio)
> {
> swp_entry_t swp = folio->swap;
> pgoff_t offset = swp_offset(swp);
> @@ -1631,7 +1631,7 @@ bool zswap_load(struct folio *folio)
> VM_WARN_ON_ONCE(!folio_test_locked(folio));
>
> if (zswap_never_enabled())
> - return false;
> + return -ENOENT;
>
> /*
> * Large folios should not be swapped in while zswap is being used, as
> @@ -1641,8 +1641,25 @@ bool zswap_load(struct folio *folio)
> * Return true without marking the folio uptodate so that an IO error is
> * emitted (e.g. do_swap_page() will sigbus).
> */
> - if (WARN_ON_ONCE(folio_test_large(folio)))
> - return true;
> + if (WARN_ON_ONCE(folio_test_large(folio))) {
> + folio_unlock(folio);
> + return -EINVAL;
> + }
> +
> + entry = xa_load(tree, offset);
> + if (!entry)
> + return -ENOENT;
> +
> + if (!zswap_decompress(entry, folio)) {
> + folio_unlock(folio);
> + return -EIO;
> + }
> +
> + folio_mark_uptodate(folio);
> +
> + count_vm_event(ZSWPIN);
> + if (entry->objcg)
> + count_objcg_events(entry->objcg, ZSWPIN, 1);
>
> /*
> * When reading into the swapcache, invalidate our entry. The
> @@ -1656,27 +1673,14 @@ bool zswap_load(struct folio *folio)
> * files, which reads into a private page and may free it if
> * the fault fails. We remain the primary owner of the entry.)
> */
> - if (swapcache)
> - entry = xa_erase(tree, offset);
> - else
> - entry = xa_load(tree, offset);
> -
> - if (!entry)
> - return false;
> -
> - zswap_decompress(entry, folio);
> -
> - count_vm_event(ZSWPIN);
> - if (entry->objcg)
> - count_objcg_events(entry->objcg, ZSWPIN, 1);
> -
> if (swapcache) {
> - zswap_entry_free(entry);
> folio_mark_dirty(folio);
> + xa_erase(tree, offset);
> + zswap_entry_free(entry);
> }
>
> - folio_mark_uptodate(folio);
> - return true;
> + folio_unlock(folio);
> + return 0;
> }
>
> void zswap_invalidate(swp_entry_t swp)
Powered by blists - more mailing lists