linux-kernel - Re: [PATCH] zswap: do not crash the kernel on decompression failure

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Z76AVZ_tjq2NvmLT@google.com>
Date: Wed, 26 Feb 2025 02:45:41 +0000
From: Yosry Ahmed <yosry.ahmed@...ux.dev>
To: Johannes Weiner <hannes@...xchg.org>
Cc: Nhat Pham <nphamcs@...il.com>, akpm@...ux-foundation.org,
	chengming.zhou@...ux.dev, linux-mm@...ck.org, kernel-team@...a.com,
	linux-kernel@...r.kernel.org
Subject: Re: [PATCH] zswap: do not crash the kernel on decompression failure

On Tue, Feb 25, 2025 at 07:51:49PM -0500, Johannes Weiner wrote:
> On Tue, Feb 25, 2025 at 01:32:00PM -0800, Nhat Pham wrote:
> > Currently, we crash the kernel when a decompression failure occurs in
> > zswap (either because of memory corruption, or a bug in the compression
> > algorithm). This is overkill. We should only SIGBUS the unfortunate
> > process asking for the zswap entry on zswap load, and skip the corrupted
> > entry in zswap writeback.
> > 
> > See [1] for a recent upstream discussion about this.
> > 
> > [1]: https://lore.kernel.org/all/ZsiLElTykamcYZ6J@casper.infradead.org/
> > 
> > Suggested-by: Matthew Wilcox <willy@...radead.org>
> > Suggested-by: Yosry Ahmed <yosryahmed@...gle.com>
> > Signed-off-by: Nhat Pham <nphamcs@...il.com>
> > ---
> >  mm/zswap.c | 85 +++++++++++++++++++++++++++++++++++++-----------------
> >  1 file changed, 58 insertions(+), 27 deletions(-)
> > 
> > diff --git a/mm/zswap.c b/mm/zswap.c
> > index f6316b66fb23..31d4397eed61 100644
> > --- a/mm/zswap.c
> > +++ b/mm/zswap.c
> > @@ -62,6 +62,8 @@ static u64 zswap_reject_reclaim_fail;
> >  static u64 zswap_reject_compress_fail;
> >  /* Compressed page was too big for the allocator to (optimally) store */
> >  static u64 zswap_reject_compress_poor;
> > +/* Load and writeback failed due to decompression failure */
> > +static u64 zswap_reject_decompress_fail;
> 
> "reject" refers to "rejected store", so the name doesn't quite make
> sense. zswap_decompress_fail?
> 
> >  /* Store failed because underlying allocator could not get memory */
> >  static u64 zswap_reject_alloc_fail;
> >  /* Store failed because the entry metadata could not be allocated (rare) */
> > @@ -953,11 +955,12 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
> >  	return comp_ret == 0 && alloc_ret == 0;
> >  }
> >  
> > -static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
> > +static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
> >  {
> >  	struct zpool *zpool = entry->pool->zpool;
> >  	struct scatterlist input, output;
> >  	struct crypto_acomp_ctx *acomp_ctx;
> > +	bool ret = true;
> >  	u8 *src;
> >  
> >  	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
> > @@ -984,12 +987,19 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio)
> >  	sg_init_table(&output, 1);
> >  	sg_set_folio(&output, folio, PAGE_SIZE, 0);
> >  	acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, PAGE_SIZE);
> > -	BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait));
> > -	BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE);
> > +	if (crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait) ||
> > +			acomp_ctx->req->dlen != PAGE_SIZE) {
> > +		ret = false;
> > +		zswap_reject_decompress_fail++;
> > +		pr_alert_ratelimited(
> > +			"decompression failed on zswap entry with offset %08lx\n",
> > +			entry->swpentry.val);
> 
> Since this is a pretty dramatic failure scenario, IMO it would be
> useful to dump as much info as possible.
> 
> The exact return value of crypto_wait_req() could be useful,
> entry->length and req->dlen too.
> 
> entry->pool->tfm_name just to make absolute sure there is no
> confusion, as the compressor can be switched for new stores.
> 
> Maybe swp_type() and swp_offset() of entry->swpentry? Those could be
> easy markers to see if the entry was corrupted for example.
> 
> > +	}
> >  	mutex_unlock(&acomp_ctx->mutex);
> >  
> >  	if (src != acomp_ctx->buffer)
> >  		zpool_unmap_handle(zpool, entry->handle);
> > +	return ret;
> >  }
> >  
> >  /*********************************
> > @@ -1018,6 +1028,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> >  	struct writeback_control wbc = {
> >  		.sync_mode = WB_SYNC_NONE,
> >  	};
> > +	int ret = 0;
> >  
> >  	/* try to allocate swap cache folio */
> >  	mpol = get_task_policy(current);
> > @@ -1034,8 +1045,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> >  	 * and freed when invalidated by the concurrent shrinker anyway.
> >  	 */
> >  	if (!folio_was_allocated) {
> > -		folio_put(folio);
> > -		return -EEXIST;
> > +		ret = -EEXIST;
> > +		goto put_folio;
> >  	}
> >  
> >  	/*
> > @@ -1048,14 +1059,17 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> >  	 * be dereferenced.
> >  	 */
> >  	tree = swap_zswap_tree(swpentry);
> > -	if (entry != xa_cmpxchg(tree, offset, entry, NULL, GFP_KERNEL)) {
> > -		delete_from_swap_cache(folio);
> > -		folio_unlock(folio);
> > -		folio_put(folio);
> > -		return -ENOMEM;
> > +	if (entry != xa_load(tree, offset)) {
> > +		ret = -ENOMEM;
> > +		goto fail;
> >  	}
> >  
> > -	zswap_decompress(entry, folio);
> > +	if (!zswap_decompress(entry, folio)) {
> > +		ret = -EIO;
> > +		goto fail;
> > +	}
> > +
> > +	xa_erase(tree, offset);
> >  
> >  	count_vm_event(ZSWPWB);
> >  	if (entry->objcg)
> > @@ -1071,9 +1085,14 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> >  
> >  	/* start writeback */
> >  	__swap_writepage(folio, &wbc);
> > -	folio_put(folio);
> > +	goto put_folio;
> >  
> > -	return 0;
> > +fail:
> > +	delete_from_swap_cache(folio);
> > +	folio_unlock(folio);
> > +put_folio:
> > +	folio_put(folio);
> > +	return ret;
> >  }
> 
> Nice, yeah it's time for factoring out the error unwinding. If you
> write it like this, you can save a jump in the main sequence:
> 
> 	__swap_writepage(folio, &wbc);
> 	ret = 0;
> put:
> 	folio_put(folio);
> 	return ret;
> delete_unlock:

(I like how you sneaked the label rename in here, I didn't like 'fail'
either :P)

> 	delete_from_swap_cache(folio);
> 	folio_unlock(folio);
> 	goto put;

I would go even further and avoid gotos completely (and make it super
clear what gets executed in the normal path vs the failure path):

	__swap_writepage(folio, &wbc);
	folio_put(folio);
	if (ret) {
		delete_from_swap_cache(folio);
		folio_unlock(folio);
	}
	return ret;

> 
> >  
> >  /*********************************
> > @@ -1600,6 +1619,29 @@ bool zswap_load(struct folio *folio)
> >  	if (WARN_ON_ONCE(folio_test_large(folio)))
> >  		return true;
> >  
> > +	/*
> > +	 * We cannot invalidate the zswap entry before decompressing it. If
> > +	 * decompression fails, we must keep the entry in the tree so that
> > +	 * a future read by another process on the same swap entry will also
> > +	 * have to go through zswap. Otherwise, we risk silently reading
> > +	 * corrupted data for the other process.
> > +	 */
> > +	entry = xa_load(tree, offset);
> > +	if (!entry)
> > +		return false;
> 
> The explanation in the comment makes sense in the context of this
> change. But fresh eyes reading this function and having never seen
> that this *used to* open with xa_erase() will be confused. It answers
> questions the reader doesn't have at this point - it's just a function
> called zswap_load() beginning with an xa_load(), so what?
> 
> At first I was going to suggest moving it down to the swapcache
> branch. But honestly after reading *that* comment again, in the new
> sequence, I don't think the question will arise there either. It's
> pretty self-evident that the whole "we can invalidate when reading
> into the swapcache" does not apply if the read failed.

+1

> 
> > +	/*
> > +	 * If decompression fails, we return true to notify the caller that the
> > +	 * folio's data were in zswap, but do not mark the folio as up-to-date.
> > +	 * This will effectively SIGBUS the calling process.
> > +	 */
> 
> It would be good to put a lampshade on this weirdness that the return
> value has nothing to do with success or not. This wasn't as important
> a distinction, but with actual decompression failures I think it is.

+1

> 
> Something like this?
> 
> 	if (!zswap_decompress(entry, folio)) {
> 		/*
> 		 * The zswap_load() return value doesn't indicate success or
> 		 * failure, but whether zswap owns the swapped out contents.
> 		 * This MUST return true here, otherwise swap_readpage() will
> 		 * read garbage from the backend.
> 		 *
> 		 * Success is signaled by marking the folio uptodate.
> 		 */

We use the same trick in the folio_test_large() branch, so maybe this
should be moved to above the function definition. Then we can perhaps
refer to it in places where we return true wihout setting uptodate for
added clarity if needed.

> 		return true;
> 	}
> 
> 	folio_mark_uptodate(folio);
> 
> > +	if (!zswap_decompress(entry, folio))
> > +		return true;
> > +
> > +	count_vm_event(ZSWPIN);
> > +	if (entry->objcg)
> > +		count_objcg_events(entry->objcg, ZSWPIN, 1);
> > +
> >  	/*
> >  	 * When reading into the swapcache, invalidate our entry. The
> >  	 * swapcache can be the authoritative owner of the page and
> > @@ -1612,21 +1654,8 @@ bool zswap_load(struct folio *folio)
> >  	 * files, which reads into a private page and may free it if
> >  	 * the fault fails. We remain the primary owner of the entry.)
> >  	 */
> > -	if (swapcache)
> > -		entry = xa_erase(tree, offset);
> > -	else
> > -		entry = xa_load(tree, offset);
> > -
> > -	if (!entry)
> > -		return false;
> > -
> > -	zswap_decompress(entry, folio);
> > -
> > -	count_vm_event(ZSWPIN);
> > -	if (entry->objcg)
> > -		count_objcg_events(entry->objcg, ZSWPIN, 1);
> > -
> >  	if (swapcache) {
> > +		xa_erase(tree, offset);
> >  		zswap_entry_free(entry);
> >  		folio_mark_dirty(folio);
> >  	}
>