linux-kernel - Re: [PATCH] erofs: fix infinite loop due to a race of filling compressed

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240125221248.00005540.zbestahu@gmail.com>
Date: Thu, 25 Jan 2024 22:12:48 +0800
From: Yue Hu <zbestahu@...il.com>
To: Gao Xiang <hsiangkao@...ux.alibaba.com>
Cc: linux-erofs@...ts.ozlabs.org, LKML <linux-kernel@...r.kernel.org>
Subject: Re: [PATCH] erofs: fix infinite loop due to a race of filling
 compressed_bvecs

On Thu, 25 Jan 2024 20:00:39 +0800
Gao Xiang <hsiangkao@...ux.alibaba.com> wrote:

> I encountered a race issue after lengthy (~594647 sec) stress tests on
> a 64k-page arm64 VM with several 4k-block EROFS images.  The timing
> is like below:
> 
> z_erofs_try_inplace_io                  z_erofs_fill_bio_vec
>   cmpxchg(&compressed_bvecs[].page,
>           NULL, ..)
>                                         [access bufvec]
>   compressed_bvecs[] = *bvec;
> 
> Previously, z_erofs_submit_queue() just accessed bufvec->page only, so
> other fields in bufvec didn't matter.  After the subpage block support
> is landed, .offset and .end can be used too, but filling bufvec isn't
> an atomic operation which can cause inconsistency.
> 
> Let's use a spinlock to keep the atomicity of each bufvec.  More
> specifically, just reuse the existing spinlock `pcl->obj.lockref.lock`
> since it's rarely used (also it takes a short time if even used) as long
> as the pcluster has a reference.
> 
> Fixes: 192351616a9d ("erofs: support I/O submission for sub-page compressed blocks")
> Signed-off-by: Gao Xiang <hsiangkao@...ux.alibaba.com>
> ---
>  fs/erofs/zdata.c | 74 +++++++++++++++++++++++++-----------------------
>  1 file changed, 38 insertions(+), 36 deletions(-)
> 
> diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
> index 583c062cd0e4..c1c77166b30f 100644
> --- a/fs/erofs/zdata.c
> +++ b/fs/erofs/zdata.c
> @@ -563,21 +563,19 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
>  			__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
>  	unsigned int i;
>  
> -	if (i_blocksize(fe->inode) != PAGE_SIZE)
> -		return;
> -	if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
> +	if (i_blocksize(fe->inode) != PAGE_SIZE ||
> +	    fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
>  		return;
>  
>  	for (i = 0; i < pclusterpages; ++i) {
>  		struct page *page, *newpage;
>  		void *t;	/* mark pages just found for debugging */
>  
> -		/* the compressed page was loaded before */
> +		/* Inaccurate check w/o locking to avoid unneeded lookups */
>  		if (READ_ONCE(pcl->compressed_bvecs[i].page))
>  			continue;
>  
>  		page = find_get_page(mc, pcl->obj.index + i);
> -
>  		if (page) {
>  			t = (void *)((unsigned long)page | 1);
>  			newpage = NULL;
> @@ -597,9 +595,13 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
>  			set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
>  			t = (void *)((unsigned long)newpage | 1);
>  		}
> -
> -		if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t))
> +		spin_lock(&pcl->obj.lockref.lock);
> +		if (!pcl->compressed_bvecs[i].page) {
> +			pcl->compressed_bvecs[i].page = t;
> +			spin_unlock(&pcl->obj.lockref.lock);
>  			continue;
> +		}
> +		spin_unlock(&pcl->obj.lockref.lock);
>  
>  		if (page)
>  			put_page(page);
> @@ -718,31 +720,25 @@ int erofs_init_managed_cache(struct super_block *sb)
>  	return 0;
>  }
>  
> -static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
> -				   struct z_erofs_bvec *bvec)
> -{
> -	struct z_erofs_pcluster *const pcl = fe->pcl;
> -
> -	while (fe->icur > 0) {
> -		if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page,
> -			     NULL, bvec->page)) {
> -			pcl->compressed_bvecs[fe->icur] = *bvec;
> -			return true;
> -		}
> -	}
> -	return false;
> -}
> -
>  /* callers must be with pcluster lock held */
>  static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
>  			       struct z_erofs_bvec *bvec, bool exclusive)
>  {
> +	struct z_erofs_pcluster *pcl = fe->pcl;
>  	int ret;
>  
>  	if (exclusive) {
>  		/* give priority for inplaceio to use file pages first */
> -		if (z_erofs_try_inplace_io(fe, bvec))
> +		spin_lock(&pcl->obj.lockref.lock);
> +		while (fe->icur > 0) {
> +			if (pcl->compressed_bvecs[--fe->icur].page)
> +				continue;
> +			pcl->compressed_bvecs[fe->icur] = *bvec;
> +			spin_unlock(&pcl->obj.lockref.lock);
>  			return 0;
> +		}
> +		spin_unlock(&pcl->obj.lockref.lock);
> +
>  		/* otherwise, check if it can be used as a bvpage */
>  		if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
>  		    !fe->candidate_bvpage)
> @@ -1423,23 +1419,26 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
>  {
>  	gfp_t gfp = mapping_gfp_mask(mc);
>  	bool tocache = false;
> -	struct z_erofs_bvec *zbv = pcl->compressed_bvecs + nr;
> +	struct z_erofs_bvec zbv;
>  	struct address_space *mapping;
> -	struct page *page, *oldpage;
> +	struct page *page;
>  	int justfound, bs = i_blocksize(f->inode);
>  
>  	/* Except for inplace pages, the entire page can be used for I/Os */
>  	bvec->bv_offset = 0;
>  	bvec->bv_len = PAGE_SIZE;
>  repeat:
> -	oldpage = READ_ONCE(zbv->page);
> -	if (!oldpage)
> +	spin_lock(&pcl->obj.lockref.lock);
> +	zbv = pcl->compressed_bvecs[nr];
> +	page = zbv.page;
> +	justfound = (unsigned long)page & 1UL;
> +	page = (struct page *)((unsigned long)page & ~1UL);
> +	pcl->compressed_bvecs[nr].page = page;
> +	spin_unlock(&pcl->obj.lockref.lock);
> +	if (!page)
>  		goto out_allocpage;
>  
> -	justfound = (unsigned long)oldpage & 1UL;
> -	page = (struct page *)((unsigned long)oldpage & ~1UL);
>  	bvec->bv_page = page;
> -
>  	DBG_BUGON(z_erofs_is_shortlived_page(page));
>  	/*
>  	 * Handle preallocated cached pages.  We tried to allocate such pages
> @@ -1448,7 +1447,6 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
>  	 */
>  	if (page->private == Z_EROFS_PREALLOCATED_PAGE) {
>  		set_page_private(page, 0);
> -		WRITE_ONCE(zbv->page, page);
>  		tocache = true;
>  		goto out_tocache;
>  	}
> @@ -1459,9 +1457,9 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
>  	 * therefore it is impossible for `mapping` to be NULL.
>  	 */
>  	if (mapping && mapping != mc) {
> -		if (zbv->offset < 0)
> -			bvec->bv_offset = round_up(-zbv->offset, bs);
> -		bvec->bv_len = round_up(zbv->end, bs) - bvec->bv_offset;
> +		if (zbv.offset < 0)
> +			bvec->bv_offset = round_up(-zbv.offset, bs);
> +		bvec->bv_len = round_up(zbv.end, bs) - bvec->bv_offset;
>  		return;
>  	}
>  
> @@ -1471,7 +1469,6 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
>  
>  	/* the cached page is still in managed cache */
>  	if (page->mapping == mc) {
> -		WRITE_ONCE(zbv->page, page);
>  		/*
>  		 * The cached page is still available but without a valid
>  		 * `->private` pcluster hint.  Let's reconnect them.
> @@ -1503,11 +1500,15 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
>  	put_page(page);
>  out_allocpage:
>  	page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL);
> -	if (oldpage != cmpxchg(&zbv->page, oldpage, page)) {
> +	spin_lock(&pcl->obj.lockref.lock);
> +	if (pcl->compressed_bvecs[nr].page) {
>  		erofs_pagepool_add(&f->pagepool, page);
> +		spin_unlock(&pcl->obj.lockref.lock);
>  		cond_resched();
>  		goto repeat;
>  	}
> +	pcl->compressed_bvecs[nr].page = page;
> +	spin_unlock(&pcl->obj.lockref.lock);
>  	bvec->bv_page = page;
>  out_tocache:
>  	if (!tocache || bs != PAGE_SIZE ||
> @@ -1685,6 +1686,7 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
>  
>  			if (cur + bvec.bv_len > end)
>  				bvec.bv_len = end - cur;
> +			DBG_BUGON(bvec.bv_len < sb->s_blocksize);
>  			if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len,
>  					  bvec.bv_offset))
>  				goto submit_bio_retry;

Looks good to me.

Reviewed-by: Yue Hu <huyue2@...lpad.com>