linux-kernel - Re: [PATCH v4] fs: Fix data race in inode_set_ctime_to

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20241124174435.GB620578@frogsfrogsfrogs>
Date: Sun, 24 Nov 2024 09:44:35 -0800
From: "Darrick J. Wong" <djwong@...nel.org>
To: Hao-ran Zheng <zhenghaoran@...a.edu.cn>
Cc: viro@...iv.linux.org.uk, brauner@...nel.org, jack@...e.cz,
	linux-fsdevel@...r.kernel.org, linux-kernel@...r.kernel.org,
	baijiaju1990@...il.com, 21371365@...a.edu.cn
Subject: Re: [PATCH v4] fs: Fix data race in inode_set_ctime_to_ts

On Sun, Nov 24, 2024 at 05:42:53PM +0800, Hao-ran Zheng wrote:
> A data race may occur when the function `inode_set_ctime_to_ts()` and
> the function `inode_get_ctime_sec()` are executed concurrently. When
> two threads call `aio_read` and `aio_write` respectively, they will
> be distributed to the read and write functions of the corresponding
> file system respectively. Taking the btrfs file system as an example,
> the `btrfs_file_read_iter` and `btrfs_file_write_iter` functions are
> finally called. These two functions created a data race when they
> finally called `inode_get_ctime_sec()` and `inode_set_ctime_to_ns()`.
> The specific call stack that appears during testing is as follows:
> 
> ============DATA_RACE============
> btrfs_delayed_update_inode+0x1f61/0x7ce0 [btrfs]
> btrfs_update_inode+0x45e/0xbb0 [btrfs]
> btrfs_dirty_inode+0x2b8/0x530 [btrfs]
> btrfs_update_time+0x1ad/0x230 [btrfs]
> touch_atime+0x211/0x440
> filemap_read+0x90f/0xa20
> btrfs_file_read_iter+0xeb/0x580 [btrfs]
> aio_read+0x275/0x3a0
> io_submit_one+0xd22/0x1ce0
> __se_sys_io_submit+0xb3/0x250
> do_syscall_64+0xc1/0x190
> entry_SYSCALL_64_after_hwframe+0x77/0x7f
> ============OTHER_INFO============
> btrfs_write_check+0xa15/0x1390 [btrfs]
> btrfs_buffered_write+0x52f/0x29d0 [btrfs]
> btrfs_do_write_iter+0x53d/0x1590 [btrfs]
> btrfs_file_write_iter+0x41/0x60 [btrfs]
> aio_write+0x41e/0x5f0
> io_submit_one+0xd42/0x1ce0
> __se_sys_io_submit+0xb3/0x250
> do_syscall_64+0xc1/0x190
> entry_SYSCALL_64_after_hwframe+0x77/0x7f
> 
> To address this issue, it is recommended to add WRITE_ONCE
> and READ_ONCE when reading or writing the `inode->i_ctime_sec`
> and `inode->i_ctime_nsec` variable.

Excuse my ignorance, but how exactly does this annotation fix the race?
Does it prevent reordering of the memory accesses or something?  "it is
recommended" is not enough for dunces such as myself to figure out why
this fixes the problem. :(

--D

> Signed-off-by: Hao-ran Zheng <zhenghaoran@...a.edu.cn>
> ---
> V3 -> V4: Fixed patch for latest code
> V2 -> V3: Added READ_ONCE in inode_get_ctime_nsec() and addressed review comments
> V1 -> V2: Added READ_ONCE in inode_get_ctime_sec()
> ---
>  fs/inode.c | 16 ++++++++--------
>  fs/stat.c  |  2 +-
>  2 files changed, 9 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/inode.c b/fs/inode.c
> index b13b778257ae..bfab370c8622 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -2713,8 +2713,8 @@ struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 t
>  {
>  	trace_inode_set_ctime_to_ts(inode, &ts);
>  	set_normalized_timespec64(&ts, ts.tv_sec, ts.tv_nsec);
> -	inode->i_ctime_sec = ts.tv_sec;
> -	inode->i_ctime_nsec = ts.tv_nsec;
> +	WRITE_ONCE(inode->i_ctime_sec, ts.tv_sec);
> +	WRITE_ONCE(inode->i_ctime_nsec, ts.tv_nsec);
>  	return ts;
>  }
>  EXPORT_SYMBOL(inode_set_ctime_to_ts);
> @@ -2788,7 +2788,7 @@ struct timespec64 inode_set_ctime_current(struct inode *inode)
>  	 */
>  	cns = smp_load_acquire(&inode->i_ctime_nsec);
>  	if (cns & I_CTIME_QUERIED) {
> -		struct timespec64 ctime = { .tv_sec = inode->i_ctime_sec,
> +		struct timespec64 ctime = { .tv_sec = READ_ONCE(inode->i_ctime_sec),
>  					    .tv_nsec = cns & ~I_CTIME_QUERIED };
>  
>  		if (timespec64_compare(&now, &ctime) <= 0) {
> @@ -2809,7 +2809,7 @@ struct timespec64 inode_set_ctime_current(struct inode *inode)
>  	/* Try to swap the nsec value into place. */
>  	if (try_cmpxchg(&inode->i_ctime_nsec, &cur, now.tv_nsec)) {
>  		/* If swap occurred, then we're (mostly) done */
> -		inode->i_ctime_sec = now.tv_sec;
> +		WRITE_ONCE(inode->i_ctime_sec, now.tv_sec);
>  		trace_ctime_ns_xchg(inode, cns, now.tv_nsec, cur);
>  		mgtime_counter_inc(mg_ctime_swaps);
>  	} else {
> @@ -2824,7 +2824,7 @@ struct timespec64 inode_set_ctime_current(struct inode *inode)
>  			goto retry;
>  		}
>  		/* Otherwise, keep the existing ctime */
> -		now.tv_sec = inode->i_ctime_sec;
> +		now.tv_sec = READ_ONCE(inode->i_ctime_sec);
>  		now.tv_nsec = cur & ~I_CTIME_QUERIED;
>  	}
>  out:
> @@ -2857,7 +2857,7 @@ struct timespec64 inode_set_ctime_deleg(struct inode *inode, struct timespec64 u
>  	/* pairs with try_cmpxchg below */
>  	cur = smp_load_acquire(&inode->i_ctime_nsec);
>  	cur_ts.tv_nsec = cur & ~I_CTIME_QUERIED;
> -	cur_ts.tv_sec = inode->i_ctime_sec;
> +	cur_ts.tv_sec = READ_ONCE(inode->i_ctime_sec);
>  
>  	/* If the update is older than the existing value, skip it. */
>  	if (timespec64_compare(&update, &cur_ts) <= 0)
> @@ -2883,7 +2883,7 @@ struct timespec64 inode_set_ctime_deleg(struct inode *inode, struct timespec64 u
>  retry:
>  	old = cur;
>  	if (try_cmpxchg(&inode->i_ctime_nsec, &cur, update.tv_nsec)) {
> -		inode->i_ctime_sec = update.tv_sec;
> +		WRITE_ONCE(inode->i_ctime_sec, update.tv_sec);
>  		mgtime_counter_inc(mg_ctime_swaps);
>  		return update;
>  	}
> @@ -2899,7 +2899,7 @@ struct timespec64 inode_set_ctime_deleg(struct inode *inode, struct timespec64 u
>  		goto retry;
>  
>  	/* Otherwise, it was a new timestamp. */
> -	cur_ts.tv_sec = inode->i_ctime_sec;
> +	cur_ts.tv_sec = READ_ONCE(inode->i_ctime_sec);
>  	cur_ts.tv_nsec = cur & ~I_CTIME_QUERIED;
>  	return cur_ts;
>  }
> diff --git a/fs/stat.c b/fs/stat.c
> index 0870e969a8a0..e39c78cd62f3 100644
> --- a/fs/stat.c
> +++ b/fs/stat.c
> @@ -53,7 +53,7 @@ void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode)
>  	}
>  
>  	stat->mtime = inode_get_mtime(inode);
> -	stat->ctime.tv_sec = inode->i_ctime_sec;
> +	stat->ctime.tv_sec = READ_ONCE(inode->i_ctime_sec);
>  	stat->ctime.tv_nsec = (u32)atomic_read(pcn);
>  	if (!(stat->ctime.tv_nsec & I_CTIME_QUERIED))
>  		stat->ctime.tv_nsec = ((u32)atomic_fetch_or(I_CTIME_QUERIED, pcn));
> -- 
> 2.34.1
> 
>