linux-kernel - Re: [PATCH v7 14/14] xfs: allow sysadmins to specify a maximum atomic write limit at mount time

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250415165520.GS25675@frogsfrogsfrogs>
Date: Tue, 15 Apr 2025 09:55:20 -0700
From: "Darrick J. Wong" <djwong@...nel.org>
To: John Garry <john.g.garry@...cle.com>
Cc: brauner@...nel.org, hch@....de, viro@...iv.linux.org.uk, jack@...e.cz,
	cem@...nel.org, linux-fsdevel@...r.kernel.org, dchinner@...hat.com,
	linux-xfs@...r.kernel.org, linux-kernel@...r.kernel.org,
	ojaswin@...ux.ibm.com, ritesh.list@...il.com,
	martin.petersen@...cle.com, linux-ext4@...r.kernel.org,
	linux-block@...r.kernel.org, catherine.hoang@...cle.com,
	linux-api@...r.kernel.org
Subject: Re: [PATCH v7 14/14] xfs: allow sysadmins to specify a maximum
 atomic write limit at mount time

On Tue, Apr 15, 2025 at 12:14:25PM +0000, John Garry wrote:
> From: "Darrick J. Wong" <djwong@...nel.org>
> 
> Introduce a mount option to allow sysadmins to specify the maximum size
> of an atomic write.  When this happens, we dynamically recompute the
> tr_atomic_write transaction reservation based on the given block size,
> and then check that we don't violate any of the minimum log size
> constraints.
> 
> The actual software atomic write max is still computed based off of
> tr_atomic the same way it has for the past few commits.
> 
> Signed-off-by: "Darrick J. Wong" <djwong@...nel.org>
> Signed-off-by: John Garry <john.g.garry@...cle.com>
> ---
>  Documentation/admin-guide/xfs.rst |  8 +++++
>  fs/xfs/libxfs/xfs_trans_resv.c    | 54 +++++++++++++++++++++++++++++++
>  fs/xfs/libxfs/xfs_trans_resv.h    |  1 +
>  fs/xfs/xfs_mount.c                |  8 ++++-
>  fs/xfs/xfs_mount.h                |  5 +++
>  fs/xfs/xfs_super.c                | 28 +++++++++++++++-
>  fs/xfs/xfs_trace.h                | 33 +++++++++++++++++++
>  7 files changed, 135 insertions(+), 2 deletions(-)
> 
> diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst
> index b67772cf36d6..715019ec4f24 100644
> --- a/Documentation/admin-guide/xfs.rst
> +++ b/Documentation/admin-guide/xfs.rst
> @@ -143,6 +143,14 @@ When mounting an XFS filesystem, the following options are accepted.
>  	optional, and the log section can be separate from the data
>  	section or contained within it.
>  
> +  max_atomic_write=value
> +	Set the maximum size of an atomic write.  The size may be
> +	specified in bytes, in kilobytes with a "k" suffix, in megabytes
> +	with a "m" suffix, or in gigabytes with a "g" suffix.
> +
> +	The default value is to set the maximum io completion size
> +	to allow each CPU to handle one at a time.
> +
>    noalign
>  	Data allocations will not be aligned at stripe unit
>  	boundaries. This is only relevant to filesystems created
> diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
> index f530aa5d72f5..36e47ec3c3c2 100644
> --- a/fs/xfs/libxfs/xfs_trans_resv.c
> +++ b/fs/xfs/libxfs/xfs_trans_resv.c
> @@ -1475,3 +1475,57 @@ xfs_calc_max_atomic_write_fsblocks(
>  
>  	return ret;
>  }
> +
> +/*
> + * Compute the log reservation needed to complete an atomic write of a given
> + * number of blocks.  Worst case, each block requires separate handling.
> + * Returns true if the blockcount is supported, false otherwise.
> + */
> +bool
> +xfs_calc_atomic_write_reservation(
> +	struct xfs_mount	*mp,
> +	int			bytes)

Hmm, the comment says this should be a block count, not a byte count.

	xfs_extlen_t		blockcount)

> +{
> +	struct xfs_trans_res	*curr_res = &M_RES(mp)->tr_atomic_ioend;
> +	unsigned int		per_intent, step_size;
> +	unsigned int		logres;
> +	xfs_extlen_t		blockcount = XFS_B_TO_FSBT(mp, bytes);
> +	uint			old_logres =
> +		M_RES(mp)->tr_atomic_ioend.tr_logres;
> +	int			min_logblocks;
> +
> +	/*
> +	 * If the caller doesn't ask for a specific atomic write size, then
> +	 * we'll use conservatively use tr_itruncate as the basis for computing
> +	 * a reasonable maximum.
> +	 */
> +	if (blockcount == 0) {
> +		curr_res->tr_logres = M_RES(mp)->tr_itruncate.tr_logres;
> +		return true;
> +	}
> +
> +	/* Untorn write completions require out of place write remapping */
> +	if (!xfs_has_reflink(mp))
> +		return false;
> +
> +	per_intent = xfs_calc_atomic_write_ioend_geometry(mp, &step_size);
> +
> +	if (check_mul_overflow(blockcount, per_intent, &logres))
> +		return false;
> +	if (check_add_overflow(logres, step_size, &logres))
> +		return false;
> +
> +	curr_res->tr_logres = logres;
> +	min_logblocks = xfs_log_calc_minimum_size(mp);
> +
> +	trace_xfs_calc_max_atomic_write_reservation(mp, per_intent, step_size,
> +			blockcount, min_logblocks, curr_res->tr_logres);
> +
> +	if (min_logblocks > mp->m_sb.sb_logblocks) {
> +		/* Log too small, revert changes. */
> +		curr_res->tr_logres = old_logres;
> +		return false;
> +	}
> +
> +	return true;
> +}
> diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
> index a6d303b83688..af974f920556 100644
> --- a/fs/xfs/libxfs/xfs_trans_resv.h
> +++ b/fs/xfs/libxfs/xfs_trans_resv.h
> @@ -122,5 +122,6 @@ unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp);
>  unsigned int xfs_calc_qm_dqalloc_reservation_minlogsize(struct xfs_mount *mp);
>  
>  xfs_extlen_t xfs_calc_max_atomic_write_fsblocks(struct xfs_mount *mp);
> +bool xfs_calc_atomic_write_reservation(struct xfs_mount *mp, int bytes);
>  
>  #endif	/* __XFS_TRANS_RESV_H__ */
> diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> index 860fc3c91fd5..b8dd9e956c2a 100644
> --- a/fs/xfs/xfs_mount.c
> +++ b/fs/xfs/xfs_mount.c
> @@ -671,7 +671,7 @@ static inline unsigned int max_pow_of_two_factor(const unsigned int nr)
>  	return 1 << (ffs(nr) - 1);
>  }
>  
> -static inline void
> +void
>  xfs_compute_atomic_write_unit_max(
>  	struct xfs_mount	*mp)
>  {
> @@ -1160,6 +1160,12 @@ xfs_mountfs(
>  	 * derived from transaction reservations, so we must do this after the
>  	 * log is fully initialized.
>  	 */
> +	if (!xfs_calc_atomic_write_reservation(mp, mp->m_awu_max_bytes)) {
> +		xfs_warn(mp, "cannot support atomic writes of %u bytes",
> +				mp->m_awu_max_bytes);
> +		error = -EINVAL;
> +		goto out_agresv;
> +	}

Hmm.  I don't think this is sufficient validation of m_awu_max_bytes.
xfs_compute_atomic_write_unit_max never allows an atomic write that is
larger than MAX_RW_COUNT or larger than the allocation group, because
it's not possible to land a single write larger than either of those
sizes.  The parsing code ignores values that aren't congruent with
an fsblock size, and suffix_kstrtoint gets confused if you feed it
values that are too large (like "2g").  I propose something like this:

/*
 * Try to set the atomic write maximum to a new value that we got from
 * userspace via mount option.
 */
int
xfs_set_max_atomic_write_opt(
	struct xfs_mount	*mp,
	unsigned long long	new_max_bytes)
{
	xfs_filblks_t		new_max_fsbs = XFS_B_TO_FSBT(mp, new_max_bytes);

	if (new_max_bytes) {
		xfs_extlen_t	max_write_fsbs =
			rounddown_pow_of_two(XFS_B_TO_FSB(mp, MAX_RW_COUNT));
		xfs_extlen_t	max_group_fsbs =
			max(mp->m_groups[XG_TYPE_AG].blocks,
			    mp->m_groups[XG_TYPE_RTG].blocks);

		ASSERT(max_write_fsbs <= U32_MAX);

		if (new_max_bytes % mp->m_sb.sb_blocksize > 0) {
			xfs_warn(mp,
 "max atomic write size of %llu bytes not aligned with fsblock",
					new_max_bytes);
			return -EINVAL;
		}

		if (new_max_fsbs > max_write_fsbs) {
			xfs_warn(mp,
 "max atomic write size of %lluk cannot be larger than max write size %lluk",
					new_max_bytes >> 10,
					XFS_FSB_TO_B(mp, max_write_fsbs) >> 10);
			return -EINVAL;
		}

		if (new_max_fsbs > max_group_fsbs) {
			xfs_warn(mp,
 "max atomic write size of %lluk cannot be larger than allocation group size %lluk",
					new_max_bytes >> 10,
					XFS_FSB_TO_B(mp, max_group_fsbs) >> 10);
			return -EINVAL;
		}
	}

	if (!xfs_calc_atomic_write_reservation(mp, new_max_fsbs)) {
		xfs_warn(mp,
 "cannot support completing atomic writes of %lluk",
				new_max_bytes >> 10);
		return -EINVAL;
	}

	xfs_compute_atomic_write_unit_max(mp);
	return 0;
}

and then we get some nicer error messages about what exactly failed
validation.

>  	xfs_compute_atomic_write_unit_max(mp);
>  
>  	return 0;
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index c0eff3adfa31..a5037db4ecff 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -236,6 +236,9 @@ typedef struct xfs_mount {
>  	bool			m_update_sb;	/* sb needs update in mount */
>  	unsigned int		m_max_open_zones;
>  
> +	/* max_atomic_write mount option value */
> +	unsigned int		m_awu_max_bytes;
> +
>  	/*
>  	 * Bitsets of per-fs metadata that have been checked and/or are sick.
>  	 * Callers must hold m_sb_lock to access these two fields.
> @@ -798,4 +801,6 @@ static inline void xfs_mod_sb_delalloc(struct xfs_mount *mp, int64_t delta)
>  	percpu_counter_add(&mp->m_delalloc_blks, delta);
>  }
>  
> +void xfs_compute_atomic_write_unit_max(struct xfs_mount *mp);
> +
>  #endif	/* __XFS_MOUNT_H__ */
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index b2dd0c0bf509..f7849052e5ff 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -111,7 +111,7 @@ enum {
>  	Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota,
>  	Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
>  	Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, Opt_max_open_zones,
> -	Opt_lifetime, Opt_nolifetime,
> +	Opt_lifetime, Opt_nolifetime, Opt_max_atomic_write,
>  };
>  
>  static const struct fs_parameter_spec xfs_fs_parameters[] = {
> @@ -159,6 +159,7 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = {
>  	fsparam_u32("max_open_zones",	Opt_max_open_zones),
>  	fsparam_flag("lifetime",	Opt_lifetime),
>  	fsparam_flag("nolifetime",	Opt_nolifetime),
> +	fsparam_string("max_atomic_write",	Opt_max_atomic_write),
>  	{}
>  };
>  
> @@ -241,6 +242,9 @@ xfs_fs_show_options(
>  
>  	if (mp->m_max_open_zones)
>  		seq_printf(m, ",max_open_zones=%u", mp->m_max_open_zones);
> +	if (mp->m_awu_max_bytes)
> +		seq_printf(m, ",max_atomic_write=%uk",
> +				mp->m_awu_max_bytes >> 10);
>  
>  	return 0;
>  }
> @@ -1518,6 +1522,13 @@ xfs_fs_parse_param(
>  	case Opt_nolifetime:
>  		parsing_mp->m_features |= XFS_FEAT_NOLIFETIME;
>  		return 0;
> +	case Opt_max_atomic_write:
> +		if (suffix_kstrtoint(param->string, 10,
> +				     &parsing_mp->m_awu_max_bytes))

Let's replace this with a new suffix_kstrtoull helper that returns an
unsigned long long quantity that won't get confused.  This has the
unfortunate consequence that we have to burn a u64 in xfs_mount instead
of a u32.

--D

> +			return -EINVAL;
> +		if (parsing_mp->m_awu_max_bytes < 0)
> +			return -EINVAL;
> +		return 0;
>  	default:
>  		xfs_warn(parsing_mp, "unknown mount option [%s].", param->key);
>  		return -EINVAL;
> @@ -2114,6 +2125,16 @@ xfs_fs_reconfigure(
>  	if (error)
>  		return error;
>  
> +	/* validate new max_atomic_write option before making other changes */
> +	if (mp->m_awu_max_bytes != new_mp->m_awu_max_bytes) {
> +		if (!xfs_calc_atomic_write_reservation(mp,
> +					new_mp->m_awu_max_bytes)) {
> +			xfs_warn(mp, "cannot support atomic writes of %u bytes",
> +					new_mp->m_awu_max_bytes);
> +			return -EINVAL;
> +		}
> +	}
> +
>  	/* inode32 -> inode64 */
>  	if (xfs_has_small_inums(mp) && !xfs_has_small_inums(new_mp)) {
>  		mp->m_features &= ~XFS_FEAT_SMALL_INUMS;
> @@ -2140,6 +2161,11 @@ xfs_fs_reconfigure(
>  			return error;
>  	}
>  
> +	/* set new atomic write max here */
> +	if (mp->m_awu_max_bytes != new_mp->m_awu_max_bytes) {
> +		xfs_compute_atomic_write_unit_max(mp);
> +		mp->m_awu_max_bytes = new_mp->m_awu_max_bytes;
> +	}
>  	return 0;
>  }
>  
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index 24d73e9bbe83..d41885f1efe2 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -230,6 +230,39 @@ TRACE_EVENT(xfs_calc_max_atomic_write_fsblocks,
>  		  __entry->blockcount)
>  );
>  
> +TRACE_EVENT(xfs_calc_max_atomic_write_reservation,
> +	TP_PROTO(struct xfs_mount *mp, unsigned int per_intent,
> +		 unsigned int step_size, unsigned int blockcount,
> +		 unsigned int min_logblocks, unsigned int logres),
> +	TP_ARGS(mp, per_intent, step_size, blockcount, min_logblocks, logres),
> +	TP_STRUCT__entry(
> +		__field(dev_t, dev)
> +		__field(unsigned int, per_intent)
> +		__field(unsigned int, step_size)
> +		__field(unsigned int, blockcount)
> +		__field(unsigned int, min_logblocks)
> +		__field(unsigned int, cur_logblocks)
> +		__field(unsigned int, logres)
> +	),
> +	TP_fast_assign(
> +		__entry->dev = mp->m_super->s_dev;
> +		__entry->per_intent = per_intent;
> +		__entry->step_size = step_size;
> +		__entry->blockcount = blockcount;
> +		__entry->min_logblocks = min_logblocks;
> +		__entry->cur_logblocks = mp->m_sb.sb_logblocks;
> +		__entry->logres = logres;
> +	),
> +	TP_printk("dev %d:%d per_intent %u step_size %u blockcount %u min_logblocks %u logblocks %u logres %u",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  __entry->per_intent,
> +		  __entry->step_size,
> +		  __entry->blockcount,
> +		  __entry->min_logblocks,
> +		  __entry->cur_logblocks,
> +		  __entry->logres)
> +);
> +
>  TRACE_EVENT(xlog_intent_recovery_failed,
>  	TP_PROTO(struct xfs_mount *mp, const struct xfs_defer_op_type *ops,
>  		 int error),
> -- 
> 2.31.1
> 
>