[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <d4474c49-1000-4553-bd21-c0a9ad41bba4@oracle.com>
Date: Thu, 11 Jul 2024 08:17:26 +0100
From: John Garry <john.g.garry@...cle.com>
To: "Darrick J. Wong" <djwong@...nel.org>
Cc: chandan.babu@...cle.com, dchinner@...hat.com, hch@....de,
viro@...iv.linux.org.uk, brauner@...nel.org, jack@...e.cz,
linux-xfs@...r.kernel.org, linux-kernel@...r.kernel.org,
linux-fsdevel@...r.kernel.org, catherine.hoang@...cle.com,
martin.petersen@...cle.com
Subject: Re: [PATCH v2 07/13] xfs: Introduce FORCEALIGN inode flag
On 11/07/2024 03:59, Darrick J. Wong wrote:
> On Fri, Jul 05, 2024 at 04:24:44PM +0000, John Garry wrote:
>> From: "Darrick J. Wong" <djwong@...nel.org>
>>
>> Add a new inode flag to require that all file data extent mappings must
>> be aligned (both the file offset range and the allocated space itself)
>> to the extent size hint. Having a separate COW extent size hint is no
>> longer allowed.
>>
>> The goal here is to enable sysadmins and users to mandate that all space
>> mappings in a file must have a startoff/blockcount that are aligned to
>> (say) a 2MB alignment and that the startblock/blockcount will follow the
>> same alignment.
>>
>> Allocated space will be aligned to start of the AG, and not necessarily
>> aligned with disk blocks. The upcoming atomic writes feature will rely and
>> forcealign and will also require allocated space will also be aligned to
>> disk blocks.
>>
>> Currently RT is not be supported for forcealign, so reject a mount under
>> that condition. In future, it should be possible to support forcealign for
>> RT. In this case, the extent size hint will need to be aligned with
>> rtextsize, so add inode verification for that now.
>>
>> reflink link will not be supported for forcealign. This is because we have
>> the limitation of pageache writeback not knowing how to writeback an
>> entire allocation unut, so reject a mount with relink.
>>
>> Signed-off-by: "Darrick J. Wong" <djwong@...nel.org>
>> Co-developed-by: John Garry <john.g.garry@...cle.com>
>> [jpg: many changes from orig, including forcealign inode verification
>> rework, disallow RT and forcealign mount, ioctl setattr rework,
>> disallow reflink a forcealign inode]
>> Signed-off-by: John Garry <john.g.garry@...cle.com>
>> ---
>> fs/xfs/libxfs/xfs_format.h | 6 +++-
>> fs/xfs/libxfs/xfs_inode_buf.c | 55 +++++++++++++++++++++++++++++++++++
>> fs/xfs/libxfs/xfs_inode_buf.h | 3 ++
>> fs/xfs/libxfs/xfs_sb.c | 2 ++
>> fs/xfs/xfs_inode.c | 13 +++++++++
>> fs/xfs/xfs_inode.h | 20 ++++++++++++-
>> fs/xfs/xfs_ioctl.c | 51 ++++++++++++++++++++++++++++++--
>> fs/xfs/xfs_mount.h | 2 ++
>> fs/xfs/xfs_reflink.c | 5 ++--
>> fs/xfs/xfs_reflink.h | 10 -------
>> fs/xfs/xfs_super.c | 11 +++++++
>> include/uapi/linux/fs.h | 2 ++
>> 12 files changed, 164 insertions(+), 16 deletions(-)
>>
>> diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
>> index 61f51becff4f..b48cd75d34a6 100644
>> --- a/fs/xfs/libxfs/xfs_format.h
>> +++ b/fs/xfs/libxfs/xfs_format.h
>> @@ -353,6 +353,7 @@ xfs_sb_has_compat_feature(
>> #define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map btree */
>> #define XFS_SB_FEAT_RO_COMPAT_REFLINK (1 << 2) /* reflinked files */
>> #define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3) /* inobt block counts */
>> +#define XFS_SB_FEAT_RO_COMPAT_FORCEALIGN (1 << 30) /* aligned file data extents */
>> #define XFS_SB_FEAT_RO_COMPAT_ALL \
>> (XFS_SB_FEAT_RO_COMPAT_FINOBT | \
>> XFS_SB_FEAT_RO_COMPAT_RMAPBT | \
>> @@ -1094,16 +1095,19 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
>> #define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */
>> #define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */
>> #define XFS_DIFLAG2_NREXT64_BIT 4 /* large extent counters */
>> +/* data extent mappings for regular files must be aligned to extent size hint */
>> +#define XFS_DIFLAG2_FORCEALIGN_BIT 5
>>
>> #define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT)
>> #define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT)
>> #define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT)
>> #define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT)
>> #define XFS_DIFLAG2_NREXT64 (1 << XFS_DIFLAG2_NREXT64_BIT)
>> +#define XFS_DIFLAG2_FORCEALIGN (1 << XFS_DIFLAG2_FORCEALIGN_BIT)
>>
>> #define XFS_DIFLAG2_ANY \
>> (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \
>> - XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64)
>> + XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64 | XFS_DIFLAG2_FORCEALIGN)
>>
>> static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
>> {
>> diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
>> index 513b50da6215..5c61a1d1bb2b 100644
>> --- a/fs/xfs/libxfs/xfs_inode_buf.c
>> +++ b/fs/xfs/libxfs/xfs_inode_buf.c
>> @@ -657,6 +657,15 @@ xfs_dinode_verify(
>> !xfs_has_bigtime(mp))
>> return __this_address;
>>
>> + if (flags2 & XFS_DIFLAG2_FORCEALIGN) {
>> + fa = xfs_inode_validate_forcealign(mp,
>> + be32_to_cpu(dip->di_extsize),
>> + be32_to_cpu(dip->di_cowextsize),
>> + mode, flags, flags2);
>> + if (fa)
>> + return fa;
>> + }
>> +
>> return NULL;
>> }
>>
>> @@ -824,3 +833,49 @@ xfs_inode_validate_cowextsize(
>>
>> return NULL;
>> }
>> +
>> +/* Validate the forcealign inode flag */
>> +xfs_failaddr_t
>> +xfs_inode_validate_forcealign(
>> + struct xfs_mount *mp,
>> + uint32_t extsize,
>> + uint32_t cowextsize,
>> + uint16_t mode,
>> + uint16_t flags,
>> + uint64_t flags2)
>> +{
>> + bool rt = flags & XFS_DIFLAG_REALTIME;
>> +
>> + /* superblock rocompat feature flag */
>> + if (!xfs_has_forcealign(mp))
>> + return __this_address;
>> +
>> + /* Only regular files and directories */
>> + if (!S_ISDIR(mode) && !S_ISREG(mode))
>> + return __this_address;
>> +
>> + /* We require EXTSIZE or EXTSZINHERIT */
>> + if (!(flags & (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT)))
>> + return __this_address;
>> +
>> + /* We require a non-zero extsize */
>> + if (!extsize)
>> + return __this_address;
>> +
>> + /* Reflink'ed disallowed */
>> + if (flags2 & XFS_DIFLAG2_REFLINK)
>> + return __this_address;
>
> Hmm. If we don't support reflink + forcealign ATM, then shouldn't the
> superblock verifier or xfs_fs_fill_super fail the mount so that old
> kernels won't abruptly emit EFSCORRUPTED errors if a future kernel adds
> support for forcealign'd cow and starts writing out files with both
> iflags set?
Fine, I see that we do something similar now for rtdev.
However why even have the rt inode check, below, to disallow for reflink
cp for rt inode (if we can't even mount with rt and reflink together)?
>> * Mount features
>> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
>> index 265a2a418bc7..8da293e8bfa2 100644
>> --- a/fs/xfs/xfs_reflink.c
>> +++ b/fs/xfs/xfs_reflink.c
>> @@ -1467,8 +1467,9 @@ xfs_reflink_remap_prep(
>>
>> /* Check file eligibility and prepare for block sharing. */
>> ret = -EINVAL;
>> - /* Don't reflink realtime inodes */
>> - if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
>> + /* Don't reflink realtime or forcealign inodes */
>> + if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest) ||
>> + xfs_inode_has_forcealign(src) || xfs_inode_has_forcealign(dest))
>> goto out_unlock;
>>
>> /* Don't share DAX file data with non-DAX file. */
>> diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
>> index 65c5dfe17ecf..fb55e4ce49fa 100644
>> --- a/fs/xfs/xfs_reflink.h
>> +++ b/fs/xfs/xfs_reflink.h
>> @@ -6,16 +6,6 @@
>> #ifndef __XFS_REFLINK_H
>> #define __XFS_REFLINK_H 1
>>
Powered by blists - more mailing lists