[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <76c85021-dd9e-49e3-80e3-25a17c7ca455@oracle.com>
Date: Tue, 19 Dec 2023 12:41:37 +0000
From: John Garry <john.g.garry@...cle.com>
To: Christoph Hellwig <hch@....de>, "Darrick J. Wong" <djwong@...nel.org>
Cc: axboe@...nel.dk, kbusch@...nel.org, sagi@...mberg.me, jejb@...ux.ibm.com,
martin.petersen@...cle.com, viro@...iv.linux.org.uk,
brauner@...nel.org, dchinner@...hat.com, jack@...e.cz,
linux-block@...r.kernel.org, linux-kernel@...r.kernel.org,
linux-nvme@...ts.infradead.org, linux-xfs@...r.kernel.org,
linux-fsdevel@...r.kernel.org, tytso@....edu, jbongio@...gle.com,
linux-scsi@...r.kernel.org, ming.lei@...hat.com, jaswin@...ux.ibm.com,
bvanassche@....org
Subject: Re: [PATCH v2 00/16] block atomic writes
On 19/12/2023 05:21, Christoph Hellwig wrote:
> On Mon, Dec 18, 2023 at 09:14:56PM -0800, Darrick J. Wong wrote:
>> /me stumbles back in with plenty of covidbrain to go around.
>>
>> So ... Christoph, you're asking for a common API for
>> sysadmins/applications to signal to the filesystem that they want all
>> data allocations aligned to a given value, right?
>>
>> e.g. if a nvme device advertises a capability for untorn writes between
>> $lbasize and 64k, then we need a way to set up each untorn-file with
>> some alignment between $lbasize and 64k?
>>
>> or if cxl storage becomes not ung-dly expensive, then we'd want a way to
>> set up files with an alignment of 2M (x86) or 512M (arm64lol) to take
>> advantage of PMD mmap mappings?
>
> The most important point is to not mix these up.
>
> If we want to use a file for atomic writes I should tell the fs about
> it, and preferably in a way that does not require me to know about weird
> internal implementation details of every file system. I really just
> want to use atomic writes. Preferably I'd just start using them after
> asking for the limits. But that's obviously not going to work for
> file systems that use the hardware offload and don't otherwise align
> to the limit (which would suck for very small files anyway :))
>
> So as a compromise I tell the file system before writing or otherwise
> adding any data [1] to file that I want to use atomic writes so that
> the fs can take the right decisions.
>
> [1] reflinking data into a such marked file will be ... interesting.
>
How about something based on fcntl, like below? We will prob also
require some per-FS flag for enabling atomic writes without HW support.
That flag might be also useful for XFS for differentiating forcealign
for atomic writes with just forcealign.
---8<----
diff --git a/fs/fcntl.c b/fs/fcntl.c
index c80a6acad742..d4a50655565a 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -313,6 +313,15 @@ static long fcntl_rw_hint(struct file *file,
unsigned int cmd,
}
}
+static long fcntl_atomic_write(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ if (file->f_op->enable_atomic_writes)
+ return file->f_op->enable_atomic_writes(file, arg);
+
+ return -EOPNOTSUPP;
+}
+
static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
struct file *filp)
{
@@ -419,6 +428,9 @@ static long do_fcntl(int fd, unsigned int cmd,
unsigned long arg,
case F_SET_RW_HINT:
err = fcntl_rw_hint(filp, cmd, arg);
break;
+ case F_SET_ATOMIC_WRITE_SIZE:
+ err = fcntl_atomic_write(filp, cmd, arg);
+ break;
default:
break;
}
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e33e5e13b95f..dba206cbe1ab 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1478,6 +1478,46 @@ xfs_file_mmap(
return 0;
}
+STATIC int
+xfs_enable_atomic_writes(
+ struct file *file,
+ unsigned int unit_max)
+{
+ struct xfs_inode *ip = XFS_I(file_inode(file));
+ struct mnt_idmap *idmap = file_mnt_idmap(file);
+ struct dentry *dentry = file->f_path.dentry;
+ struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+ struct block_device *bdev = target->bt_bdev;
+ struct request_queue *q = bdev->bd_queue;
+ struct inode *inode = &ip->i_vnode;
+
+ if (queue_atomic_write_unit_max_bytes(q) == 0) {
+ if (unit_max != 0) {
+ /* We expect unbounded CoW size if no HW support */
+ return -ENOTBLK;
+ }
+ /* Do something for CoW support ... */
+
+ return 0;
+ }
+
+ if (!unit_max)
+ return -EINVAL;
+
+ /* bodge alert */
+ if (inode->i_op->fileattr_set) {
+ struct fileattr fa = {
+ .fsx_extsize = unit_max,
+ .fsx_xflags = FS_XFLAG_EXTSIZE | FS_XFLAG_FORCEALIGN,
+ .fsx_valid = 1,
+ };
+
+ return inode->i_op->fileattr_set(idmap, dentry, &fa);
+ }
+
+ return -EOPNOTSUPP;
+}
+
const struct file_operations xfs_file_operations = {
.llseek = xfs_file_llseek,
.read_iter = xfs_file_read_iter,
@@ -1498,6 +1538,7 @@ const struct file_operations xfs_file_operations = {
.fallocate = xfs_file_fallocate,
.fadvise = xfs_file_fadvise,
.remap_file_range = xfs_file_remap_range,
+ .enable_atomic_writes = xfs_enable_atomic_writes,
};
const struct file_operations xfs_dir_file_operations = {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 98b7a7a8c42e..a715f98057b8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1956,6 +1956,7 @@ struct file_operations {
int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *,
unsigned int poll_flags);
+ int (*enable_atomic_writes)(struct file *, unsigned int unit_max);
} __randomize_layout;
/* Wrap a directory iterator that needs exclusive inode access */
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 6c80f96049bd..69780c49622e 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -56,6 +56,8 @@
#define F_GET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 13)
#define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14)
+#define F_SET_ATOMIC_WRITE_SIZE (F_LINUX_SPECIFIC_BASE + 15)
+
/*
* Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be
* used to clear any hints previously set.
--
--->8----
Powered by blists - more mailing lists