linux-kernel - Re: [PATCH v2 00/16] block atomic writes

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <76c85021-dd9e-49e3-80e3-25a17c7ca455@oracle.com>
Date: Tue, 19 Dec 2023 12:41:37 +0000
From: John Garry <john.g.garry@...cle.com>
To: Christoph Hellwig <hch@....de>, "Darrick J. Wong" <djwong@...nel.org>
Cc: axboe@...nel.dk, kbusch@...nel.org, sagi@...mberg.me, jejb@...ux.ibm.com,
        martin.petersen@...cle.com, viro@...iv.linux.org.uk,
        brauner@...nel.org, dchinner@...hat.com, jack@...e.cz,
        linux-block@...r.kernel.org, linux-kernel@...r.kernel.org,
        linux-nvme@...ts.infradead.org, linux-xfs@...r.kernel.org,
        linux-fsdevel@...r.kernel.org, tytso@....edu, jbongio@...gle.com,
        linux-scsi@...r.kernel.org, ming.lei@...hat.com, jaswin@...ux.ibm.com,
        bvanassche@....org
Subject: Re: [PATCH v2 00/16] block atomic writes

On 19/12/2023 05:21, Christoph Hellwig wrote:
> On Mon, Dec 18, 2023 at 09:14:56PM -0800, Darrick J. Wong wrote:
>> /me stumbles back in with plenty of covidbrain to go around.
>>
>> So ... Christoph, you're asking for a common API for
>> sysadmins/applications to signal to the filesystem that they want all
>> data allocations aligned to a given value, right?
>>
>> e.g. if a nvme device advertises a capability for untorn writes between
>> $lbasize and 64k, then we need a way to set up each untorn-file with
>> some alignment between $lbasize and 64k?
>>
>> or if cxl storage becomes not ung-dly expensive, then we'd want a way to
>> set up files with an alignment of 2M (x86) or 512M (arm64lol) to take
>> advantage of PMD mmap mappings?
> 
> The most important point is to not mix these up.
> 
> If we want to use a file for atomic writes I should tell the fs about
> it, and preferably in a way that does not require me to know about weird
> internal implementation details of every file system.  I really just
> want to use atomic writes.  Preferably I'd just start using them after
> asking for the limits.  But that's obviously not going to work for
> file systems that use the hardware offload and don't otherwise align
> to the limit (which would suck for very small files anyway :))
> 
> So as a compromise I tell the file system before writing or otherwise
> adding any data [1] to file that I want to use atomic writes so that
> the fs can take the right decisions.
> 
> [1] reflinking data into a such marked file will be ... interesting.
> 

How about something based on fcntl, like below? We will prob also 
require some per-FS flag for enabling atomic writes without HW support. 
That flag might be also useful for XFS for differentiating forcealign 
for atomic writes with just forcealign.

---8<----

diff --git a/fs/fcntl.c b/fs/fcntl.c
index c80a6acad742..d4a50655565a 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -313,6 +313,15 @@ static long fcntl_rw_hint(struct file *file, 
unsigned int cmd,
  	}
  }

+static long fcntl_atomic_write(struct file *file, unsigned int cmd,
+			  unsigned long arg)
+{
+	if (file->f_op->enable_atomic_writes)
+		return file->f_op->enable_atomic_writes(file, arg);
+
+	return -EOPNOTSUPP;
+}
+
  static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
  		struct file *filp)
  {
@@ -419,6 +428,9 @@ static long do_fcntl(int fd, unsigned int cmd, 
unsigned long arg,
  	case F_SET_RW_HINT:
  		err = fcntl_rw_hint(filp, cmd, arg);
  		break;
+	case F_SET_ATOMIC_WRITE_SIZE:
+		err = fcntl_atomic_write(filp, cmd, arg);
+		break;
  	default:
  		break;
  	}
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e33e5e13b95f..dba206cbe1ab 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1478,6 +1478,46 @@ xfs_file_mmap(
  	return 0;
  }

+STATIC int
+xfs_enable_atomic_writes(
+	struct file	*file,
+	unsigned int unit_max)
+{
+	struct xfs_inode *ip = XFS_I(file_inode(file));
+	struct mnt_idmap *idmap = file_mnt_idmap(file);
+	struct dentry *dentry = file->f_path.dentry;
+	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
+	struct block_device	*bdev = target->bt_bdev;
+	struct request_queue	*q = bdev->bd_queue;
+	struct inode		*inode = &ip->i_vnode;
+
+	if (queue_atomic_write_unit_max_bytes(q) == 0) {
+		if (unit_max != 0) {
+			/* We expect unbounded CoW size if no HW support */
+			return -ENOTBLK;
+		}
+		/* Do something for CoW support ... */
+
+		return 0;
+	}
+
+	if (!unit_max)
+		return -EINVAL;
+
+	/* bodge alert */
+	if (inode->i_op->fileattr_set) {
+		struct fileattr fa = {
+			.fsx_extsize = unit_max,
+			.fsx_xflags = FS_XFLAG_EXTSIZE | FS_XFLAG_FORCEALIGN,
+			.fsx_valid = 1,
+		};
+
+		return inode->i_op->fileattr_set(idmap, dentry, &fa);
+	}
+
+	return -EOPNOTSUPP;
+}
+
  const struct file_operations xfs_file_operations = {
  	.llseek		= xfs_file_llseek,
  	.read_iter	= xfs_file_read_iter,
@@ -1498,6 +1538,7 @@ const struct file_operations xfs_file_operations = {
  	.fallocate	= xfs_file_fallocate,
  	.fadvise	= xfs_file_fadvise,
  	.remap_file_range = xfs_file_remap_range,
+	.enable_atomic_writes = xfs_enable_atomic_writes,
  };

  const struct file_operations xfs_dir_file_operations = {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 98b7a7a8c42e..a715f98057b8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1956,6 +1956,7 @@ struct file_operations {
  	int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
  	int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *,
  				unsigned int poll_flags);
+	int (*enable_atomic_writes)(struct file *, unsigned int unit_max);
  } __randomize_layout;

  /* Wrap a directory iterator that needs exclusive inode access */
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 6c80f96049bd..69780c49622e 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -56,6 +56,8 @@
  #define F_GET_FILE_RW_HINT	(F_LINUX_SPECIFIC_BASE + 13)
  #define F_SET_FILE_RW_HINT	(F_LINUX_SPECIFIC_BASE + 14)

+#define F_SET_ATOMIC_WRITE_SIZE  (F_LINUX_SPECIFIC_BASE + 15)
+
  /*
   * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be
   * used to clear any hints previously set.
-- 


--->8----