lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250911160208.69086-1-sergeybashirov@gmail.com>
Date: Thu, 11 Sep 2025 19:02:03 +0300
From: Sergey Bashirov <sergeybashirov@...il.com>
To: Chuck Lever <chuck.lever@...cle.com>,
	Jeff Layton <jlayton@...nel.org>,
	NeilBrown <neil@...wn.name>,
	Olga Kornievskaia <okorniev@...hat.com>,
	Dai Ngo <Dai.Ngo@...cle.com>,
	Tom Talpey <tom@...pey.com>
Cc: linux-nfs@...r.kernel.org,
	linux-kernel@...r.kernel.org,
	Sergey Bashirov <sergeybashirov@...il.com>
Subject: [PATCH] NFSD: Impl multiple extents in block/scsi layoutget

This patch allows the pNFS server to respond with multiple extents
in a layoutget request. As a result, the number of layoutget requests
is significantly reduced for various file access patterns, including
random and parallel writes, avoiding unnecessary load to the server.
On the client side, this improves the performance of writing large
files and allows requesting layouts with minimum length greater than
PAGE_SIZE.

Signed-off-by: Sergey Bashirov <sergeybashirov@...il.com>
---
Checked with smatch, tested on pNFS block volume setup.

 fs/nfsd/blocklayout.c    | 167 +++++++++++++++++++++++++++++----------
 fs/nfsd/blocklayoutxdr.c |  36 ++++++---
 fs/nfsd/blocklayoutxdr.h |   5 ++
 3 files changed, 157 insertions(+), 51 deletions(-)

diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index fde5539cf6a6..d53f3ec8823a 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -17,48 +17,39 @@
 #define NFSDDBG_FACILITY	NFSDDBG_PNFS
 
 
+/**
+ * nfsd4_block_map_extent - get extent that covers the start of segment
+ * @inode: inode of the file requested by the client
+ * @fhp: handle of the file requested by the client
+ * @seg: layout subrange requested by the client
+ * @minlength: layout min length requested by the client
+ * @bex: output block extent structure
+ *
+ * Get an extent from the file system that starts at @seg->offset or below,
+ * but may be shorter than @seg->length.
+ *
+ * Return values:
+ *   %nfs_ok: Success, @bex is initialized and valid
+ *   %nfserr_layoutunavailable: Failed to get extent for requested @seg
+ *   OS errors converted to NFS errors
+ */
 static __be32
-nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode,
-		const struct svc_fh *fhp, struct nfsd4_layoutget *args)
+nfsd4_block_map_extent(struct inode *inode, const struct svc_fh *fhp,
+		const struct nfsd4_layout_seg *seg, u64 minlength,
+		struct pnfs_block_extent *bex)
 {
-	struct nfsd4_layout_seg *seg = &args->lg_seg;
 	struct super_block *sb = inode->i_sb;
-	u32 block_size = i_blocksize(inode);
-	struct pnfs_block_extent *bex;
 	struct iomap iomap;
 	u32 device_generation = 0;
 	int error;
 
-	if (locks_in_grace(SVC_NET(rqstp)))
-		return nfserr_grace;
-
-	if (seg->offset & (block_size - 1)) {
-		dprintk("pnfsd: I/O misaligned\n");
-		goto out_layoutunavailable;
-	}
-
-	/*
-	 * Some clients barf on non-zero block numbers for NONE or INVALID
-	 * layouts, so make sure to zero the whole structure.
-	 */
-	error = -ENOMEM;
-	bex = kzalloc(sizeof(*bex), GFP_KERNEL);
-	if (!bex)
-		goto out_error;
-	args->lg_content = bex;
-
 	error = sb->s_export_op->map_blocks(inode, seg->offset, seg->length,
 					    &iomap, seg->iomode != IOMODE_READ,
 					    &device_generation);
 	if (error) {
 		if (error == -ENXIO)
-			goto out_layoutunavailable;
-		goto out_error;
-	}
-
-	if (iomap.length < args->lg_minlength) {
-		dprintk("pnfsd: extent smaller than minlength\n");
-		goto out_layoutunavailable;
+			return nfserr_layoutunavailable;
+		return nfserrno(error);
 	}
 
 	switch (iomap.type) {
@@ -74,9 +65,9 @@ nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode,
 			/*
 			 * Crack monkey special case from section 2.3.1.
 			 */
-			if (args->lg_minlength == 0) {
+			if (minlength == 0) {
 				dprintk("pnfsd: no soup for you!\n");
-				goto out_layoutunavailable;
+				return nfserr_layoutunavailable;
 			}
 
 			bex->es = PNFS_BLOCK_INVALID_DATA;
@@ -93,27 +84,119 @@ nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode,
 	case IOMAP_DELALLOC:
 	default:
 		WARN(1, "pnfsd: filesystem returned %d extent\n", iomap.type);
-		goto out_layoutunavailable;
+		return nfserr_layoutunavailable;
 	}
 
 	error = nfsd4_set_deviceid(&bex->vol_id, fhp, device_generation);
 	if (error)
-		goto out_error;
+		return nfserrno(error);
+
 	bex->foff = iomap.offset;
 	bex->len = iomap.length;
+	return nfs_ok;
+}
 
-	seg->offset = iomap.offset;
-	seg->length = iomap.length;
+/**
+ * nfsd4_block_map_segment - get extent array for the requested layout
+ * @inode: inode of the file requested by the client
+ * @fhp: handle of the file requested by the client
+ * @seg: layout range requested by the client
+ * @minlength: layout min length requested by the client
+ * @bl: output array of block extents
+ *
+ * Get an array of consecutive block extents that span the requested
+ * layout range. The resulting range may be shorter than requested if
+ * all preallocated block extents are used.
+ *
+ * Return values:
+ *   %nfs_ok: Success, @bl initialized and valid
+ *   %nfserr_layoutunavailable: Failed to get extents for requested @seg
+ *   OS errors converted to NFS errors
+ */
+static __be32
+nfsd4_block_map_segment(struct inode *inode, const struct svc_fh *fhp,
+		const struct nfsd4_layout_seg *seg, u64 minlength,
+		struct pnfs_block_layout *bl)
+{
+	struct nfsd4_layout_seg subseg = *seg;
+	u32 i;
+	__be32 nfserr;
 
-	dprintk("GET: 0x%llx:0x%llx %d\n", bex->foff, bex->len, bex->es);
-	return 0;
+	for (i = 0; i < bl->nr_extents; i++) {
+		struct pnfs_block_extent *extent = bl->extents + i;
+		u64 extent_len;
+
+		nfserr = nfsd4_block_map_extent(inode, fhp, &subseg,
+				minlength, extent);
+		if (nfserr != nfs_ok)
+			return nfserr;
+
+		extent_len = extent->len - (subseg.offset - extent->foff);
+		if (extent_len >= subseg.length) {
+			bl->nr_extents = i + 1;
+			return nfs_ok;
+		}
+
+		subseg.offset = extent->foff + extent->len;
+		subseg.length -= extent_len;
+	}
+
+	return nfs_ok;
+}
+
+static __be32
+nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode,
+		const struct svc_fh *fhp, struct nfsd4_layoutget *args)
+{
+	struct nfsd4_layout_seg *seg = &args->lg_seg;
+	u64 seg_length;
+	struct pnfs_block_extent *first_bex, *last_bex;
+	struct pnfs_block_layout *bl;
+	u32 nr_extents_max = PAGE_SIZE / sizeof(bl->extents[0]) - 1;
+	u32 block_size = i_blocksize(inode);
+	__be32 nfserr;
+
+	if (locks_in_grace(SVC_NET(rqstp)))
+		return nfserr_grace;
+
+	nfserr = nfserr_layoutunavailable;
+	if (seg->offset & (block_size - 1)) {
+		dprintk("pnfsd: I/O misaligned\n");
+		goto out_error;
+	}
+
+	/*
+	 * Some clients barf on non-zero block numbers for NONE or INVALID
+	 * layouts, so make sure to zero the whole structure.
+	 */
+	nfserr = nfserrno(-ENOMEM);
+	bl = kzalloc(struct_size(bl, extents, nr_extents_max), GFP_KERNEL);
+	if (!bl)
+		goto out_error;
+	bl->nr_extents = nr_extents_max;
+	args->lg_content = bl;
+
+	nfserr = nfsd4_block_map_segment(inode, fhp, seg,
+			args->lg_minlength, bl);
+	if (nfserr != nfs_ok)
+		goto out_error;
+	first_bex = bl->extents;
+	last_bex = bl->extents + bl->nr_extents - 1;
+
+	nfserr = nfserr_layoutunavailable;
+	seg_length = last_bex->foff + last_bex->len - seg->offset;
+	if (seg_length < args->lg_minlength) {
+		dprintk("pnfsd: extent smaller than minlength\n");
+		goto out_error;
+	}
+
+	seg->offset = first_bex->foff;
+	seg->length = last_bex->foff - first_bex->foff + last_bex->len;
+	return nfs_ok;
 
 out_error:
 	seg->length = 0;
-	return nfserrno(error);
-out_layoutunavailable:
-	seg->length = 0;
-	return nfserr_layoutunavailable;
+	return nfserr;
 }
 
 static __be32
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index e50afe340737..68c112d47cee 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -14,12 +14,25 @@
 #define NFSDDBG_FACILITY	NFSDDBG_PNFS
 
 
+/**
+ * nfsd4_block_encode_layoutget - encode block/scsi layout extent array
+ * @xdr: stream for data encoding
+ * @lgp: layoutget content, actually an array of extents to encode
+ *
+ * This function encodes the opaque loc_body field in the layoutget response.
+ * Since the pnfs_block_layout4 and pnfs_scsi_layout4 structures on the wire
+ * are the same, this function is used by both layout drivers.
+ *
+ * Return values:
+ *   %nfs_ok: Success, all extents encoded into @xdr
+ *   %nfserr_toosmall: Not enough space in @xdr to encode all the data
+ */
 __be32
 nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
 		const struct nfsd4_layoutget *lgp)
 {
-	const struct pnfs_block_extent *b = lgp->lg_content;
-	int len = sizeof(__be32) + 5 * sizeof(__be64) + sizeof(__be32);
+	const struct pnfs_block_layout *bl = lgp->lg_content;
+	u32 i, len = sizeof(__be32) + bl->nr_extents * PNFS_BLOCK_EXTENT_SIZE;
 	__be32 *p;
 
 	p = xdr_reserve_space(xdr, sizeof(__be32) + len);
@@ -27,14 +40,19 @@ nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
 		return nfserr_toosmall;
 
 	*p++ = cpu_to_be32(len);
-	*p++ = cpu_to_be32(1);		/* we always return a single extent */
+	*p++ = cpu_to_be32(bl->nr_extents);
 
-	p = svcxdr_encode_deviceid4(p, &b->vol_id);
-	p = xdr_encode_hyper(p, b->foff);
-	p = xdr_encode_hyper(p, b->len);
-	p = xdr_encode_hyper(p, b->soff);
-	*p++ = cpu_to_be32(b->es);
-	return 0;
+	for (i = 0; i < bl->nr_extents; i++) {
+		const struct pnfs_block_extent *bex = bl->extents + i;
+
+		p = svcxdr_encode_deviceid4(p, &bex->vol_id);
+		p = xdr_encode_hyper(p, bex->foff);
+		p = xdr_encode_hyper(p, bex->len);
+		p = xdr_encode_hyper(p, bex->soff);
+		*p++ = cpu_to_be32(bex->es);
+	}
+
+	return nfs_ok;
 }
 
 static int
diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
index 7d25ef689671..54fe7f517a94 100644
--- a/fs/nfsd/blocklayoutxdr.h
+++ b/fs/nfsd/blocklayoutxdr.h
@@ -21,6 +21,11 @@ struct pnfs_block_range {
 	u64				len;
 };
 
+struct pnfs_block_layout {
+	u32				nr_extents;
+	struct pnfs_block_extent	extents[] __counted_by(nr_extents);
+};
+
 /*
  * Random upper cap for the uuid length to avoid unbounded allocation.
  * Not actually limited by the protocol.
-- 
2.43.0


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ