[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250610011818.3232-1-sergeybashirov@gmail.com>
Date: Tue, 10 Jun 2025 04:18:03 +0300
From: Sergey Bashirov <sergeybashirov@...il.com>
To: Chuck Lever <chuck.lever@...cle.com>,
Christoph Hellwig <hch@...radead.org>,
Jeff Layton <jlayton@...nel.org>,
NeilBrown <neil@...wn.name>,
Olga Kornievskaia <okorniev@...hat.com>,
Dai Ngo <Dai.Ngo@...cle.com>,
Tom Talpey <tom@...pey.com>
Cc: linux-nfs@...r.kernel.org,
linux-kernel@...r.kernel.org,
Konstantin Evtushenko <koevtushenko@...dex.com>,
Sergey Bashirov <sergeybashirov@...il.com>
Subject: [PATCH v2] nfsd: Implement large extent array support in pNFS
When pNFS client in block layout mode sends layoutcommit RPC to MDS,
a variable length array of modified extents is supplied within request.
This patch allows NFS server to accept such extent arrays if they do not
fit within single memory page.
The issue can be reproduced when writing to a 1GB file using FIO with small
block size and large I/O depth. Server returns NFSERR_BADXDR to the client.
Co-developed-by: Konstantin Evtushenko <koevtushenko@...dex.com>
Signed-off-by: Konstantin Evtushenko <koevtushenko@...dex.com>
Signed-off-by: Sergey Bashirov <sergeybashirov@...il.com>
---
Changes in v2:
- Drop the len argument in *_decode_layoutupdate()
- Remove lc_up_len field from nfsd4_layoutcommit structure
- Fix code formatting
fs/nfsd/blocklayout.c | 8 ++--
fs/nfsd/blocklayoutxdr.c | 89 +++++++++++++++++++++++++++++-----------
fs/nfsd/blocklayoutxdr.h | 8 ++--
fs/nfsd/nfs4xdr.c | 11 +++--
fs/nfsd/xdr4.h | 3 +-
5 files changed, 78 insertions(+), 41 deletions(-)
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 08a20e5bcf7f..e74c3a19aeb9 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -179,8 +179,8 @@ nfsd4_block_proc_layoutcommit(struct inode *inode,
struct iomap *iomaps;
int nr_iomaps;
- nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
- lcp->lc_up_len, &iomaps, i_blocksize(inode));
+ nr_iomaps = nfsd4_block_decode_layoutupdate(&lcp->lc_up_layout,
+ &iomaps, i_blocksize(inode));
if (nr_iomaps < 0)
return nfserrno(nr_iomaps);
@@ -317,8 +317,8 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode,
struct iomap *iomaps;
int nr_iomaps;
- nr_iomaps = nfsd4_scsi_decode_layoutupdate(lcp->lc_up_layout,
- lcp->lc_up_len, &iomaps, i_blocksize(inode));
+ nr_iomaps = nfsd4_scsi_decode_layoutupdate(&lcp->lc_up_layout,
+ &iomaps, i_blocksize(inode));
if (nr_iomaps < 0)
return nfserrno(nr_iomaps);
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index ce78f74715ee..b76dbb119e19 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -113,26 +113,27 @@ nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
}
int
-nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
+nfsd4_block_decode_layoutupdate(struct xdr_buf *buf, struct iomap **iomapp,
u32 block_size)
{
struct iomap *iomaps;
- u32 nr_iomaps, i;
+ u32 nr_iomaps, expected, len, i;
+ struct xdr_stream xdr;
+ char scratch[sizeof(struct pnfs_block_extent)];
- if (len < sizeof(u32)) {
- dprintk("%s: extent array too small: %u\n", __func__, len);
- return -EINVAL;
- }
- len -= sizeof(u32);
- if (len % PNFS_BLOCK_EXTENT_SIZE) {
- dprintk("%s: extent array invalid: %u\n", __func__, len);
+ xdr_init_decode(&xdr, buf, buf->head[0].iov_base, NULL);
+ xdr_set_scratch_buffer(&xdr, scratch, sizeof(scratch));
+
+ if (xdr_stream_decode_u32(&xdr, &nr_iomaps)) {
+ dprintk("%s: failed to decode extent array size\n", __func__);
return -EINVAL;
}
- nr_iomaps = be32_to_cpup(p++);
- if (nr_iomaps != len / PNFS_BLOCK_EXTENT_SIZE) {
+ len = sizeof(__be32) + xdr_stream_remaining(&xdr);
+ expected = sizeof(__be32) + nr_iomaps * PNFS_BLOCK_EXTENT_SIZE;
+ if (len != expected) {
dprintk("%s: extent array size mismatch: %u/%u\n",
- __func__, len, nr_iomaps);
+ __func__, len, expected);
return -EINVAL;
}
@@ -144,29 +145,54 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
for (i = 0; i < nr_iomaps; i++) {
struct pnfs_block_extent bex;
+ ssize_t ret;
- memcpy(&bex.vol_id, p, sizeof(struct nfsd4_deviceid));
- p += XDR_QUADLEN(sizeof(struct nfsd4_deviceid));
+ ret = xdr_stream_decode_opaque_fixed(&xdr,
+ &bex.vol_id, sizeof(bex.vol_id));
+ if (ret < sizeof(bex.vol_id)) {
+ dprintk("%s: failed to decode device id for entry %u\n",
+ __func__, i);
+ goto fail;
+ }
- p = xdr_decode_hyper(p, &bex.foff);
+ if (xdr_stream_decode_u64(&xdr, &bex.foff)) {
+ dprintk("%s: failed to decode offset for entry %u\n",
+ __func__, i);
+ goto fail;
+ }
if (bex.foff & (block_size - 1)) {
dprintk("%s: unaligned offset 0x%llx\n",
__func__, bex.foff);
goto fail;
}
- p = xdr_decode_hyper(p, &bex.len);
+
+ if (xdr_stream_decode_u64(&xdr, &bex.len)) {
+ dprintk("%s: failed to decode length for entry %u\n",
+ __func__, i);
+ goto fail;
+ }
if (bex.len & (block_size - 1)) {
dprintk("%s: unaligned length 0x%llx\n",
__func__, bex.foff);
goto fail;
}
- p = xdr_decode_hyper(p, &bex.soff);
+
+ if (xdr_stream_decode_u64(&xdr, &bex.soff)) {
+ dprintk("%s: failed to decode soffset for entry %u\n",
+ __func__, i);
+ goto fail;
+ }
if (bex.soff & (block_size - 1)) {
dprintk("%s: unaligned disk offset 0x%llx\n",
__func__, bex.soff);
goto fail;
}
- bex.es = be32_to_cpup(p++);
+
+ if (xdr_stream_decode_u32(&xdr, &bex.es)) {
+ dprintk("%s: failed to decode estate for entry %u\n",
+ __func__, i);
+ goto fail;
+ }
if (bex.es != PNFS_BLOCK_READWRITE_DATA) {
dprintk("%s: incorrect extent state %d\n",
__func__, bex.es);
@@ -185,18 +211,23 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
}
int
-nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
+nfsd4_scsi_decode_layoutupdate(struct xdr_buf *buf, struct iomap **iomapp,
u32 block_size)
{
struct iomap *iomaps;
- u32 nr_iomaps, expected, i;
+ u32 nr_iomaps, expected, len, i;
+ struct xdr_stream xdr;
+ char scratch[sizeof(struct pnfs_block_range)];
- if (len < sizeof(u32)) {
- dprintk("%s: extent array too small: %u\n", __func__, len);
+ xdr_init_decode(&xdr, buf, buf->head[0].iov_base, NULL);
+ xdr_set_scratch_buffer(&xdr, scratch, sizeof(scratch));
+
+ if (xdr_stream_decode_u32(&xdr, &nr_iomaps)) {
+ dprintk("%s: failed to decode extent array size\n", __func__);
return -EINVAL;
}
- nr_iomaps = be32_to_cpup(p++);
+ len = sizeof(__be32) + xdr_stream_remaining(&xdr);
expected = sizeof(__be32) + nr_iomaps * PNFS_SCSI_RANGE_SIZE;
if (len != expected) {
dprintk("%s: extent array size mismatch: %u/%u\n",
@@ -213,14 +244,22 @@ nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
for (i = 0; i < nr_iomaps; i++) {
u64 val;
- p = xdr_decode_hyper(p, &val);
+ if (xdr_stream_decode_u64(&xdr, &val)) {
+ dprintk("%s: failed to decode offset for entry %u\n",
+ __func__, i);
+ goto fail;
+ }
if (val & (block_size - 1)) {
dprintk("%s: unaligned offset 0x%llx\n", __func__, val);
goto fail;
}
iomaps[i].offset = val;
- p = xdr_decode_hyper(p, &val);
+ if (xdr_stream_decode_u64(&xdr, &val)) {
+ dprintk("%s: failed to decode length for entry %u\n",
+ __func__, i);
+ goto fail;
+ }
if (val & (block_size - 1)) {
dprintk("%s: unaligned length 0x%llx\n", __func__, val);
goto fail;
diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
index 4e28ac8f1127..6e603cfec0a7 100644
--- a/fs/nfsd/blocklayoutxdr.h
+++ b/fs/nfsd/blocklayoutxdr.h
@@ -54,9 +54,9 @@ __be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
const struct nfsd4_getdeviceinfo *gdp);
__be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
const struct nfsd4_layoutget *lgp);
-int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
- u32 block_size);
-int nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
- u32 block_size);
+int nfsd4_block_decode_layoutupdate(struct xdr_buf *buf,
+ struct iomap **iomapp, u32 block_size);
+int nfsd4_scsi_decode_layoutupdate(struct xdr_buf *buf,
+ struct iomap **iomapp, u32 block_size);
#endif /* _NFSD_BLOCKLAYOUTXDR_H */
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 3afcdbed6e14..659e60b85d5f 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -604,6 +604,8 @@ static __be32
nfsd4_decode_layoutupdate4(struct nfsd4_compoundargs *argp,
struct nfsd4_layoutcommit *lcp)
{
+ u32 len;
+
if (xdr_stream_decode_u32(argp->xdr, &lcp->lc_layout_type) < 0)
return nfserr_bad_xdr;
if (lcp->lc_layout_type < LAYOUT_NFSV4_1_FILES)
@@ -611,13 +613,10 @@ nfsd4_decode_layoutupdate4(struct nfsd4_compoundargs *argp,
if (lcp->lc_layout_type >= LAYOUT_TYPE_MAX)
return nfserr_bad_xdr;
- if (xdr_stream_decode_u32(argp->xdr, &lcp->lc_up_len) < 0)
+ if (xdr_stream_decode_u32(argp->xdr, &len) < 0)
+ return nfserr_bad_xdr;
+ if (!xdr_stream_subsegment(argp->xdr, &lcp->lc_up_layout, len))
return nfserr_bad_xdr;
- if (lcp->lc_up_len > 0) {
- lcp->lc_up_layout = xdr_inline_decode(argp->xdr, lcp->lc_up_len);
- if (!lcp->lc_up_layout)
- return nfserr_bad_xdr;
- }
return nfs_ok;
}
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index aa2a356da784..02887029a81c 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -630,8 +630,7 @@ struct nfsd4_layoutcommit {
u64 lc_last_wr; /* request */
struct timespec64 lc_mtime; /* request */
u32 lc_layout_type; /* request */
- u32 lc_up_len; /* layout length */
- void *lc_up_layout; /* decoded by callback */
+ struct xdr_buf lc_up_layout; /* decoded by callback */
bool lc_size_chg; /* response */
u64 lc_newsize; /* response */
};
--
2.43.0
Powered by blists - more mailing lists