[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <1899437526.7296629.1692017818178.JavaMail.zimbra@desy.de>
Date: Mon, 14 Aug 2023 14:56:58 +0200 (CEST)
From: "Mkrtchyan, Tigran" <tigran.mkrtchyan@...y.de>
To: Jeff Layton <jlayton@...nel.org>
Cc: Chuck Lever <cel@...nel.org>, linux-nfs <linux-nfs@...r.kernel.org>,
netdev@...r.kernel.org, Chuck Lever <chuck.lever@...cle.com>,
dhowells@...hat.com
Subject: Re: [PATCH v2 1/4] SUNRPC: Convert svc_tcp_sendmsg to use bio_vecs
directly
----- Original Message -----
> From: "Jeff Layton" <jlayton@...nel.org>
> To: "Chuck Lever" <cel@...nel.org>, "linux-nfs" <linux-nfs@...r.kernel.org>, netdev@...r.kernel.org
> Cc: "Chuck Lever" <chuck.lever@...cle.com>, dhowells@...hat.com
> Sent: Saturday, 12 August, 2023 14:04:57
> Subject: Re: [PATCH v2 1/4] SUNRPC: Convert svc_tcp_sendmsg to use bio_vecs directly
> On Fri, 2023-07-14 at 14:10 -0400, Chuck Lever wrote:
>> From: Chuck Lever <chuck.lever@...cle.com>
>>
>> Add a helper to convert a whole xdr_buf directly into an array of
>> bio_vecs, then send this array instead of iterating piecemeal over
>> the xdr_buf containing the outbound RPC message.
>>
>> Note that the rules of the RPC protocol mean there can be only one
>> outstanding send at a time on a transport socket. The kernel's
>> SunRPC server enforces this via the transport's xpt_mutex. Thus we
>> can use a per-transport shared array for the xdr_buf conversion
>> rather than allocate one every time or use one that is part of
>> struct svc_rqst.
>>
>> Signed-off-by: Chuck Lever <chuck.lever@...cle.com>
>> ---
>> include/linux/sunrpc/svcsock.h | 3 ++
>> include/linux/sunrpc/xdr.h | 2 +
>> net/sunrpc/svcsock.c | 59 ++++++++++++++--------------------------
>> net/sunrpc/xdr.c | 50 ++++++++++++++++++++++++++++++++++
>> 4 files changed, 75 insertions(+), 39 deletions(-)
>>
>
> I've seen some pynfs test regressions in mainline (v6.5-rc5-ish)
> kernels. Here's one failing test:
BTW, we have built a container to run pynfs tests as part of your CI process.
podman run -ti --rm dcache/pynfs:0.3 /run-nfs4.0.sh --help
podman run -ti --rm dcache/pynfs:0.3 /run-nfs4.1.sh --help
Maybe others will find it useful as well.
Tigran.
>
> _text = b'write data' # len=10
>
> [...]
>
> def testSimpleWrite2(t, env):
> """WRITE with stateid=zeros changing size
>
> FLAGS: write all
> DEPEND: MKFILE
> CODE: WRT1b
> """
> c = env.c1
> c.init_connection()
> attrs = {FATTR4_SIZE: 32, FATTR4_MODE: 0o644}
> fh, stateid = c.create_confirm(t.word(), attrs=attrs,
> deny=OPEN4_SHARE_DENY_NONE)
> res = c.write_file(fh, _text, 30)
> check(res, msg="WRITE with stateid=zeros changing size")
> res = c.read_file(fh, 25, 20)
> _compare(t, res, b'\0'*5 + _text, True)
>
> This test writes 10 bytes of data (to a file at offset 30, and then does
> a 20 byte read starting at offset 25. The READ reply has NULs where the
> written data should be
>
> The patch that broke things is this one:
>
> 5df5dd03a8f7 sunrpc: Use sendmsg(MSG_SPLICE_PAGES) rather then sendpage
>
> This patch fixes the problem and gets the test run "green" again. I
> think we will probably want to send this patch to mainline for v6.5, but
> it'd be good to understand what's broken and how this fixes it.
>
> Do you (or David) have any insight?
>
> It'd also be good to understand whether we also need to fix UDP. pynfs
> is tcp-only, so I can't run the same test there as easily.
>
>> diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
>> index a7116048a4d4..a9bfeadf4cbe 100644
>> --- a/include/linux/sunrpc/svcsock.h
>> +++ b/include/linux/sunrpc/svcsock.h
>> @@ -40,6 +40,9 @@ struct svc_sock {
>>
>> struct completion sk_handshake_done;
>>
>> + struct bio_vec sk_send_bvec[RPCSVC_MAXPAGES]
>> + ____cacheline_aligned;
>> +
>> struct page * sk_pages[RPCSVC_MAXPAGES]; /* received data */
>> };
>>
>> diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
>> index f89ec4b5ea16..42f9d7eb9a1a 100644
>> --- a/include/linux/sunrpc/xdr.h
>> +++ b/include/linux/sunrpc/xdr.h
>> @@ -139,6 +139,8 @@ void xdr_terminate_string(const struct xdr_buf *, const
>> u32);
>> size_t xdr_buf_pagecount(const struct xdr_buf *buf);
>> int xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp);
>> void xdr_free_bvec(struct xdr_buf *buf);
>> +unsigned int xdr_buf_to_bvec(struct bio_vec *bvec, unsigned int bvec_size,
>> + const struct xdr_buf *xdr);
>>
>> static inline __be32 *xdr_encode_array(__be32 *p, const void *s, unsigned int
>> len)
>> {
>> diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
>> index e43f26382411..e35e5afe4b81 100644
>> --- a/net/sunrpc/svcsock.c
>> +++ b/net/sunrpc/svcsock.c
>> @@ -36,6 +36,8 @@
>> #include <linux/skbuff.h>
>> #include <linux/file.h>
>> #include <linux/freezer.h>
>> +#include <linux/bvec.h>
>> +
>> #include <net/sock.h>
>> #include <net/checksum.h>
>> #include <net/ip.h>
>> @@ -1194,72 +1196,52 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
>> return 0; /* record not complete */
>> }
>>
>> -static int svc_tcp_send_kvec(struct socket *sock, const struct kvec *vec,
>> - int flags)
>> -{
>> - struct msghdr msg = { .msg_flags = MSG_SPLICE_PAGES | flags, };
>> -
>> - iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, vec, 1, vec->iov_len);
>> - return sock_sendmsg(sock, &msg);
>> -}
>> -
>> /*
>> * MSG_SPLICE_PAGES is used exclusively to reduce the number of
>> * copy operations in this path. Therefore the caller must ensure
>> * that the pages backing @xdr are unchanging.
>> *
>> - * In addition, the logic assumes that * .bv_len is never larger
>> - * than PAGE_SIZE.
>> + * Note that the send is non-blocking. The caller has incremented
>> + * the reference count on each page backing the RPC message, and
>> + * the network layer will "put" these pages when transmission is
>> + * complete.
>> + *
>> + * This is safe for our RPC services because the memory backing
>> + * the head and tail components is never kmalloc'd. These always
>> + * come from pages in the svc_rqst::rq_pages array.
>> */
>> -static int svc_tcp_sendmsg(struct socket *sock, struct xdr_buf *xdr,
>> +static int svc_tcp_sendmsg(struct svc_sock *svsk, struct xdr_buf *xdr,
>> rpc_fraghdr marker, unsigned int *sentp)
>> {
>> - const struct kvec *head = xdr->head;
>> - const struct kvec *tail = xdr->tail;
>> struct kvec rm = {
>> .iov_base = &marker,
>> .iov_len = sizeof(marker),
>> };
>> struct msghdr msg = {
>> - .msg_flags = 0,
>> + .msg_flags = MSG_MORE,
>> };
>> + unsigned int count;
>> int ret;
>>
>> *sentp = 0;
>> - ret = xdr_alloc_bvec(xdr, GFP_KERNEL);
>> - if (ret < 0)
>> - return ret;
>>
>> - ret = kernel_sendmsg(sock, &msg, &rm, 1, rm.iov_len);
>> + ret = kernel_sendmsg(svsk->sk_sock, &msg, &rm, 1, rm.iov_len);
>> if (ret < 0)
>> return ret;
>> *sentp += ret;
>> if (ret != rm.iov_len)
>> return -EAGAIN;
>>
>> - ret = svc_tcp_send_kvec(sock, head, 0);
>> - if (ret < 0)
>> - return ret;
>> - *sentp += ret;
>> - if (ret != head->iov_len)
>> - goto out;
>> + count = xdr_buf_to_bvec(svsk->sk_send_bvec,
>> + ARRAY_SIZE(svsk->sk_send_bvec), xdr);
>>
>> msg.msg_flags = MSG_SPLICE_PAGES;
>> - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, xdr->bvec,
>> - xdr_buf_pagecount(xdr), xdr->page_len);
>> - ret = sock_sendmsg(sock, &msg);
>> + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, svsk->sk_send_bvec,
>> + count, xdr->len);
>> + ret = sock_sendmsg(svsk->sk_sock, &msg);
>> if (ret < 0)
>> return ret;
>> *sentp += ret;
>> -
>> - if (tail->iov_len) {
>> - ret = svc_tcp_send_kvec(sock, tail, 0);
>> - if (ret < 0)
>> - return ret;
>> - *sentp += ret;
>> - }
>> -
>> -out:
>> return 0;
>> }
>>
>> @@ -1290,8 +1272,7 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp)
>> if (svc_xprt_is_dead(xprt))
>> goto out_notconn;
>> tcp_sock_set_cork(svsk->sk_sk, true);
>> - err = svc_tcp_sendmsg(svsk->sk_sock, xdr, marker, &sent);
>> - xdr_free_bvec(xdr);
>> + err = svc_tcp_sendmsg(svsk, xdr, marker, &sent);
>> trace_svcsock_tcp_send(xprt, err < 0 ? (long)err : sent);
>> if (err < 0 || sent != (xdr->len + sizeof(marker)))
>> goto out_close;
>> diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
>> index 2a22e78af116..358e6de91775 100644
>> --- a/net/sunrpc/xdr.c
>> +++ b/net/sunrpc/xdr.c
>> @@ -164,6 +164,56 @@ xdr_free_bvec(struct xdr_buf *buf)
>> buf->bvec = NULL;
>> }
>>
>> +/**
>> + * xdr_buf_to_bvec - Copy components of an xdr_buf into a bio_vec array
>> + * @bvec: bio_vec array to populate
>> + * @bvec_size: element count of @bio_vec
>> + * @xdr: xdr_buf to be copied
>> + *
>> + * Returns the number of entries consumed in @bvec.
>> + */
>> +unsigned int xdr_buf_to_bvec(struct bio_vec *bvec, unsigned int bvec_size,
>> + const struct xdr_buf *xdr)
>> +{
>> + const struct kvec *head = xdr->head;
>> + const struct kvec *tail = xdr->tail;
>> + unsigned int count = 0;
>> +
>> + if (head->iov_len) {
>> + bvec_set_virt(bvec++, head->iov_base, head->iov_len);
>> + ++count;
>> + }
>> +
>> + if (xdr->page_len) {
>> + unsigned int offset, len, remaining;
>> + struct page **pages = xdr->pages;
>> +
>> + offset = offset_in_page(xdr->page_base);
>> + remaining = xdr->page_len;
>> + while (remaining > 0) {
>> + len = min_t(unsigned int, remaining,
>> + PAGE_SIZE - offset);
>> + bvec_set_page(bvec++, *pages++, len, offset);
>> + remaining -= len;
>> + offset = 0;
>> + if (unlikely(++count > bvec_size))
>> + goto bvec_overflow;
>> + }
>> + }
>> +
>> + if (tail->iov_len) {
>> + bvec_set_virt(bvec, tail->iov_base, tail->iov_len);
>> + if (unlikely(++count > bvec_size))
>> + goto bvec_overflow;
>> + }
>> +
>> + return count;
>> +
>> +bvec_overflow:
>> + pr_warn_once("%s: bio_vec array overflow\n", __func__);
>> + return count - 1;
>> +}
>> +
>> /**
>> * xdr_inline_pages - Prepare receive buffer for a large reply
>> * @xdr: xdr_buf into which reply will be placed
>>
>>
>
> --
> Jeff Layton <jlayton@...nel.org>
Download attachment "smime.p7s" of type "application/pkcs7-signature" (2208 bytes)
Powered by blists - more mailing lists