[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <SA0PR15MB391946EE72BC969337CF695299899@SA0PR15MB3919.namprd15.prod.outlook.com>
Date: Wed, 29 Mar 2023 15:18:00 +0000
From: Bernard Metzler <BMT@...ich.ibm.com>
To: David Howells <dhowells@...hat.com>,
Matthew Wilcox <willy@...radead.org>,
"David S. Miller" <davem@...emloft.net>,
Eric Dumazet <edumazet@...gle.com>,
Jakub Kicinski <kuba@...nel.org>,
Paolo Abeni <pabeni@...hat.com>
CC: David Howells <dhowells@...hat.com>,
Al Viro <viro@...iv.linux.org.uk>,
Christoph Hellwig <hch@...radead.org>,
Jens Axboe <axboe@...nel.dk>, Jeff Layton <jlayton@...nel.org>,
Christian Brauner <brauner@...nel.org>,
Chuck Lever III <chuck.lever@...cle.com>,
Linus Torvalds <torvalds@...ux-foundation.org>,
"netdev@...r.kernel.org" <netdev@...r.kernel.org>,
"linux-fsdevel@...r.kernel.org" <linux-fsdevel@...r.kernel.org>,
"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
"linux-mm@...ck.org" <linux-mm@...ck.org>,
Tom Talpey <tom@...pey.com>,
"linux-rdma@...r.kernel.org" <linux-rdma@...r.kernel.org>
Subject: RE: [RFC PATCH v2 30/48] siw: Use sendmsg(MSG_SPLICE_PAGES) rather than
sendpage to transmit
> -----Original Message-----
> From: David Howells <dhowells@...hat.com>
> Sent: Wednesday, 29 March 2023 16:14
> To: Matthew Wilcox <willy@...radead.org>; David S. Miller
> <davem@...emloft.net>; Eric Dumazet <edumazet@...gle.com>; Jakub Kicinski
> <kuba@...nel.org>; Paolo Abeni <pabeni@...hat.com>
> Cc: David Howells <dhowells@...hat.com>; Al Viro <viro@...iv.linux.org.uk>;
> Christoph Hellwig <hch@...radead.org>; Jens Axboe <axboe@...nel.dk>; Jeff
> Layton <jlayton@...nel.org>; Christian Brauner <brauner@...nel.org>; Chuck
> Lever III <chuck.lever@...cle.com>; Linus Torvalds <torvalds@...ux-
> foundation.org>; netdev@...r.kernel.org; linux-fsdevel@...r.kernel.org;
> linux-kernel@...r.kernel.org; linux-mm@...ck.org; Bernard Metzler
> <BMT@...ich.ibm.com>; Tom Talpey <tom@...pey.com>; linux-
> rdma@...r.kernel.org
> Subject: [EXTERNAL] [RFC PATCH v2 30/48] siw: Use sendmsg(MSG_SPLICE_PAGES)
> rather than sendpage to transmit
>
> When transmitting data, call down into TCP using a single sendmsg with
> MSG_SPLICE_PAGES to indicate that content should be spliced rather than
> performing several sendmsg and sendpage calls to transmit header, data
> pages and trailer.
>
> To make this work, the data is assembled in a bio_vec array and attached to
> a BVEC-type iterator. The header and trailer (if present) are copied into
> page fragments that can be freed with put_page().
I like it a lot if it still keeps zero copy sendpage() semantics for
the cases the driver can make use of data transfers w/o copy.
Is 'msg.msg_flags |= MSG_SPLICE_PAGES' doing that magic? 'splicing'
suggest just merging pages to me.
It would simplify the transmit code path substantially, also getting
rid of kmap_local_page()/kunmap_local() sequences for multi-fragment
sendmsg()'s.
>
> Signed-off-by: David Howells <dhowells@...hat.com>
> cc: Bernard Metzler <bmt@...ich.ibm.com>
> cc: Tom Talpey <tom@...pey.com>
> cc: "David S. Miller" <davem@...emloft.net>
> cc: Eric Dumazet <edumazet@...gle.com>
> cc: Jakub Kicinski <kuba@...nel.org>
> cc: Paolo Abeni <pabeni@...hat.com>
> cc: Jens Axboe <axboe@...nel.dk>
> cc: Matthew Wilcox <willy@...radead.org>
> cc: linux-rdma@...r.kernel.org
> cc: netdev@...r.kernel.org
> ---
> drivers/infiniband/sw/siw/siw_qp_tx.c | 234 ++++++--------------------
> 1 file changed, 48 insertions(+), 186 deletions(-)
>
> diff --git a/drivers/infiniband/sw/siw/siw_qp_tx.c
> b/drivers/infiniband/sw/siw/siw_qp_tx.c
> index fa5de40d85d5..fbe80c06d0ca 100644
> --- a/drivers/infiniband/sw/siw/siw_qp_tx.c
> +++ b/drivers/infiniband/sw/siw/siw_qp_tx.c
> @@ -312,114 +312,8 @@ static int siw_tx_ctrl(struct siw_iwarp_tx *c_tx,
> struct socket *s,
> return rv;
> }
>
> -/*
> - * 0copy TCP transmit interface: Use MSG_SPLICE_PAGES.
> - *
> - * Using sendpage to push page by page appears to be less efficient
> - * than using sendmsg, even if data are copied.
> - *
> - * A general performance limitation might be the extra four bytes
> - * trailer checksum segment to be pushed after user data.
> - */
> -static int siw_tcp_sendpages(struct socket *s, struct page **page, int
> offset,
> - size_t size)
> -{
> - struct bio_vec bvec;
> - struct msghdr msg = {
> - .msg_flags = (MSG_MORE | MSG_DONTWAIT | MSG_SENDPAGE_NOTLAST |
> - MSG_SPLICE_PAGES),
> - };
> - struct sock *sk = s->sk;
> - int i = 0, rv = 0, sent = 0;
> -
> - while (size) {
> - size_t bytes = min_t(size_t, PAGE_SIZE - offset, size);
> -
> - if (size + offset <= PAGE_SIZE)
> - msg.msg_flags = MSG_MORE | MSG_DONTWAIT;
> -
> - tcp_rate_check_app_limited(sk);
> - bvec_set_page(&bvec, page[i], bytes, offset);
> - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size);
> -
> -try_page_again:
> - lock_sock(sk);
> - rv = tcp_sendmsg_locked(sk, &msg, size);
> - release_sock(sk);
> -
> - if (rv > 0) {
> - size -= rv;
> - sent += rv;
> - if (rv != bytes) {
> - offset += rv;
> - bytes -= rv;
> - goto try_page_again;
> - }
> - offset = 0;
> - } else {
> - if (rv == -EAGAIN || rv == 0)
> - break;
> - return rv;
> - }
> - i++;
> - }
> - return sent;
> -}
> -
> -/*
> - * siw_0copy_tx()
> - *
> - * Pushes list of pages to TCP socket. If pages from multiple
> - * SGE's, all referenced pages of each SGE are pushed in one
> - * shot.
> - */
> -static int siw_0copy_tx(struct socket *s, struct page **page,
> - struct siw_sge *sge, unsigned int offset,
> - unsigned int size)
> -{
> - int i = 0, sent = 0, rv;
> - int sge_bytes = min(sge->length - offset, size);
> -
> - offset = (sge->laddr + offset) & ~PAGE_MASK;
> -
> - while (sent != size) {
> - rv = siw_tcp_sendpages(s, &page[i], offset, sge_bytes);
> - if (rv >= 0) {
> - sent += rv;
> - if (size == sent || sge_bytes > rv)
> - break;
> -
> - i += PAGE_ALIGN(sge_bytes + offset) >> PAGE_SHIFT;
> - sge++;
> - sge_bytes = min(sge->length, size - sent);
> - offset = sge->laddr & ~PAGE_MASK;
> - } else {
> - sent = rv;
> - break;
> - }
> - }
> - return sent;
> -}
> -
> #define MAX_TRAILER (MPA_CRC_SIZE + 4)
>
> -static void siw_unmap_pages(struct kvec *iov, unsigned long kmap_mask, int
> len)
> -{
> - int i;
> -
> - /*
> - * Work backwards through the array to honor the kmap_local_page()
> - * ordering requirements.
> - */
> - for (i = (len-1); i >= 0; i--) {
> - if (kmap_mask & BIT(i)) {
> - unsigned long addr = (unsigned long)iov[i].iov_base;
> -
> - kunmap_local((void *)(addr & PAGE_MASK));
> - }
> - }
> -}
> -
> /*
> * siw_tx_hdt() tries to push a complete packet to TCP where all
> * packet fragments are referenced by the elements of one iovec.
> @@ -439,15 +333,14 @@ static int siw_tx_hdt(struct siw_iwarp_tx *c_tx,
> struct socket *s)
> {
> struct siw_wqe *wqe = &c_tx->wqe_active;
> struct siw_sge *sge = &wqe->sqe.sge[c_tx->sge_idx];
> - struct kvec iov[MAX_ARRAY];
> - struct page *page_array[MAX_ARRAY];
> + struct bio_vec bvec[MAX_ARRAY];
> struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR };
> + void *trl, *t;
>
> int seg = 0, do_crc = c_tx->do_crc, is_kva = 0, rv;
> unsigned int data_len = c_tx->bytes_unsent, hdr_len = 0, trl_len = 0,
> sge_off = c_tx->sge_off, sge_idx = c_tx->sge_idx,
> pbl_idx = c_tx->pbl_idx;
> - unsigned long kmap_mask = 0L;
>
> if (c_tx->state == SIW_SEND_HDR) {
> if (c_tx->use_sendpage) {
> @@ -457,10 +350,15 @@ static int siw_tx_hdt(struct siw_iwarp_tx *c_tx,
> struct socket *s)
>
> c_tx->state = SIW_SEND_DATA;
> } else {
> - iov[0].iov_base =
> - (char *)&c_tx->pkt.ctrl + c_tx->ctrl_sent;
> - iov[0].iov_len = hdr_len =
> - c_tx->ctrl_len - c_tx->ctrl_sent;
> + const void *hdr = &c_tx->pkt.ctrl + c_tx->ctrl_sent;
> + void *h;
> +
> + rv = -ENOMEM;
> + hdr_len = c_tx->ctrl_len - c_tx->ctrl_sent;
> + h = page_frag_memdup(NULL, hdr, hdr_len, GFP_NOFS,
> ULONG_MAX);
> + if (!h)
> + goto done;
> + bvec_set_virt(&bvec[0], h, hdr_len);
> seg = 1;
> }
> }
> @@ -478,28 +376,9 @@ static int siw_tx_hdt(struct siw_iwarp_tx *c_tx,
> struct socket *s)
> } else {
> is_kva = 1;
> }
> - if (is_kva && !c_tx->use_sendpage) {
> - /*
> - * tx from kernel virtual address: either inline data
> - * or memory region with assigned kernel buffer
> - */
> - iov[seg].iov_base =
> - (void *)(uintptr_t)(sge->laddr + sge_off);
> - iov[seg].iov_len = sge_len;
> -
> - if (do_crc)
> - crypto_shash_update(c_tx->mpa_crc_hd,
> - iov[seg].iov_base,
> - sge_len);
> - sge_off += sge_len;
> - data_len -= sge_len;
> - seg++;
> - goto sge_done;
> - }
>
> while (sge_len) {
> size_t plen = min((int)PAGE_SIZE - fp_off, sge_len);
> - void *kaddr;
>
> if (!is_kva) {
> struct page *p;
> @@ -512,33 +391,12 @@ static int siw_tx_hdt(struct siw_iwarp_tx *c_tx,
> struct socket *s)
> p = siw_get_upage(mem->umem,
> sge->laddr + sge_off);
> if (unlikely(!p)) {
> - siw_unmap_pages(iov, kmap_mask, seg);
> wqe->processed -= c_tx->bytes_unsent;
> rv = -EFAULT;
> goto done_crc;
> }
> - page_array[seg] = p;
> -
> - if (!c_tx->use_sendpage) {
> - void *kaddr = kmap_local_page(p);
> -
> - /* Remember for later kunmap() */
> - kmap_mask |= BIT(seg);
> - iov[seg].iov_base = kaddr + fp_off;
> - iov[seg].iov_len = plen;
> -
> - if (do_crc)
> - crypto_shash_update(
> - c_tx->mpa_crc_hd,
> - iov[seg].iov_base,
> - plen);
> - } else if (do_crc) {
> - kaddr = kmap_local_page(p);
> - crypto_shash_update(c_tx->mpa_crc_hd,
> - kaddr + fp_off,
> - plen);
> - kunmap_local(kaddr);
> - }
> +
> + bvec_set_page(&bvec[seg], p, plen, fp_off);
> } else {
> /*
> * Cast to an uintptr_t to preserve all 64 bits
> @@ -552,12 +410,15 @@ static int siw_tx_hdt(struct siw_iwarp_tx *c_tx,
> struct socket *s)
> * bits on a 64 bit platform and 32 bits on a
> * 32 bit platform.
> */
> - page_array[seg] = virt_to_page((void *)(va &
> PAGE_MASK));
> - if (do_crc)
> - crypto_shash_update(
> - c_tx->mpa_crc_hd,
> - (void *)va,
> - plen);
> + bvec_set_virt(&bvec[seg], (void *)va, plen);
> + }
> +
> + if (do_crc) {
> + void *kaddr = kmap_local_page(bvec[seg].bv_page);
> + crypto_shash_update(c_tx->mpa_crc_hd,
> + kaddr + bvec[seg].bv_offset,
> + bvec[seg].bv_len);
> + kunmap_local(kaddr);
> }
>
> sge_len -= plen;
> @@ -567,13 +428,12 @@ static int siw_tx_hdt(struct siw_iwarp_tx *c_tx,
> struct socket *s)
>
> if (++seg > (int)MAX_ARRAY) {
> siw_dbg_qp(tx_qp(c_tx), "to many fragments\n");
> - siw_unmap_pages(iov, kmap_mask, seg-1);
> wqe->processed -= c_tx->bytes_unsent;
> rv = -EMSGSIZE;
> goto done_crc;
> }
> }
> -sge_done:
> +
> /* Update SGE variables at end of SGE */
> if (sge_off == sge->length &&
> (data_len != 0 || wqe->processed < wqe->bytes)) {
> @@ -582,15 +442,8 @@ static int siw_tx_hdt(struct siw_iwarp_tx *c_tx,
> struct socket *s)
> sge_off = 0;
> }
> }
> - /* trailer */
> - if (likely(c_tx->state != SIW_SEND_TRAILER)) {
> - iov[seg].iov_base = &c_tx->trailer.pad[4 - c_tx->pad];
> - iov[seg].iov_len = trl_len = MAX_TRAILER - (4 - c_tx->pad);
> - } else {
> - iov[seg].iov_base = &c_tx->trailer.pad[c_tx->ctrl_sent];
> - iov[seg].iov_len = trl_len = MAX_TRAILER - c_tx->ctrl_sent;
> - }
>
> + /* Set the CRC in the trailer */
> if (c_tx->pad) {
> *(u32 *)c_tx->trailer.pad = 0;
> if (do_crc)
> @@ -603,23 +456,29 @@ static int siw_tx_hdt(struct siw_iwarp_tx *c_tx,
> struct socket *s)
> else if (do_crc)
> crypto_shash_final(c_tx->mpa_crc_hd, (u8 *)&c_tx->trailer.crc);
>
> - data_len = c_tx->bytes_unsent;
> -
> - if (c_tx->use_sendpage) {
> - rv = siw_0copy_tx(s, page_array, &wqe->sqe.sge[c_tx->sge_idx],
> - c_tx->sge_off, data_len);
> - if (rv == data_len) {
> - rv = kernel_sendmsg(s, &msg, &iov[seg], 1, trl_len);
> - if (rv > 0)
> - rv += data_len;
> - else
> - rv = data_len;
> - }
> + /* Copy the trailer and add it to the output list */
> + if (likely(c_tx->state != SIW_SEND_TRAILER)) {
> + trl = &c_tx->trailer.pad[4 - c_tx->pad];
> + trl_len = MAX_TRAILER - (4 - c_tx->pad);
> } else {
> - rv = kernel_sendmsg(s, &msg, iov, seg + 1,
> - hdr_len + data_len + trl_len);
> - siw_unmap_pages(iov, kmap_mask, seg);
> + trl = &c_tx->trailer.pad[c_tx->ctrl_sent];
> + trl_len = MAX_TRAILER - c_tx->ctrl_sent;
> }
> +
> + rv = -ENOMEM;
> + t = page_frag_memdup(NULL, trl, trl_len, GFP_NOFS, ULONG_MAX);
> + if (!t)
> + goto done_crc;
> + bvec_set_virt(&bvec[seg], t, trl_len);
> +
> + data_len = c_tx->bytes_unsent;
> +
> + if (c_tx->use_sendpage)
> + msg.msg_flags |= MSG_SPLICE_PAGES;
> + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, bvec, seg + 1,
> + hdr_len + data_len + trl_len);
> + rv = sock_sendmsg(s, &msg);
> +
> if (rv < (int)hdr_len) {
> /* Not even complete hdr pushed or negative rv */
> wqe->processed -= data_len;
> @@ -680,6 +539,9 @@ static int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct
> socket *s)
> }
> done_crc:
> c_tx->do_crc = 0;
> + if (c_tx->state == SIW_SEND_HDR)
> + folio_put(page_folio(bvec[0].bv_page));
> + folio_put(page_folio(bvec[seg].bv_page));
> done:
> return rv;
> }
Powered by blists - more mailing lists