lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <SN7PR15MB5755AC0B56FDBE597F586695995DA@SN7PR15MB5755.namprd15.prod.outlook.com>
Date:   Wed, 21 Jun 2023 08:57:49 +0000
From:   Bernard Metzler <BMT@...ich.ibm.com>
To:     David Howells <dhowells@...hat.com>,
        "netdev@...r.kernel.org" <netdev@...r.kernel.org>
CC:     David Howells <dhowells@...hat.com>,
        Alexander Duyck <alexander.duyck@...il.com>,
        "David S. Miller" <davem@...emloft.net>,
        Eric Dumazet <edumazet@...gle.com>,
        Jakub Kicinski <kuba@...nel.org>,
        Paolo Abeni <pabeni@...hat.com>,
        Willem de Bruijn <willemdebruijn.kernel@...il.com>,
        David Ahern <dsahern@...nel.org>,
        Matthew Wilcox <willy@...radead.org>,
        Jens Axboe <axboe@...nel.dk>,
        "linux-mm@...ck.org" <linux-mm@...ck.org>,
        "linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
        Tom Talpey <tom@...pey.com>, Jason Gunthorpe <jgg@...pe.ca>,
        Leon Romanovsky <leon@...nel.org>,
        "linux-rdma@...r.kernel.org" <linux-rdma@...r.kernel.org>
Subject: RE:  [PATCH net-next v3 04/18] siw: Use sendmsg(MSG_SPLICE_PAGES) rather
 than sendpage to transmit



> -----Original Message-----
> From: David Howells <dhowells@...hat.com>
> Sent: Tuesday, 20 June 2023 16:53
> To: netdev@...r.kernel.org
> Cc: David Howells <dhowells@...hat.com>; Alexander Duyck
> <alexander.duyck@...il.com>; David S. Miller <davem@...emloft.net>; Eric
> Dumazet <edumazet@...gle.com>; Jakub Kicinski <kuba@...nel.org>; Paolo
> Abeni <pabeni@...hat.com>; Willem de Bruijn
> <willemdebruijn.kernel@...il.com>; David Ahern <dsahern@...nel.org>;
> Matthew Wilcox <willy@...radead.org>; Jens Axboe <axboe@...nel.dk>; linux-
> mm@...ck.org; linux-kernel@...r.kernel.org; Bernard Metzler
> <BMT@...ich.ibm.com>; Tom Talpey <tom@...pey.com>; Jason Gunthorpe
> <jgg@...pe.ca>; Leon Romanovsky <leon@...nel.org>; linux-
> rdma@...r.kernel.org
> Subject: [PATCH net-next v3 04/18] siw: Use
> sendmsg(MSG_SPLICE_PAGES) rather than sendpage to transmit
> 
> When transmitting data, call down into TCP using a single sendmsg with
> MSG_SPLICE_PAGES to indicate that content should be spliced rather than
> performing several sendmsg and sendpage calls to transmit header, data
> pages and trailer.
> 
> To make this work, the data is assembled in a bio_vec array and attached to
> a BVEC-type iterator.  The header and trailer (if present) are copied into
> page fragments that can be freed with put_page().
> 
> Signed-off-by: David Howells <dhowells@...hat.com>
> cc: Bernard Metzler <bmt@...ich.ibm.com>
> cc: Tom Talpey <tom@...pey.com>
> cc: Jason Gunthorpe <jgg@...pe.ca>
> cc: Leon Romanovsky <leon@...nel.org>
> cc: "David S. Miller" <davem@...emloft.net>
> cc: Eric Dumazet <edumazet@...gle.com>
> cc: Jakub Kicinski <kuba@...nel.org>
> cc: Paolo Abeni <pabeni@...hat.com>
> cc: Jens Axboe <axboe@...nel.dk>
> cc: Matthew Wilcox <willy@...radead.org>
> cc: linux-rdma@...r.kernel.org
> cc: netdev@...r.kernel.org
> ---
> 
> Notes:
>     ver #2)
>      - Wrap lines at 80.
> 
>  drivers/infiniband/sw/siw/siw_qp_tx.c | 231 ++++----------------------
>  1 file changed, 36 insertions(+), 195 deletions(-)
> 
> diff --git a/drivers/infiniband/sw/siw/siw_qp_tx.c
> b/drivers/infiniband/sw/siw/siw_qp_tx.c
> index ffb16beb6c30..2584f9da0dd8 100644
> --- a/drivers/infiniband/sw/siw/siw_qp_tx.c
> +++ b/drivers/infiniband/sw/siw/siw_qp_tx.c
> @@ -311,114 +311,8 @@ static int siw_tx_ctrl(struct siw_iwarp_tx *c_tx,
> struct socket *s,
>  	return rv;
>  }
> 
> -/*
> - * 0copy TCP transmit interface: Use MSG_SPLICE_PAGES.
> - *
> - * Using sendpage to push page by page appears to be less efficient
> - * than using sendmsg, even if data are copied.
> - *
> - * A general performance limitation might be the extra four bytes
> - * trailer checksum segment to be pushed after user data.
> - */
> -static int siw_tcp_sendpages(struct socket *s, struct page **page, int
> offset,
> -			     size_t size)
> -{
> -	struct bio_vec bvec;
> -	struct msghdr msg = {
> -		.msg_flags = (MSG_MORE | MSG_DONTWAIT | MSG_SENDPAGE_NOTLAST |
> -			      MSG_SPLICE_PAGES),
> -	};
> -	struct sock *sk = s->sk;
> -	int i = 0, rv = 0, sent = 0;
> -
> -	while (size) {
> -		size_t bytes = min_t(size_t, PAGE_SIZE - offset, size);
> -
> -		if (size + offset <= PAGE_SIZE)
> -			msg.msg_flags &= ~MSG_SENDPAGE_NOTLAST;
> -
> -		tcp_rate_check_app_limited(sk);
> -		bvec_set_page(&bvec, page[i], bytes, offset);
> -		iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size);
> -
> -try_page_again:
> -		lock_sock(sk);
> -		rv = tcp_sendmsg_locked(sk, &msg, size);
> -		release_sock(sk);
> -
> -		if (rv > 0) {
> -			size -= rv;
> -			sent += rv;
> -			if (rv != bytes) {
> -				offset += rv;
> -				bytes -= rv;
> -				goto try_page_again;
> -			}
> -			offset = 0;
> -		} else {
> -			if (rv == -EAGAIN || rv == 0)
> -				break;
> -			return rv;
> -		}
> -		i++;
> -	}
> -	return sent;
> -}
> -
> -/*
> - * siw_0copy_tx()
> - *
> - * Pushes list of pages to TCP socket. If pages from multiple
> - * SGE's, all referenced pages of each SGE are pushed in one
> - * shot.
> - */
> -static int siw_0copy_tx(struct socket *s, struct page **page,
> -			struct siw_sge *sge, unsigned int offset,
> -			unsigned int size)
> -{
> -	int i = 0, sent = 0, rv;
> -	int sge_bytes = min(sge->length - offset, size);
> -
> -	offset = (sge->laddr + offset) & ~PAGE_MASK;
> -
> -	while (sent != size) {
> -		rv = siw_tcp_sendpages(s, &page[i], offset, sge_bytes);
> -		if (rv >= 0) {
> -			sent += rv;
> -			if (size == sent || sge_bytes > rv)
> -				break;
> -
> -			i += PAGE_ALIGN(sge_bytes + offset) >> PAGE_SHIFT;
> -			sge++;
> -			sge_bytes = min(sge->length, size - sent);
> -			offset = sge->laddr & ~PAGE_MASK;
> -		} else {
> -			sent = rv;
> -			break;
> -		}
> -	}
> -	return sent;
> -}
> -
>  #define MAX_TRAILER (MPA_CRC_SIZE + 4)
> 
> -static void siw_unmap_pages(struct kvec *iov, unsigned long kmap_mask, int
> len)
> -{
> -	int i;
> -
> -	/*
> -	 * Work backwards through the array to honor the kmap_local_page()
> -	 * ordering requirements.
> -	 */
> -	for (i = (len-1); i >= 0; i--) {
> -		if (kmap_mask & BIT(i)) {
> -			unsigned long addr = (unsigned long)iov[i].iov_base;
> -
> -			kunmap_local((void *)(addr & PAGE_MASK));
> -		}
> -	}
> -}
> -
>  /*
>   * siw_tx_hdt() tries to push a complete packet to TCP where all
>   * packet fragments are referenced by the elements of one iovec.

Just a nit - maybe change 'iovec' -> 'bio_vec' in this comment?

I successfully tested with nvmeof client and ib_read_bw/ib_write_bw.
Looks good, I very much appreciate the resultant code
simplifications in the tx path!

Reviewed-by: Bernard Metzler <bmt@...ich.ibm.com>

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ