[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20180416173339.6310-5-edumazet@google.com>
Date: Mon, 16 Apr 2018 10:33:38 -0700
From: Eric Dumazet <edumazet@...gle.com>
To: "David S . Miller" <davem@...emloft.net>
Cc: netdev <netdev@...r.kernel.org>,
Eric Dumazet <edumazet@...gle.com>,
Neal Cardwell <ncardwell@...gle.com>,
Yuchung Cheng <ycheng@...gle.com>,
Soheil Hassas Yeganeh <soheil@...gle.com>,
Eric Dumazet <eric.dumazet@...il.com>
Subject: [PATCH net-next 4/5] tcp: implement mmap() for zero copy receive
Some networks can make sure TCP payload can exactly fit 4KB pages,
with well chosen MSS/MTU and architectures.
Implement mmap() system call so that applications can avoid
copying data without complex splice() games.
Note that a successful mmap( X bytes) on TCP socket is consuming
bytes, as if recvmsg() has been done. (tp->copied += X)
Only PROT_READ mappings are accepted, as skb page frags
are fundamentally shared and read only.
If tcp_mmap() finds data that is not a full page, or a patch of
urgent data, -EINVAL is returned, no bytes are consumed.
Application must fallback to recvmsg() to read the problematic sequence.
mmap() wont block, regardless of socket being in blocking or
non-blocking mode. If not enough bytes are in receive queue,
mmap() would return -EAGAIN, or -EIO if socket is in a state
where no other bytes can be added into receive queue.
An application might use SO_RCVLOWAT, poll() and/or ioctl( FIONREAD)
to efficiently use mmap()
On the sender side, MSG_EOR might help to clearly separate unaligned
headers and 4K-aligned chunks if necessary.
Tested:
mlx4 (cx-3) 40Gbit NIC, with tcp_mmap program provided in following patch.
MTU set to 4168 (4096 TCP payload, 40 bytes IPv6 header, 32 bytes TCP header)
Without mmap() (tcp_mmap -s)
received 32768 MB (0 % mmap'ed) in 8.13342 s, 33.7961 Gbit,
cpu usage user:0.034 sys:3.778, 116.333 usec per MB, 63062 c-switches
received 32768 MB (0 % mmap'ed) in 8.14501 s, 33.748 Gbit,
cpu usage user:0.029 sys:3.997, 122.864 usec per MB, 61903 c-switches
received 32768 MB (0 % mmap'ed) in 8.11723 s, 33.8635 Gbit,
cpu usage user:0.048 sys:3.964, 122.437 usec per MB, 62983 c-switches
received 32768 MB (0 % mmap'ed) in 8.39189 s, 32.7552 Gbit,
cpu usage user:0.038 sys:4.181, 128.754 usec per MB, 55834 c-switches
With mmap() on receiver (tcp_mmap -s -z)
received 32768 MB (100 % mmap'ed) in 8.03083 s, 34.2278 Gbit,
cpu usage user:0.024 sys:1.466, 45.4712 usec per MB, 65479 c-switches
received 32768 MB (100 % mmap'ed) in 7.98805 s, 34.4111 Gbit,
cpu usage user:0.026 sys:1.401, 43.5486 usec per MB, 65447 c-switches
received 32768 MB (100 % mmap'ed) in 7.98377 s, 34.4296 Gbit,
cpu usage user:0.028 sys:1.452, 45.166 usec per MB, 65496 c-switches
received 32768 MB (99.9969 % mmap'ed) in 8.01838 s, 34.281 Gbit,
cpu usage user:0.02 sys:1.446, 44.7388 usec per MB, 65505 c-switches
Signed-off-by: Eric Dumazet <edumazet@...gle.com>
---
include/net/tcp.h | 2 +
net/ipv4/af_inet.c | 2 +-
net/ipv4/tcp.c | 113 ++++++++++++++++++++++++++++++++++++++++++++
net/ipv6/af_inet6.c | 2 +-
4 files changed, 117 insertions(+), 2 deletions(-)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0ee85c47c185afcb8e1017d59e02313cb5df78ec..833154e3df173ea41aa16dd1ec739a175c679c5c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -404,6 +404,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
int flags, int *addr_len);
int tcp_set_rcvlowat(struct sock *sk, int val);
void tcp_data_ready(struct sock *sk);
+int tcp_mmap(struct file *file, struct socket *sock,
+ struct vm_area_struct *vma);
void tcp_parse_options(const struct net *net, const struct sk_buff *skb,
struct tcp_options_received *opt_rx,
int estab, struct tcp_fastopen_cookie *foc);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index f5c562aaef3522519bcf1ae37782a7e14e278723..3ebf599cebaea4926decc1aad7274b12ec7e1566 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -994,7 +994,7 @@ const struct proto_ops inet_stream_ops = {
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
.recvmsg = inet_recvmsg,
- .mmap = sock_no_mmap,
+ .mmap = tcp_mmap,
.sendpage = inet_sendpage,
.splice_read = tcp_splice_read,
.read_sock = tcp_read_sock,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c768d306b65714bb8740c60110c43042508af6b7..438fbca96cd3100d722e1bd8bcc6f49624495a21 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1726,6 +1726,119 @@ int tcp_set_rcvlowat(struct sock *sk, int val)
}
EXPORT_SYMBOL(tcp_set_rcvlowat);
+/* When user wants to mmap X pages, we first need to perform the mapping
+ * before freeing any skbs in receive queue, otherwise user would be unable
+ * to fallback to standard recvmsg(). This happens if some data in the
+ * requested block is not exactly fitting in a page.
+ *
+ * We only support order-0 pages for the moment.
+ * mmap() on TCP is very strict, there is no point
+ * trying to accommodate with pathological layouts.
+ */
+int tcp_mmap(struct file *file, struct socket *sock,
+ struct vm_area_struct *vma)
+{
+ unsigned long size = vma->vm_end - vma->vm_start;
+ unsigned int nr_pages = size >> PAGE_SHIFT;
+ struct page **pages_array = NULL;
+ u32 seq, len, offset, nr = 0;
+ struct sock *sk = sock->sk;
+ const skb_frag_t *frags;
+ struct tcp_sock *tp;
+ struct sk_buff *skb;
+ int ret;
+
+ if (vma->vm_pgoff || !nr_pages)
+ return -EINVAL;
+
+ if (vma->vm_flags & VM_WRITE)
+ return -EPERM;
+ /* TODO: Maybe the following is not needed if pages are COW */
+ vma->vm_flags &= ~VM_MAYWRITE;
+
+ lock_sock(sk);
+
+ ret = -ENOTCONN;
+ if (sk->sk_state == TCP_LISTEN)
+ goto out;
+
+ sock_rps_record_flow(sk);
+
+ if (tcp_inq(sk) < size) {
+ ret = sock_flag(sk, SOCK_DONE) ? -EIO : -EAGAIN;
+ goto out;
+ }
+ tp = tcp_sk(sk);
+ seq = tp->copied_seq;
+ /* Abort if urgent data is in the area */
+ if (unlikely(tp->urg_data)) {
+ u32 urg_offset = tp->urg_seq - seq;
+
+ ret = -EINVAL;
+ if (urg_offset < size)
+ goto out;
+ }
+ ret = -ENOMEM;
+ pages_array = kvmalloc_array(nr_pages, sizeof(struct page *),
+ GFP_KERNEL);
+ if (!pages_array)
+ goto out;
+ skb = tcp_recv_skb(sk, seq, &offset);
+ ret = -EINVAL;
+skb_start:
+ /* We do not support anything not in page frags */
+ offset -= skb_headlen(skb);
+ if ((int)offset < 0)
+ goto out;
+ if (skb_has_frag_list(skb))
+ goto out;
+ len = skb->data_len - offset;
+ frags = skb_shinfo(skb)->frags;
+ while (offset) {
+ if (frags->size > offset)
+ goto out;
+ offset -= frags->size;
+ frags++;
+ }
+ while (nr < nr_pages) {
+ if (len) {
+ if (len < PAGE_SIZE)
+ goto out;
+ if (frags->size != PAGE_SIZE || frags->page_offset)
+ goto out;
+ pages_array[nr++] = skb_frag_page(frags);
+ frags++;
+ len -= PAGE_SIZE;
+ seq += PAGE_SIZE;
+ continue;
+ }
+ skb = skb->next;
+ offset = seq - TCP_SKB_CB(skb)->seq;
+ goto skb_start;
+ }
+ /* OK, we have a full set of pages ready to be inserted into vma */
+ for (nr = 0; nr < nr_pages; nr++) {
+ ret = vm_insert_page(vma, vma->vm_start + (nr << PAGE_SHIFT),
+ pages_array[nr]);
+ if (ret)
+ goto out;
+ }
+ /* operation is complete, we can 'consume' all skbs */
+ tp->copied_seq = seq;
+ tcp_rcv_space_adjust(sk);
+
+ /* Clean up data we have read: This will do ACK frames. */
+ tcp_recv_skb(sk, seq, &offset);
+ tcp_cleanup_rbuf(sk, size);
+
+ ret = 0;
+out:
+ release_sock(sk);
+ kvfree(pages_array);
+ return ret;
+}
+EXPORT_SYMBOL(tcp_mmap);
+
static void tcp_update_recv_tstamps(struct sk_buff *skb,
struct scm_timestamping *tss)
{
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index e70d59fb26e16ace1eb484d23964946092a2cd57..2c694912df2e77b414de5cc2aa43e2ec59286836 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -579,7 +579,7 @@ const struct proto_ops inet6_stream_ops = {
.getsockopt = sock_common_getsockopt, /* ok */
.sendmsg = inet_sendmsg, /* ok */
.recvmsg = inet_recvmsg, /* ok */
- .mmap = sock_no_mmap,
+ .mmap = tcp_mmap,
.sendpage = inet_sendpage,
.sendmsg_locked = tcp_sendmsg_locked,
.sendpage_locked = tcp_sendpage_locked,
--
2.17.0.484.g0c8726318c-goog
Powered by blists - more mailing lists