[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1461034241.10638.145.camel@edumazet-glaptop3.roam.corp.google.com>
Date: Mon, 18 Apr 2016 19:50:41 -0700
From: Eric Dumazet <eric.dumazet@...il.com>
To: Martin KaFai Lau <kafai@...com>
Cc: netdev@...r.kernel.org, Eric Dumazet <edumazet@...gle.com>,
Neal Cardwell <ncardwell@...gle.com>,
Soheil Hassas Yeganeh <soheil.kdev@...il.com>,
Willem de Bruijn <willemb@...gle.com>,
Yuchung Cheng <ycheng@...gle.com>,
Kernel Team <kernel-team@...com>
Subject: Re: [RFC PATCH v2 net-next 4/7] tcp: Make use of MSG_EOR flag in
tcp_sendmsg
On Mon, 2016-04-18 at 19:27 -0700, Martin KaFai Lau wrote:
> On Mon, Apr 18, 2016 at 05:06:57PM -0700, Eric Dumazet wrote:
> > It should only be a request from user space to ask TCP to not aggregate
> > stuff on future sendpage()/sendmsg() on the skb carrying this new flag.
> >
> How about something like this. Please advise if tcp_sendmsg_noappend can
> be simpler.
>
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index c0ef054..ac31798 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -762,7 +762,8 @@ struct tcp_skb_cb {
>
> __u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */
> __u8 txstamp_ack:1, /* Record TX timestamp for ack? */
> - unused:7;
> + eor:1, /* Is skb MSG_EOR marked */
> + unused:6;
> __u32 ack_seq; /* Sequence number ACK'd */
> union {
> struct inet_skb_parm h4;
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index 4d73858..12772be 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -874,6 +874,13 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
> return mss_now;
> }
>
> +static bool tcp_sendmsg_noappend(const struct sock *sk)
> +{
> + const struct sk_buff *skb = tcp_write_queue_tail(sk);
> +
> + return (!skb || TCP_SKB_CB(skb)->eor);
> +}
> +
> static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
> size_t size, int flags)
> {
> @@ -903,6 +910,9 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
> if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
> goto out_err;
>
> + if (tcp_sendmsg_noappend(sk))
> + goto new_segment;
> +
> while (size > 0) {
> struct sk_buff *skb = tcp_write_queue_tail(sk);
> int copy, i;
> @@ -960,6 +970,7 @@ new_segment:
> size -= copy;
> if (!size) {
> tcp_tx_timestamp(sk, sk->sk_tsflags, skb);
> + TCP_SKB_CB(skb)->eor = !!(flags & MSG_EOR);
> goto out;
> }
>
> @@ -1145,6 +1156,9 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
>
> sg = !!(sk->sk_route_caps & NETIF_F_SG);
>
> + if (tcp_sendmsg_noappend(sk))
> + goto new_segment;
> +
> while (msg_data_left(msg)) {
> int copy = 0;
> int max = size_goal;
> @@ -1250,6 +1264,7 @@ new_segment:
> copied += copy;
> if (!msg_data_left(msg)) {
> tcp_tx_timestamp(sk, sockc.tsflags, skb);
> + TCP_SKB_CB(skb)->eor = !!(flags & MSG_EOR);
> goto out;
> }
I believe it is slightly wrong (to do the goto new_segment if there is
no data to send)
I would instead use this fast path, doing the test _when_ we already
have an skb to test for.
diff --git a/include/net/tcp.h b/include/net/tcp.h
index fd40f8c64d5f..015851634195 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -760,7 +760,8 @@ struct tcp_skb_cb {
__u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */
__u8 txstamp_ack:1, /* Record TX timestamp for ack? */
- unused:7;
+ eor:1, /* Is skb MSG_EOR marked */
+ unused:6;
__u32 ack_seq; /* Sequence number ACK'd */
union {
struct inet_skb_parm h4;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4d73858991af..1944c19885ab 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -908,7 +908,9 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
int copy, i;
bool can_coalesce;
- if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
+ if (!tcp_send_head(sk) ||
+ (copy = size_goal - skb->len) <= 0 ||
+ TCP_SKB_CB(skb)->eor) {
new_segment:
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
@@ -960,6 +962,7 @@ new_segment:
size -= copy;
if (!size) {
tcp_tx_timestamp(sk, sk->sk_tsflags, skb);
+ TCP_SKB_CB(skb)->eor = !!(flags & MSG_EOR);
goto out;
}
@@ -1156,7 +1159,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
copy = max - skb->len;
}
- if (copy <= 0) {
+ if (copy <= 0 || TCP_SKB_CB(skb)->eor) {
new_segment:
/* Allocate new segment. If the interface is SG,
* allocate skb fitting to single page.
@@ -1250,6 +1253,7 @@ new_segment:
copied += copy;
if (!msg_data_left(msg)) {
tcp_tx_timestamp(sk, sockc.tsflags, skb);
+ TCP_SKB_CB(skb)->eor = !!(flags & MSG_EOR);
goto out;
}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 6451b83d81e9..acfbff81ef47 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1171,6 +1171,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
TCP_SKB_CB(buff)->tcp_flags = flags;
TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
+ TCP_SKB_CB(buff)->eor = TCP_SKB_CB(skb)->eor;
+ TCP_SKB_CB(skb)->eor = 0;
if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) {
/* Copy and checksum data tail into the new buffer. */
@@ -1730,6 +1732,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
/* This packet was never sent out yet, so no SACK bits. */
TCP_SKB_CB(buff)->sacked = 0;
+ TCP_SKB_CB(buff)->eor = TCP_SKB_CB(skb)->eor;
+ TCP_SKB_CB(skb)->eor = 0;
buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL;
skb_split(skb, buff, len);
@@ -2471,6 +2475,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
/* Merge over control information. This moves PSH/FIN etc. over */
TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags;
+ TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor;
/* All done, get rid of second SKB and account for it so
* packet counting does not break.
@@ -2502,7 +2507,8 @@ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
/* Some heurestics for collapsing over SACK'd could be invented */
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
return false;
-
+ if (TCP_SKB_CB(skb)->eor)
+ return false;
return true;
}
Powered by blists - more mailing lists