[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1326831948.2606.2.camel@edumazet-laptop>
Date: Tue, 17 Jan 2012 21:25:48 +0100
From: Eric Dumazet <eric.dumazet@...il.com>
To: Tore Anderson <tore@....no>
Cc: netdev <netdev@...r.kernel.org>
Subject: Re: [RFC] ipv6: dst_allfrag() not taken into account by TCP
Le mardi 17 janvier 2012 à 21:03 +0100, Tore Anderson a écrit :
> * Eric Dumazet
>
> > Bugzilla reference :
> >
> > https://bugzilla.kernel.org/show_bug.cgi?id=42572
>
> Hi, and thanks for taking an interest in this issue!
>
> I've got some general comments regarding running IPv6-only Linux servers
> behind stateless IPv4/IPv6 translators. (They are not strictly related
> to the above bug, but not completely off-topic either I hope.)
>
> 1) The Linux kernel doesn't allow reducing the effective IPv6 link MTU
> (as recorded in the routing cache) to anything less than 1280. This
> means that it can end up in a situation where the effective IPv6 link
> MTU is greater than the actual IPv6 Path MTU. In the PCAP in the
> bugzilla, they are 1280 and 1279, respectively. However, the kernel
> doesn't appear to record the actual Path MTU anywhere, instead setting
> the allfrag feature.
>
> While this is perfectly legal behaviour according to the RFC, from an
> operational point of view it would have been nice if there were some way
> (e.g. a sysctl) to tell the kernel to also actually allow an ICMPv6 PTB
> to reduce the effective IPv6 link MTU to values less than 1280 (at least
> down to the minimum IPv4 MTU + 20 bytes). That would have avoided the
> need for the allfrag feature to come into play completely.
>
> The RFC allows for this behaviour, too.
>
> 2) Since the kernel doesn't keep track of the actual Path MTU (if it's
> lower than 1280), when the allfrag feature gets set on a route, *every*
> packet gets a fragmentation header. (Which is to be expected, really,
> given it's name.) However, this means that even tiny packets such as a
> TCP SYN/ACK gets the fragmentation header added. This is clearly not
> particularly useful.
>
> If the kernel had kept track of the effective Path MTU, and only
> included the IPv6 Fragmentation header on packets that were larger than
> it *only*, this wouldn't have been a problem. (Alternatively, if it
> allowed the effective link MTU to drop below 1280 that would also have
> avoided this problem.)
>
> 3) There seems to be a bug related to generating the TCP checksum of
> SYN/ACK packets to destinations with the allfrag features set. I just
> submitted a bug report about this:
>
> https://bugzilla.kernel.org/show_bug.cgi?id=42595
>
> This makes the allfrag feature pretty much useless for me, as I can only
> successfully establish a single TCP session from a client behind a <1280
> MTU link for the entire lifetime of the routing cache entry.
>
> Best regards,
Thanks Tore, we'll take a look at this second problem.
Could you please test following patch ?
It should apply on current 'net' or 'linux' tree
include/net/inet_connection_sock.h | 1 +
include/net/tcp.h | 4 ++--
net/ipv4/tcp_output.c | 19 +++++++++++++++++--
net/ipv6/tcp_ipv6.c | 1 +
net/sctp/ipv6.c | 1 +
5 files changed, 22 insertions(+), 4 deletions(-)
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index dbf9aab..8297691 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -45,6 +45,7 @@ struct inet_connection_sock_af_ops {
struct dst_entry *dst);
struct inet_peer *(*get_peer)(struct sock *sk, bool *release_it);
u16 net_header_len;
+ u16 net_frag_header_len;
u16 sockaddr_len;
int (*setsockopt)(struct sock *sk, int level, int optname,
char __user *optval, unsigned int optlen);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0118ea9..86e5ef4 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -531,8 +531,8 @@ extern int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
extern void tcp_initialize_rcv_mss(struct sock *sk);
-extern int tcp_mtu_to_mss(const struct sock *sk, int pmtu);
-extern int tcp_mss_to_mtu(const struct sock *sk, int mss);
+extern int tcp_mtu_to_mss(struct sock *sk, int pmtu);
+extern int tcp_mss_to_mtu(struct sock *sk, int mss);
extern void tcp_mtup_init(struct sock *sk);
extern void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8c8de27..5d27b68 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1151,7 +1151,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
}
/* Calculate MSS. Not accounting for SACKs here. */
-int tcp_mtu_to_mss(const struct sock *sk, int pmtu)
+int tcp_mtu_to_mss(struct sock *sk, int pmtu)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1162,6 +1162,14 @@ int tcp_mtu_to_mss(const struct sock *sk, int pmtu)
*/
mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
+ /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
+ if (icsk->icsk_af_ops->net_frag_header_len) {
+ const struct dst_entry *dst = __sk_dst_get(sk);
+
+ if (dst && dst_allfrag(dst))
+ mss_now -= icsk->icsk_af_ops->net_frag_header_len;
+ }
+
/* Clamp it (mss_clamp does not include tcp options) */
if (mss_now > tp->rx_opt.mss_clamp)
mss_now = tp->rx_opt.mss_clamp;
@@ -1180,7 +1188,7 @@ int tcp_mtu_to_mss(const struct sock *sk, int pmtu)
}
/* Inverse of above */
-int tcp_mss_to_mtu(const struct sock *sk, int mss)
+int tcp_mss_to_mtu(struct sock *sk, int mss)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1191,6 +1199,13 @@ int tcp_mss_to_mtu(const struct sock *sk, int mss)
icsk->icsk_ext_hdr_len +
icsk->icsk_af_ops->net_header_len;
+ /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
+ if (icsk->icsk_af_ops->net_frag_header_len) {
+ const struct dst_entry *dst = __sk_dst_get(sk);
+
+ if (dst && dst_allfrag(dst))
+ mtu += icsk->icsk_af_ops->net_frag_header_len;
+ }
return mtu;
}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 906c7ca..d90c007 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1883,6 +1883,7 @@ static const struct inet_connection_sock_af_ops ipv6_specific = {
.syn_recv_sock = tcp_v6_syn_recv_sock,
.get_peer = tcp_v6_get_peer,
.net_header_len = sizeof(struct ipv6hdr),
+ .net_frag_header_len = sizeof(struct frag_hdr),
.setsockopt = ipv6_setsockopt,
.getsockopt = ipv6_getsockopt,
.addr2sockaddr = inet6_csk_addr2sockaddr,
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 91f4791..13174e5 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -1000,6 +1000,7 @@ static struct sctp_af sctp_af_inet6 = {
.seq_dump_addr = sctp_v6_seq_dump_addr,
.ecn_capable = sctp_v6_ecn_capable,
.net_header_len = sizeof(struct ipv6hdr),
+ .net_frag_header_len = sizeof(struct frag_hdr),
.sockaddr_len = sizeof(struct sockaddr_in6),
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_ipv6_setsockopt,
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists