[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1428704189-31247-4-git-send-email-fw@strlen.de>
Date: Sat, 11 Apr 2015 00:16:29 +0200
From: Florian Westphal <fw@...len.de>
To: <netdev@...r.kernel.org>
Cc: kaber@...sh.net, Florian Westphal <fw@...len.de>
Subject: [PATCH -next 3/3] ipv4: don't remove df bit when refragmenting
We always send fragments without DF bit set.
Thus, given following setup:
mtu1500 - mtu1500:1400 - mtu1400:1280 - mtu1280
A R1 R2 B
Where R1 and R2 run linux with netfilter defragmentation/conntrack
enabled, then if Host A sent a fragmented packet _with_ DF set to B, R1
will respond with icmp too big error if one of these fragments exceeded
1400 bytes. So far, so good.
However, the host A will never learn about the lower 1280 link.
The next packet presumably sent by A would use 1400 as the new fragment
size, but R1 will strip DF bit when refragmenting.
Whats worse: If R1 receives fragment sizes 1200 and 100, it would
forward the reassembled packet without refragmenting, i.e.
R2 would send an icmp error in response to a packet that was never sent,
citing mtu that the original sender never exceeded.
In order to 'replay' the original fragments to preserve their semantics,
one solution is to
1. set DF bit on the new fragments if it was set on original ones.
2. set the size of the new fragments generated by R1 during
refragmentation to the largest size seen when defragmenting.
R2 will then notice the problem and will send the expected
'too big, use 1280' icmp error, and further fragments of this size
would not grow anymore to 1400 link mtu when R1 refragments.
There is however, one important caveat. We cannot just use existing
IPCB(skb)->frag_max_size as upper boundary for refragmentation.
We have to consider a case where we receive a large fragment without DF,
followed by a small fragment with DF set.
In such scenario we must not generate a large spew of small DF-fragments
(else we induce packet/traffic amplification).
This modifies ip_fragment so that we track largest fragment size seen
both for DF and non-DF packets.
Then, when we find that we had at least one DF fragment AND the largest
non-DF fragment did not exceed one with DF set, let ip_fragment know that
it should refragment using original frag size and also set DF bit on the
newly created fragments.
Joint work with Hannes Frederic Sowa.
Reported-by: Jesse Gross <jesse@...ira.com>
Signed-off-by: Florian Westphal <fw@...len.de>
---
include/net/inet_frag.h | 2 +-
include/net/ip.h | 1 +
net/ipv4/ip_fragment.c | 27 ++++++++++++++++++++++-----
net/ipv4/ip_output.c | 43 ++++++++++++++++++++++++++++++++++---------
4 files changed, 58 insertions(+), 15 deletions(-)
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 8d17655..e1300b3 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -43,7 +43,7 @@ enum {
* @len: total length of the original datagram
* @meat: length of received fragments so far
* @flags: fragment queue flags
- * @max_size: (ipv4 only) maximum received fragment size with IP_DF set
+ * @max_size: maximum received fragment size
* @net: namespace that this frag belongs to
*/
struct inet_frag_queue {
diff --git a/include/net/ip.h b/include/net/ip.h
index d14af7e..428e694 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -45,6 +45,7 @@ struct inet_skb_parm {
#define IPSKB_FRAG_COMPLETE BIT(3)
#define IPSKB_REROUTED BIT(4)
#define IPSKB_DOREDIRECT BIT(5)
+#define IPSKB_FRAG_PMTU BIT(6)
u16 frag_max_size;
};
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index cc1da6d..863d10d 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -75,6 +75,7 @@ struct ipq {
__be16 id;
u8 protocol;
u8 ecn; /* RFC3168 support */
+ u16 max_df_size; /* largest frag with DF set seen */
int iif;
unsigned int rid;
struct inet_peer *peer;
@@ -319,6 +320,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
{
struct sk_buff *prev, *next;
struct net_device *dev;
+ unsigned int fragsize;
int flags, offset;
int ihl, end;
int err = -ENOENT;
@@ -474,9 +476,14 @@ found:
if (offset == 0)
qp->q.flags |= INET_FRAG_FIRST_IN;
+ fragsize = skb->len + ihl;
+
+ if (fragsize > qp->q.max_size)
+ qp->q.max_size = fragsize;
+
if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
- skb->len + ihl > qp->q.max_size)
- qp->q.max_size = skb->len + ihl;
+ fragsize > qp->max_df_size)
+ qp->max_df_size = fragsize;
if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
qp->q.meat == qp->q.len) {
@@ -606,13 +613,23 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
head->next = NULL;
head->dev = dev;
head->tstamp = qp->q.stamp;
- IPCB(head)->frag_max_size = qp->q.max_size;
+ IPCB(head)->frag_max_size = qp->max_df_size;
iph = ip_hdr(head);
- /* max_size != 0 implies at least one fragment had IP_DF set */
- iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0;
iph->tot_len = htons(len);
iph->tos |= ecn;
+
+ /* if largest size with df is also largest seen we should also
+ * call ip_fragment & set DF on the new fragments when forwarding
+ * to not break path mtu discovery.
+ */
+ if (qp->max_df_size == qp->q.max_size) {
+ IPCB(head)->flags |= IPSKB_FRAG_PMTU;
+ iph->frag_off = htons(IP_DF);
+ } else {
+ iph->frag_off = 0;
+ }
+
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
qp->q.fragments = NULL;
qp->q.fragments_tail = NULL;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index c65b93a..1b7e16b 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -270,7 +270,8 @@ static int ip_finish_output(struct sock *sk, struct sk_buff *skb)
if (skb_is_gso(skb))
return ip_finish_output_gso(sk, skb);
- if (skb->len > ip_skb_dst_mtu(skb))
+ if (skb->len > ip_skb_dst_mtu(skb) ||
+ (IPCB(skb)->flags & IPSKB_FRAG_PMTU))
return ip_fragment(sk, skb, ip_finish_output2);
return ip_finish_output2(sk, skb);
@@ -507,14 +508,29 @@ int ip_fragment(struct sock *sk, struct sk_buff *skb,
iph = ip_hdr(skb);
mtu = ip_skb_dst_mtu(skb);
- if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) ||
- (IPCB(skb)->frag_max_size &&
- IPCB(skb)->frag_max_size > mtu))) {
- IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
- htonl(mtu));
- kfree_skb(skb);
- return -EMSGSIZE;
+
+ if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->ignore_df))
+ goto fail_toobig;
+
+ /* if frag_max_size is set, then at least one fragment
+ * had DF bit set when defragmentation took place.
+ */
+ if (unlikely(IPCB(skb)->frag_max_size)) {
+ if (IPCB(skb)->frag_max_size > mtu)
+ goto fail_toobig;
+
+ /* PMTU discovery wanted?
+ * Don't send larger fragments than what we received.
+ *
+ * We don't cap unconditionally because we don't
+ * want to send tiny packets if skb was built from
+ * small df-fragment and one huge fragment without df.
+ *
+ * IPSKB_FRAG_PMTU tells us that all fragments had same size
+ * so we won't create more packets than we originally received.
+ */
+ if (IPCB(skb)->flags & IPSKB_FRAG_PMTU)
+ mtu = IPCB(skb)->frag_max_size;
}
/*
@@ -711,6 +727,9 @@ slow_path:
iph = ip_hdr(skb2);
iph->frag_off = htons((offset >> 3));
+ if (IPCB(skb)->flags & IPSKB_FRAG_PMTU)
+ iph->frag_off |= htons(IP_DF);
+
/* ANK: dirty, but effective trick. Upgrade options only if
* the segment to be fragmented was THE FIRST (otherwise,
* options are already fixed) and make it ONCE
@@ -750,6 +769,12 @@ fail:
kfree_skb(skb);
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
return err;
+fail_toobig:
+ IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+ kfree_skb(skb);
+ return -EMSGSIZE;
+
}
EXPORT_SYMBOL(ip_fragment);
--
2.0.5
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists