lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1430772888-5682-4-git-send-email-fw@strlen.de>
Date:	Mon,  4 May 2015 22:54:46 +0200
From:	Florian Westphal <fw@...len.de>
To:	<netdev@...r.kernel.org>
Cc:	hannes@...essinduktion.org, jesse@...ira.com,
	Florian Westphal <fw@...len.de>
Subject: [PATCH V2 -next 3/5] ip_fragment: don't remove df bit from defragmented packets

We always send fragments without DF bit set.

Thus, given following setup:

mtu1500 - mtu1500:1400 - mtu1400:1280 - mtu1280
   A           R1               R2         B

Where R1 and R2 run linux with netfilter defragmentation/conntrack
enabled, then if Host A sent a fragmented packet _with_ DF set to B, R1
will respond with icmp too big error if one of these fragments exceeded
1400 bytes.  So far, so good.

However, the host A will never learn about the lower 1280 link.
The next packet presumably sent by A would use 1400 as the new fragment
size, but R1 will strip DF bit when refragmenting.

Whats worse: If R1 receives fragment sizes 1200 and 100, it would
forward the reassembled packet without refragmenting, i.e.
R2 would send an icmp error in response to a packet that was never sent,
citing a mtu that the original sender never exceeded.

In order to 'replay' the original fragments to preserve their semantics,
the simple solution is to

 1. set DF bit on the new fragments if it was set on original ones.
 2. set the size of the new fragments generated by R1 during
    refragmentation to the largest size seen when defragmenting.

R2 will then notice the problem and will send the expected
'too big, use 1280' icmp error, and further fragments of this size
would not grow anymore to 1400 link mtu when R1 refragments.

There is however, one important caveat. We cannot just use existing
IPCB(skb)->frag_max_size as upper boundary for refragmentation.

We have to consider a case where we receive a large fragment without DF,
followed by a small fragment with DF set.

In such scenario we must not generate a large spew of small DF-fragments
(else we induce packet/traffic amplification).

This modifies ip_fragment so that we track largest fragment size seen
both for DF and non-DF packets, and set frag_max_size to the largest
value.

If the DF fragment value is not smaller than the largest with DF set,
we will let the output path know that it needs to refragment
using frag_max_size sized packets, also setting DF bit on each fragment.

If frag_max_size exceeds the mtu, we already drop the skb and send the
needed icmp error.

Joint work with Hannes Frederic Sowa.

Reported-by: Jesse Gross <jesse@...ira.com>
Signed-off-by: Florian Westphal <fw@...len.de>
---
 Changes since v1:
  - always store frag_max_size and use it to avoid increasing
    the size of the fragments when refragmenting.

 include/net/inet_frag.h |  2 +-
 include/net/ip.h        |  1 +
 net/ipv4/ip_fragment.c  | 31 ++++++++++++++++++++++++++-----
 net/ipv4/ip_output.c    | 31 ++++++++++++++++++++++---------
 4 files changed, 50 insertions(+), 15 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 8d17655..e1300b3 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -43,7 +43,7 @@ enum {
  * @len: total length of the original datagram
  * @meat: length of received fragments so far
  * @flags: fragment queue flags
- * @max_size: (ipv4 only) maximum received fragment size with IP_DF set
+ * @max_size: maximum received fragment size
  * @net: namespace that this frag belongs to
  */
 struct inet_frag_queue {
diff --git a/include/net/ip.h b/include/net/ip.h
index d14af7e..428e694 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -45,6 +45,7 @@ struct inet_skb_parm {
 #define IPSKB_FRAG_COMPLETE	BIT(3)
 #define IPSKB_REROUTED		BIT(4)
 #define IPSKB_DOREDIRECT	BIT(5)
+#define IPSKB_FRAG_PMTU		BIT(6)
 
 	u16			frag_max_size;
 };
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index cc1da6d..ad2404f 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -75,6 +75,7 @@ struct ipq {
 	__be16		id;
 	u8		protocol;
 	u8		ecn; /* RFC3168 support */
+	u16		max_df_size; /* largest frag with DF set seen */
 	int             iif;
 	unsigned int    rid;
 	struct inet_peer *peer;
@@ -319,6 +320,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 {
 	struct sk_buff *prev, *next;
 	struct net_device *dev;
+	unsigned int fragsize;
 	int flags, offset;
 	int ihl, end;
 	int err = -ENOENT;
@@ -474,9 +476,14 @@ found:
 	if (offset == 0)
 		qp->q.flags |= INET_FRAG_FIRST_IN;
 
+	fragsize = skb->len + ihl;
+
+	if (fragsize > qp->q.max_size)
+		qp->q.max_size = fragsize;
+
 	if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
-	    skb->len + ihl > qp->q.max_size)
-		qp->q.max_size = skb->len + ihl;
+	    fragsize > qp->max_df_size)
+		qp->max_df_size = fragsize;
 
 	if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
 	    qp->q.meat == qp->q.len) {
@@ -606,13 +613,27 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 	head->next = NULL;
 	head->dev = dev;
 	head->tstamp = qp->q.stamp;
-	IPCB(head)->frag_max_size = qp->q.max_size;
+	IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
 
 	iph = ip_hdr(head);
-	/* max_size != 0 implies at least one fragment had IP_DF set */
-	iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0;
 	iph->tot_len = htons(len);
 	iph->tos |= ecn;
+
+	/* if largest size with df is also largest seen we should also
+	 * call ip_fragment & set DF on the new fragments when forwarding
+	 * to not break path mtu discovery.
+	 *
+	 * This check exists because we don't want to send tiny packets if
+	 * skb was built from small df-fragment and one huge fragment
+	 * without df.
+	 */
+	if (qp->max_df_size == qp->q.max_size) {
+		IPCB(head)->flags |= IPSKB_FRAG_PMTU;
+		iph->frag_off = htons(IP_DF);
+	} else {
+		iph->frag_off = 0;
+	}
+
 	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
 	qp->q.fragments = NULL;
 	qp->q.fragments_tail = NULL;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index c65b93a..26847e4 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -270,7 +270,8 @@ static int ip_finish_output(struct sock *sk, struct sk_buff *skb)
 	if (skb_is_gso(skb))
 		return ip_finish_output_gso(sk, skb);
 
-	if (skb->len > ip_skb_dst_mtu(skb))
+	if (skb->len > ip_skb_dst_mtu(skb) ||
+	    (IPCB(skb)->flags & IPSKB_FRAG_PMTU))
 		return ip_fragment(sk, skb, ip_finish_output2);
 
 	return ip_finish_output2(sk, skb);
@@ -507,16 +508,19 @@ int ip_fragment(struct sock *sk, struct sk_buff *skb,
 	iph = ip_hdr(skb);
 
 	mtu = ip_skb_dst_mtu(skb);
-	if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) ||
-		     (IPCB(skb)->frag_max_size &&
-		      IPCB(skb)->frag_max_size > mtu))) {
-		IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
-		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
-			  htonl(mtu));
-		kfree_skb(skb);
-		return -EMSGSIZE;
+
+	if (unlikely((iph->frag_off & htons(IP_DF)))) {
+		if (!skb->ignore_df)
+			goto fail_toobig;
+
+		if (IPCB(skb)->frag_max_size &&
+		    IPCB(skb)->frag_max_size > mtu)
+			goto fail_toobig;
 	}
 
+	if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu)
+		mtu = IPCB(skb)->frag_max_size;
+
 	/*
 	 *	Setup starting values.
 	 */
@@ -711,6 +715,9 @@ slow_path:
 		iph = ip_hdr(skb2);
 		iph->frag_off = htons((offset >> 3));
 
+		if (IPCB(skb)->flags & IPSKB_FRAG_PMTU)
+			iph->frag_off |= htons(IP_DF);
+
 		/* ANK: dirty, but effective trick. Upgrade options only if
 		 * the segment to be fragmented was THE FIRST (otherwise,
 		 * options are already fixed) and make it ONCE
@@ -750,6 +757,12 @@ fail:
 	kfree_skb(skb);
 	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 	return err;
+fail_toobig:
+	IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
+	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+	kfree_skb(skb);
+	return -EMSGSIZE;
+
 }
 EXPORT_SYMBOL(ip_fragment);
 
-- 
2.0.5

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ