[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CALx6S35aNoSgP5UyA70xE=3h_haWw8NU11ViaqUqG=uGbtzUgQ@mail.gmail.com>
Date: Tue, 22 Jan 2019 16:37:45 -0800
From: Tom Herbert <tom@...bertland.com>
To: Peter Oskolkov <posk@...gle.com>
Cc: David Miller <davem@...emloft.net>,
Linux Kernel Network Developers <netdev@...r.kernel.org>,
Peter Oskolkov <posk.devel@...il.com>,
Eric Dumazet <edumazet@...gle.com>,
Florian Westphal <fw@...len.de>
Subject: Re: [PATCH net-next 2/4] net: IP6 defrag: use rbtrees for IPv6 defrag
On Tue, Jan 22, 2019 at 10:03 AM Peter Oskolkov <posk@...gle.com> wrote:
>
> Currently, IPv6 defragmentation code drops non-last fragments that
> are smaller than 1280 bytes: see
> commit 0ed4229b08c1 ("ipv6: defrag: drop non-last frags smaller than min mtu")
>
Peter,
I'm a bit confused on why so much code is required to undo this patch
which was a whole three lines of code to begin with.
> This behavior is not specified in IPv6 RFCs and appears to break
> compatibility with some IPv6 implemenations, as reported here:
> https://www.spinics.net/lists/netdev/msg543846.html
>
I am not yet convinced that the patch should be reverted. While it is
not conformant, it is a potential mitigation against DOS attack. For
instance, a single fragmented packet could be composed of over 8000
fragments with the minimum fragment size of eight bytes. Receiving
packets like that attacks both memory and CPU, and I don't see any
purpose to sending tiny fragments other than DOS attack. For practical
implemenation, I believe a limit is justified. The 1280 MTU limit in
the patch was too austere, I think it should be smaller than that.
Also, any such limit should apply to fragments payload length and not
length of the packet since a clever attacker could stuff the packet
with various other extension headers and still end up sending tiny
fragments. I have a patch along these lines with limit configurable by
sysctl. I will post shortly.
Tom
> This patch re-uses common IP defragmentation queueing and reassembly
> code in IPv6, removing the 1280 byte restriction.
>
> Signed-off-by: Peter Oskolkov <posk@...gle.com>
> Reported-by: Tom Herbert <tom@...bertland.com>
> Cc: Eric Dumazet <edumazet@...gle.com>
> Cc: Florian Westphal <fw@...len.de>
> ---
> include/net/ipv6_frag.h | 11 +-
> net/ipv6/reassembly.c | 233 +++++++++++-----------------------------
> 2 files changed, 71 insertions(+), 173 deletions(-)
>
> diff --git a/include/net/ipv6_frag.h b/include/net/ipv6_frag.h
> index 6ced1e6899b6..28aa9b30aece 100644
> --- a/include/net/ipv6_frag.h
> +++ b/include/net/ipv6_frag.h
> @@ -82,8 +82,15 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq)
> __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);
>
> /* Don't send error if the first segment did not arrive. */
> - head = fq->q.fragments;
> - if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !head)
> + if (!(fq->q.flags & INET_FRAG_FIRST_IN))
> + goto out;
> +
> + /* sk_buff::dev and sk_buff::rbnode are unionized. So we
> + * pull the head out of the tree in order to be able to
> + * deal with head->dev.
> + */
> + head = inet_frag_pull_head(&fq->q);
> + if (!head)
> goto out;
>
> head->dev = dev;
> diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
> index 36a3d8dc61f5..24264d0a4b85 100644
> --- a/net/ipv6/reassembly.c
> +++ b/net/ipv6/reassembly.c
> @@ -69,8 +69,8 @@ static u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h)
>
> static struct inet_frags ip6_frags;
>
> -static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
> - struct net_device *dev);
> +static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb,
> + struct sk_buff *prev_tail, struct net_device *dev);
>
> static void ip6_frag_expire(struct timer_list *t)
> {
> @@ -111,21 +111,26 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
> struct frag_hdr *fhdr, int nhoff,
> u32 *prob_offset)
> {
> - struct sk_buff *prev, *next;
> - struct net_device *dev;
> - int offset, end, fragsize;
> struct net *net = dev_net(skb_dst(skb)->dev);
> + int offset, end, fragsize;
> + struct sk_buff *prev_tail;
> + struct net_device *dev;
> + int err = -ENOENT;
> u8 ecn;
>
> if (fq->q.flags & INET_FRAG_COMPLETE)
> goto err;
>
> + err = -EINVAL;
> offset = ntohs(fhdr->frag_off) & ~0x7;
> end = offset + (ntohs(ipv6_hdr(skb)->payload_len) -
> ((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1)));
>
> if ((unsigned int)end > IPV6_MAXPLEN) {
> *prob_offset = (u8 *)&fhdr->frag_off - skb_network_header(skb);
> + /* note that if prob_offset is set, the skb is freed elsewhere,
> + * we do not free it here.
> + */
> return -1;
> }
>
> @@ -170,62 +175,27 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
> if (end == offset)
> goto discard_fq;
>
> + err = -ENOMEM;
> /* Point into the IP datagram 'data' part. */
> if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data))
> goto discard_fq;
>
> - if (pskb_trim_rcsum(skb, end - offset))
> + err = pskb_trim_rcsum(skb, end - offset);
> + if (err)
> goto discard_fq;
>
> - /* Find out which fragments are in front and at the back of us
> - * in the chain of fragments so far. We must know where to put
> - * this fragment, right?
> - */
> - prev = fq->q.fragments_tail;
> - if (!prev || prev->ip_defrag_offset < offset) {
> - next = NULL;
> - goto found;
> - }
> - prev = NULL;
> - for (next = fq->q.fragments; next != NULL; next = next->next) {
> - if (next->ip_defrag_offset >= offset)
> - break; /* bingo! */
> - prev = next;
> - }
> -
> -found:
> - /* RFC5722, Section 4, amended by Errata ID : 3089
> - * When reassembling an IPv6 datagram, if
> - * one or more its constituent fragments is determined to be an
> - * overlapping fragment, the entire datagram (and any constituent
> - * fragments) MUST be silently discarded.
> - */
> -
> - /* Check for overlap with preceding fragment. */
> - if (prev &&
> - (prev->ip_defrag_offset + prev->len) > offset)
> - goto discard_fq;
> -
> - /* Look for overlap with succeeding segment. */
> - if (next && next->ip_defrag_offset < end)
> - goto discard_fq;
> -
> - /* Note : skb->ip_defrag_offset and skb->sk share the same location */
> + /* Note : skb->rbnode and skb->dev share the same location. */
> dev = skb->dev;
> - if (dev)
> - fq->iif = dev->ifindex;
> /* Makes sure compiler wont do silly aliasing games */
> barrier();
> - skb->ip_defrag_offset = offset;
>
> - /* Insert this fragment in the chain of fragments. */
> - skb->next = next;
> - if (!next)
> - fq->q.fragments_tail = skb;
> - if (prev)
> - prev->next = skb;
> - else
> - fq->q.fragments = skb;
> + prev_tail = fq->q.fragments_tail;
> + err = inet_frag_queue_insert(&fq->q, skb, offset, end);
> + if (err)
> + goto insert_error;
> +
> + if (dev)
> + fq->iif = dev->ifindex;
>
> fq->q.stamp = skb->tstamp;
> fq->q.meat += skb->len;
> @@ -246,44 +216,48 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
>
> if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
> fq->q.meat == fq->q.len) {
> - int res;
> unsigned long orefdst = skb->_skb_refdst;
>
> skb->_skb_refdst = 0UL;
> - res = ip6_frag_reasm(fq, prev, dev);
> + err = ip6_frag_reasm(fq, skb, prev_tail, dev);
> skb->_skb_refdst = orefdst;
> - return res;
> + return err;
> }
>
> skb_dst_drop(skb);
> - return -1;
> + return -EINPROGRESS;
>
> +insert_error:
> + if (err == IPFRAG_DUP) {
> + kfree_skb(skb);
> + return -EINVAL;
> + }
> + err = -EINVAL;
> + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
> + IPSTATS_MIB_REASM_OVERLAPS);
> discard_fq:
> inet_frag_kill(&fq->q);
> -err:
> __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
> IPSTATS_MIB_REASMFAILS);
> +err:
> kfree_skb(skb);
> - return -1;
> + return err;
> }
>
> /*
> * Check if this packet is complete.
> - * Returns NULL on failure by any reason, and pointer
> - * to current nexthdr field in reassembled frame.
> *
> * It is called with locked fq, and caller must check that
> * queue is eligible for reassembly i.e. it is not COMPLETE,
> * the last and the first frames arrived and all the bits are here.
> */
> -static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
> - struct net_device *dev)
> +static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb,
> + struct sk_buff *prev_tail, struct net_device *dev)
> {
> struct net *net = container_of(fq->q.net, struct net, ipv6.frags);
> - struct sk_buff *fp, *head = fq->q.fragments;
> - int payload_len, delta;
> unsigned int nhoff;
> - int sum_truesize;
> + void *reasm_data;
> + int payload_len;
> u8 ecn;
>
> inet_frag_kill(&fq->q);
> @@ -292,121 +266,40 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
> if (unlikely(ecn == 0xff))
> goto out_fail;
>
> - /* Make the one we just received the head. */
> - if (prev) {
> - head = prev->next;
> - fp = skb_clone(head, GFP_ATOMIC);
> -
> - if (!fp)
> - goto out_oom;
> -
> - fp->next = head->next;
> - if (!fp->next)
> - fq->q.fragments_tail = fp;
> - prev->next = fp;
> -
> - skb_morph(head, fq->q.fragments);
> - head->next = fq->q.fragments->next;
> -
> - consume_skb(fq->q.fragments);
> - fq->q.fragments = head;
> - }
> -
> - WARN_ON(head == NULL);
> - WARN_ON(head->ip_defrag_offset != 0);
> + reasm_data = inet_frag_reasm_prepare(&fq->q, skb, prev_tail);
> + if (!reasm_data)
> + goto out_oom;
>
> - /* Unfragmented part is taken from the first segment. */
> - payload_len = ((head->data - skb_network_header(head)) -
> + payload_len = ((skb->data - skb_network_header(skb)) -
> sizeof(struct ipv6hdr) + fq->q.len -
> sizeof(struct frag_hdr));
> if (payload_len > IPV6_MAXPLEN)
> goto out_oversize;
>
> - delta = - head->truesize;
> -
> - /* Head of list must not be cloned. */
> - if (skb_unclone(head, GFP_ATOMIC))
> - goto out_oom;
> -
> - delta += head->truesize;
> - if (delta)
> - add_frag_mem_limit(fq->q.net, delta);
> -
> - /* If the first fragment is fragmented itself, we split
> - * it to two chunks: the first with data and paged part
> - * and the second, holding only fragments. */
> - if (skb_has_frag_list(head)) {
> - struct sk_buff *clone;
> - int i, plen = 0;
> -
> - clone = alloc_skb(0, GFP_ATOMIC);
> - if (!clone)
> - goto out_oom;
> - clone->next = head->next;
> - head->next = clone;
> - skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
> - skb_frag_list_init(head);
> - for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
> - plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
> - clone->len = clone->data_len = head->data_len - plen;
> - head->data_len -= clone->len;
> - head->len -= clone->len;
> - clone->csum = 0;
> - clone->ip_summed = head->ip_summed;
> - add_frag_mem_limit(fq->q.net, clone->truesize);
> - }
> -
> /* We have to remove fragment header from datagram and to relocate
> * header in order to calculate ICV correctly. */
> nhoff = fq->nhoffset;
> - skb_network_header(head)[nhoff] = skb_transport_header(head)[0];
> - memmove(head->head + sizeof(struct frag_hdr), head->head,
> - (head->data - head->head) - sizeof(struct frag_hdr));
> - if (skb_mac_header_was_set(head))
> - head->mac_header += sizeof(struct frag_hdr);
> - head->network_header += sizeof(struct frag_hdr);
> -
> - skb_reset_transport_header(head);
> - skb_push(head, head->data - skb_network_header(head));
> -
> - sum_truesize = head->truesize;
> - for (fp = head->next; fp;) {
> - bool headstolen;
> - int delta;
> - struct sk_buff *next = fp->next;
> -
> - sum_truesize += fp->truesize;
> - if (head->ip_summed != fp->ip_summed)
> - head->ip_summed = CHECKSUM_NONE;
> - else if (head->ip_summed == CHECKSUM_COMPLETE)
> - head->csum = csum_add(head->csum, fp->csum);
> -
> - if (skb_try_coalesce(head, fp, &headstolen, &delta)) {
> - kfree_skb_partial(fp, headstolen);
> - } else {
> - fp->sk = NULL;
> - if (!skb_shinfo(head)->frag_list)
> - skb_shinfo(head)->frag_list = fp;
> - head->data_len += fp->len;
> - head->len += fp->len;
> - head->truesize += fp->truesize;
> - }
> - fp = next;
> - }
> - sub_frag_mem_limit(fq->q.net, sum_truesize);
> + skb_network_header(skb)[nhoff] = skb_transport_header(skb)[0];
> + memmove(skb->head + sizeof(struct frag_hdr), skb->head,
> + (skb->data - skb->head) - sizeof(struct frag_hdr));
> + if (skb_mac_header_was_set(skb))
> + skb->mac_header += sizeof(struct frag_hdr);
> + skb->network_header += sizeof(struct frag_hdr);
> +
> + skb_reset_transport_header(skb);
> +
> + inet_frag_reasm_finish(&fq->q, skb, reasm_data);
>
> - skb_mark_not_on_list(head);
> - head->dev = dev;
> - head->tstamp = fq->q.stamp;
> - ipv6_hdr(head)->payload_len = htons(payload_len);
> - ipv6_change_dsfield(ipv6_hdr(head), 0xff, ecn);
> - IP6CB(head)->nhoff = nhoff;
> - IP6CB(head)->flags |= IP6SKB_FRAGMENTED;
> - IP6CB(head)->frag_max_size = fq->q.max_size;
> + skb->dev = dev;
> + ipv6_hdr(skb)->payload_len = htons(payload_len);
> + ipv6_change_dsfield(ipv6_hdr(skb), 0xff, ecn);
> + IP6CB(skb)->nhoff = nhoff;
> + IP6CB(skb)->flags |= IP6SKB_FRAGMENTED;
> + IP6CB(skb)->frag_max_size = fq->q.max_size;
>
> /* Yes, and fold redundant checksum back. 8) */
> - skb_postpush_rcsum(head, skb_network_header(head),
> - skb_network_header_len(head));
> + skb_postpush_rcsum(skb, skb_network_header(skb),
> + skb_network_header_len(skb));
>
> rcu_read_lock();
> __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS);
> @@ -414,6 +307,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
> fq->q.fragments = NULL;
> fq->q.rb_fragments = RB_ROOT;
> fq->q.fragments_tail = NULL;
> + fq->q.last_run_head = NULL;
> return 1;
>
> out_oversize:
> @@ -464,10 +358,6 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
> return 1;
> }
>
> - if (skb->len - skb_network_offset(skb) < IPV6_MIN_MTU &&
> - fhdr->frag_off & htons(IP6_MF))
> - goto fail_hdr;
> -
> iif = skb->dev ? skb->dev->ifindex : 0;
> fq = fq_find(net, fhdr->identification, hdr, iif);
> if (fq) {
> @@ -485,6 +375,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
> if (prob_offset) {
> __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev),
> IPSTATS_MIB_INHDRERRORS);
> + /* icmpv6_param_prob() calls kfree_skb(skb) */
> icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, prob_offset);
> }
> return ret;
> --
> 2.20.1.321.g9e740568ce-goog
>
Powered by blists - more mailing lists