lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Date:   Sun, 27 Jun 2021 11:11:04 +0300
From:   Leon Romanovsky <leon@...nel.org>
To:     Jakub Kicinski <kuba@...nel.org>
Cc:     davem@...emloft.net, netdev@...r.kernel.org, willemb@...gle.com,
        eric.dumazet@...il.com, dsahern@...il.com, yoshfuji@...ux-ipv6.org,
        brouer@...hat.com, Dave Jones <dsj@...com>
Subject: Re: [PATCH net-next v5] net: ip: avoid OOM kills with large UDP
 sends over loopback

On Thu, Jun 24, 2021 at 10:57:24AM -0700, Jakub Kicinski wrote:
> Dave observed number of machines hitting OOM on the UDP send
> path. The workload seems to be sending large UDP packets over
> loopback. Since loopback has MTU of 64k kernel will try to
> allocate an skb with up to 64k of head space. This has a good
> chance of failing under memory pressure. What's worse if
> the message length is <32k the allocation may trigger an
> OOM killer.
> 
> This is entirely avoidable, we can use an skb with page frags.
> 
> af_unix solves a similar problem by limiting the head
> length to SKB_MAX_ALLOC. This seems like a good and simple
> approach. It means that UDP messages > 16kB will now
> use fragments if underlying device supports SG, if extra
> allocator pressure causes regressions in real workloads
> we can switch to trying the large allocation first and
> falling back.
> 
> v4: pre-calculate all the additions to alloclen so
>     we can be sure it won't go over order-2
> v5: s/< MAX/<= MAX/ ; reorder variable declaration

Jakub,

Can you please put changelog under "---" below SOB?
It gives no value to see all vX in the final git log.

Thanks

> 
> Reported-by: Dave Jones <dsj@...com>
> Signed-off-by: Jakub Kicinski <kuba@...nel.org>
> ---
>  net/ipv4/ip_output.c  | 32 ++++++++++++++++++--------------
>  net/ipv6/ip6_output.c | 32 +++++++++++++++++---------------
>  2 files changed, 35 insertions(+), 29 deletions(-)
> 
> diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
> index c3efc7d658f6..f5c398036efa 100644
> --- a/net/ipv4/ip_output.c
> +++ b/net/ipv4/ip_output.c
> @@ -1050,11 +1050,11 @@ static int __ip_append_data(struct sock *sk,
>  		if (copy < length)
>  			copy = maxfraglen - skb->len;
>  		if (copy <= 0) {
> +			unsigned int alloclen, alloc_extra;
>  			char *data;
>  			unsigned int datalen;
>  			unsigned int fraglen;
>  			unsigned int fraggap;
> -			unsigned int alloclen;
>  			unsigned int pagedlen;
>  			struct sk_buff *skb_prev;
>  alloc_new_skb:
> @@ -1074,35 +1074,39 @@ static int __ip_append_data(struct sock *sk,
>  			fraglen = datalen + fragheaderlen;
>  			pagedlen = 0;
>  
> +			alloc_extra = hh_len + 15;
> +			alloc_extra += exthdrlen;
> +
> +			/* The last fragment gets additional space at tail.
> +			 * Note, with MSG_MORE we overallocate on fragments,
> +			 * because we have no idea what fragment will be
> +			 * the last.
> +			 */
> +			if (datalen == length + fraggap)
> +				alloc_extra += rt->dst.trailer_len;
> +
>  			if ((flags & MSG_MORE) &&
>  			    !(rt->dst.dev->features&NETIF_F_SG))
>  				alloclen = mtu;
> -			else if (!paged)
> +			else if (!paged &&
> +				 (fraglen + alloc_extra <= SKB_MAX_ALLOC ||
> +				  !(rt->dst.dev->features & NETIF_F_SG)))
>  				alloclen = fraglen;
>  			else {
>  				alloclen = min_t(int, fraglen, MAX_HEADER);
>  				pagedlen = fraglen - alloclen;
>  			}
>  
> -			alloclen += exthdrlen;
> -
> -			/* The last fragment gets additional space at tail.
> -			 * Note, with MSG_MORE we overallocate on fragments,
> -			 * because we have no idea what fragment will be
> -			 * the last.
> -			 */
> -			if (datalen == length + fraggap)
> -				alloclen += rt->dst.trailer_len;
> +			alloclen += alloc_extra;
>  
>  			if (transhdrlen) {
> -				skb = sock_alloc_send_skb(sk,
> -						alloclen + hh_len + 15,
> +				skb = sock_alloc_send_skb(sk, alloclen,
>  						(flags & MSG_DONTWAIT), &err);
>  			} else {
>  				skb = NULL;
>  				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
>  				    2 * sk->sk_sndbuf)
> -					skb = alloc_skb(alloclen + hh_len + 15,
> +					skb = alloc_skb(alloclen,
>  							sk->sk_allocation);
>  				if (unlikely(!skb))
>  					err = -ENOBUFS;
> diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
> index ff4f9ebcf7f6..afe887dd5e35 100644
> --- a/net/ipv6/ip6_output.c
> +++ b/net/ipv6/ip6_output.c
> @@ -1551,11 +1551,11 @@ static int __ip6_append_data(struct sock *sk,
>  			copy = maxfraglen - skb->len;
>  
>  		if (copy <= 0) {
> +			unsigned int alloclen, alloc_extra;
>  			char *data;
>  			unsigned int datalen;
>  			unsigned int fraglen;
>  			unsigned int fraggap;
> -			unsigned int alloclen;
>  			unsigned int pagedlen;
>  alloc_new_skb:
>  			/* There's no room in the current skb */
> @@ -1582,17 +1582,28 @@ static int __ip6_append_data(struct sock *sk,
>  			fraglen = datalen + fragheaderlen;
>  			pagedlen = 0;
>  
> +			alloc_extra = hh_len;
> +			alloc_extra += dst_exthdrlen;
> +			alloc_extra += rt->dst.trailer_len;
> +
> +			/* We just reserve space for fragment header.
> +			 * Note: this may be overallocation if the message
> +			 * (without MSG_MORE) fits into the MTU.
> +			 */
> +			alloc_extra += sizeof(struct frag_hdr);
> +
>  			if ((flags & MSG_MORE) &&
>  			    !(rt->dst.dev->features&NETIF_F_SG))
>  				alloclen = mtu;
> -			else if (!paged)
> +			else if (!paged &&
> +				 (fraglen + alloc_extra <= SKB_MAX_ALLOC ||
> +				  !(rt->dst.dev->features & NETIF_F_SG)))
>  				alloclen = fraglen;
>  			else {
>  				alloclen = min_t(int, fraglen, MAX_HEADER);
>  				pagedlen = fraglen - alloclen;
>  			}
> -
> -			alloclen += dst_exthdrlen;
> +			alloclen += alloc_extra;
>  
>  			if (datalen != length + fraggap) {
>  				/*
> @@ -1602,30 +1613,21 @@ static int __ip6_append_data(struct sock *sk,
>  				datalen += rt->dst.trailer_len;
>  			}
>  
> -			alloclen += rt->dst.trailer_len;
>  			fraglen = datalen + fragheaderlen;
>  
> -			/*
> -			 * We just reserve space for fragment header.
> -			 * Note: this may be overallocation if the message
> -			 * (without MSG_MORE) fits into the MTU.
> -			 */
> -			alloclen += sizeof(struct frag_hdr);
> -
>  			copy = datalen - transhdrlen - fraggap - pagedlen;
>  			if (copy < 0) {
>  				err = -EINVAL;
>  				goto error;
>  			}
>  			if (transhdrlen) {
> -				skb = sock_alloc_send_skb(sk,
> -						alloclen + hh_len,
> +				skb = sock_alloc_send_skb(sk, alloclen,
>  						(flags & MSG_DONTWAIT), &err);
>  			} else {
>  				skb = NULL;
>  				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
>  				    2 * sk->sk_sndbuf)
> -					skb = alloc_skb(alloclen + hh_len,
> +					skb = alloc_skb(alloclen,
>  							sk->sk_allocation);
>  				if (unlikely(!skb))
>  					err = -ENOBUFS;
> -- 
> 2.31.1
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ