linux-kernel - Re: [PATCH bpf-next v2 3/8] bpf, net, frags: Add bpf_ip_check

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Y/5X7BF9CDYZMuMl@google.com>
Date:   Tue, 28 Feb 2023 11:37:16 -0800
From:   Stanislav Fomichev <sdf@...gle.com>
To:     Daniel Xu <dxu@...uu.xyz>
Cc:     kuba@...nel.org, edumazet@...gle.com, davem@...emloft.net,
        dsahern@...nel.org, pabeni@...hat.com,
        linux-kernel@...r.kernel.org, netdev@...r.kernel.org,
        bpf@...r.kernel.org
Subject: Re: [PATCH bpf-next v2 3/8] bpf, net, frags: Add bpf_ip_check_defrag()
 kfunc

On 02/27, Daniel Xu wrote:
> This kfunc is used to defragment IPv4 packets. The idea is that if you
> see a fragmented packet, you call this kfunc. If the kfunc returns 0,
> then the skb has been updated to contain the entire reassembled packet.

> If the kfunc returns an error (most likely -EINPROGRESS), then it means
> the skb is part of a yet-incomplete original packet. A reasonable
> response to -EINPROGRESS is to drop the packet, as the ip defrag
> infrastructure is already hanging onto the frag for future reassembly.

> Care has been taken to ensure the prog skb remains valid no matter what
> the underlying ip_check_defrag() call does. This is in contrast to
> ip_defrag(), which may consume the skb if the skb is part of a
> yet-incomplete original packet.

> So far this kfunc is only callable from TC clsact progs.

> Signed-off-by: Daniel Xu <dxu@...uu.xyz>
> ---
>   include/net/ip.h           | 11 +++++
>   net/ipv4/Makefile          |  1 +
>   net/ipv4/ip_fragment.c     |  2 +
>   net/ipv4/ip_fragment_bpf.c | 98 ++++++++++++++++++++++++++++++++++++++
>   4 files changed, 112 insertions(+)
>   create mode 100644 net/ipv4/ip_fragment_bpf.c

> diff --git a/include/net/ip.h b/include/net/ip.h
> index c3fffaa92d6e..f3796b1b5cac 100644
> --- a/include/net/ip.h
> +++ b/include/net/ip.h
> @@ -680,6 +680,7 @@ enum ip_defrag_users {
>   	IP_DEFRAG_VS_FWD,
>   	IP_DEFRAG_AF_PACKET,
>   	IP_DEFRAG_MACVLAN,
> +	IP_DEFRAG_BPF,
>   };

>   /* Return true if the value of 'user' is between 'lower_bond'
> @@ -693,6 +694,16 @@ static inline bool ip_defrag_user_in_between(u32  
> user,
>   }

>   int ip_defrag(struct net *net, struct sk_buff *skb, u32 user);
> +
> +#ifdef CONFIG_DEBUG_INFO_BTF
> +int register_ip_frag_bpf(void);
> +#else
> +static inline int register_ip_frag_bpf(void)
> +{
> +	return 0;
> +}
> +#endif
> +
>   #ifdef CONFIG_INET
>   struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb,  
> u32 user);
>   #else
> diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
> index 880277c9fd07..950efb166d37 100644
> --- a/net/ipv4/Makefile
> +++ b/net/ipv4/Makefile
> @@ -65,6 +65,7 @@ obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
>   obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o
>   obj-$(CONFIG_BPF_SYSCALL) += udp_bpf.o
>   obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
> +obj-$(CONFIG_DEBUG_INFO_BTF) += ip_fragment_bpf.o

>   obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
>   		      xfrm4_output.o xfrm4_protocol.o
> diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
> index 959d2c4260ea..e3fda5203f09 100644
> --- a/net/ipv4/ip_fragment.c
> +++ b/net/ipv4/ip_fragment.c
> @@ -759,5 +759,7 @@ void __init ipfrag_init(void)
>   	if (inet_frags_init(&ip4_frags))
>   		panic("IP: failed to allocate ip4_frags cache\n");
>   	ip4_frags_ctl_register();
> +	if (register_ip_frag_bpf())
> +		panic("IP: bpf: failed to register ip_frag_bpf\n");
>   	register_pernet_subsys(&ip4_frags_ops);
>   }
> diff --git a/net/ipv4/ip_fragment_bpf.c b/net/ipv4/ip_fragment_bpf.c
> new file mode 100644
> index 000000000000..a9e5908ed216
> --- /dev/null
> +++ b/net/ipv4/ip_fragment_bpf.c
> @@ -0,0 +1,98 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Unstable ipv4 fragmentation helpers for TC-BPF hook
> + *
> + * These are called from SCHED_CLS BPF programs. Note that it is allowed  
> to
> + * break compatibility for these functions since the interface they are  
> exposed
> + * through to BPF programs is explicitly unstable.
> + */
> +
> +#include <linux/bpf.h>
> +#include <linux/btf_ids.h>
> +#include <linux/ip.h>
> +#include <linux/filter.h>
> +#include <linux/netdevice.h>
> +#include <net/ip.h>
> +#include <net/sock.h>
> +
> +__diag_push();
> +__diag_ignore_all("-Wmissing-prototypes",
> +		  "Global functions as their definitions will be in ip_fragment BTF");
> +
> +/* bpf_ip_check_defrag - Defragment an ipv4 packet
> + *
> + * This helper takes an skb as input. If this skb successfully  
> reassembles
> + * the original packet, the skb is updated to contain the original,  
> reassembled
> + * packet.
> + *
> + * Otherwise (on error or incomplete reassembly), the input skb remains
> + * unmodified.
> + *
> + * Parameters:
> + * @ctx		- Pointer to program context (skb)
> + * @netns	- Child network namespace id. If value is a negative signed
> + *		  32-bit integer, the netns of the device in the skb is used.
> + *
> + * Return:
> + * 0 on successfully reassembly or non-fragmented packet. Negative value  
> on
> + * error or incomplete reassembly.
> + */
> +int bpf_ip_check_defrag(struct __sk_buff *ctx, u64 netns)

Needs a __bpf_kfunc tag as well?

> +{
> +	struct sk_buff *skb = (struct sk_buff *)ctx;
> +	struct sk_buff *skb_cpy, *skb_out;
> +	struct net *caller_net;
> +	struct net *net;
> +	int mac_len;
> +	void *mac;
> +

[..]

> +	if (unlikely(!((s32)netns < 0 || netns <= S32_MAX)))
> +		return -EINVAL;

Can you explain what it does? Is it checking for -1 explicitly? Not sure
it works :-/

Maybe we can spell out the cases explicitly?

if (unlikely(
	     ((s32)netns < 0 && netns != S32_MAX) || /* -1 */
	     netns > U32_MAX /* higher 4 bytes */
	    )
	return -EINVAL;

> +
> +	caller_net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk);
> +	if ((s32)netns < 0) {
> +		net = caller_net;
> +	} else {
> +		net = get_net_ns_by_id(caller_net, netns);
> +		if (unlikely(!net))
> +			return -EINVAL;
> +	}
> +
> +	mac_len = skb->mac_len;
> +	skb_cpy = skb_copy(skb, GFP_ATOMIC);
> +	if (!skb_cpy)
> +		return -ENOMEM;
> +
> +	skb_out = ip_check_defrag(net, skb_cpy, IP_DEFRAG_BPF);
> +	if (IS_ERR(skb_out))
> +		return PTR_ERR(skb_out);
> +
> +	skb_morph(skb, skb_out);
> +	kfree_skb(skb_out);
> +
> +	/* ip_check_defrag() does not maintain mac header, so push empty header
> +	 * in so prog sees the correct layout. The empty mac header will be
> +	 * later pulled from cls_bpf.
> +	 */
> +	mac = skb_push(skb, mac_len);
> +	memset(mac, 0, mac_len);
> +	bpf_compute_data_pointers(skb);
> +
> +	return 0;
> +}
> +
> +__diag_pop()
> +
> +BTF_SET8_START(ip_frag_kfunc_set)
> +BTF_ID_FLAGS(func, bpf_ip_check_defrag, KF_CHANGES_PKT)
> +BTF_SET8_END(ip_frag_kfunc_set)
> +
> +static const struct btf_kfunc_id_set ip_frag_bpf_kfunc_set = {
> +	.owner = THIS_MODULE,
> +	.set   = &ip_frag_kfunc_set,
> +};
> +
> +int register_ip_frag_bpf(void)
> +{
> +	return register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS,
> +					 &ip_frag_bpf_kfunc_set);
> +}
> --
> 2.39.1