netdev - Re: [PATCH nf-next v8 2/3] net: netfilter: Add IPIP flowtable tx sw acceleration

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aQqDnjv8KLtQJaOW@calendula>
Date: Tue, 4 Nov 2025 23:52:14 +0100
From: Pablo Neira Ayuso <pablo@...filter.org>
To: Lorenzo Bianconi <lorenzo@...nel.org>
Cc: "David S. Miller" <davem@...emloft.net>,
	David Ahern <dsahern@...nel.org>,
	Eric Dumazet <edumazet@...gle.com>,
	Jakub Kicinski <kuba@...nel.org>, Paolo Abeni <pabeni@...hat.com>,
	Simon Horman <horms@...nel.org>,
	Jozsef Kadlecsik <kadlec@...filter.org>,
	Shuah Khan <shuah@...nel.org>, Andrew Lunn <andrew+netdev@...n.ch>,
	Phil Sutter <phil@....cc>, Florian Westphal <fw@...len.de>,
	netdev@...r.kernel.org, netfilter-devel@...r.kernel.org,
	coreteam@...filter.org, linux-kselftest@...r.kernel.org
Subject: Re: [PATCH nf-next v8 2/3] net: netfilter: Add IPIP flowtable tx sw
 acceleration

On Thu, Oct 23, 2025 at 10:50:16AM +0200, Lorenzo Bianconi wrote:
[...]
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 0355461960ce3c0db49e00a6f77f48b031a635dc..eb8058fd7139a2b5457008146f979590f9f03c1d 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -897,6 +897,9 @@ struct net_device_path {
>  			};
>  
>  			u8	l3_proto;
> +			u8	tos;
> +			u8	ttl;
> +			__be16	df;
>  		} tun;
>  		struct {
>  			enum {
> diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
> index 6d00a8aa52584ad96d200683297c1b02bf1f6d4f..fe792f5a8f0528de021c27382b235688532614e4 100644
> --- a/include/net/netfilter/nf_flow_table.h
> +++ b/include/net/netfilter/nf_flow_table.h
> @@ -119,6 +119,9 @@ struct flow_offload_tunnel {
>  	};
>  
>  	u8	l3_proto;
> +	u8	tos;
> +	u8	ttl;
> +	__be16	df;

This is now included in the hash that is used for the lookup, is it
intentional to include these fields here? For rx, we cannot know ttl
of the received packet?

Maybe this needs to be moved after the placeholder:

        struct { }                      __hash;

>  };
>  
>  struct flow_offload_tuple {
[...]
> diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
> index 76081d5d2f71c10e0c65e906b3fb2769e3ab1466..a66ffa0c7fbe780a9f9a545e42d44dfe408e7cb2 100644
> --- a/net/netfilter/nf_flow_table_ip.c
> +++ b/net/netfilter/nf_flow_table_ip.c
[...]
> @@ -533,6 +589,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
>  	struct flow_offload *flow;
>  	struct neighbour *neigh;
>  	struct rtable *rt;
> +	__be32 dest;
>  	int ret;
>  
>  	tuplehash = nf_flow_offload_lookup(&ctx, flow_table, skb);
> @@ -555,8 +612,9 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
>  
>  	dir = tuplehash->tuple.dir;
>  	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
> +	reply_tuple = &flow->tuplehash[!dir].tuple;

Nit: I'd suggest 'other_tuple' instead 'reply_tuple' given this is not
strictly the reply tuple, just the tuple from the other direction.

> -	if (nf_flow_encap_push(skb, &flow->tuplehash[!dir].tuple) < 0)
> +	if (nf_flow_encap_push(state->net, skb, reply_tuple))
>  		return NF_DROP;
>  
>  	switch (tuplehash->tuple.xmit_type) {
> @@ -567,7 +625,9 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
>  			flow_offload_teardown(flow);
>  			return NF_DROP;
>  		}
> -		neigh = ip_neigh_gw4(rt->dst.dev, rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr));
> +		dest = reply_tuple->tun_num ? reply_tuple->tun.src_v4.s_addr
> +					    : reply_tuple->src_v4.s_addr;
> +		neigh = ip_neigh_gw4(rt->dst.dev, rt_nexthop(rt, dest));
>  		if (IS_ERR(neigh)) {
>  			flow_offload_teardown(flow);
>  			return NF_DROP;
> diff --git a/net/netfilter/nf_flow_table_path.c b/net/netfilter/nf_flow_table_path.c
> index bd5e9bf1ca393ab793976ba98a027b60f84882ba..cd0be2efe97596d0947621a5ea604373d5b61da8 100644
> --- a/net/netfilter/nf_flow_table_path.c
> +++ b/net/netfilter/nf_flow_table_path.c
> @@ -190,7 +190,43 @@ static bool nft_flowtable_find_dev(const struct net_device *dev,
>  	return found;
>  }
>  
> -static void nft_dev_forward_path(struct nf_flow_route *route,
> +static int nft_flow_tunnel_update_route(const struct nft_pktinfo *pkt,
> +					struct nf_flow_route *route,
> +					enum ip_conntrack_dir dir)
> +{
> +	struct dst_entry *tun_dst = NULL;
> +	struct flowi fl = {};
> +
> +	switch (nft_pf(pkt)) {
> +	case NFPROTO_IPV4:
> +		fl.u.ip4.daddr = route->tuple[!dir].in.tun.src_v4.s_addr;
> +		fl.u.ip4.saddr = route->tuple[!dir].in.tun.dst_v4.s_addr;
> +		fl.u.ip4.flowi4_iif = nft_in(pkt)->ifindex;
> +		fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb));
> +		fl.u.ip4.flowi4_mark = pkt->skb->mark;
> +		fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC;
> +		break;
> +	case NFPROTO_IPV6:
> +		fl.u.ip6.daddr = route->tuple[!dir].in.tun.src_v6;
> +		fl.u.ip6.saddr = route->tuple[!dir].in.tun.dst_v6;
> +		fl.u.ip6.flowi6_iif = nft_in(pkt)->ifindex;
> +		fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb));
> +		fl.u.ip6.flowi6_mark = pkt->skb->mark;
> +		fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC;
> +		break;
> +	}
> +
> +	nf_route(nft_net(pkt), &tun_dst, &fl, false, nft_pf(pkt));
> +	if (!tun_dst)
> +		return -ENOENT;
> +
> +	nft_default_forward_path(route, tun_dst, dir);

This overrides the previous dst that is set on here, is this leaking
such dst?

> +
> +	return 0;
> +}
> +
> +static void nft_dev_forward_path(const struct nft_pktinfo *pkt,
> +				 struct nf_flow_route *route,
>  				 const struct nf_conn *ct,
>  				 enum ip_conntrack_dir dir,
>  				 struct nft_flowtable *ft)
> @@ -218,6 +254,12 @@ static void nft_dev_forward_path(struct nf_flow_route *route,
>  		route->tuple[!dir].in.tun.src_v6 = info.tun.dst_v6;
>  		route->tuple[!dir].in.tun.dst_v6 = info.tun.src_v6;
>  		route->tuple[!dir].in.tun.l3_proto = info.tun.l3_proto;
> +		route->tuple[!dir].in.tun.tos = info.tun.tos;
> +		route->tuple[!dir].in.tun.ttl = info.tun.ttl;
> +		route->tuple[!dir].in.tun.df = info.tun.df;
> +
> +		if (nft_flow_tunnel_update_route(pkt, route, dir))
> +			return;

If tunnel route is found...

>  	}
>
>  	route->tuple[!dir].in.num_encaps = info.num_encaps;

... num_encaps is never set?

Would you also extend the selftest to combine IPIP with vlan? Thanks.

> @@ -274,9 +316,9 @@ int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct,
>  	nft_default_forward_path(route, other_dst, !dir);
>  
>  	if (route->tuple[dir].xmit_type	== FLOW_OFFLOAD_XMIT_NEIGH)
> -		nft_dev_forward_path(route, ct, dir, ft);
> +		nft_dev_forward_path(pkt, route, ct, dir, ft);
>  	if (route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH)
> -		nft_dev_forward_path(route, ct, !dir, ft);
> +		nft_dev_forward_path(pkt, route, ct, !dir, ft);
>  
>  	return 0;
>  }
> 
> -- 
> 2.51.0
>