[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <ZWirZo4FrvZOi1Ik@hog>
Date: Thu, 30 Nov 2023 16:33:58 +0100
From: Sabrina Dubroca <sd@...asysnail.net>
To: Christian Hopps <chopps@...pps.org>
Cc: devel@...ux-ipsec.org, Steffen Klassert <steffen.klassert@...unet.com>,
netdev@...r.kernel.org, Christian Hopps <chopps@...n.net>
Subject: Re: [RFC ipsec-next v2 8/8] iptfs: impl: add new iptfs xfrm mode impl
2023-11-12, 22:52:19 -0500, Christian Hopps wrote:
> From: Christian Hopps <chopps@...n.net>
>
> Add a new xfrm mode implementing AggFrag/IP-TFS from RFC9347.
>
> This utilizes the new xfrm_mode_cbs to implement demand-driven IP-TFS
> functionality. This functionality can be used to increase bandwidth
> utilization through small packet aggregation, as well as help solve PMTU
> issues through it's efficient use of fragmentation.
>
> Link: https://www.rfc-editor.org/rfc/rfc9347.txt
>
> Signed-off-by: Christian Hopps <chopps@...n.net>
> ---
> include/net/iptfs.h | 18 +
> net/xfrm/Makefile | 1 +
> net/xfrm/trace_iptfs.h | 224 ++++
> net/xfrm/xfrm_iptfs.c | 2696 ++++++++++++++++++++++++++++++++++++++++
> 4 files changed, 2939 insertions(+)
> create mode 100644 include/net/iptfs.h
> create mode 100644 net/xfrm/trace_iptfs.h
> create mode 100644 net/xfrm/xfrm_iptfs.c
>
> diff --git a/include/net/iptfs.h b/include/net/iptfs.h
> new file mode 100644
> index 000000000000..d8f2e494f251
> --- /dev/null
> +++ b/include/net/iptfs.h
Is this header needed? It's only included by net/xfrm/xfrm_iptfs.c,
why not put those #defines directly in the file?
> @@ -0,0 +1,18 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _NET_IPTFS_H
> +#define _NET_IPTFS_H
> +
> +#include <linux/types.h>
> +#include <linux/ip.h>
> +
> +#define IPTFS_SUBTYPE_BASIC 0
> +#define IPTFS_SUBTYPE_CC 1
> +#define IPTFS_SUBTYPE_LAST IPTFS_SUBTYPE_CC
_LAST is never used.
> +#define IPTFS_CC_FLAGS_ECN_CE 0x1
> +#define IPTFS_CC_FLAGS_PLMTUD 0x2
Not used either.
> +extern void xfrm_iptfs_get_rtt_and_delays(struct ip_iptfs_cc_hdr *cch, u32 *rtt,
> + u32 *actual_delay, u32 *xmit_delay);
Not implemented in this series, drop this.
[...]
> diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c
> new file mode 100644
> index 000000000000..65f7acdbe6a8
> --- /dev/null
> +++ b/net/xfrm/xfrm_iptfs.c
[...]
> +struct sk_buff *iptfs_pskb_add_frags(struct sk_buff *tpl,
nit: static? afaict it's not used outside this file.
> + struct skb_frag_walk *walk, u32 off,
> + u32 len, struct skb_seq_state *st,
> + u32 copy_len)
> +{
[...]
> +
> +/**
> + * iptfs_input_ordered() - handle next in order IPTFS payload.
> + *
> + * Process the IPTFS payload in `skb` and consume it afterwards.
> + */
> +static int iptfs_input_ordered(struct xfrm_state *x, struct sk_buff *skb)
> +{
Can we try to not introduce a worse problem than xfrm_input already
is? 326 lines and 20+ local variables is way too much. And then it
calls another 200+ lines function...
I did try to understand what the main loop does but I got completely
lost the 3 times I tried :/
> +static u32 __reorder_this(struct xfrm_iptfs_data *xtfs, struct sk_buff *inskb,
> + struct list_head *list)
> +
nit: extra blank line
> +{
[...]
> +/**
> + * iptfs_input() - handle receipt of iptfs payload
> + * @x: xfrm state.
> + * @skb: the packet.
> + *
> + * We have an IPTFS payload order it if needed, then process newly in order
> + * packetsA.
typo? "packetsA"
[...]
> +/* IPv4/IPv6 packet ingress to IPTFS tunnel, arrange to send in IPTFS payload
> + * (i.e., aggregating or fragmenting as appropriate).
> + * This is set in dst->output for an SA.
> + */
> +static int iptfs_output_collect(struct net *net, struct sock *sk,
> + struct sk_buff *skb)
> +{
> + struct dst_entry *dst = skb_dst(skb);
> + struct xfrm_state *x = dst->xfrm;
> + struct xfrm_iptfs_data *xtfs = x->mode_data;
> + struct sk_buff *segs, *nskb;
> + u32 count, qcount;
> + u32 pmtu = 0;
> + bool ok = true;
> + bool was_gso;
> +
> + /* We have hooked into dst_entry->output which means we have skipped the
> + * protocol specific netfilter (see xfrm4_output, xfrm6_output).
> + * when our timer runs we will end up calling xfrm_output directly on
> + * the encapsulated traffic.
> + *
> + * For both cases this is the NF_INET_POST_ROUTING hook which allows
> + * changing the skb->dst entry which then may not be xfrm based anymore
> + * in which case a REROUTED flag is set. and dst_output is called.
> + *
> + * For IPv6 we are also skipping fragmentation handling for local
> + * sockets, which may or may not be good depending on our tunnel DF
> + * setting. Normally with fragmentation supported we want to skip this
> + * fragmentation.
> + */
> +
> + BUG_ON(!xtfs);
Or drop the packet and add a DEBUG_NET_WARN_ON_ONCE? This should never
happen, but why crash the system when we have a way to deal with this
error?
> +
> + if (xtfs->cfg.dont_frag)
> + pmtu = iptfs_get_cur_pmtu(x, xtfs, skb);
> +
> + /* Break apart GSO skbs. If the queue is nearing full then we want the
> + * accounting and queuing to be based on the individual packets not on the
> + * aggregate GSO buffer.
> + */
> + was_gso = skb_is_gso(skb);
> + if (!was_gso) {
> + segs = skb;
> + } else {
> + segs = skb_gso_segment(skb, 0);
> + if (IS_ERR_OR_NULL(segs)) {
> + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR);
> + kfree_skb(skb);
> + return PTR_ERR(segs);
> + }
> + consume_skb(skb);
> + skb = NULL;
> + }
> +
> + count = 0;
> + qcount = 0;
nit: both of those get incremented through the main loop but never read
> +
> + /* We can be running on multiple cores and from the network softirq or
> + * from user context depending on where the packet is coming from.
> + */
> + spin_lock_bh(&x->lock);
> +
> + skb_list_walk_safe(segs, skb, nskb) {
> + skb_mark_not_on_list(skb);
> + count++;
> +
> + /* Once we drop due to no queue space we continue to drop the
> + * rest of the packets from that GRO.
> + */
> + if (!ok) {
> +nospace:
> + trace_iptfs_no_queue_space(skb, xtfs, pmtu, was_gso);
> + XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMOUTNOQSPACE);
> + kfree_skb_reason(skb, SKB_DROP_REASON_FULL_RING);
> + continue;
> + }
> +
> + /* If the user indicated no iptfs fragmenting check before
> + * enqueue.
> + */
> + if (xtfs->cfg.dont_frag && iptfs_is_too_big(sk, skb, pmtu)) {
> + trace_iptfs_too_big(skb, xtfs, pmtu, was_gso);
> + kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
> + continue;
> + }
> +
> + /* Enqueue to send in tunnel */
> +
nit: unneeded blank line
> + ok = iptfs_enqueue(xtfs, skb);
> + if (!ok)
> + goto nospace;
> +
> + trace_iptfs_enqueue(skb, xtfs, pmtu, was_gso);
> + qcount++;
> + }
> +
> + /* Start a delay timer if we don't have one yet */
> + if (!hrtimer_is_queued(&xtfs->iptfs_timer)) {
> + /* softirq blocked lest the timer fire and interrupt us */
> + BUG_ON(!in_interrupt());
Why is that a fatal condition?
> + hrtimer_start(&xtfs->iptfs_timer, xtfs->init_delay_ns,
> + IPTFS_HRTIMER_MODE);
> +
> + xtfs->iptfs_settime = ktime_get_raw_fast_ns();
> + trace_iptfs_timer_start(xtfs, xtfs->init_delay_ns);
> + }
> +
> + spin_unlock_bh(&x->lock);
> + return 0;
> +}
> +
[...]
> +static int iptfs_copy_create_frags(struct sk_buff **skbp,
> + struct xfrm_iptfs_data *xtfs, u32 mtu)
> +{
[...]
> + /* prepare the initial fragment with an iptfs header */
> + iptfs_output_prepare_skb(skb, 0);
> +
> + /* Send all but last fragment. */
> + list_for_each_entry_safe(skb, nskb, &sublist, list) {
> + skb_list_del_init(skb);
> + xfrm_output(NULL, skb);
Should we stop if xfrm_output fails? Or is it still useful to send the
rest of the iptfs frags if we lose one in the middle?
> + }
> +
> + return 0;
> +}
> +
[...]
> +static int iptfs_first_skb(struct sk_buff **skbp, struct xfrm_iptfs_data *xtfs,
> + u32 mtu)
> +{
> + struct sk_buff *skb = *skbp;
> + int err;
> +
> + /* Classic ESP skips the don't fragment ICMP error if DF is clear on
> + * the inner packet or ignore_df is set. Otherwise it will send an ICMP
> + * or local error if the inner packet won't fit it's MTU.
> + *
> + * With IPTFS we do not care about the inner packet DF bit. If the
> + * tunnel is configured to "don't fragment" we error back if things
> + * don't fit in our max packet size. Otherwise we iptfs-fragment as
> + * normal.
> + */
> +
> + /* The opportunity for HW offload has ended */
> + if (skb->ip_summed == CHECKSUM_PARTIAL) {
> + err = skb_checksum_help(skb);
> + if (err)
> + return err;
> + }
> +
> + /* We've split these up before queuing */
> + BUG_ON(skb_is_gso(skb));
Drop and DEBUG_NET_WARN_ON_ONCE?
> +
> + trace_iptfs_first_dequeue(skb, mtu, 0, ip_hdr(skb));
> +
> + /* Simple case -- it fits. `mtu` accounted for all the overhead
> + * including the basic IPTFS header.
> + */
> + if (skb->len <= mtu) {
> + iptfs_output_prepare_skb(skb, 0);
> + return 0;
> + }
> +
> + BUG_ON(xtfs->cfg.dont_frag);
and here?
> + if (iptfs_first_should_copy(skb, mtu))
> + return iptfs_copy_create_frags(skbp, xtfs, mtu);
Since we end up copying anyway, drop this (and
iptfs_first_should_copy). You can introduce the optimization later on.
> + /* For now we always copy */
> + return iptfs_copy_create_frags(skbp, xtfs, mtu);
> +}
> +
> +static struct sk_buff **iptfs_rehome_fraglist(struct sk_buff **nextp,
> + struct sk_buff *child)
> +{
> + u32 fllen = 0;
> +
> + BUG_ON(!skb_has_frag_list(child));
Not needed, this was tested just before calling this function.
> +
> + /* It might be possible to account for a frag list in addition to page
> + * fragment if it's a valid state to be in. The page fragments size
> + * should be kept as data_len so only the frag_list size is removed,
> + * this must be done above as well took
> + */
> + BUG_ON(skb_shinfo(child)->nr_frags);
Again not worth crashing the system?
> + *nextp = skb_shinfo(child)->frag_list;
> + while (*nextp) {
> + fllen += (*nextp)->len;
> + nextp = &(*nextp)->next;
> + }
> + skb_frag_list_init(child);
> + BUG_ON(fllen > child->data_len);
> + child->len -= fllen;
> + child->data_len -= fllen;
> +
> + return nextp;
> +}
[...]
> +static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list)
> +{
> + struct xfrm_iptfs_data *xtfs = x->mode_data;
> + u32 payload_mtu = xtfs->payload_mtu;
> + struct sk_buff *skb, *skb2, **nextp;
> + struct skb_shared_info *shi, *shi2;
> +
> + /* For now we are just outputting packets as fast as we can, so if we
> + * are fragmenting we will do so until the last inner packet has been
> + * consumed.
> + *
> + * When we are fragmenting we need to output all outer packets that
> + * contain the fragments of a single inner packet, consecutively (ESP
> + * seq-wise). So we need a lock to keep another CPU from sending the
> + * next batch of packets (it's `list`) and trying to output those, while
> + * we output our `list` resuling with interleaved non-spec-client inner
> + * packet streams. Thus we need to lock the IPTFS output on a per SA
> + * basis while we process this list.
> + */
This talks about a lock but I don't see one. What am I missing?
> +
> + /* NOTE: for the future, for timed packet sends, if our queue is not
> + * growing longer (i.e., we are keeping up) and a packet we are about to
> + * fragment will not fragment in then next outer packet, we might consider
> + * holding on to it to send whole in the next slot. The question then is
> + * does this introduce a continuous delay in the inner packet stream
> + * with certain packet rates and sizes?
> + */
> +
> + /* and send them on their way */
> +
> + while ((skb = __skb_dequeue(list))) {
> + struct xfrm_dst *xdst = (struct xfrm_dst *)skb_dst(skb);
> + u32 mtu = __iptfs_get_inner_mtu(x, xdst->child_mtu_cached);
> + bool share_ok = true;
> + int remaining;
> +
> + /* protocol comes to us cleared sometimes */
> + skb->protocol = x->outer_mode.family == AF_INET ?
> + htons(ETH_P_IP) :
> + htons(ETH_P_IPV6);
> +
> + if (payload_mtu && payload_mtu < mtu)
> + mtu = payload_mtu;
Isn't that iptfs_get_cur_pmtu?
[...]
> +static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me)
> +{
> + struct sk_buff_head list;
> + struct xfrm_iptfs_data *xtfs;
> + struct xfrm_state *x;
> + time64_t settime;
> + size_t osize;
> +
> + xtfs = container_of(me, typeof(*xtfs), iptfs_timer);
> + x = xtfs->x;
> +
> + /* Process all the queued packets
> + *
> + * softirq execution order: timer > tasklet > hrtimer
> + *
> + * Network rx will have run before us giving one last chance to queue
> + * ingress packets for us to process and transmit.
> + */
> +
> + spin_lock(&x->lock);
> + __skb_queue_head_init(&list);
> + skb_queue_splice_init(&xtfs->queue, &list);
> + osize = xtfs->queue_size;
Unused variable?
[...]
> +static int iptfs_user_init(struct net *net, struct xfrm_state *x,
> + struct nlattr **attrs)
> +{
> + struct xfrm_iptfs_data *xtfs = x->mode_data;
> + struct xfrm_iptfs_config *xc;
> +
> + if (x->props.mode != XFRM_MODE_IPTFS)
> + return -EINVAL;
Is that necessary? This only gets called via ->user_init for this
mode.
> + xc = &xtfs->cfg;
> + xc->reorder_win_size = net->xfrm.sysctl_iptfs_rewin;
> + xc->max_queue_size = net->xfrm.sysctl_iptfs_maxqsize;
> + xc->init_delay_us = net->xfrm.sysctl_iptfs_idelay;
> + xc->drop_time_us = net->xfrm.sysctl_iptfs_drptime;
> +
> + if (attrs[XFRMA_IPTFS_DONT_FRAG])
> + xc->dont_frag = true;
> + if (attrs[XFRMA_IPTFS_REORD_WIN])
> + xc->reorder_win_size =
> + nla_get_u16(attrs[XFRMA_IPTFS_REORD_WIN]);
> + /* saved array is for saving 1..N seq nums from wantseq */
> + if (xc->reorder_win_size)
> + xtfs->w_saved = kcalloc(xc->reorder_win_size,
> + sizeof(*xtfs->w_saved), GFP_KERNEL);
We probably need a reasonable bound on reorder_win_size so that we
don't try to allocate crazy amounts of memory here.
> + if (attrs[XFRMA_IPTFS_PKT_SIZE]) {
> + xc->pkt_size = nla_get_u32(attrs[XFRMA_IPTFS_PKT_SIZE]);
> + if (!xc->pkt_size)
> + xtfs->payload_mtu = 0;
That's already set to 0 via kzalloc, right? So passing 0 as
XFRMA_IPTFS_PKT_SIZE is equivalent to not providing it?
> + else if (xc->pkt_size > x->props.header_len)
> + xtfs->payload_mtu = xc->pkt_size - x->props.header_len;
> + else
> + return -EINVAL;
This could probably use an extack to explain why the value was rejected.
> + }
> + if (attrs[XFRMA_IPTFS_MAX_QSIZE])
> + xc->max_queue_size = nla_get_u32(attrs[XFRMA_IPTFS_MAX_QSIZE]);
> + if (attrs[XFRMA_IPTFS_DROP_TIME])
> + xc->drop_time_us = nla_get_u32(attrs[XFRMA_IPTFS_DROP_TIME]);
> + if (attrs[XFRMA_IPTFS_INIT_DELAY])
> + xc->init_delay_us = nla_get_u32(attrs[XFRMA_IPTFS_INIT_DELAY]);
> +
> + xtfs->ecn_queue_size = (u64)xc->max_queue_size * 95 / 100;
> + xtfs->drop_time_ns = xc->drop_time_us * NSECS_IN_USEC;
> + xtfs->init_delay_ns = xc->init_delay_us * NSECS_IN_USEC;
Can't we get rid of the _us version? Why store both in kernel memory?
[...]
> +static void iptfs_delete_state(struct xfrm_state *x)
> +{
> + struct xfrm_iptfs_data *xtfs = x->mode_data;
> +
> + if (IS_ERR_OR_NULL(xtfs))
Can mode_data ever be an error pointer?
> + return;
> +
> + spin_lock(&xtfs->drop_lock);
> + hrtimer_cancel(&xtfs->iptfs_timer);
> + hrtimer_cancel(&xtfs->drop_timer);
> + spin_unlock(&xtfs->drop_lock);
> +
> + kfree_sensitive(xtfs->w_saved);
> + kfree_sensitive(xtfs);
> +}
> +
> +static const struct xfrm_mode_cbs iptfs_mode_cbs = {
> + .owner = THIS_MODULE,
> + .create_state = iptfs_create_state,
> + .delete_state = iptfs_delete_state,
> + .user_init = iptfs_user_init,
> + .copy_to_user = iptfs_copy_to_user,
> + .get_inner_mtu = iptfs_get_inner_mtu,
> + .input = iptfs_input,
> + .output = iptfs_output_collect,
> + .prepare_output = iptfs_prepare_output,
> +};
> +
> +static int __init xfrm_iptfs_init(void)
> +{
> + int err;
> +
> + pr_info("xfrm_iptfs: IPsec IP-TFS tunnel mode module\n");
> +
> + err = xfrm_register_mode_cbs(XFRM_MODE_IPTFS, &iptfs_mode_cbs);
> + if (err < 0)
> + pr_info("%s: can't register IP-TFS\n", __func__);
> +
> + return err;
> +}
> +
> +static void __exit xfrm_iptfs_fini(void)
> +{
> + xfrm_unregister_mode_cbs(XFRM_MODE_IPTFS);
> +}
If the module is unloaded, existing xfrm states will be left but
silently broken?
--
Sabrina
Powered by blists - more mailing lists