netdev - [PATCH ipsec-next v1 8/8] iptfs: impl: add new iptfs xfrm mode impl

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Zeh2YFt4AWF7oNzE@hog>
Date: Wed, 6 Mar 2024 14:57:52 +0100
From: Sabrina Dubroca <sd@...asysnail.net>
To: Christian Hopps <chopps@...pps.org>
Cc: Steffen Klassert <steffen.klassert@...unet.com>, netdev@...r.kernel.org,
	Christian Hopps <chopps@...n.net>, devel@...ux-ipsec.org
Subject: [PATCH ipsec-next v1 8/8] iptfs: impl: add new iptfs xfrm mode impl

2024-02-19, 03:57:35 -0500, Christian Hopps wrote:
>  net/xfrm/Makefile      |    1 +
>  net/xfrm/trace_iptfs.h |  218 ++++
>  net/xfrm/xfrm_iptfs.c  | 2762 ++++++++++++++++++++++++++++++++++++++++

This should probably be split into a few patches (maybe user config,
output, reordering, input, tracepoints) to help reviewers and keep the
answers to a more reasonable length. I dropped some of the feedback I
had because this email was getting ridiculous.

> diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c
> new file mode 100644

[...]
> +static struct sk_buff *iptfs_alloc_skb(struct sk_buff *tpl, u32 len,
> +				       bool l3resv)
> +{
> +	struct sk_buff *skb;
> +	u32 resv;
> +
> +	if (!l3resv) {
> +		resv = XFRM_IPTFS_MIN_L2HEADROOM;
> +	} else {
> +		resv = skb_headroom(tpl);
> +		if (resv < XFRM_IPTFS_MIN_L3HEADROOM)
> +			resv = XFRM_IPTFS_MIN_L3HEADROOM;
> +	}
> +
> +	skb = alloc_skb(len + resv, GFP_ATOMIC);
> +	if (!skb) {
> +		XFRM_INC_STATS(dev_net(tpl->dev), LINUX_MIB_XFRMINERROR);

Confusing, as iptfs_alloc_skb can be called from the output path as well.

> +		return NULL;
> +	}
> +
> +	skb_reserve(skb, resv);
> +
> +	/* Code from __copy_skb_header() -- we do not want any of the
> +	 * tpl->headers copied over, so we aren't using `skb_copy_header()`.
> +	 */

This is a bit of a bad sign for the implementation. It also worries
me, as this may not be updated when changes are made to
__copy_skb_header().

> +	skb->tstamp = tpl->tstamp;
> +	skb->dev = tpl->dev;
> +	memcpy(skb->cb, tpl->cb, sizeof(skb->cb));
> +	skb_dst_copy(skb, tpl);
> +	__skb_ext_copy(skb, tpl);
> +	__nf_copy(skb, tpl, false);
> +
> +	return skb;
> +}



> +static void skb_head_to_frag(const struct sk_buff *skb, skb_frag_t *frag)
> +static void skb_prepare_frag_walk(struct sk_buff *skb, u32 initial_offset,
> +static u32 __skb_reset_frag_walk(struct skb_frag_walk *walk, u32 offset)
> +static bool skb_can_add_frags(const struct sk_buff *skb,
> +static int skb_add_frags(struct sk_buff *skb, struct skb_frag_walk *walk,

That's a lot of new helpers. Is there no existing API that fits IPTFS's needs?


> +static int skb_copy_bits_seq(struct skb_seq_state *st, int offset, void *to, int len)

Probably belongs in net/core/skbuff.c (if this is really the right way
to implement iptfs).


[...]
> +static int iptfs_input_ordered(struct xfrm_state *x, struct sk_buff *skb)
> +{
[...]
> +	skb_prepare_seq_read(skb, 0, skb->len, &skbseq);
> +
> +	/* Get the IPTFS header and validate it */
> +
> +	if (skb_copy_bits_seq(&skbseq, 0, ipth, sizeof(*ipth))) {

Could you use pskb_may_pull here? Like the rest of networking does
when parsing headers from skbs.

> +		XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
> +		goto done;
> +	}
> +	data = sizeof(*ipth);
> +
> +	trace_iptfs_egress_recv(skb, xtfs, htons(ipth->block_offset));
> +
> +	/* Set data past the basic header */
> +	if (ipth->subtype == IPTFS_SUBTYPE_CC) {
> +		/* Copy the rest of the CC header */
> +		remaining = sizeof(iptcch) - sizeof(*ipth);
> +		if (skb_copy_bits_seq(&skbseq, data, ipth + 1, remaining)) {
> +			XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
> +			goto done;
> +		}
> +		data += remaining;
> +	} else if (ipth->subtype != IPTFS_SUBTYPE_BASIC) {
> +		XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR);
> +		goto done;
> +	}

[...]
> +		} else {
> +			XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
> +			goto done;
> +		}
> +
> +		if (unlikely(skbseq.stepped_offset)) {

I don't think users of skb_seq_* should look into the internal state
of the skbseq.


[...]
> +				/* all pointers could be changed now reset walk */
> +				skb_abort_seq_read(&skbseq);
> +				skb_prepare_seq_read(skb, data, tail, &skbseq);

The fact that you have to reset the walk indicates that this is
probably not the right way to implement this.

> +			} else if (skb->head_frag &&
> +				   /* We have the IP header right now */
> +				   remaining >= iphlen) {
> +				fragwalk = &_fragwalk;
> +				skb_prepare_frag_walk(skb, data, fragwalk);
> +				defer = skb;
> +				skb = NULL;
> +			} else {
> +				/* We couldn't reuse the input skb so allocate a
> +				 * new one.
> +				 */
> +				defer = skb;
> +				skb = NULL;
> +			}

I still don't get at all how the overall defrag/reassembly process is
implemented. I can tell you're walking the skb looking for IP headers,
and (I suppose) carving out new skbs containing a single packet built
from that header and all the payload that goes with it (and then
saving the last chunk to merge with the next incoming packet), but I
don't understand how.

This chunk looks like it might be optimizations, but I'm not sure. If
this is indeed the case, I'd suggest to remove them. For the initial
submission it would be nice to have a slightly dumber version that
reviewers can fully understand. Then you can add back the
optimizations once the code is merged.



[...]
> +/* ------------------------------- */
> +/* Input (Egress) Re-ordering Code */
> +/* ------------------------------- */

nit: ingress? The whole patch seems to mix up ingress/egress and
send/receive.

[...]
> +static u32 __reorder_future_fits(struct xfrm_iptfs_data *xtfs,
> +				 struct sk_buff *inskb,
> +				 struct list_head *freelist, u32 *fcount)
> +{
[...]
> +	BUG_ON(distance >= nslots);

Really not needed. I saw what you wrote about assert philosophy, but
that's not how BUG_ON is used in the kernel. DEBUG_NET_WARN_ON_ONCE
would fit that better for conditions that really should never be
true/false.

> +	if (xtfs->w_saved[index].skb) {
> +		/* a dup of a future */
> +		list_add_tail(&inskb->list, freelist);

Why not just free it immediately?


> +static u32 __reorder_future_shifts(struct xfrm_iptfs_data *xtfs,
> +				   struct sk_buff *inskb,
> +				   struct list_head *list,
> +				   struct list_head *freelist, u32 *fcount)
> +{

[...]
> +	/* ex: slot count is 4, array size is 3 savedlen is 2, slot 0 is the
> +	 * missing sequence number.

It would help to show the state of the reorder window (and all the
related state) at the end of this function (and maybe intermediate
states, after the loop and again after the following if block).

I also struggled to parse the formatting of the diagrams (for example,
what [b] and [-] and --- mean).


> +	 * the final slot at savedlen (index savedlen - 1) is always occupied.
> +	 *
> +	 * beyond is "beyond array size" not savedlen.
> +	 *
> +	 *          +--------- array length (savedlen == 2)
> +	 *          |   +----- array size (nslots - 1 == 3)
> +	 *          |   |   +- window boundary (nslots == 4)
> +	 *          V   V | V
> +	 *                |
> +	 *  0   1   2   3 |   slot number
> +	 * ---  0   1   2 |   array index
> +	 *     [b] [c] : :|   array
> +	 *                |
> +	 * "2" "3" "4" "5"|*6*  seq numbers
> +	 *
> +	 * We receive seq number 6
> +	 * distance == 4 [inseq(6) - w_wantseq(2)]
> +	 * newslot == distance
> +	 * index == 3 [distance(4) - 1]
> +	 * beyond == 1 [newslot(4) - lastslot((nslots(4) - 1))]
> +	 * shifting == 1 [min(savedlen(2), beyond(1)]
> +	 * slot0_skb == [b], and should match w_wantseq

A separator between each example (maybe a long line of '*') would be
helpful.

> +	 *
> +	 *                +--- window boundary (nslots == 4)
> +	 *  0   1   2   3 | 4   slot number
> +	 * ---  0   1   2 | 3   array index
> +	 *     [b] : : : :|     array
> +	 * "2" "3" "4" "5" *6*  seq numbers
> +	 *
> +	 * We receive seq number 6
> +	 * distance == 4 [inseq(6) - w_wantseq(2)]
> +	 * newslot == distance
> +	 * index == 3 [distance(4) - 1]
> +	 * beyond == 1 [newslot(4) - lastslot((nslots(4) - 1))]
> +	 * shifting == 1 [min(savedlen(1), beyond(1)]
> +	 * slot0_skb == [b] and should match w_wantseq
> +	 *
> +	 *                +-- window boundary (nslots == 4)
> +	 *  0   1   2   3 | 4   5   6   slot number
> +	 * ---  0   1   2 | 3   4   5   array index
> +	 *     [-] [c] : :|             array
> +	 * "2" "3" "4" "5" "6" "7" *8*  seq numbers
> +	 *
> +	 * savedlen = 2, beyond = 3
> +	 * iter 1: slot0 == NULL, missed++, lastdrop = 2 (2+1-1), slot0 = [-]
> +	 * iter 2: slot0 == NULL, missed++, lastdrop = 3 (2+2-1), slot0 = [c]

I'm not quite sure what "iter 1" and "iter 2" refer to.

> +	 * 2 < 3, extra = 1 (3-2), missed += extra, lastdrop = 4 (2+2+1-1)
> +	 *
> +	 * We receive seq number 8
> +	 * distance == 6 [inseq(8) - w_wantseq(2)]
> +	 * newslot == distance
> +	 * index == 5 [distance(6) - 1]
> +	 * beyond == 3 [newslot(6) - lastslot((nslots(4) - 1))]
> +	 * shifting == 2 [min(savedlen(2), beyond(3)]
> +	 *
> +	 * slot0_skb == NULL changed from [b] when "savedlen < beyond" is true.
> +	 */
> +
> +	/* Now send any packets that are being shifted out of saved, and account
> +	 * for missing packets that are exiting the window as we shift it.
> +	 */
> +
> +	/* If savedlen > beyond we are shifting some, else all. */
> +	shifting = min(savedlen, beyond);
> +
> +	/* slot0 is the buf that just shifted out and into slot0 */
> +	slot0 = NULL;
> +	s0seq = wantseq;
> +	last_drop_seq = s0seq;

Set but never read.

> +	wnext = xtfs->w_saved;
> +	for (slot = 1; slot <= shifting; slot++, wnext++) {
> +		/* handle what was in slot0 before we occupy it */
> +		if (!slot0) {
> +			last_drop_seq = s0seq;
> +			missed++;

Set but never read.

> +		} else {
> +			list_add_tail(&slot0->list, list);
> +			count++;
> +		}
> +		s0seq++;
> +		slot0 = wnext->skb;
> +		wnext->skb = NULL;
> +	}

I wonder if this code would become more readable by "wasting" an array
element for slot0 (which can never be set except temporarily while
shifting and reordering packets). This function is more than 50%
comments :/

> +	/* slot0 is now either NULL (in which case it's what we now are waiting
> +	 * for, or a buf in which case we need to handle it like we received it;
> +	 * however, we may be advancing past that buffer as well..
> +	 */
> +
> +	/* Handle case where we need to shift more than we had saved, slot0 will
> +	 * be NULL iff savedlen is 0, otherwise slot0 will always be
> +	 * non-NULL b/c we shifted the final element, which is always set if
> +	 * there is any saved, into slot0.
> +	 */
> +	if (savedlen < beyond) {
> +		extra_drops = beyond - savedlen;
> +		if (savedlen == 0) {
> +			BUG_ON(slot0);
> +			s0seq += extra_drops;
> +			last_drop_seq = s0seq - 1;
> +		} else {
> +			extra_drops--; /* we aren't dropping what's in slot0 */
> +			BUG_ON(!slot0);
> +			list_add_tail(&slot0->list, list);
> +			/* if extra_drops then we are going past this slot0
> +			 * so we can safely advance last_drop_seq
> +			 */
> +			if (extra_drops)
> +				last_drop_seq = s0seq + extra_drops;
> +			s0seq += extra_drops + 1;
> +			count++;
> +		}
> +		missed += extra_drops;
> +		slot0 = NULL;
> +		/* slot0 has had an empty slot pushed into it */
> +	}
> +	(void)last_drop_seq;	/* we want this for CC code */

Ewww. Please no breadcrumbs.

> +
> +	/* Remove the entries */

This comment doesn't explain much...

> +	__vec_shift(xtfs, beyond);
> +
> +	/* Advance want seq */

and this one even less.

> +	xtfs->w_wantseq += beyond;
> +
> +	/* Process drops here when implementing congestion control */

[...]
> +static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb)
> +{
> +	struct list_head freelist, list;
> +	struct xfrm_iptfs_data *xtfs = x->mode_data;
> +	struct sk_buff *next;
> +	u32 count, fcount;
> +
> +	/* Fast path for no reorder window. */
> +	if (xtfs->cfg.reorder_win_size == 0) {
> +		iptfs_input_ordered(x, skb);
> +		goto done;
> +	}
> +
> +	/* Fetch list of in-order packets from the reordering window as well as
> +	 * a list of buffers we need to now free.
> +	 */
> +	INIT_LIST_HEAD(&list);
> +	INIT_LIST_HEAD(&freelist);
> +	fcount = 0;
> +
> +	spin_lock(&xtfs->drop_lock);
> +	count = iptfs_input_reorder(xtfs, skb, &list, &freelist, &fcount);

iptfs_input_reorder isn't that long, it would be less messy to just
insert it here.

> +	spin_unlock(&xtfs->drop_lock);
> +
> +	if (count) {

Do we really need those counts? I hope list_for_each_entry_safe can
deal just fine with an empty list?

> +		list_for_each_entry_safe(skb, next, &list, list) {
> +			skb_list_del_init(skb);
> +			(void)iptfs_input_ordered(x, skb);
> +		}
> +	}
> +
> +	if (fcount) {
> +		list_for_each_entry_safe(skb, next, &freelist, list) {
> +			skb_list_del_init(skb);
> +			kfree_skb(skb);

Given that freelist processing is so simple, why not just free
everything directly? It would remove the need to pass more arguments
down to the other functions.

> +		}
> +	}


[...]
> +/* IPv4/IPv6 packet ingress to IPTFS tunnel, arrange to send in IPTFS payload
> + * (i.e., aggregating or fragmenting as appropriate).
> + * This is set in dst->output for an SA.
> + */
> +static int iptfs_output_collect(struct net *net, struct sock *sk,
> +				struct sk_buff *skb)
> +{
[...]
> +	} else {
> +		segs = skb_gso_segment(skb, 0);
> +		if (IS_ERR_OR_NULL(segs)) {
> +			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR);
> +			kfree_skb(skb);
> +			return PTR_ERR(segs);

If skb_gso_segment returned NULL, this will end up returning 0. Is
that correct?



> +static int iptfs_user_init(struct net *net, struct xfrm_state *x,
> +			   struct nlattr **attrs,
> +			   struct netlink_ext_ack *extack)
> +{
[...]
> +	if (xc->reorder_win_size)
> +		xtfs->w_saved = kcalloc(xc->reorder_win_size,
> +					sizeof(*xtfs->w_saved), GFP_KERNEL);

if (!xtfs->w_saved)
    return -ENOMEM;



> +static int iptfs_create_state(struct xfrm_state *x)
> +{
> +	struct xfrm_iptfs_data *xtfs;
> +	int err;
> +
> +	xtfs = kzalloc(sizeof(*xtfs), GFP_KERNEL);
> +	if (!xtfs)
> +		return -ENOMEM;
> +
> +	err = __iptfs_init_state(x, xtfs);
> +	if (err)
> +		return err;

kfree(xtfs) here?
iptfs_delete_state can't free it since we haven't set x->mode_data yet
when __iptfs_init_state fails.

> +
> +	return 0;
> +}
> +
> +static void iptfs_delete_state(struct xfrm_state *x)
> +{
> +	struct xfrm_iptfs_data *xtfs = x->mode_data;
> +	struct skb_wseq *s, *se;
> +
> +	if (!xtfs)
> +		return;
> +
> +	spin_lock_bh(&xtfs->drop_lock);
> +	hrtimer_cancel(&xtfs->iptfs_timer);
> +	hrtimer_cancel(&xtfs->drop_timer);
> +	spin_unlock_bh(&xtfs->drop_lock);

Does this guarantee that xtfs->queue has been flushed? If not, I guess
we need to do it now.

> +
> +	if (xtfs->ra_newskb)
> +		kfree_skb(xtfs->ra_newskb);
> +
> +	for (s = xtfs->w_saved, se = s + xtfs->w_savedlen; s < se; s++)
> +		if (s->skb)
> +			kfree_skb(s->skb);

nit: possibly better if hidden in a free_reorder_pending helper (or
some other similar name), implemented alongside all the reorder code.


> +	kfree_sensitive(xtfs->w_saved);
> +	kfree_sensitive(xtfs);
> +
> +	module_put(x->mode_cbs->owner);
> +}

-- 
Sabrina