netdev - [PATCH RFC 1/2] net: Support flow sorted RX skb lists for IPv4.

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Date:   Wed, 12 Sep 2018 12:23:29 +0200
From:   Steffen Klassert <steffen.klassert@...unet.com>
To:     <netdev@...r.kernel.org>
CC:     Steffen Klassert <steffen.klassert@...unet.com>
Subject: [PATCH RFC 1/2] net: Support flow sorted RX skb lists for IPv4.

This patch sorts RX skb lists into separate flows, using
a flow dissector, at the IP input layer. Packets of the
same flow are chained at the frag_list pointer of the first
skb of this flow.

After ip_list_rcv_finish() the skb list has this layout:

|---------|        |---------|        |---------|
|flow 1   |        |flow 1   |        |flow 1   |
|---------|        |---------|        |---------|
|frag_list|<-\     |frag_list|        |frag_list|
|---------|   \    |---------|        |---------|
|next     |<-\ \---|next     |<-------|next     |
|---------|   \    |---------|        |---------|
              |
              |
              |    |---------|        |---------|        |---------|
              |    |flow 2   |        |flow 2   |        |flow 2   |
              |    |---------|        |---------|        |---------|
              |    |frag_list|<-\     |frag_list|        |frag_list|
              |    |---------|   \    |---------|        |---------|
              |----|next     |<-\ \---|next     |<-------|next     |
                   |---------|   \    |---------|        |---------|
                                 |
                                 |
                                 |    |---------|        |---------|       |---------|
                                 |    |flow 3   |        |flow 3   |       |flow 3   |
                                 |    |---------|        |---------|       |---------|
                                 |    |frag_list|<-\     |frag_list|       |frag_list|
                                 |    |---------|   \    |---------|       |---------|
                                 |----|next     |    \---|next     |<------|next     |
                                      |---------|        |---------|       |---------|

With this approach route lookups etc. are done just for one
representative packet of a given flow instead for each packet.

ip_sublist_rcv_finish() splits these lists then into:

|---------|        |---------|        |---------|
|flow 1   |        |flow 1   |        |flow 1   |
|---------|        |---------|        |---------|
|frag_list|<-\     |frag_list|        |frag_list|
|---------|   \    |---------|        |---------|
|next     |    \---|next     |<-------|next     |
|---------|        |---------|        |---------|

Packets of the same flow can still travel together after that point.

On input, this is plumbed through to ip_local_deliver_finish(),
here the skb chain is split back into single packets.

My hope is that this can be plumbed through to the sockets
receive queue. I have a patch for UDP, but it has still
problems with UDP encapsulaion, so it is not included here.

On forward, the skb chain can travel together to the TX path.
__skb_gso_segment() will build a standard skb list from this.

For now, this is only enabled if the receiving device allows
forwarding, as the forwarding path has currently the most gain
from this.

Known issues:

- I don't have a NIC whose driver supports to build skb lists
  to be received by netif_receive_skb_list(). To test this
  codepath I used a hack that builds skb lists at the napi
  layer.

- Performance measurements were done with this hack, so I don't
  know if these measurements are really meaningful.

- This is early stage work, so the functional tests are only
  done on a basic level, it might be still buggy.

- This still uses the skb->next, skb->prev pointers to build
  skb lists. So needs to be converted to standard list handling
  at some point.

Signed-off-by: Steffen Klassert <steffen.klassert@...unet.com>
---
 include/linux/skbuff.h    |   5 ++
 net/core/dev.c            |  45 +++++++++++-
 net/core/flow_dissector.c |  40 +++++++++++
 net/core/skbuff.c         |  52 ++++++++++++++
 net/ipv4/ip_input.c       | 139 ++++++++++++++++++++++++++++++++++----
 net/ipv4/ip_output.c      |   3 +-
 6 files changed, 270 insertions(+), 14 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 17a13e4785fc..d070d073a1dc 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -575,6 +575,8 @@ enum {
 	SKB_GSO_UDP = 1 << 16,
 
 	SKB_GSO_UDP_L4 = 1 << 17,
+
+	SKB_GSO_FRAGLIST = 1 << 18,
 };
 
 #if BITS_PER_LONG > 32
@@ -1226,6 +1228,8 @@ skb_flow_dissect_flow_keys_basic(const struct sk_buff *skb,
 				  data, proto, nhoff, hlen, flags);
 }
 
+u32 skb_flow_keys_rx_digest(struct sk_buff *skb, struct flow_keys_digest *digest);
+
 void
 skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
 			     struct flow_dissector *flow_dissector,
@@ -3302,6 +3306,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen);
 void skb_scrub_packet(struct sk_buff *skb, bool xnet);
 bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu);
 bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len);
+void skb_segment_list(struct sk_buff *skb);
 struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features);
 struct sk_buff *skb_vlan_untag(struct sk_buff *skb);
 int skb_ensure_writable(struct sk_buff *skb, int write_len);
diff --git a/net/core/dev.c b/net/core/dev.c
index ca78dc5a79a3..147da35d7380 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2998,6 +2998,34 @@ static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
 	return skb->ip_summed == CHECKSUM_NONE;
 }
 
+static void skb_segment_list_ip(struct sk_buff *skb)
+{
+	unsigned int tnl_hlen = 0;
+	struct sk_buff *nskb;
+	int id;
+
+	id = ntohs(ip_hdr(skb)->id);
+	skb_segment_list(skb);
+
+	tnl_hlen = skb_tnl_header_len(skb);
+
+	nskb = skb->next;
+
+	do {
+		skb_push(nskb, skb_network_header(nskb) - skb_mac_header(nskb));
+		skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb));
+		skb_copy_from_linear_data_offset(skb, -tnl_hlen,
+						 nskb->data - tnl_hlen,
+						 skb_transport_header(nskb) -
+						 skb_mac_header(nskb) +
+						 tnl_hlen);
+
+		ip_hdr(nskb)->id = htons(++id);
+		ip_send_check(ip_hdr(nskb));
+		nskb = nskb->next;
+	} while (nskb);
+}
+
 /**
  *	__skb_gso_segment - Perform segmentation on skb.
  *	@skb: buffer to segment
@@ -3016,6 +3044,21 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
 {
 	struct sk_buff *segs;
 
+	if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST) {
+		int dummy;
+
+		if (skb_network_protocol(skb, &dummy) != htons(ETH_P_IP))
+			return ERR_PTR(-EINVAL);
+
+		skb_segment_list_ip(skb);
+
+		if (skb_needs_linearize(skb, features) &&
+		    __skb_linearize(skb))
+			return ERR_PTR(-EINVAL);
+
+		return skb;
+	}
+
 	if (unlikely(skb_needs_check(skb, tx_path))) {
 		int err;
 
@@ -3289,7 +3332,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
 		segs = skb_gso_segment(skb, features);
 		if (IS_ERR(segs)) {
 			goto out_kfree_skb;
-		} else if (segs) {
+		} else if (segs && segs != skb) {
 			consume_skb(skb);
 			skb = segs;
 		}
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index ce9eeeb7c024..8ca7e09dca5e 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1211,6 +1211,46 @@ static inline u32 ___skb_get_hash(const struct sk_buff *skb,
 	return __flow_hash_from_keys(keys, keyval);
 }
 
+struct _flow_keys_rx_digest_data {
+	__be16	n_proto;
+	u8	ip_proto;
+	u8	poff;
+	__be32	ports;
+	__be32	src;
+	__be32	dst;
+};
+
+u32 skb_flow_keys_rx_digest(struct sk_buff *skb, struct flow_keys_digest *digest)
+{
+	struct flow_keys keys;
+	struct _flow_keys_rx_digest_data *data =
+	    (struct _flow_keys_rx_digest_data *)digest;
+	struct flow_keys_basic *bkeys;
+	u32 poff;
+
+	__flow_hash_secret_init();
+
+	skb_flow_dissect_flow_keys(skb, &keys,
+				   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
+
+	bkeys = (struct flow_keys_basic *)&keys;
+	poff = __skb_get_poff(skb, skb->data, bkeys, skb_headlen(skb));
+	if (poff > 255)
+		poff = 0;
+
+	BUILD_BUG_ON(sizeof(*data) > sizeof(*digest));
+
+	data->n_proto = keys.basic.n_proto;
+	data->ip_proto = keys.basic.ip_proto;
+	data->ports = keys.ports.ports;
+	data->poff = poff;
+	data->src = keys.addrs.v4addrs.src;
+	data->dst = keys.addrs.v4addrs.dst;
+
+	return poff;
+}
+EXPORT_SYMBOL(skb_flow_keys_rx_digest);
+
 struct _flow_keys_digest_data {
 	__be16	n_proto;
 	u8	ip_proto;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c996c09d095f..8f725a78dc93 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3495,6 +3495,58 @@ static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb)
 	return head_frag;
 }
 
+void skb_segment_list(struct sk_buff *skb)
+{
+	struct sk_buff *list_skb = skb_shinfo(skb)->frag_list;
+	unsigned int delta_truesize = 0;
+	unsigned int delta_len = 0;
+	struct sk_buff *tail = NULL;
+	struct sk_buff *nskb;
+
+
+	skb_shinfo(skb)->frag_list = NULL;
+
+	do {
+		nskb = list_skb;
+		list_skb = list_skb->next;
+
+		if (!tail)
+			skb->next = nskb;
+		else
+			tail->next = nskb;
+
+		tail = nskb;
+
+		delta_len += nskb->len;
+		delta_truesize += nskb->truesize;
+
+		if (!secpath_exists(nskb))
+			nskb->sp = secpath_get(skb->sp);
+
+		memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
+
+		nskb->tstamp = skb->tstamp;
+		nskb->dev = skb->dev;
+		nskb->queue_mapping = skb->queue_mapping;
+
+		nskb->mac_len = skb->mac_len;
+		nskb->mac_header = skb->mac_header;
+		nskb->transport_header = skb->transport_header;
+		nskb->network_header = skb->network_header;
+		skb_dst_copy(nskb, skb);
+
+	} while (list_skb);
+
+	skb->truesize = skb->truesize - delta_truesize;
+	skb->data_len = skb->data_len - delta_len;
+	skb->len = skb->len - delta_len;
+
+	skb_gso_reset(skb);
+
+	skb->prev = tail;
+}
+EXPORT_SYMBOL_GPL(skb_segment_list);
+
 /**
  *	skb_segment - Perform protocol segmentation on skb.
  *	@head_skb: buffer to segment
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 3196cf58f418..bf710bf95fea 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -190,14 +190,20 @@ bool ip_call_ra_chain(struct sk_buff *skb)
 
 static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
-	__skb_pull(skb, skb_network_header_len(skb));
+	if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)
+		skb_segment_list(skb);
 
 	rcu_read_lock();
-	{
+	do {
 		int protocol = ip_hdr(skb)->protocol;
 		const struct net_protocol *ipprot;
+		struct sk_buff *nskb = skb->next;
 		int raw;
 
+		skb->next = NULL;
+
+		__skb_pull(skb, skb_network_header_len(skb));
+
 	resubmit:
 		raw = raw_local_deliver(skb, protocol);
 
@@ -208,7 +214,7 @@ static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_b
 			if (!ipprot->no_policy) {
 				if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
 					kfree_skb(skb);
-					goto out;
+					continue;
 				}
 				nf_reset(skb);
 			}
@@ -231,8 +237,8 @@ static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_b
 				consume_skb(skb);
 			}
 		}
-	}
- out:
+		skb = nskb;
+	} while (skb);
 	rcu_read_unlock();
 
 	return 0;
@@ -403,6 +409,10 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
 	int ret;
 
+	/* Remove any debris in the socket control block */
+	memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+	IPCB(skb)->iif = skb->skb_iif;
+
 	/* if ingress device is enslaved to an L3 master device pass the
 	 * skb to its handler for processing
 	 */
@@ -416,10 +426,108 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 	return ret;
 }
 
+struct dissect_skb_cb {
+	struct sk_buff *last;
+	struct	flow_keys_digest keys;
+};
+
+static inline struct dissect_skb_cb *dissect_skb_cb(const struct sk_buff *skb) {
+	return (struct dissect_skb_cb *)skb->cb;
+}
+
+static void ip_sublist_rcv(struct list_head *head, struct net_device *dev,
+			   struct net *net);
+
+static struct sk_buff *ip_flow_dissect(struct sk_buff *skb, struct list_head *rx_list)
+{
+	unsigned int maclen = skb->dev->hard_header_len;
+	const struct iphdr *iph  = ip_hdr(skb);
+	unsigned int gso_type = 0;
+	struct sk_buff *p;
+	u32 poff;
+
+	if (*(u8 *)iph != 0x45)
+		goto out;
+
+	if (ip_is_fragment(iph))
+		goto out;
+
+	dissect_skb_cb(skb)->last = NULL;
+	poff = skb_flow_keys_rx_digest(skb, &dissect_skb_cb(skb)->keys);
+	if (!poff)
+		goto out;
+
+	switch (iph->protocol) {
+	case IPPROTO_TCP:
+		gso_type = SKB_GSO_TCPV4;
+		break;
+	case IPPROTO_UDP:
+		gso_type = SKB_GSO_UDP_L4;
+		break;
+	default:
+		goto out;
+	}
+
+	list_for_each_entry(p, rx_list, list) {
+		unsigned long diffs;
+
+		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
+		diffs |= p->vlan_tci ^ skb->vlan_tci;
+		diffs |= skb_metadata_dst_cmp(p, skb);
+		diffs |= skb_metadata_differs(p, skb);
+		if (maclen == ETH_HLEN)
+			diffs |= compare_ether_header(skb_mac_header(p),
+						      skb_mac_header(skb));
+		else if (!diffs)
+			diffs = memcmp(skb_mac_header(p),
+				       skb_mac_header(skb),
+				       maclen);
+
+		if (diffs)
+			continue;
+
+		if (memcmp(&dissect_skb_cb(p)->keys,
+			   &dissect_skb_cb(skb)->keys,
+			   sizeof(dissect_skb_cb(skb)->keys)))
+			continue;
+
+		if (p->len != skb->len) {
+			if (!list_empty(rx_list))
+				ip_sublist_rcv(rx_list, p->dev, dev_net(p->dev));
+			INIT_LIST_HEAD(rx_list);
+			goto out;
+	}
+
+		skb->next = NULL;
+		skb->prev = NULL;
+
+		if (!dissect_skb_cb(p)->last) {
+			skb_shinfo(p)->gso_size = p->len - poff;
+			skb_shinfo(p)->gso_type |= (SKB_GSO_FRAGLIST | gso_type);
+			skb_shinfo(p)->frag_list = skb;
+			skb_shinfo(p)->gso_segs = 1;
+		} else {
+			dissect_skb_cb(p)->last->next = skb;
+		}
+
+		dissect_skb_cb(p)->last = skb;
+
+		skb_shinfo(p)->gso_segs++;
+		p->data_len += skb->len;
+		p->truesize += skb->truesize;
+		p->len += skb->len;
+
+		return NULL;
+	}
+
+out:
+	return skb;
+}
+
 /*
  * 	Main IP Receive routine.
  */
-static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
+static struct sk_buff *ip_rcv_core(struct list_head *head, struct sk_buff *skb, struct net *net)
 {
 	const struct iphdr *iph;
 	u32 len;
@@ -491,13 +599,14 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
 
 	skb->transport_header = skb->network_header + iph->ihl*4;
 
-	/* Remove any debris in the socket control block */
-	memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
-	IPCB(skb)->iif = skb->skb_iif;
-
 	/* Must drop socket now because of tproxy. */
 	skb_orphan(skb);
 
+	if (IN_DEV_FORWARD(__in_dev_get_rcu(skb->dev))) {
+		if (head)
+			return ip_flow_dissect(skb, head);
+	}
+
 	return skb;
 
 csum_error:
@@ -518,9 +627,10 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
 {
 	struct net *net = dev_net(dev);
 
-	skb = ip_rcv_core(skb, net);
+	skb = ip_rcv_core(NULL, skb, net);
 	if (skb == NULL)
 		return NET_RX_DROP;
+
 	return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
 		       net, NULL, skb, dev, NULL,
 		       ip_rcv_finish);
@@ -552,6 +662,11 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk,
 		struct dst_entry *dst;
 
 		list_del(&skb->list);
+
+		/* Remove any debris in the socket control block */
+		memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+		IPCB(skb)->iif = skb->skb_iif;
+
 		/* if ingress device is enslaved to an L3 master device pass the
 		 * skb to its handler for processing
 		 */
@@ -599,7 +714,7 @@ void ip_list_rcv(struct list_head *head, struct packet_type *pt,
 		struct net *net = dev_net(dev);
 
 		list_del(&skb->list);
-		skb = ip_rcv_core(skb, net);
+		skb = ip_rcv_core(&sublist, skb, net);
 		if (skb == NULL)
 			continue;
 
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 9c4e72e9c60a..00d8a2576266 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -272,7 +272,8 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk,
 		return -ENOMEM;
 	}
 
-	consume_skb(skb);
+	if (segs != skb)
+		consume_skb(skb);
 
 	do {
 		struct sk_buff *nskb = segs->next;
-- 
2.17.1