[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20180912102330.24790-2-steffen.klassert@secunet.com>
Date: Wed, 12 Sep 2018 12:23:29 +0200
From: Steffen Klassert <steffen.klassert@...unet.com>
To: <netdev@...r.kernel.org>
CC: Steffen Klassert <steffen.klassert@...unet.com>
Subject: [PATCH RFC 1/2] net: Support flow sorted RX skb lists for IPv4.
This patch sorts RX skb lists into separate flows, using
a flow dissector, at the IP input layer. Packets of the
same flow are chained at the frag_list pointer of the first
skb of this flow.
After ip_list_rcv_finish() the skb list has this layout:
|---------| |---------| |---------|
|flow 1 | |flow 1 | |flow 1 |
|---------| |---------| |---------|
|frag_list|<-\ |frag_list| |frag_list|
|---------| \ |---------| |---------|
|next |<-\ \---|next |<-------|next |
|---------| \ |---------| |---------|
|
|
| |---------| |---------| |---------|
| |flow 2 | |flow 2 | |flow 2 |
| |---------| |---------| |---------|
| |frag_list|<-\ |frag_list| |frag_list|
| |---------| \ |---------| |---------|
|----|next |<-\ \---|next |<-------|next |
|---------| \ |---------| |---------|
|
|
| |---------| |---------| |---------|
| |flow 3 | |flow 3 | |flow 3 |
| |---------| |---------| |---------|
| |frag_list|<-\ |frag_list| |frag_list|
| |---------| \ |---------| |---------|
|----|next | \---|next |<------|next |
|---------| |---------| |---------|
With this approach route lookups etc. are done just for one
representative packet of a given flow instead for each packet.
ip_sublist_rcv_finish() splits these lists then into:
|---------| |---------| |---------|
|flow 1 | |flow 1 | |flow 1 |
|---------| |---------| |---------|
|frag_list|<-\ |frag_list| |frag_list|
|---------| \ |---------| |---------|
|next | \---|next |<-------|next |
|---------| |---------| |---------|
Packets of the same flow can still travel together after that point.
On input, this is plumbed through to ip_local_deliver_finish(),
here the skb chain is split back into single packets.
My hope is that this can be plumbed through to the sockets
receive queue. I have a patch for UDP, but it has still
problems with UDP encapsulaion, so it is not included here.
On forward, the skb chain can travel together to the TX path.
__skb_gso_segment() will build a standard skb list from this.
For now, this is only enabled if the receiving device allows
forwarding, as the forwarding path has currently the most gain
from this.
Known issues:
- I don't have a NIC whose driver supports to build skb lists
to be received by netif_receive_skb_list(). To test this
codepath I used a hack that builds skb lists at the napi
layer.
- Performance measurements were done with this hack, so I don't
know if these measurements are really meaningful.
- This is early stage work, so the functional tests are only
done on a basic level, it might be still buggy.
- This still uses the skb->next, skb->prev pointers to build
skb lists. So needs to be converted to standard list handling
at some point.
Signed-off-by: Steffen Klassert <steffen.klassert@...unet.com>
---
include/linux/skbuff.h | 5 ++
net/core/dev.c | 45 +++++++++++-
net/core/flow_dissector.c | 40 +++++++++++
net/core/skbuff.c | 52 ++++++++++++++
net/ipv4/ip_input.c | 139 ++++++++++++++++++++++++++++++++++----
net/ipv4/ip_output.c | 3 +-
6 files changed, 270 insertions(+), 14 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 17a13e4785fc..d070d073a1dc 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -575,6 +575,8 @@ enum {
SKB_GSO_UDP = 1 << 16,
SKB_GSO_UDP_L4 = 1 << 17,
+
+ SKB_GSO_FRAGLIST = 1 << 18,
};
#if BITS_PER_LONG > 32
@@ -1226,6 +1228,8 @@ skb_flow_dissect_flow_keys_basic(const struct sk_buff *skb,
data, proto, nhoff, hlen, flags);
}
+u32 skb_flow_keys_rx_digest(struct sk_buff *skb, struct flow_keys_digest *digest);
+
void
skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
@@ -3302,6 +3306,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen);
void skb_scrub_packet(struct sk_buff *skb, bool xnet);
bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu);
bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len);
+void skb_segment_list(struct sk_buff *skb);
struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features);
struct sk_buff *skb_vlan_untag(struct sk_buff *skb);
int skb_ensure_writable(struct sk_buff *skb, int write_len);
diff --git a/net/core/dev.c b/net/core/dev.c
index ca78dc5a79a3..147da35d7380 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2998,6 +2998,34 @@ static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
return skb->ip_summed == CHECKSUM_NONE;
}
+static void skb_segment_list_ip(struct sk_buff *skb)
+{
+ unsigned int tnl_hlen = 0;
+ struct sk_buff *nskb;
+ int id;
+
+ id = ntohs(ip_hdr(skb)->id);
+ skb_segment_list(skb);
+
+ tnl_hlen = skb_tnl_header_len(skb);
+
+ nskb = skb->next;
+
+ do {
+ skb_push(nskb, skb_network_header(nskb) - skb_mac_header(nskb));
+ skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb));
+ skb_copy_from_linear_data_offset(skb, -tnl_hlen,
+ nskb->data - tnl_hlen,
+ skb_transport_header(nskb) -
+ skb_mac_header(nskb) +
+ tnl_hlen);
+
+ ip_hdr(nskb)->id = htons(++id);
+ ip_send_check(ip_hdr(nskb));
+ nskb = nskb->next;
+ } while (nskb);
+}
+
/**
* __skb_gso_segment - Perform segmentation on skb.
* @skb: buffer to segment
@@ -3016,6 +3044,21 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
{
struct sk_buff *segs;
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST) {
+ int dummy;
+
+ if (skb_network_protocol(skb, &dummy) != htons(ETH_P_IP))
+ return ERR_PTR(-EINVAL);
+
+ skb_segment_list_ip(skb);
+
+ if (skb_needs_linearize(skb, features) &&
+ __skb_linearize(skb))
+ return ERR_PTR(-EINVAL);
+
+ return skb;
+ }
+
if (unlikely(skb_needs_check(skb, tx_path))) {
int err;
@@ -3289,7 +3332,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
segs = skb_gso_segment(skb, features);
if (IS_ERR(segs)) {
goto out_kfree_skb;
- } else if (segs) {
+ } else if (segs && segs != skb) {
consume_skb(skb);
skb = segs;
}
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index ce9eeeb7c024..8ca7e09dca5e 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1211,6 +1211,46 @@ static inline u32 ___skb_get_hash(const struct sk_buff *skb,
return __flow_hash_from_keys(keys, keyval);
}
+struct _flow_keys_rx_digest_data {
+ __be16 n_proto;
+ u8 ip_proto;
+ u8 poff;
+ __be32 ports;
+ __be32 src;
+ __be32 dst;
+};
+
+u32 skb_flow_keys_rx_digest(struct sk_buff *skb, struct flow_keys_digest *digest)
+{
+ struct flow_keys keys;
+ struct _flow_keys_rx_digest_data *data =
+ (struct _flow_keys_rx_digest_data *)digest;
+ struct flow_keys_basic *bkeys;
+ u32 poff;
+
+ __flow_hash_secret_init();
+
+ skb_flow_dissect_flow_keys(skb, &keys,
+ FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
+
+ bkeys = (struct flow_keys_basic *)&keys;
+ poff = __skb_get_poff(skb, skb->data, bkeys, skb_headlen(skb));
+ if (poff > 255)
+ poff = 0;
+
+ BUILD_BUG_ON(sizeof(*data) > sizeof(*digest));
+
+ data->n_proto = keys.basic.n_proto;
+ data->ip_proto = keys.basic.ip_proto;
+ data->ports = keys.ports.ports;
+ data->poff = poff;
+ data->src = keys.addrs.v4addrs.src;
+ data->dst = keys.addrs.v4addrs.dst;
+
+ return poff;
+}
+EXPORT_SYMBOL(skb_flow_keys_rx_digest);
+
struct _flow_keys_digest_data {
__be16 n_proto;
u8 ip_proto;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c996c09d095f..8f725a78dc93 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3495,6 +3495,58 @@ static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb)
return head_frag;
}
+void skb_segment_list(struct sk_buff *skb)
+{
+ struct sk_buff *list_skb = skb_shinfo(skb)->frag_list;
+ unsigned int delta_truesize = 0;
+ unsigned int delta_len = 0;
+ struct sk_buff *tail = NULL;
+ struct sk_buff *nskb;
+
+
+ skb_shinfo(skb)->frag_list = NULL;
+
+ do {
+ nskb = list_skb;
+ list_skb = list_skb->next;
+
+ if (!tail)
+ skb->next = nskb;
+ else
+ tail->next = nskb;
+
+ tail = nskb;
+
+ delta_len += nskb->len;
+ delta_truesize += nskb->truesize;
+
+ if (!secpath_exists(nskb))
+ nskb->sp = secpath_get(skb->sp);
+
+ memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
+
+ nskb->tstamp = skb->tstamp;
+ nskb->dev = skb->dev;
+ nskb->queue_mapping = skb->queue_mapping;
+
+ nskb->mac_len = skb->mac_len;
+ nskb->mac_header = skb->mac_header;
+ nskb->transport_header = skb->transport_header;
+ nskb->network_header = skb->network_header;
+ skb_dst_copy(nskb, skb);
+
+ } while (list_skb);
+
+ skb->truesize = skb->truesize - delta_truesize;
+ skb->data_len = skb->data_len - delta_len;
+ skb->len = skb->len - delta_len;
+
+ skb_gso_reset(skb);
+
+ skb->prev = tail;
+}
+EXPORT_SYMBOL_GPL(skb_segment_list);
+
/**
* skb_segment - Perform protocol segmentation on skb.
* @head_skb: buffer to segment
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 3196cf58f418..bf710bf95fea 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -190,14 +190,20 @@ bool ip_call_ra_chain(struct sk_buff *skb)
static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- __skb_pull(skb, skb_network_header_len(skb));
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)
+ skb_segment_list(skb);
rcu_read_lock();
- {
+ do {
int protocol = ip_hdr(skb)->protocol;
const struct net_protocol *ipprot;
+ struct sk_buff *nskb = skb->next;
int raw;
+ skb->next = NULL;
+
+ __skb_pull(skb, skb_network_header_len(skb));
+
resubmit:
raw = raw_local_deliver(skb, protocol);
@@ -208,7 +214,7 @@ static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_b
if (!ipprot->no_policy) {
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
kfree_skb(skb);
- goto out;
+ continue;
}
nf_reset(skb);
}
@@ -231,8 +237,8 @@ static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_b
consume_skb(skb);
}
}
- }
- out:
+ skb = nskb;
+ } while (skb);
rcu_read_unlock();
return 0;
@@ -403,6 +409,10 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
int ret;
+ /* Remove any debris in the socket control block */
+ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+ IPCB(skb)->iif = skb->skb_iif;
+
/* if ingress device is enslaved to an L3 master device pass the
* skb to its handler for processing
*/
@@ -416,10 +426,108 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
return ret;
}
+struct dissect_skb_cb {
+ struct sk_buff *last;
+ struct flow_keys_digest keys;
+};
+
+static inline struct dissect_skb_cb *dissect_skb_cb(const struct sk_buff *skb) {
+ return (struct dissect_skb_cb *)skb->cb;
+}
+
+static void ip_sublist_rcv(struct list_head *head, struct net_device *dev,
+ struct net *net);
+
+static struct sk_buff *ip_flow_dissect(struct sk_buff *skb, struct list_head *rx_list)
+{
+ unsigned int maclen = skb->dev->hard_header_len;
+ const struct iphdr *iph = ip_hdr(skb);
+ unsigned int gso_type = 0;
+ struct sk_buff *p;
+ u32 poff;
+
+ if (*(u8 *)iph != 0x45)
+ goto out;
+
+ if (ip_is_fragment(iph))
+ goto out;
+
+ dissect_skb_cb(skb)->last = NULL;
+ poff = skb_flow_keys_rx_digest(skb, &dissect_skb_cb(skb)->keys);
+ if (!poff)
+ goto out;
+
+ switch (iph->protocol) {
+ case IPPROTO_TCP:
+ gso_type = SKB_GSO_TCPV4;
+ break;
+ case IPPROTO_UDP:
+ gso_type = SKB_GSO_UDP_L4;
+ break;
+ default:
+ goto out;
+ }
+
+ list_for_each_entry(p, rx_list, list) {
+ unsigned long diffs;
+
+ diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
+ diffs |= p->vlan_tci ^ skb->vlan_tci;
+ diffs |= skb_metadata_dst_cmp(p, skb);
+ diffs |= skb_metadata_differs(p, skb);
+ if (maclen == ETH_HLEN)
+ diffs |= compare_ether_header(skb_mac_header(p),
+ skb_mac_header(skb));
+ else if (!diffs)
+ diffs = memcmp(skb_mac_header(p),
+ skb_mac_header(skb),
+ maclen);
+
+ if (diffs)
+ continue;
+
+ if (memcmp(&dissect_skb_cb(p)->keys,
+ &dissect_skb_cb(skb)->keys,
+ sizeof(dissect_skb_cb(skb)->keys)))
+ continue;
+
+ if (p->len != skb->len) {
+ if (!list_empty(rx_list))
+ ip_sublist_rcv(rx_list, p->dev, dev_net(p->dev));
+ INIT_LIST_HEAD(rx_list);
+ goto out;
+ }
+
+ skb->next = NULL;
+ skb->prev = NULL;
+
+ if (!dissect_skb_cb(p)->last) {
+ skb_shinfo(p)->gso_size = p->len - poff;
+ skb_shinfo(p)->gso_type |= (SKB_GSO_FRAGLIST | gso_type);
+ skb_shinfo(p)->frag_list = skb;
+ skb_shinfo(p)->gso_segs = 1;
+ } else {
+ dissect_skb_cb(p)->last->next = skb;
+ }
+
+ dissect_skb_cb(p)->last = skb;
+
+ skb_shinfo(p)->gso_segs++;
+ p->data_len += skb->len;
+ p->truesize += skb->truesize;
+ p->len += skb->len;
+
+ return NULL;
+ }
+
+out:
+ return skb;
+}
+
/*
* Main IP Receive routine.
*/
-static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
+static struct sk_buff *ip_rcv_core(struct list_head *head, struct sk_buff *skb, struct net *net)
{
const struct iphdr *iph;
u32 len;
@@ -491,13 +599,14 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
skb->transport_header = skb->network_header + iph->ihl*4;
- /* Remove any debris in the socket control block */
- memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
- IPCB(skb)->iif = skb->skb_iif;
-
/* Must drop socket now because of tproxy. */
skb_orphan(skb);
+ if (IN_DEV_FORWARD(__in_dev_get_rcu(skb->dev))) {
+ if (head)
+ return ip_flow_dissect(skb, head);
+ }
+
return skb;
csum_error:
@@ -518,9 +627,10 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
{
struct net *net = dev_net(dev);
- skb = ip_rcv_core(skb, net);
+ skb = ip_rcv_core(NULL, skb, net);
if (skb == NULL)
return NET_RX_DROP;
+
return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
net, NULL, skb, dev, NULL,
ip_rcv_finish);
@@ -552,6 +662,11 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk,
struct dst_entry *dst;
list_del(&skb->list);
+
+ /* Remove any debris in the socket control block */
+ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+ IPCB(skb)->iif = skb->skb_iif;
+
/* if ingress device is enslaved to an L3 master device pass the
* skb to its handler for processing
*/
@@ -599,7 +714,7 @@ void ip_list_rcv(struct list_head *head, struct packet_type *pt,
struct net *net = dev_net(dev);
list_del(&skb->list);
- skb = ip_rcv_core(skb, net);
+ skb = ip_rcv_core(&sublist, skb, net);
if (skb == NULL)
continue;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 9c4e72e9c60a..00d8a2576266 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -272,7 +272,8 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk,
return -ENOMEM;
}
- consume_skb(skb);
+ if (segs != skb)
+ consume_skb(skb);
do {
struct sk_buff *nskb = segs->next;
--
2.17.1
Powered by blists - more mailing lists