[<prev] [next>] [day] [month] [year] [list]
Message-ID: <20151201180356.GD21252@oracle.com>
Date: Tue, 1 Dec 2015 13:03:56 -0500
From: Sowmini Varadhan <sowmini.varadhan@...cle.com>
To: netdev@...r.kernel.org, linux-crypto@...r.kernel.org
Cc: sowmini.varadhan@...cle.com
Subject: [PATCH RFC] Defer xfrm to be done post-GSO
Experimental patch as promised in
http://marc.info/?l=linux-netdev&m=144899280626165&w=2
Disclaimer: this patch is only an experiment to see if xfrm can be done
post-GSO to leverage from GSO benefits. I'm sharing to get some feedback
on the general direction being pursued here.
At the moment, for single-stream iperf using esp-null and a 10G link,
results in 3-3.5 Gbps vs the baseline of 1.8-2 Gbps.
While the 1.8 -> 3 Gbps is a step in the right direction, it still far
away from the 6-9 Gbps that one can get with just GSO (6 Gbps if GRO
is disabled), so input on other ways to improve this is invited.
Major things done in this patch:
- don't disable TSO in sk_setup_caps() if a dst->header_len is found
- in xfrm4_output, if GSO is applicable, bail out without esp header
addition - that will get done after skb_segment()
- at the end of tcp_gso_segment() (when tcp segment is available),
set things up for xfrm_output_one and trigger the esp_output
A 1-bit hole in sk_buff is used to track an skb that needs xfrm (might
not need to burn that bit, but using it for now)
Signed-off-by: Sowmini Varadhan <sowmini.varadhan@...cle.com>
---
include/linux/skbuff.h | 6 +++-
include/net/xfrm.h | 1 +
net/core/dev.c | 8 +++--
net/core/sock.c | 4 +++
net/ipv4/af_inet.c | 11 +++++++-
net/ipv4/ip_output.c | 4 +++
net/ipv4/tcp_offload.c | 56 +++++++++++++++++++++++++++++++++++++++
net/ipv4/tcp_output.c | 1 +
net/ipv4/xfrm4_mode_transport.c | 51 +++++++++++++++++++++++++++++++++++
net/ipv4/xfrm4_output.c | 9 ++++++
net/xfrm/xfrm_output.c | 3 +-
11 files changed, 147 insertions(+), 7 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 24f4dfd..242c32b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -593,8 +593,8 @@ struct sk_buff {
fclone:2,
peeked:1,
head_frag:1,
- xmit_more:1;
- /* one bit hole */
+ xmit_more:1,
+ recirc:1; /* uses one bit hole */
kmemcheck_bitfield_end(flags1);
/* fields enclosed in headers_start/headers_end are copied
@@ -3577,5 +3577,7 @@ static inline unsigned int skb_gso_network_seglen(const struct sk_buff *skb)
return hdr_len + skb_gso_transport_seglen(skb);
}
+#define XFRM_GSO 1 /* use this for now to quickly toggle back to baseline */
+
#endif /* __KERNEL__ */
#endif /* _LINUX_SKBUFF_H */
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 4a9c21f..c17dc79 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1508,6 +1508,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type);
int xfrm_input_resume(struct sk_buff *skb, int nexthdr);
int xfrm_output_resume(struct sk_buff *skb, int err);
int xfrm_output(struct sock *sk, struct sk_buff *skb);
+int xfrm_output_one(struct sk_buff *skb, int err);
int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb);
void xfrm_local_error(struct sk_buff *skb, int mtu);
int xfrm4_extract_header(struct sk_buff *skb);
diff --git a/net/core/dev.c b/net/core/dev.c
index 8ce3f74..6b9f20f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2804,7 +2804,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
{
- struct sk_buff *next, *head = NULL, *tail;
+ struct sk_buff *next, *head = NULL, *tail = NULL;
for (; skb != NULL; skb = next) {
next = skb->next;
@@ -3086,10 +3086,12 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
/* If device/qdisc don't need skb->dst, release it right now while
* its hot in this cpu cache.
*/
- if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
+ if ((dev->priv_flags & IFF_XMIT_DST_RELEASE) &&
+ !skb->recirc) {
skb_dst_drop(skb);
- else
+ } else {
skb_dst_force(skb);
+ }
#ifdef CONFIG_NET_SWITCHDEV
/* Don't forward if offload device already forwarded */
diff --git a/net/core/sock.c b/net/core/sock.c
index 7529eb9..05c902b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1613,7 +1613,11 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
sk->sk_route_caps &= ~sk->sk_route_nocaps;
if (sk_can_gso(sk)) {
+#ifndef XFRM_GSO
if (dst->header_len) {
+#else
+ if (0) {
+#endif
sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
} else {
sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 11c4ca1..2c04a98 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1203,6 +1203,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
int nhoff;
int ihl;
int id;
+ bool need_xfrm = skb->recirc;
if (unlikely(skb_shinfo(skb)->gso_type &
~(SKB_GSO_TCPV4 |
@@ -1254,14 +1255,22 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
udpfrag = proto == IPPROTO_UDP && !skb->encapsulation;
ops = rcu_dereference(inet_offloads[proto]);
- if (likely(ops && ops->callbacks.gso_segment))
+ if (likely(ops && ops->callbacks.gso_segment)) {
+ /* tcp_gso_segment gets called here. It will add the
+ * XFRM by calling xfrm_output_one->esp_output.
+ * We will move things around to make space for the
+ * esp header in xfrm4_mode_transport.c (for transport
+ * mode- this is in xfrm4_transport_output_gso()
+ */
segs = ops->callbacks.gso_segment(skb, features);
+ }
if (IS_ERR_OR_NULL(segs))
goto out;
skb = segs;
do {
+ nhoff = skb_network_header(skb) - skb_mac_header(skb);
iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
if (udpfrag) {
iph->id = htons(id);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 4233cbe..8f3f111 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -270,10 +270,14 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */
if (skb_dst(skb)->xfrm) {
+ if (sk_can_gso(sk) && skb_is_gso(skb) &&
+ sk->sk_gso_type == SKB_GSO_TCPV4)
+ goto xfrm_gso;
IPCB(skb)->flags |= IPSKB_REROUTED;
return dst_output(net, sk, skb);
}
#endif
+xfrm_gso:
mtu = ip_skb_dst_mtu(skb);
if (skb_is_gso(skb))
return ip_finish_output_gso(net, sk, skb, mtu);
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 9864a2d..1c0f669 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -13,6 +13,7 @@
#include <linux/skbuff.h>
#include <net/tcp.h>
#include <net/protocol.h>
+#include <net/xfrm.h>
static void tcp_gso_tstamp(struct sk_buff *skb, unsigned int ts_seq,
unsigned int seq, unsigned int mss)
@@ -51,6 +52,49 @@ static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb,
return tcp_gso_segment(skb, features);
}
+#ifdef XFRM_GSO
+static int add_xfrm_post_gso(struct sk_buff *skb)
+{
+ struct xfrm_state *x = skb_dst(skb)->xfrm;
+ int err;
+
+ if (!x) {
+ skb->recirc = 0;
+ return 0;
+ }
+ memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+
+ /* XXX sub-optimal stuff.
+ * at this point ip_summed is CHECKSUM_PARTIAL. This bit
+ * should be optimized- we should not be doing this again.
+ * For now, just use ethool to set tx off rx off, and let
+ * the rest of the GSO logic compute the checksum efficiently
+ */
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ err = skb_checksum_help(skb);
+ /* at this point ip_summed is 0 */
+
+ if (err) {
+ kfree_skb(skb);
+ return err;
+ }
+ }
+ err = 1;
+ skb->recirc = 1;
+ err = xfrm_output_one(skb, err);
+ WARN_ON(err != 0);
+
+ /* reset all the abuse */
+ skb->recirc = 0;
+ skb->mac_header = skb->network_header - 14;
+ skb->transport_header += x->props.header_len;
+ __skb_push(skb, 14);
+
+ skb_dst_drop(skb);
+ return err;
+}
+#endif /* XFRM_GSO */
+
struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
@@ -65,6 +109,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
struct sk_buff *gso_skb = skb;
__sum16 newcheck;
bool ooo_okay, copy_destructor;
+#ifdef XFRM_GSO
+ bool need_xfrm = (skb->recirc == 1);
+#endif
th = tcp_hdr(skb);
thlen = th->doff * 4;
@@ -113,6 +160,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
skb->ooo_okay = 0;
segs = skb_segment(skb, features);
+ skb->recirc = 0;
if (IS_ERR(segs))
goto out;
@@ -172,6 +220,14 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
if (skb->ip_summed != CHECKSUM_PARTIAL)
th->check = gso_make_checksum(skb, ~th->check);
out:
+#ifdef XFRM_GSO
+ if (need_xfrm) {
+ struct sk_buff *nskb;
+
+ for (nskb = segs; nskb; nskb = nskb->next)
+ add_xfrm_post_gso(nskb);
+ }
+#endif
return segs;
}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index cb7ca56..6168834 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -951,6 +951,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
skb_reset_transport_header(skb);
skb_orphan(skb);
+ skb->recirc = 0;
skb->sk = sk;
skb->destructor = skb_is_tcp_pure_ack(skb) ? sock_wfree : tcp_wfree;
skb_set_hash_from_sk(skb, sk);
diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c
index fd840c7..154c580 100644
--- a/net/ipv4/xfrm4_mode_transport.c
+++ b/net/ipv4/xfrm4_mode_transport.c
@@ -13,6 +13,41 @@
#include <net/ip.h>
#include <net/xfrm.h>
+#ifdef XFRM_GSO
+/*
+ * when we come here, we have
+ * mac_header pointing to start of ether addr. This is also skb->data
+ * ip_hdr/network_header pointing to start of IP header (14 bytes after
+ * mac header.
+ * transport header points at ip_hdr + ihl.
+ * Unfortunately, esp_output overloads mac_header to use it as a pointer
+ * to the ip_proto field (which will get over-written by IPPROTO_ESP
+ * in esp_output).
+ * We should really pullup mac and ip header fields and leave some room
+ * for the esp header. Actually we should not be doing any move at all.
+ * This is a mess.
+ */
+static int xfrm4_transport_output_gso(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct iphdr *iph = ip_hdr(skb);
+ int ihl = iph->ihl * 4;
+ int iph_off = (unsigned char *)iph - (unsigned char *)skb->data;
+ unsigned char *data = skb_mac_header(skb);
+
+ skb->network_header -= x->props.header_len;
+ skb->transport_header = skb->network_header + ihl;
+ skb->mac_header -= x->props.header_len;
+
+ __skb_pull(skb, ihl + iph_off);
+ memmove(skb_mac_header(skb), data, ihl + iph_off);
+
+ /* This is a mess */
+ skb->mac_header = skb->network_header +
+ offsetof(struct iphdr, protocol);
+ return 0;
+}
+#endif /* XFRM_GSO */
+
/* Add encapsulation header.
*
* The IP header will be moved forward to make space for the encapsulation
@@ -22,12 +57,28 @@ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb)
{
struct iphdr *iph = ip_hdr(skb);
int ihl = iph->ihl * 4;
+ int iph_off = (unsigned char *)iph - (unsigned char *)skb->data;
+
+#ifdef XFRM_GSO
+ if (skb->recirc)
+ return xfrm4_transport_output_gso(x, skb);
+#endif /* XFRM_GSO */
+ /* move network/ip_hdr back by esp hdr size */
skb_set_network_header(skb, -x->props.header_len);
+ /* make mac_header point to ip_proto field in the
+ * new location of ip_hdr
+ */
skb->mac_header = skb->network_header +
offsetof(struct iphdr, protocol);
+ /* make transport_hdr point to tcp payload
+ * in the new location. This is where the esp hdr will go
+ */
skb->transport_header = skb->network_header + ihl;
+ /* move up the skb->data to go past ip hdr to tcp hdr.
+ * This reduces the len by the ip header len */
__skb_pull(skb, ihl);
+ /* copy the ip hdr over to new location */
memmove(skb_network_header(skb), iph, ihl);
return 0;
}
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 7ee6518..d0c8a9a 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -98,6 +98,15 @@ static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
+#ifdef XFRM_GSO
+ if (sk_can_gso(sk) && sk->sk_gso_type == SKB_GSO_TCPV4 &&
+ skb_is_gso(skb)) {
+ BUG_ON(IPCB(skb)->flags & IPSKB_REROUTED);
+ skb->recirc = 1;
+ return (ip_output(net, sk, skb));
+ }
+#endif /* XFRM_GSO */
+
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
net, sk, skb, NULL, skb_dst(skb)->dev,
__xfrm4_output,
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index cc3676e..39f7d76 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -50,7 +50,7 @@ static struct dst_entry *skb_dst_pop(struct sk_buff *skb)
return child;
}
-static int xfrm_output_one(struct sk_buff *skb, int err)
+int xfrm_output_one(struct sk_buff *skb, int err)
{
struct dst_entry *dst = skb_dst(skb);
struct xfrm_state *x = dst->xfrm;
@@ -128,6 +128,7 @@ static int xfrm_output_one(struct sk_buff *skb, int err)
out:
return err;
}
+EXPORT_SYMBOL_GPL(xfrm_output_one);
int xfrm_output_resume(struct sk_buff *skb, int err)
{
--
1.7.1
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists