[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <1359065793-1796-1-git-send-email-pshelar@nicira.com>
Date: Thu, 24 Jan 2013 14:16:33 -0800
From: Pravin B Shelar <pshelar@...ira.com>
To: netdev@...r.kernel.org
Cc: jesse@...ira.com, eric.dumazet@...il.com,
Pravin B Shelar <pshelar@...ira.com>
Subject: [PATCH 2/2] v2 GRE: Add segmentation offload for GRE
Signed-off-by: Pravin B Shelar <pshelar@...ira.com>
---
Fixed according to comments from Jesse and Eric.
- Factored a MAC layer handler out of skb_gso_segment().
- Eliminated copy operation from gre_gso_segment().
- Refresh header pointer after pskb_may_pull().
---
include/linux/netdevice.h | 4 +-
include/linux/skbuff.h | 7 +++
net/core/dev.c | 58 ++++++++++++++----------
net/core/skbuff.c | 7 ++-
net/ipv4/af_inet.c | 1 +
net/ipv4/gre.c | 109 +++++++++++++++++++++++++++++++++++++++++++++
net/ipv4/ip_gre.c | 94 ++++++++++++++++++++++++++++++--------
net/ipv4/tcp.c | 1 +
net/ipv4/udp.c | 3 +-
net/ipv6/ip6_offload.c | 1 +
net/ipv6/udp_offload.c | 3 +-
11 files changed, 243 insertions(+), 45 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 549f5ad..109d27b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2662,8 +2662,10 @@ extern int netdev_master_upper_dev_link(struct net_device *dev,
extern void netdev_upper_dev_unlink(struct net_device *dev,
struct net_device *upper_dev);
extern int skb_checksum_help(struct sk_buff *skb);
+extern struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
+ netdev_features_t features);
extern struct sk_buff *skb_gso_segment(struct sk_buff *skb,
- netdev_features_t features);
+ netdev_features_t features);
#ifdef CONFIG_BUG
extern void netdev_rx_csum_fault(struct net_device *dev);
#else
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 7c00664..7b98ee6 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -307,6 +307,8 @@ enum {
SKB_GSO_TCPV6 = 1 << 4,
SKB_GSO_FCOE = 1 << 5,
+
+ SKB_GSO_GRE = 1 << 6,
};
#if BITS_PER_LONG > 32
@@ -2711,6 +2713,11 @@ static inline struct sec_path *skb_sec_path(struct sk_buff *skb)
}
#endif
+struct skb_gso_cb {
+ int tnl_hoffset;
+};
+#define NAPI_GSO_CB(skb) ((struct skb_gso_cb *)(skb)->cb)
+
static inline bool skb_is_gso(const struct sk_buff *skb)
{
return skb_shinfo(skb)->gso_size;
diff --git a/net/core/dev.c b/net/core/dev.c
index c69cd87..208eb8b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2293,18 +2293,8 @@ out:
}
EXPORT_SYMBOL(skb_checksum_help);
-/**
- * skb_gso_segment - Perform segmentation on skb.
- * @skb: buffer to segment
- * @features: features for the output path (see dev->features)
- *
- * This function segments the given skb and returns a list of segments.
- *
- * It may return NULL if the skb requires no segmentation. This is
- * only possible when GSO is used for verifying header integrity.
- */
-struct sk_buff *skb_gso_segment(struct sk_buff *skb,
- netdev_features_t features)
+struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
{
struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
struct packet_offload *ptype;
@@ -2323,18 +2313,7 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb,
vlan_depth += VLAN_HLEN;
}
- skb_reset_mac_header(skb);
- skb->mac_len = skb->network_header - skb->mac_header;
__skb_pull(skb, skb->mac_len);
-
- if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
- skb_warn_bad_offload(skb);
-
- if (skb_header_cloned(skb) &&
- (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
- return ERR_PTR(err);
- }
-
rcu_read_lock();
list_for_each_entry_rcu(ptype, &offload_base, list) {
if (ptype->type == type && ptype->callbacks.gso_segment) {
@@ -2356,6 +2335,39 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb,
return segs;
}
+EXPORT_SYMBOL(skb_mac_gso_segment);
+
+/**
+ * skb_gso_segment - Perform segmentation on skb.
+ * @skb: buffer to segment
+ * @features: features for the output path (see dev->features)
+ *
+ * This function segments the given skb and returns a list of segments.
+ *
+ * It may return NULL if the skb requires no segmentation. This is
+ * only possible when GSO is used for verifying header integrity.
+ */
+struct sk_buff *skb_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ int err;
+
+ if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
+ skb_warn_bad_offload(skb);
+
+ if (skb_header_cloned(skb)) {
+ err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+ if (err)
+ return ERR_PTR(err);
+ }
+ }
+ NAPI_GSO_CB(skb)->tnl_hoffset = skb_headroom(skb);
+
+ skb_reset_mac_header(skb);
+ skb->mac_len = skb->network_header - skb->mac_header;
+
+ return skb_mac_gso_segment(skb, features);
+}
EXPORT_SYMBOL(skb_gso_segment);
/* Take action when hardware reception checksum errors are detected. */
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2568c44..c12b6f3 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2752,6 +2752,8 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features)
unsigned int doffset = skb->data - skb_mac_header(skb);
unsigned int offset = doffset;
unsigned int headroom;
+ unsigned int tnl_hlen = (skb_mac_header(skb) - skb->head) -
+ NAPI_GSO_CB(skb)->tnl_hoffset;
unsigned int len;
int sg = !!(features & NETIF_F_SG);
int nfrags = skb_shinfo(skb)->nr_frags;
@@ -2827,7 +2829,10 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features)
skb_set_network_header(nskb, skb->mac_len);
nskb->transport_header = (nskb->network_header +
skb_network_header_len(skb));
- skb_copy_from_linear_data(skb, nskb->data, doffset);
+
+ skb_copy_from_linear_data_offset(skb, -tnl_hlen,
+ nskb->data - tnl_hlen,
+ doffset + tnl_hlen);
if (fskb != skb_shinfo(skb)->frag_list)
continue;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 4b70539..2bd9998 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1306,6 +1306,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
SKB_GSO_UDP |
SKB_GSO_DODGY |
SKB_GSO_TCP_ECN |
+ SKB_GSO_GRE |
0)))
goto out;
diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c
index 42a4910..1f86421 100644
--- a/net/ipv4/gre.c
+++ b/net/ipv4/gre.c
@@ -19,6 +19,7 @@
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/netdevice.h>
+#include <linux/if_tunnel.h>
#include <linux/spinlock.h>
#include <net/protocol.h>
#include <net/gre.h>
@@ -26,6 +27,11 @@
static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly;
static DEFINE_SPINLOCK(gre_proto_lock);
+struct gre_base_hdr {
+ __be16 flags;
+ __be16 protocol;
+};
+#define GRE_HEADER_SECTION 4
int gre_add_protocol(const struct gre_protocol *proto, u8 version)
{
@@ -112,12 +118,108 @@ static void gre_err(struct sk_buff *skb, u32 info)
rcu_read_unlock();
}
+static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ int ghl = GRE_HEADER_SECTION;
+ struct gre_base_hdr *greh;
+ int mac_len = skb->mac_len;
+ int hlen;
+ bool csum;
+
+ if (unlikely(skb_shinfo(skb)->gso_type &
+ ~(SKB_GSO_TCPV4 |
+ SKB_GSO_TCPV6 |
+ SKB_GSO_UDP |
+ SKB_GSO_DODGY |
+ SKB_GSO_TCP_ECN |
+ SKB_GSO_GRE)))
+ goto out;
+
+ if (unlikely(!pskb_may_pull(skb, sizeof(*greh))))
+ goto out;
+
+ greh = (struct gre_base_hdr *)skb_transport_header(skb);
+
+ if (greh->flags & GRE_KEY)
+ ghl += GRE_HEADER_SECTION;
+ if (greh->flags & GRE_CSUM) {
+ ghl += GRE_HEADER_SECTION;
+ csum = true;
+ } else
+ csum = false;
+
+ /* setup inner skb. */
+ if (greh->protocol == htons(ETH_P_TEB)) {
+ struct ethhdr *eth = eth_hdr(skb);
+ skb->protocol = eth->h_proto;
+ } else {
+ skb->protocol = greh->protocol;
+ }
+
+ hlen = mac_len + sizeof(struct iphdr);
+ skb->encapsulation = 0;
+
+ if (unlikely(!pskb_may_pull(skb, ghl)))
+ goto out;
+ __skb_pull(skb, ghl);
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, skb_inner_network_offset(skb));
+ skb->mac_len = skb_inner_network_offset(skb);
+
+ /* segment inner packet. */
+ segs = skb_mac_gso_segment(skb, 0);
+ if (!segs || IS_ERR(segs))
+ goto out;
+
+ skb = segs;
+ do {
+ __skb_push(skb, ghl + hlen);
+ if (csum) {
+ __be32 *pcsum;
+
+ greh = (struct gre_base_hdr *)(skb->data + hlen);
+ pcsum = (__be32 *)(greh + 1);
+ *pcsum = 0;
+ *(__sum16 *)pcsum = csum_fold(skb_checksum(skb, hlen,
+ skb->len - hlen, 0));
+ }
+
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, mac_len);
+ skb->mac_len = mac_len;
+ } while ((skb = skb->next));
+out:
+ return segs;
+}
+
+static int gre_gso_send_check(struct sk_buff *skb)
+{
+ struct gre_base_hdr *greh;
+
+ if (!skb->encapsulation)
+ return -EINVAL;
+
+ greh = (struct gre_base_hdr *)skb_transport_header(skb);
+ if (greh->flags & GRE_SEQ)
+ return -EINVAL;
+ return 0;
+}
+
static const struct net_protocol net_gre_protocol = {
.handler = gre_rcv,
.err_handler = gre_err,
.netns_ok = 1,
};
+static const struct net_offload gre_offload = {
+ .callbacks = {
+ .gso_send_check = gre_gso_send_check,
+ .gso_segment = gre_gso_segment,
+ },
+};
+
static int __init gre_init(void)
{
pr_info("GRE over IPv4 demultiplexor driver\n");
@@ -127,11 +229,18 @@ static int __init gre_init(void)
return -EAGAIN;
}
+ if (inet_add_offload(&gre_offload, IPPROTO_GRE)) {
+ pr_err("can't add protocol offload\n");
+ inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
+ return -EAGAIN;
+ }
+
return 0;
}
static void __exit gre_exit(void)
{
+ inet_del_offload(&gre_offload, IPPROTO_GRE);
inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
}
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 303012a..ac8cb5e 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -735,6 +735,42 @@ drop:
return 0;
}
+static struct sk_buff *handle_offloads(struct sk_buff *skb)
+{
+ int err;
+
+ if (skb_is_gso(skb)) {
+ err = skb_unclone(skb, GFP_ATOMIC);
+ if (unlikely(err))
+ goto error;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_GRE;
+ return skb;
+ } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ /* Pages aren't locked and could change at any time.
+ * If this happens after we compute the checksum, the
+ * checksum will be wrong. We linearize now to avoid
+ * this problem.
+ */
+ if (skb_is_nonlinear(skb)) {
+ err = __skb_linearize(skb);
+ if (unlikely(err))
+ goto error;
+ }
+
+ err = skb_checksum_help(skb);
+ if (unlikely(err))
+ goto error;
+ }
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ return skb;
+
+error:
+ kfree_skb(skb);
+ return ERR_PTR(err);
+}
+
static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
@@ -751,10 +787,9 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
__be32 dst;
int mtu;
u8 ttl;
-
- if (skb->ip_summed == CHECKSUM_PARTIAL &&
- skb_checksum_help(skb))
- goto tx_error;
+ int pkt_len;
+ struct pcpu_tstats *tstats;
+ int err;
if (dev->type == ARPHRD_ETHER)
IPCB(skb)->flags = 0;
@@ -852,13 +887,6 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
if (skb->protocol == htons(ETH_P_IP)) {
df |= (old_iph->frag_off&htons(IP_DF));
-
- if ((old_iph->frag_off&htons(IP_DF)) &&
- mtu < ntohs(old_iph->tot_len)) {
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
- ip_rt_put(rt);
- goto tx_error;
- }
}
#if IS_ENABLED(CONFIG_IPV6)
else if (skb->protocol == htons(ETH_P_IPV6)) {
@@ -873,11 +901,6 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
}
}
- if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- ip_rt_put(rt);
- goto tx_error;
- }
}
#endif
@@ -908,10 +931,19 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
skb_set_owner_w(new_skb, skb->sk);
dev_kfree_skb(skb);
skb = new_skb;
- old_iph = ip_hdr(skb);
/* Warning : tiph value might point to freed memory */
}
+ if (!skb->encapsulation) {
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
+ }
+
+ skb = handle_offloads(skb);
+ if (IS_ERR(skb))
+ return NETDEV_TX_OK;
+
+ old_iph = ip_hdr(skb);
skb_push(skb, gre_hlen);
skb_reset_network_header(skb);
skb_set_transport_header(skb, sizeof(*iph));
@@ -967,8 +999,23 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
*(__sum16 *)ptr = ip_compute_csum((void *)(iph+1), skb->len - sizeof(struct iphdr));
}
}
+ pkt_len = skb->len - skb_transport_offset(skb);
+ tstats = this_cpu_ptr(dev->tstats);
+
+ nf_reset(skb);
+ ip_select_ident(iph, skb_dst(skb), NULL);
+
+ err = ip_local_out(skb);
+ if (likely(net_xmit_eval(err) == 0)) {
+ u64_stats_update_begin(&tstats->syncp);
+ tstats->tx_bytes += pkt_len;
+ tstats->tx_packets++;
+ u64_stats_update_end(&tstats->syncp);
+ } else {
+ dev->stats.tx_errors++;
+ dev->stats.tx_aborted_errors++;
+ }
- iptunnel_xmit(skb, dev);
return NETDEV_TX_OK;
#if IS_ENABLED(CONFIG_IPV6)
@@ -1374,6 +1421,10 @@ static int ipgre_tunnel_init(struct net_device *dev)
return err;
}
+ if (!(tunnel->parms.o_flags & GRE_SEQ)) {
+ dev->features |= NETIF_F_GSO_SOFTWARE;
+ dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+ }
return 0;
}
@@ -1564,6 +1615,10 @@ static int ipgre_tap_init(struct net_device *dev)
if (!dev->tstats)
return -ENOMEM;
+ if (!(tunnel->parms.o_flags & GRE_SEQ)) {
+ dev->features |= NETIF_F_GSO_SOFTWARE;
+ dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+ }
return 0;
}
@@ -1587,6 +1642,9 @@ static void ipgre_tap_setup(struct net_device *dev)
dev->iflink = 0;
dev->features |= NETIF_F_NETNS_LOCAL;
+
+ dev->features |= GRE_FEATURES;
+ dev->hw_features |= GRE_FEATURES;
}
static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5227194..8c47b7d 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3032,6 +3032,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
SKB_GSO_DODGY |
SKB_GSO_TCP_ECN |
SKB_GSO_TCPV6 |
+ SKB_GSO_GRE |
0) ||
!(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
goto out;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index e0610e4..6824272 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2304,7 +2304,8 @@ struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
/* Packet is from an untrusted source, reset gso_segs. */
int type = skb_shinfo(skb)->gso_type;
- if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY) ||
+ if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY |
+ SKB_GSO_GRE) ||
!(type & (SKB_GSO_UDP))))
goto out;
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index f26f0da..8234c1d 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -99,6 +99,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
~(SKB_GSO_UDP |
SKB_GSO_DODGY |
SKB_GSO_TCP_ECN |
+ SKB_GSO_GRE |
SKB_GSO_TCPV6 |
0)))
goto out;
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 0c8934a..cf05cf0 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -56,7 +56,8 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
/* Packet is from an untrusted source, reset gso_segs. */
int type = skb_shinfo(skb)->gso_type;
- if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY) ||
+ if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY |
+ SKB_GSO_GRE) ||
!(type & (SKB_GSO_UDP))))
goto out;
--
1.7.10
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists