netdev - [PATCH v15 ] net/veth/XDP: Line-rate packet forwarding in kernel

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAFgPn1DX9cOpDRGj=wFwvZq_bpq6VFnEOzR1YbMuC0+=DFEWxA@mail.gmail.com>
Date:   Sun, 1 Apr 2018 20:47:28 -0400
From:   "Md. Islam" <mislam4@...t.edu>
To:     netdev@...r.kernel.org, David Miller <davem@...emloft.net>,
        David Ahern <dsahern@...il.com>, stephen@...workplumber.org,
        agaceph@...il.com, Pavel Emelyanov <xemul@...nvz.org>,
        Eric Dumazet <edumazet@...gle.com>,
        alexei.starovoitov@...il.com, brouer@...hat.com
Subject: [PATCH v15 ] net/veth/XDP: Line-rate packet forwarding in kernel

This patch implements IPv4 forwarding on xdp_buff. I added a new
config option XDP_ROUTER. Kernel would forward packets through fast
path when this option is enabled. But it would require driver support.
Currently it only works with veth. Here I have modified veth such that
it outputs xdp_buff. I created a testbed in Mininet. The Mininet
script (topology.py) is attached. Here the topology is:

h1 -----r1-----h2 (r1 acts as a router)

This patch improves the throughput from 53.8Gb/s to 60Gb/s on my
machine. Median RTT also improved from around .055 ms to around .035
ms.

Then I disabled hyperthreading and cpu frequency scaling in order to
utilize CPU cache (DPDK also utilizes CPU cache to improve
forwarding). This further improves per-packet forwarding latency from
around 400ns to 200 ns. More specifically, header parsing and fib
lookup only takes around 82 ns. This shows that this could be used to
implement linerate packet forwarding in kernel.

The patch has been generated on 4.15.0+. Please let me know your
feedback and suggestions. Please feel free to let me know if this
approach make sense.

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 944ec3c..8474eef 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -328,6 +328,18 @@ config VETH
       When one end receives the packet it appears on its pair and vice
       versa.

+config XDP_ROUTER
+    bool "XDP router for veth"
+    depends on IP_ADVANCED_ROUTER
+    depends on VETH
+    default y
+    ---help---
+      This option will enable IP forwarding on incoming xdp_buff.
+      Currently it is only supported by veth. Say y or n.
+
+      Currently veth uses slow path for packet forwarding. This option
+      forwards packets as soon as it is received (as XDP generic).
+
 config VIRTIO_NET
     tristate "Virtio network driver"
     depends on VIRTIO
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index a69ad39..76112f9 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -111,6 +111,29 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb,
struct net_device *dev)
         goto drop;
     }

+#ifdef CONFIG_XDP_ROUTER
+
+    /* if IP forwarding is enabled on the receiver, create xdp_buff
+     * from skb and call xdp_router_forward()
+     */
+    if (is_forwarding_enabled(rcv)) {
+        struct xdp_buff *xdp = kmalloc(sizeof(*xdp), GFP_KERNEL);
+
+        xdp->data = skb->data;
+        xdp->data_end = skb->data + (skb->len - skb->data_len);
+        xdp->data_meta = skb;
+        prefetch_xdp(xdp);
+        if (likely(xdp_router_forward(rcv, xdp) == NET_RX_SUCCESS)) {
+            struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
+
+            u64_stats_update_begin(&stats->syncp);
+            stats->bytes += length;
+            stats->packets++;
+            u64_stats_update_end(&stats->syncp);
+            goto success;
+        }
+    }
+#endif
     if (likely(dev_forward_skb(rcv, skb) == NET_RX_SUCCESS)) {
         struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);

@@ -122,6 +145,7 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb,
struct net_device *dev)
 drop:
         atomic64_inc(&priv->dropped);
     }
+success:
     rcu_read_unlock();
     return NETDEV_TX_OK;
 }
@@ -276,6 +300,62 @@ static void veth_set_rx_headroom(struct
net_device *dev, int new_hr)
     rcu_read_unlock();
 }

+#ifdef CONFIG_XDP_ROUTER
+int veth_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
+{
+    struct veth_priv *priv = netdev_priv(dev);
+    struct net_device *rcv;
+    struct ethhdr *ethh;
+    struct sk_buff *skb;
+    int length = xdp->data_end - xdp->data;
+
+    rcu_read_lock();
+    rcv = rcu_dereference(priv->peer);
+    if (unlikely(!rcv)) {
+        kfree(xdp);
+        goto drop;
+    }
+
+    /* Update MAC address and checksum */
+    ethh = eth_hdr_xdp(xdp);
+    ether_addr_copy(ethh->h_source, dev->dev_addr);
+    ether_addr_copy(ethh->h_dest, rcv->dev_addr);
+
+    /* if IP forwarding is enabled on the receiver,
+     * call xdp_router_forward()
+     */
+    if (is_forwarding_enabled(rcv)) {
+        prefetch_xdp(xdp);
+        if (likely(xdp_router_forward(rcv, xdp) == NET_RX_SUCCESS)) {
+            struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
+
+            u64_stats_update_begin(&stats->syncp);
+            stats->bytes += length;
+            stats->packets++;
+            u64_stats_update_end(&stats->syncp);
+            goto success;
+        }
+    }
+
+    /* Local deliver */
+    skb = (struct sk_buff *)xdp->data_meta;
+    if (likely(dev_forward_skb(rcv, skb) == NET_RX_SUCCESS)) {
+        struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
+
+        u64_stats_update_begin(&stats->syncp);
+        stats->bytes += length;
+        stats->packets++;
+        u64_stats_update_end(&stats->syncp);
+    } else {
+drop:
+        atomic64_inc(&priv->dropped);
+    }
+success:
+    rcu_read_unlock();
+    return NETDEV_TX_OK;
+}
+#endif
+
 static const struct net_device_ops veth_netdev_ops = {
     .ndo_init            = veth_dev_init,
     .ndo_open            = veth_open,
@@ -290,6 +370,9 @@ static const struct net_device_ops veth_netdev_ops = {
     .ndo_get_iflink        = veth_get_iflink,
     .ndo_features_check    = passthru_features_check,
     .ndo_set_rx_headroom    = veth_set_rx_headroom,
+#ifdef CONFIG_XDP_ROUTER
+    .ndo_xdp_xmit        = veth_xdp_xmit,
+#endif
 };

 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
diff --git a/include/linux/ip.h b/include/linux/ip.h
index 492bc65..025a3ec 100644
--- a/include/linux/ip.h
+++ b/include/linux/ip.h
@@ -19,6 +19,29 @@

 #include <linux/skbuff.h>
 #include <uapi/linux/ip.h>
+#include <linux/filter.h>
+
+#ifdef CONFIG_XDP_ROUTER
+
+#define MIN_PACKET_SIZE 55
+
+static inline struct iphdr *ip_hdr_xdp(const struct xdp_buff *xdp)
+{
+    return (struct iphdr *)(xdp->data+ETH_HLEN);
+}
+
+static inline struct ethhdr *eth_hdr_xdp(const struct xdp_buff *xdp)
+{
+    return (struct ethhdr *)(xdp->data);
+}
+
+static inline bool is_xdp_forwardable(const struct xdp_buff *xdp)
+{
+    return xdp->data_end - xdp->data >= MIN_PACKET_SIZE;
+}
+
+#endif
+

 static inline struct iphdr *ip_hdr(const struct sk_buff *skb)
 {
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4c77f39..e3bf002 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3290,6 +3290,12 @@ static inline void dev_consume_skb_any(struct
sk_buff *skb)
     __dev_kfree_skb_any(skb, SKB_REASON_CONSUMED);
 }

+#ifdef CONFIG_XDP_ROUTER
+bool is_forwarding_enabled(struct net_device *dev);
+int xdp_router_forward(struct net_device *dev, struct xdp_buff *xdp);
+void prefetch_xdp(struct xdp_buff *xdp);
+#endif
+
 void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog);
 int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb);
 int netif_rx(struct sk_buff *skb);
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index f805243..623b2de 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -369,6 +369,12 @@ int fib_sync_down_dev(struct net_device *dev,
unsigned long event, bool force);
 int fib_sync_down_addr(struct net_device *dev, __be32 local);
 int fib_sync_up(struct net_device *dev, unsigned int nh_flags);

+#ifdef CONFIG_XDP_ROUTER
+int ip_route_lookup(__be32 daddr, __be32 saddr,
+                   u8 tos, struct net_device *dev,
+                   struct fib_result *res);
+#endif
+
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
                const struct sk_buff *skb);
diff --git a/net/core/dev.c b/net/core/dev.c
index dda9d7b..9d92352 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4090,6 +4090,65 @@ int do_xdp_generic(struct bpf_prog *xdp_prog,
struct sk_buff *skb)
 }
 EXPORT_SYMBOL_GPL(do_xdp_generic);

+#ifdef CONFIG_XDP_ROUTER
+
+bool is_forwarding_enabled(struct net_device *dev)
+{
+    struct in_device *in_dev;
+
+    /* verify forwarding is enabled on this interface */
+    in_dev = __in_dev_get_rcu(dev);
+    if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
+        return false;
+
+    return true;
+}
+EXPORT_SYMBOL_GPL(is_forwarding_enabled);
+
+int xdp_router_forward(struct net_device *dev, struct xdp_buff *xdp)
+{
+        int err;
+        struct fib_result res;
+        struct iphdr *iph;
+        struct net_device *rcv;
+
+        if (unlikely(xdp->data_end - xdp->data < MIN_PACKET_SIZE))
+            return NET_RX_DROP;
+
+        iph = (struct iphdr *)(xdp->data + ETH_HLEN);
+
+        /*currently only supports IPv4
+         */
+        if (unlikely(iph->version != 4))
+            return NET_RX_DROP;
+
+        err = ip_route_lookup(iph->daddr, iph->saddr,
+                      iph->tos, dev, &res);
+        if (unlikely(err))
+            return NET_RX_DROP;
+
+        rcv = FIB_RES_DEV(res);
+        if (likely(rcv)) {
+            if (likely(rcv->netdev_ops->ndo_xdp_xmit(rcv, xdp) ==
+                       NETDEV_TX_OK))
+                return NET_RX_SUCCESS;
+        }
+
+        return NET_RX_DROP;
+}
+EXPORT_SYMBOL_GPL(xdp_router_forward);
+
+inline void prefetch_xdp(struct xdp_buff *xdp)
+{
+        prefetch(xdp);
+        /* prefetch version, tos, saddr and daddr of IP header */
+        prefetch(xdp->data + ETH_HLEN);
+        prefetch(xdp->data + ETH_HLEN + 12);
+}
+EXPORT_SYMBOL_GPL(prefetch_xdp);
+
+#endif
+
 static int netif_rx_internal(struct sk_buff *skb)
 {
     int ret;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 49cc1c1..2333205 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1866,6 +1866,35 @@ static int ip_mkroute_input(struct sk_buff *skb,
     return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
 }

+#ifdef CONFIG_XDP_ROUTER
+
+int ip_route_lookup(__be32 daddr, __be32 saddr,
+            u8 tos, struct net_device *dev,
+            struct fib_result *res)
+{
+    struct flowi4    fl4;
+    int        err;
+    struct net    *net = dev_net(dev);
+
+    fl4.flowi4_oif = 0;
+    fl4.flowi4_iif = dev->ifindex;
+    fl4.flowi4_mark = 0;
+    fl4.flowi4_tos = tos & IPTOS_RT_MASK;
+    fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+    fl4.flowi4_flags = 0;
+    fl4.daddr = daddr;
+    fl4.saddr = saddr;
+
+    err = fib_lookup(net, &fl4, res, 0);
+
+    if (unlikely(err != 0 || res->type != RTN_UNICAST))
+        return -EINVAL;
+
+    return 0;
+}
+EXPORT_SYMBOL_GPL(ip_route_lookup);
+#endif
+
 /*
  *    NOTE. We drop all the packets that has local source
  *    addresses, because every properly looped back packet

Many thanks
Tamim

View attachment "topology.py" of type "text/x-python" (3289 bytes)