lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAFgPn1CRZobsuFxePKm6JS_zHSqD3V7M8AC5cKkqZg4QZdDwGg@mail.gmail.com>
Date:   Tue, 4 Sep 2018 01:02:30 -0400
From:   "Md. Islam" <mislam4@...t.edu>
To:     Netdev <netdev@...r.kernel.org>,
        David Miller <davem@...emloft.net>,
        David Ahern <dsahern@...il.com>,
        Alexey Kuznetsov <kuznet@....inr.ac.ru>,
        alexei.starovoitov@...il.com,
        Jesper Dangaard Brouer <brouer@...hat.com>,
        Stephen Hemminger <stephen@...workplumber.org>,
        makita.toshiaki@....ntt.co.jp, panda@...go.wide.ad.jp,
        yasuhiro.ohara@....com, Eric Dumazet <edumazet@...gle.com>,
        john fastabend <john.fastabend@...il.com>
Subject: [PATCH RFC net-next] net: Poptrie based routing table lookup

This patch implements Poptrie based routing table
lookup/insert/delete/flush. Currently many carrier routers use kernel
bypass frameworks such as DPDK and VPP to implement the data plane.
XDP along with this patch will enable Linux to work as such a router.
Currently it supports up to 255 ports. Many real word backbone routers
have up to 233 ports (to the best of my knowledge), so it seems to be
sufficient at this moment.

I also have attached a draft paper to explain it works (poptrie.pdf).
Please set CONFIG_FIB_POPTRIE=y (default n) before testing the patch.
Note that, poptrie_lookup() is not being called from anywhere. It will
be used by XDP forwarding.


>From 3dc9683298ed896dd3080733503c35d68f05370e Mon Sep 17 00:00:00 2001
From: tamimcse <tamim@...buet.org>
Date: Mon, 3 Sep 2018 23:56:43 -0400
Subject: [PATCH] Poptrie based routing table lookup

Signed-off-by: tamimcse <tamim@...buet.org>
---
 include/net/ip_fib.h   |  42 +++++
 net/ipv4/Kconfig       |   4 +
 net/ipv4/Makefile      |   1 +
 net/ipv4/fib_poptrie.c | 483 +++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/fib_trie.c    |  12 ++
 5 files changed, 542 insertions(+)
 create mode 100644 net/ipv4/fib_poptrie.c

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 81d0f21..76be548 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -197,6 +197,45 @@ struct fib_entry_notifier_info {
     u32 tb_id;
 };

+#if IS_ENABLED(CONFIG_FIB_POPTRIE)
+/*
+ * The router can have upto 255 ports. This limitation
+ * allows us to represent fib_index as u8
+ */
+#define NEXT_HOP_MAX 255
+
+struct next_hops {
+    struct net_device    *netdev_arr[NEXT_HOP_MAX];
+    /*Total number of next-hops*/
+    u8 count;
+};
+
+struct poptrie_node {
+    u64 vector;
+    u64 leafvec;
+    u64 nodevec;
+    struct poptrie_node *child_nodes;
+    u8 *leaves;
+    u8 *prefixes;
+    struct rcu_head        rcu;
+};
+
+struct poptrie {
+    char    def_nh;
+    struct next_hops    nhs;
+    struct poptrie_node __rcu *root;
+    spinlock_t            lock;
+};
+
+int poptrie_insert(struct poptrie *pt, u32 key,
+        u8 prefix_len, struct net_device *dev);
+int poptrie_delete(struct poptrie *pt, u32 key,
+        u8 prefix_len);
+int poptrie_flush(struct poptrie *pt);
+int poptrie_lookup(struct poptrie *pt, __be32 dest,
+        struct net_device **dev);
+#endif
+
 struct fib_nh_notifier_info {
     struct fib_notifier_info info; /* must be first */
     struct fib_nh *fib_nh;
@@ -219,6 +258,9 @@ struct fib_table {
     int            tb_num_default;
     struct rcu_head        rcu;
     unsigned long         *tb_data;
+#if IS_ENABLED(CONFIG_FIB_POPTRIE)
+    struct poptrie    pt;
+#endif
     unsigned long        __data[0];
 };

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 80dad30..75e9c9a 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -52,6 +52,10 @@ config IP_ADVANCED_ROUTER

       If unsure, say N here.

+config FIB_POPTRIE
+    bool
+    default n
+
 config IP_FIB_TRIE_STATS
     bool "FIB TRIE statistics"
     depends on IP_ADVANCED_ROUTER
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index b379520..fae4bd4 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -62,6 +62,7 @@ obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
 obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
 obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
 obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
+obj-$(CONFIG_FIB_POPTRIE) += fib_poptrie.o

 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
               xfrm4_output.o xfrm4_protocol.o
diff --git a/net/ipv4/fib_poptrie.c b/net/ipv4/fib_poptrie.c
new file mode 100644
index 0000000..3f231e7
--- /dev/null
+++ b/net/ipv4/fib_poptrie.c
@@ -0,0 +1,483 @@
+// SPDX-License-Identifier: GPL-2.0
+/*This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation; either version
+ *   2 of the License, or (at your option) any later version.
+ *
+ * Author: MD Iftakharul Islam (Tamim) <mislam4@...t.edu>.
+ *
+ * Asai, Hirochika, and Yasuhiro Ohara. "Poptrie: A compressed trie
+ * with population count for fast and scalable software IP routing
+ * table lookup." ACM SIGCOMM Computer Communication Review. 2015.
+ *
+ */
+
+#include <net/ip_fib.h>
+
+/*Get next-hop index from next-hop*/
+static u8 get_fib_index(struct next_hops *nhs, struct net_device *dev)
+{
+    u8 i;
+
+    for (i = 0; i < nhs->count; i++) {
+        if (nhs->netdev_arr[i] == dev)
+            return i;
+    }
+    nhs->netdev_arr[nhs->count++] = dev;
+    return nhs->count - 1;
+}
+
+/*Extracts 6 bytes from key starting from offset*/
+static u32 extract(u32 key, int offset)
+{
+    if (likely(offset < 26))
+        return (key >> (26 - offset)) & 63;
+    else
+        return (key << 4) & 63;
+}
+
+/*Set FIB index and prefix length to a leaf*/
+static void set_fib_index(struct poptrie_node *node,
+              unsigned long leaf_index,
+              char fib_index, char prefix_len)
+{
+    node->leaves[leaf_index] = fib_index;
+    node->prefixes[leaf_index] = prefix_len;
+}
+
+/*Insert a leaf at index*/
+static bool insert_leaf(struct poptrie_node *node,
+            char index, char fib_index,
+            char prefix_len)
+{
+    int i, j;
+    char *leaves;
+    char *prefixes;
+    int size = (int)hweight64(node->leafvec);
+
+    if (index > size) {
+        pr_err("Index needs to be smaller or equal to size");
+        return false;
+    }
+
+    leaves = kcalloc(size + 1, sizeof(*leaves), GFP_ATOMIC);
+    prefixes = kcalloc(size + 1, sizeof(*prefixes), GFP_ATOMIC);
+
+    for (i = 0, j = 0; i < (size + 1); i++) {
+        if (i == index) {
+            leaves[i] = fib_index;
+            prefixes[i] = prefix_len;
+        } else {
+            leaves[i] = node->leaves[j];
+            prefixes[i] = node->prefixes[j];
+            j++;
+        }
+    }
+
+    kfree(node->leaves);
+    kfree(node->prefixes);
+    node->leaves = leaves;
+    node->prefixes = prefixes;
+    return true;
+}
+
+/*Insert a new node at index*/
+static void insert_chield_node(struct poptrie_node *node,
+                   char index)
+{
+    int i, j;
+    struct poptrie_node *arr;
+    int arr_size  = (int)hweight64(node->nodevec);
+
+    arr = kcalloc(arr_size + 1, sizeof(*arr), GFP_ATOMIC);
+    for (i = 0, j = 0; i < (arr_size + 1); i++) {
+        if (i != index && j < arr_size)
+            arr[i] = node->child_nodes[j++];
+    }
+
+    kfree(node->child_nodes);
+    node->child_nodes = arr;
+}
+
+/*Delete a leaf at index*/
+static bool delete_leaf(struct poptrie_node *node,
+            char index)
+{
+    int i, j;
+    char *leaves;
+    char *prefixes;
+    int size = (int)hweight64(node->leafvec);
+
+    if (index >= size) {
+        pr_err("Index needs to be smaller or equal to size");
+        return false;
+    }
+
+    leaves = kcalloc(size - 1, sizeof(*leaves), GFP_ATOMIC);
+    prefixes = kcalloc(size - 1, sizeof(*prefixes), GFP_ATOMIC);
+
+    for (i = 0, j = 0; i < size; i++) {
+        if (i != index) {
+            leaves[j] = node->leaves[i];
+            prefixes[j] = node->prefixes[i];
+            j++;
+        }
+    }
+
+    kfree(node->leaves);
+    kfree(node->prefixes);
+    node->leaves = leaves;
+    node->prefixes = prefixes;
+    return true;
+}
+
+int poptrie_insert(struct poptrie *pt, u32 key,
+           u8 prefix_len, struct net_device *dev)
+{
+    int offset, i;
+    u32 index;
+    u8 consecutive_leafs;
+    u64 bitmap;
+    u64 bitmap_hp;
+    int arr_size;
+    unsigned long chield_index;
+    unsigned long leaf_index, prev_leaf_index;
+    unsigned long index_hp;
+    struct poptrie_node *node;
+    u8 prev_fib_index, prev_prefix_len;
+    u8 fib_index = get_fib_index(&pt->nhs, dev);
+
+    spin_lock(&pt->lock);
+
+    node = rcu_dereference(pt->root);
+    if (!node) {
+        node = kzalloc(sizeof(*node), GFP_ATOMIC);
+        rcu_assign_pointer(pt->root, node);
+    }
+
+    /* Default route */
+    if (prefix_len == 0) {
+        pt->def_nh = fib_index;
+        goto finish;
+    }
+
+    /*Iterate through the nodes*/
+    offset = 0;
+    while (prefix_len > (offset + 6)) {
+        index = extract(key, offset);
+        bitmap = 1ULL << index;
+        chield_index = hweight64(node->nodevec & (bitmap - 1));
+
+        /*No node for this index, so need to insert a node*/
+        if (!(node->nodevec & bitmap)) {
+            insert_chield_node(node, chield_index);
+            node->nodevec |= bitmap;
+        }
+        node = &node->child_nodes[chield_index];
+        offset += 6;
+    }
+
+    /*Now need to insert a leaf*/
+
+    index = extract(key, offset);
+    bitmap = 1ULL << index;
+    consecutive_leafs = 1 << (offset + 6 - prefix_len);
+
+    if (node->vector & bitmap && node->leafvec & bitmap) {
+        /*A leaf already exist for this index,
+         *so update the existing leaf
+         */
+        leaf_index = hweight64(node->leafvec & (bitmap - 1));
+        arr_size = (int)hweight64(node->leafvec);
+        if (leaf_index >= arr_size)
+            goto error;
+        /*Ignore the prefix*/
+        if (node->prefixes[leaf_index] > prefix_len) {
+            goto finish;
+        } else if (node->prefixes[leaf_index] == prefix_len) {
+            set_fib_index(node, leaf_index, fib_index, prefix_len);
+        } else {
+            /*hole punching*/
+            bitmap_hp = bitmap << consecutive_leafs;
+            if (!(node->leafvec & bitmap_hp)) {
+                index_hp = hweight64(node->leafvec &
+                             (bitmap_hp - 1)) - 1;
+                if (node->prefixes[index_hp] <= prefix_len) {
+                    insert_leaf(node, index_hp,
+                            fib_index, prefix_len);
+                    node->leafvec |= bitmap_hp;
+                }
+
+                for (i = leaf_index; i < index_hp ; i++) {
+                    if (node->prefixes[i] <= prefix_len)
+                        set_fib_index(node, i, fib_index, prefix_len);
+                }
+            } else {
+                index_hp = hweight64(node->leafvec &
+                             (bitmap_hp - 1)) - 1;
+                for (i = leaf_index; i <= index_hp ; i++) {
+                    if (node->prefixes[i] <= prefix_len)
+                        set_fib_index(node, i, fib_index, prefix_len);
+                }
+            }
+        }
+    } else if (!(node->vector & bitmap)) {
+        /*No leaf for this index, so need to insert a leaf*/
+        leaf_index = hweight64(node->leafvec & (bitmap - 1));
+        insert_leaf(node, leaf_index, fib_index, prefix_len);
+        node->leafvec |= bitmap;
+    } else if (node->vector & bitmap && !(node->leafvec & bitmap)) {
+        /*There is a leaf for this index created by another
+         *  prefix with smaller length
+         */
+        prev_leaf_index = hweight64(node->leafvec & (bitmap - 1)) - 1;
+        arr_size = (int)hweight64(node->leafvec);
+        if (prev_leaf_index >= arr_size)
+            goto error;
+        if (node->prefixes[prev_leaf_index] <= prefix_len) {
+            insert_leaf(node, prev_leaf_index + 1,
+                    fib_index, prefix_len);
+            node->leafvec |= bitmap;
+        }
+
+        /*hole punching*/
+        prev_fib_index = node->leaves[prev_leaf_index];
+        prev_prefix_len = node->prefixes[prev_leaf_index];
+
+        bitmap_hp = bitmap << consecutive_leafs;
+        if (!(node->leafvec & bitmap_hp)) {
+            index_hp = hweight64(node->leafvec &
+                         (bitmap_hp - 1)) - 1;
+            if (node->prefixes[index_hp] <= prefix_len) {
+                if (prev_leaf_index < 0)
+                    goto error;
+                insert_leaf(node, index_hp + 1,
+                        prev_fib_index, prev_prefix_len);
+                node->leafvec |= bitmap_hp;
+            }
+        }
+
+        for (i = 2; i < consecutive_leafs; i++) {
+            bitmap_hp = bitmap << (i - 1);
+            if (node->leafvec & bitmap_hp) {
+                index_hp = hweight64(node->leafvec &
+                             (bitmap_hp - 1)) - 1;
+                insert_leaf(node, index_hp + 1,
+                        fib_index, prefix_len);
+                node->leafvec |= bitmap_hp;
+            }
+        }
+    }
+
+    if (consecutive_leafs > 1)
+        node->vector |= ((1ULL << consecutive_leafs) - 1) << index;
+    else
+        node->vector |= bitmap;
+
+    goto finish;
+
+error:
+    pr_err("Something is very wrong !!!!");
+finish:
+    spin_unlock(&pt->lock);
+    return 0;
+}
+
+int poptrie_delete(struct poptrie *pt, u32 key,
+           u8 prefix_len)
+{
+    int offset, i;
+    u32 index;
+    u8 consecutive_leafs;
+    u64 bitmap;
+    int arr_size;
+    unsigned long chield_index;
+    unsigned long leaf_index;
+    struct poptrie_node *node;
+    bool update_vector = true;
+
+    spin_lock(&pt->lock);
+
+    node = rcu_dereference(pt->root);
+
+    if (!node || prefix_len == 0)
+        goto finish;
+
+    /*Iterate through the nodes*/
+    offset = 0;
+    while (prefix_len > (offset + 6)) {
+        index = extract(key, offset);
+        bitmap = 1ULL << index;
+        chield_index = hweight64(node->nodevec & (bitmap - 1));
+        /*No node for this index*/
+        if (!(node->nodevec & bitmap))
+            goto finish;
+        node = &node->child_nodes[chield_index];
+        offset += 6;
+    }
+
+    /*Now need to delete the leaf*/
+
+    index = extract(key, offset);
+    bitmap = 1ULL << index;
+    consecutive_leafs = 1 << (offset + 6 - prefix_len);
+
+    /*The prefix does not exist*/
+    if (!(node->vector & bitmap) || !(node->leafvec & bitmap))
+        goto finish;
+
+    leaf_index = hweight64(node->leafvec & (bitmap - 1));
+    arr_size = (int)hweight64(node->leafvec);
+    if (leaf_index >= arr_size)
+        goto error;
+
+    /*The prefix-length does not match*/
+    if (node->prefixes[leaf_index] != prefix_len)
+        goto finish;
+
+    /*Check if this prefix will replaced
+     * by another prefix with smaller length
+     */
+    for (i = leaf_index - 1; i >= 0; i--) {
+        if (node->prefixes[leaf_index] < prefix_len) {
+            update_vector = false;
+            break;
+        }
+    }
+
+    if (update_vector) {
+        if (consecutive_leafs > 1)
+            node->vector &=
+                 ~(((1ULL << consecutive_leafs) - 1) << index);
+        else
+            node->vector &= ~bitmap;
+    }
+
+    if (consecutive_leafs > 1) {
+        for (i = 0; i < consecutive_leafs; i++) {
+            if ((node->leafvec & bitmap) &&
+                node->prefixes[leaf_index] == prefix_len) {
+                delete_leaf(node, leaf_index);
+                node->leafvec &= ~bitmap;
+            }
+            bitmap <<= 1;
+            leaf_index = hweight64(node->leafvec & (bitmap - 1));
+        }
+    } else {
+        delete_leaf(node, leaf_index);
+        node->leafvec &= ~bitmap;
+    }
+
+    goto finish;
+error:
+    pr_err("Something is very wrong !!!!");
+finish:
+    spin_unlock(&pt->lock);
+    return 0;
+}
+
+/*This recursive function frees all
+ * the nodes in depth-first-search fashion
+ */
+static int poptrie_node_free(struct poptrie_node *node)
+{
+    int i;
+    int child_count;
+
+    if (!node)
+        return 0;
+    child_count = hweight64(node->nodevec);
+    if (node->child_nodes) {
+        for (i = 0; i < child_count; i++)
+            poptrie_node_free(&node->child_nodes[i]);
+    }
+
+    kfree(node->leaves);
+    kfree(node->prefixes);
+    kfree(node->child_nodes);
+
+    node->leaves = NULL;
+    node->prefixes = NULL;
+    node->child_nodes = NULL;
+
+    node->vector = 0;
+    node->leafvec = 0;
+    node->nodevec = 0;
+
+    return 0;
+}
+
+static void poptrie_rcu_free(struct rcu_head *rcu)
+{
+    poptrie_node_free(container_of(rcu, struct poptrie_node, rcu));
+}
+
+int poptrie_flush(struct poptrie *pt)
+{
+    struct poptrie_node *rt = rcu_dereference(pt->root);
+
+    if (!rt)
+        return 0;
+
+    RCU_INIT_POINTER(pt->root, NULL);
+    /* Wait for all references to be released */
+    call_rcu(&rt->rcu, poptrie_rcu_free);
+    return 0;
+}
+
+int poptrie_lookup(struct poptrie *pt, __be32 dest, struct net_device **dev)
+{
+    register u32 index;
+    register u64 bitmap, bitmask;
+    register unsigned long leaf_index;
+    register unsigned long node_index;
+    register struct poptrie_node *node;
+    register u8 fib_index = pt->def_nh;
+    register u8 carry = 0;
+    register u8 carry_bit = 2;
+
+    rcu_read_lock();
+
+    node = rcu_dereference(pt->root);
+
+    if (!node)
+        goto finish;
+
+    while (1) {
+        /*Extract 6 bytes from dest */
+        if (likely(carry_bit != 8)) {
+            index = ((dest & 252) >> carry_bit) | carry;
+            carry = (dest & ((1 << carry_bit) - 1))
+                    << (6 - carry_bit);
+            carry_bit = carry_bit + 2;
+            dest = dest >> 8;
+        } else {
+            index = carry;
+            carry = 0;
+            carry_bit = 2;
+        }
+
+        /*Create a bitmap based on the the extracted value*/
+        bitmap = 1ULL << index;
+        bitmask = bitmap - 1;
+
+        /*Find corresponding leaf*/
+        if (likely(node->vector & bitmap)) {
+            leaf_index = hweight64(node->leafvec & bitmask);
+            if (!(node->leafvec & bitmap))
+                leaf_index--;
+            fib_index = node->leaves[leaf_index];
+        }
+
+        /*Find corresponding node*/
+        if (likely(node->nodevec & bitmap)) {
+            node_index = hweight64(node->nodevec & bitmask);
+            node = &node->child_nodes[node_index];
+            continue;
+        }
+finish:
+        *dev = pt->nhs.netdev_arr[fib_index];
+        rcu_read_unlock();
+        return 0;
+    }
+}
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 3dcffd3..a34998c 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1280,6 +1280,10 @@ int fib_table_insert(struct net *net, struct
fib_table *tb,
     if (err)
         goto out_fib_notif;

+#if IS_ENABLED(CONFIG_FIB_POPTRIE)
+    poptrie_insert(&tb->pt, key, plen, fi->fib_dev);
+#endif
+
     if (!plen)
         tb->tb_num_default++;

@@ -1564,6 +1568,10 @@ int fib_table_delete(struct net *net, struct
fib_table *tb,

     pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);

+#if IS_ENABLED(CONFIG_FIB_POPTRIE)
+    poptrie_delete(&tb->pt, key, plen);
+#endif
+
     fa_to_delete = NULL;
     hlist_for_each_entry_from(fa, fa_list) {
         struct fib_info *fi = fa->fa_info;
@@ -1925,6 +1933,10 @@ int fib_table_flush(struct net *net, struct
fib_table *tb)
         }
     }

+#if IS_ENABLED(CONFIG_FIB_POPTRIE)
+    poptrie_flush(&tb->pt);
+#endif
+
     pr_debug("trie_flush found=%d\n", found);
     return found;
 }
-- 
2.7.4

Download attachment "poptrie.pdf" of type "application/pdf" (208517 bytes)

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ