netdev - Re: [PATCH RFC net-next] net: SAIL based FIB lookup for XDP

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <f427155e-685c-adb6-1a2e-c78b9f1511f6@gmail.com>
Date:   Sun, 18 Nov 2018 10:42:29 -0700
From:   David Ahern <dsahern@...il.com>
To:     "Md. Islam" <mislam4@...t.edu>, Netdev <netdev@...r.kernel.org>,
        Jesper Dangaard Brouer <brouer@...hat.com>,
        David Miller <davem@...emloft.net>,
        Stephen Hemminger <stephen@...workplumber.org>,
        john fastabend <john.fastabend@...il.com>,
        Alexey Kuznetsov <kuznet@....inr.ac.ru>
Subject: Re: [PATCH RFC net-next] net: SAIL based FIB lookup for XDP

On 11/11/18 7:25 PM, Md. Islam wrote:
> This patch implements SAIL[1] based routing table lookup for XDP. I
> however made some changes from the original proposal (details are
> described in the patch). This changes decreased the memory consumption
> from 21.94 MB to 4.97 MB for my example routing table with 400K
> routes.
> 
> This patch can perform FIB lookup in 50 CPU cycles for the example
> routing table (with 400K routes) whereas LC-trie takes around 350 CPU
> cycles.
> 
> I tried to follow all the advice I got from my last patch. Looking
> forward to your review.
> 
> 1. Yang, Tong, et al. "Guarantee IP lookup performance with FIB
> explosion." ACM SIGCOMM Computer Communication Review. Vol. 44. No. 4.
> ACM, 2014.

This work you are doing on different FIB algorithms is interesting and
probably has its use cases where it is beneficial, but you are still not
integrating it into the Linux stack correctly.

For starters, it is wrong to have 2 separate FIB data structures for the
same network namespace. If SAIL is good enough for XDP, it should be
good enough for normal routing in the namespace. There is no need to put
the same route data in 2 places. That means the FIB algorithm needs to
be selectable - either trie or sail but not both.

Further, you are not handling unexpected conditions - e.g., multipath or
lwtunnel encaps, cleanup of device references on a FIB entry delete or
device overflows ...

> diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
> index 69c91d1..cc275c7 100644
> --- a/include/net/ip_fib.h
> +++ b/include/net/ip_fib.h
> @@ -197,6 +197,62 @@ struct fib_entry_notifier_info {
>      u32 tb_id;
>  };
> 
> +#if IS_ENABLED(CONFIG_FIB_SAIL_XDP)
> +/*
> + * The router can have upto 255 ports. This limitation
> + * allows us to represent netdev_index as an u8
> + */
> +#define NETDEV_COUNT_MAX 255

... 255 is high for a physical port count but not a logical device
count. In the presence of VLANs 255 is nothing and VLANs are an
essential deployment feature.

> +
> +struct chunk {

chunk is too generic. sail_chunk at least puts it in the sail symbol
namespace.

> +    /*256-bit bitmap. Here i-th bit (from LSB) is set to 1 if C24[i] > 0 */
> +    u64 bitmap[4];
> +    /*
> +     * Index to C24 where chunk is started. A chunk corresponds
> +     * to 256 elements. Instead of having just one start index for the
> +     * whole chunk, we divide the chunk into 4 parts and save start
> +     * index for each part.
> +     */
> +    u64 start_index[4];
> +};
> +
> +struct sail {
> +    /*default next-hop (Level 0)*/
> +    u8    def_nh;
> +
> +    /*Level 16*/
> +    u8 __rcu *N16;
> +    u8 __rcu *P16;
> +    u16 __rcu *C16;
> +
> +    /*Level 24*/
> +    u8 __rcu *N24;
> +    u8 __rcu *P24;
> +    struct chunk __rcu *CK24;
> +    u32 __rcu *C24;
> +    u32 cnk24_count;/*Number of chunks in level 24*/
> +
> +    /*Level 32*/
> +    u8 __rcu *N32;
> +    u8 __rcu *P32;
> +    u32 cnk32_count;/*Number of chunks in level 32*/
> +
> +    /*Index to this array is stored in N16, N24 and N32*/
> +    struct net_device    *netdevs[NETDEV_COUNT_MAX];
> +    u8 netdev_count;/*Number of netdevs*/
> +
> +    spinlock_t lock;
> +};
> +
> +int sail_insert(struct sail *s, u32 key,
> +        u8 prefix_len, struct net_device *dev);
> +int sail_delete(struct sail *s, u32 key,
> +        u8 prefix_len);
> +int sail_flush(struct sail *s);
> +int sail_lookup(const struct sail *s, const __be32 dest,
> +        struct net_device **dev);
> +#endif

Put the new FIB algorithm specific defines in a new header.

> +
>  struct fib_nh_notifier_info {
>      struct fib_notifier_info info; /* must be first */
>      struct fib_nh *fib_nh;
> @@ -219,6 +275,10 @@ struct fib_table {
>      int            tb_num_default;
>      struct rcu_head        rcu;
>      unsigned long         *tb_data;
> +#if IS_ENABLED(CONFIG_FIB_SAIL_XDP)
> +    /*Each FIB table will have its own SAIL structure.*/
> +    struct sail    sail;

Per comment above a separate sail entry is unnecessary when this
algorithm is an alternative to trie; It overlaps tb_data - see code
references for it.

> +#endif
>      unsigned long        __data[0];
>  };
> 

...

> diff --git a/net/ipv4/fib_sail_xdp.c b/net/ipv4/fib_sail_xdp.c
> new file mode 100644
> index 0000000..f3f56c5
> --- /dev/null
> +++ b/net/ipv4/fib_sail_xdp.c
> @@ -0,0 +1,939 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright (c) 2018-19 MD Iftakharul Islam (Tamim) <mislam4@...t.edu>
> + *
> + *   This program is free software; you can redistribute it and/or
> + *   modify it under the terms of the GNU General Public License
> + *   as published by the Free Software Foundation; either version
> + *   2 of the License, or (at your option) any later version.
> + *
> + *
> + * This is SAIL_L based routing table lookup which was initially proposed in:
> + *
> + * Yang, Tong, Gaogang Xie, YanBiao Li, Qiaobin Fu, Alex X. Liu, Qi Li,
> + * and Laurent Mathy. "Guarantee IP lookup performance with FIB explosion."
> + * In ACM SIGCOMM Computer Communication Review, vol. 44, no. 4, pp. 39-50.
> + * ACM, 2014.
> + *
> + * It however deviates from the SAIL_L in three ways:
> + *
> + * 1. It pushes all the solid nodes in level 1~15 to level 16 whereas SAIL_L
> + * pushes them to either level 16, level 24 or level 32.
> + *
> + * 2. It pushes all the solid nodes in level 17~23 to level 24 whereas SAIL_L
> + * pushes them to either level 24 or level 32.
> + *
> + * 3. It adds a bitmap array, CK24 in addition to C24. This reduces the memory
> + * memory requirement of original C24 from 17.08 MB to 110KB for our example
> + * routing table.
> + */
> +
> +#include <net/ip_fib.h>
> +
> +/*The length of N16, P16 and C16 is 2^16*/
> +#define LEVEL16_SIZE 65536
> +
> +/*Length of C24.*/
> +#define C24_SIZE 1048576
> +
> +/*chunk size is 2^8*/
> +#define CHUNK_SIZE 256
> +
> +/*Total number of chunks preallocated for level 24 and 32*/
> +#define NUM_CHUNKS 16384
> +
> +/*Calculates the number of bits set to 1*/
> +#define POPCNT(X) (hweight64(X))
> +
> +/*POPCNT of left-most N bits of X*/
> +#define POPCNT_OFF(X, N) (hweight64(((1ULL << (N)) - 1) & (X)))
> +
> +/*Calculate index to C24 from CK26 chunk and chunk offset */
> +static u64 calc_c24_idx(struct chunk c, u32 cnk_off)
> +{
> +    u8 part_idx, part_off;
> +
> +    part_idx = cnk_off / 64;
> +    part_off = cnk_off % 64;
> +
> +    return c.start_index[part_idx] +
> +        POPCNT_OFF(c.bitmap[part_idx], part_off);
> +}
> +
> +/*Converts a net_device to corresponding netdev_index*/
> +static u8 get_netdev_index(struct sail *s, struct net_device *dev)
> +{
> +    u8 i;
> +
> +    /*checks if the net_device is already seen; if yes then return the
> +     *corresponding index
> +     */
> +    for (i = 0; i < s->netdev_count; i++) {
> +        if (s->netdevs[i] == dev)
> +            return i;

Linearly walking this array puts a hit on the insert time. Control
management (insert and delete times) are important as well. It is a
trade-off between data plane performance and control plane performance.
You should compare insert and delete times too.

> +    }
> +    /*If the net_device is not previously seen, then add it to the array*/
> +    s->netdevs[s->netdev_count++] = dev;
> +    return s->netdev_count - 1;
> +}

Nothing is preventing a route from being inserted using netdev 256; you
are just wrapping the counter and inserting the device which breaks
existing FIB entries using that netdev entry.

What about deletes? I don't see you ever reset the index or check for
holes (unused entries) in the array.

> +int sail_insert(struct sail *s, u32 key, u8 prefix_len,
> +        struct net_device *dev)

you are assuming single path routes - an assumption that needs to be
codified, not ignored.

> +{
> +    int i;
> +    u8 *n16, *p16, *n24, *p24, *n32, *p32;
> +    u16 *c16;
> +    u32 *c24;
> +    struct chunk *ck24;
> +    u16 chunk_id;
> +    u16 n16_idx;/*Index to N16, P16 and C16*/
> +    u64 n24_idx;/*Index to N24 and P24*/
> +    u64 c24_idx;/*Index to C24*/
> +    u32 ck24_idx;/*Index to CK24*/
> +    u64 ck24_off;/*offset inside a chunk*/
> +    u8 part_idx, part_off;
> +    u64 n32_idx;/*Index to N32 and P32*/
> +    u32 num_leafs;/*Number of leafs need to be inserted for this prefix*/
> +    u8 netdev_index = get_netdev_index(s, dev);
> +    int err = 0;
> +
> +    spin_lock(&s->lock);
> +
> +    /* Default route */
> +    if (prefix_len == 0) {
> +        s->def_nh = netdev_index;
> +        goto finish;
> +    }
> +
> +    /* Preallocate all the arrays at once*/
> +    if (!s->N16) {
> +        n16 = kcalloc(LEVEL16_SIZE, sizeof(*n16), GFP_ATOMIC);
> +        p16 = kcalloc(LEVEL16_SIZE, sizeof(*p16), GFP_ATOMIC);
> +        c16 = kcalloc(LEVEL16_SIZE, sizeof(*c16), GFP_ATOMIC);
> +        n24 = kcalloc(NUM_CHUNKS * CHUNK_SIZE, sizeof(*n24),
> +                  GFP_ATOMIC);
> +        p24 = kcalloc(NUM_CHUNKS * CHUNK_SIZE, sizeof(*p24),
> +                  GFP_ATOMIC);
> +        ck24 = kcalloc(NUM_CHUNKS, sizeof(*ck24), GFP_ATOMIC);
> +        c24 = kcalloc(C24_SIZE, sizeof(*c24), GFP_ATOMIC);
> +        n32 = kcalloc(NUM_CHUNKS * CHUNK_SIZE, sizeof(*n32),
> +                  GFP_ATOMIC);
> +        p32 = kcalloc(NUM_CHUNKS * CHUNK_SIZE, sizeof(*p32),
> +                  GFP_ATOMIC);
> +
> +        if (!n16 || !c16 || !p16 || !n24 || !p24 || !ck24 ||
> +            !c24 || !n32 || !p32) {
> +            kfree(n16);
> +            kfree(c16);
> +            kfree(p16);
> +            kfree(n24);
> +            kfree(p24);
> +            kfree(ck24);
> +            kfree(c24);
> +            kfree(n32);
> +            kfree(p32);
> +            pr_err("Out of memory while preallocating  SAIL");
> +            goto error;
> +        }
> +
> +        RCU_INIT_POINTER(s->N16, n16);
> +        RCU_INIT_POINTER(s->P16, p16);
> +        RCU_INIT_POINTER(s->C16, c16);
> +        RCU_INIT_POINTER(s->N24, n24);
> +        RCU_INIT_POINTER(s->P24, p24);
> +        RCU_INIT_POINTER(s->CK24, ck24);
> +        RCU_INIT_POINTER(s->C24, c24);
> +        RCU_INIT_POINTER(s->N32, n32);
> +        RCU_INIT_POINTER(s->P32, p32);
> +
> +        synchronize_rcu();

this initialization needs to be done as part of namespace init, not
first FIB insert.

You really need to spend time looking at how to make sail, poptrie and
anything else an alternative to LC trie as opposed to a duplicate algorithm.