[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <84c0c733-193a-97c7-1a68-c34f44cf2f61@nvidia.com>
Date: Tue, 17 Aug 2021 14:39:16 +0300
From: Nikolay Aleksandrov <nikolay@...dia.com>
To: Gilad Naaman <gnaaman@...venets.com>, davem@...emloft.net,
kuba@...nel.org, luwei32@...wei.com, wangxiongfeng2@...wei.com,
ap420073@...il.com
Cc: netdev@...r.kernel.org
Subject: Re: [PATCH] net: Improve perf of bond/vlans modification
On 17/08/2021 14:04, Gilad Naaman wrote:
> When a bond have a massive amount of VLANs with IPv6 addresses,
> performance of changing link state, attaching a VRF, changing an IPv6
> address, etc. go down dramtically.
>
> The source of most of the slow down is the `dev_addr_lists.c` module,
> which mainatins a linked list of HW addresses.
> When using IPv6, this list grows for each IPv6 address added on a
> VLAN, since each IPv6 address has a multicast HW address associated with
> it.
>
> When performing any modification to the involved links, this list is
> traversed many times, often for nothing, all while holding the RTNL
> lock.
>
> Instead, this patch adds an auxilliary rbtree which cuts down
> traversal time significantly.
>
[snip]
> Cc: David S. Miller <davem@...emloft.net>
> Cc: Jakub Kicinski <kuba@...nel.org>
> Cc: Lu Wei <luwei32@...wei.com>
> Cc: Xiongfeng Wang <wangxiongfeng2@...wei.com>
> Cc: Taehee Yoo <ap420073@...il.com>
> Signed-off-by: Gilad Naaman <gnaaman@...venets.com>
> ---
Hi Gilad,
Generally I like the idea, I have a similar hacky patch for the same reason but related to bridge
static entries which in some cases get added to lower device addr lists causing soft lockups due
to the list traversals.
The patch should be targeted at net-next, more comments below...
> include/linux/netdevice.h | 5 ++
> net/core/dev_addr_lists.c | 163 ++++++++++++++++++++++++++++----------
> 2 files changed, 126 insertions(+), 42 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index eaf5bb008aa9..dc343be9a845 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -47,6 +47,7 @@
> #include <uapi/linux/if_bonding.h>
> #include <uapi/linux/pkt_cls.h>
> #include <linux/hashtable.h>
> +#include <linux/rbtree.h>
>
> struct netpoll_info;
> struct device;
> @@ -218,12 +219,16 @@ struct netdev_hw_addr {
> int sync_cnt;
> int refcount;
> int synced;
> + struct rb_node node;
> struct rcu_head rcu_head;
> };
>
> struct netdev_hw_addr_list {
> struct list_head list;
> int count;
> +
> + /* Auxiliary tree for faster lookup when modifying the structure */
> + struct rb_root tree_root;
Why keep the list when now we have the rbtree ?
> };
>
> #define netdev_hw_addr_list_count(l) ((l)->count)
> diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
> index 45ae6eeb2964..2473d0f401aa 100644
> --- a/net/core/dev_addr_lists.c
> +++ b/net/core/dev_addr_lists.c
> @@ -12,6 +12,72 @@
> #include <linux/export.h>
> #include <linux/list.h>
>
> +/* Lookup for an address in the list using the rbtree.
> + * The return value is always a valid pointer.
> + * If the address exists, `*ret` is non-null and the address can be retrieved using
> + *
> + * container_of(*ret, struct netdev_hw_addr, node)
> + *
> + * Otherwise, `ret` can be used with `parent` as an insertion point
> + * when calling `insert_address_to_tree`.
> + *
> + * Must only be called when holding the netdevice's spinlock.
> + *
> + * @ignore_zero_addr_type if true and `addr_type` is zero,
> + * disregard addr_type when matching;
> + */
> +static struct rb_node **tree_address_lookup(struct netdev_hw_addr_list *list,
The function name prefixes in the file follow the __hw_addr_xxx and dev_addr patterns,
please conform to that.
> + const unsigned char *addr,
> + int addr_len,
> + unsigned char addr_type,
> + bool ignore_zero_addr_type,
> + struct rb_node **parent)
> +{
> + struct rb_node **node = &list->tree_root.rb_node, *_parent;
> +
> + while (*node)
> + {
> + struct netdev_hw_addr *data = container_of(*node, struct netdev_hw_addr, node);
> + int result;
> +
> + result = memcmp(addr, data->addr, addr_len);
> + if (!result && (ignore_zero_addr_type && !addr_type))
> + result = memcmp(&addr_type, &data->type, sizeof(addr_type));
> +
> + _parent = *node;
> + if (result < 0)
> + node = &(*node)->rb_left;
> + else if (result > 0)
> + node = &(*node)->rb_right;
> + else
> + break;
> + }
> +
> + if (parent)
> + *parent = _parent;
> + return node;
> +}
> +
> +
> +static int insert_address_to_tree(struct netdev_hw_addr_list *list,
+1 fn name pattern
> + struct netdev_hw_addr *ha,
> + int addr_len,
> + struct rb_node **insertion_point,
> + struct rb_node *parent)
> +{
> + /* Figure out where to put new node */
> + if (!insertion_point || !parent)
> + {
Kernel code-style says you should place the curly bracket on the same row as the statement.
Also you don't need brackets for single statement rows.
> + insertion_point = tree_address_lookup(list, ha->addr, addr_len, ha->type, false, &parent);
> + }
> +
> + /* Add new node and rebalance tree. */
> + rb_link_node(&ha->node, parent, insertion_point);
> + rb_insert_color(&ha->node, &list->tree_root);
> +
> + return true;
> +}
> +
> /*
> * General list handling functions
> */
> @@ -19,7 +85,9 @@
> static int __hw_addr_create_ex(struct netdev_hw_addr_list *list,
> const unsigned char *addr, int addr_len,
> unsigned char addr_type, bool global,
> - bool sync)
> + bool sync,
> + struct rb_node **insertion_point,
> + struct rb_node *parent)
> {
> struct netdev_hw_addr *ha;
> int alloc_size;
> @@ -36,6 +104,10 @@ static int __hw_addr_create_ex(struct netdev_hw_addr_list *list,
> ha->global_use = global;
> ha->synced = sync ? 1 : 0;
> ha->sync_cnt = 0;
> +
> + /* Insert node to hash table for quicker lookups during modification */
hash table?
> + insert_address_to_tree(list, ha, addr_len, insertion_point, parent);
> +
> list_add_tail_rcu(&ha->list, &list->list);
> list->count++;
>
> @@ -47,34 +119,36 @@ static int __hw_addr_add_ex(struct netdev_hw_addr_list *list,
> unsigned char addr_type, bool global, bool sync,
> int sync_count)
> {
> + struct rb_node **ha_node;
> + struct rb_node *insert_parent = NULL;
please order these in reverese xmas tree, longest to shortest
> struct netdev_hw_addr *ha;
>
> if (addr_len > MAX_ADDR_LEN)
> return -EINVAL;
>
> - list_for_each_entry(ha, &list->list, list) {
> - if (ha->type == addr_type &&
> - !memcmp(ha->addr, addr, addr_len)) {
> - if (global) {
> - /* check if addr is already used as global */
> - if (ha->global_use)
> - return 0;
> - else
> - ha->global_use = true;
> - }
> - if (sync) {
> - if (ha->synced && sync_count)
> - return -EEXIST;
> - else
> - ha->synced++;
> - }
> - ha->refcount++;
> - return 0;
> + ha_node = tree_address_lookup(list, addr, addr_len, addr_type, false, &insert_parent);
> + if (*ha_node)
> + {
+1 curly bracket style
> + ha = container_of(*ha_node, struct netdev_hw_addr, node);
> + if (global) {
> + /* check if addr is already used as global */
> + if (ha->global_use)
> + return 0;
> + else
> + ha->global_use = true;
> }
> + if (sync) {
> + if (ha->synced && sync_count)
> + return -EEXIST;
> + else
> + ha->synced++;
> + }
> + ha->refcount++;
> + return 0;
> }
>
> return __hw_addr_create_ex(list, addr, addr_len, addr_type, global,
> - sync);
> + sync, ha_node, insert_parent);
> }
>
> static int __hw_addr_add(struct netdev_hw_addr_list *list,
> @@ -103,6 +177,8 @@ static int __hw_addr_del_entry(struct netdev_hw_addr_list *list,
>
> if (--ha->refcount)
> return 0;
> +
> + rb_erase(&ha->node, &list->tree_root);
> list_del_rcu(&ha->list);
> kfree_rcu(ha, rcu_head);
> list->count--;
> @@ -113,14 +189,15 @@ static int __hw_addr_del_ex(struct netdev_hw_addr_list *list,
> const unsigned char *addr, int addr_len,
> unsigned char addr_type, bool global, bool sync)
> {
> + struct rb_node **ha_node;
> struct netdev_hw_addr *ha;
reverse xmas tree
>
> - list_for_each_entry(ha, &list->list, list) {
> - if (!memcmp(ha->addr, addr, addr_len) &&
> - (ha->type == addr_type || !addr_type))
> - return __hw_addr_del_entry(list, ha, global, sync);
> - }
> - return -ENOENT;
> + ha_node = tree_address_lookup(list, addr, addr_len, addr_type, true, NULL);
> + if (*ha_node == NULL)
> + return -ENOENT;
> +
> + ha = container_of(*ha_node, struct netdev_hw_addr, node);
> + return __hw_addr_del_entry(list, ha, global, sync);
> }
>
> static int __hw_addr_del(struct netdev_hw_addr_list *list,
> @@ -418,6 +495,7 @@ void __hw_addr_init(struct netdev_hw_addr_list *list)
> {
> INIT_LIST_HEAD(&list->list);
> list->count = 0;
> + list->tree_root = RB_ROOT;
> }
> EXPORT_SYMBOL(__hw_addr_init);
>
> @@ -552,19 +630,20 @@ EXPORT_SYMBOL(dev_addr_del);
> */
> int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr)
> {
> - struct netdev_hw_addr *ha;
> + struct rb_node *insert_parent = NULL;
> + struct rb_node **ha_node = NULL;
> int err;
>
> netif_addr_lock_bh(dev);
> - list_for_each_entry(ha, &dev->uc.list, list) {
> - if (!memcmp(ha->addr, addr, dev->addr_len) &&
> - ha->type == NETDEV_HW_ADDR_T_UNICAST) {
> - err = -EEXIST;
> - goto out;
> - }
> + ha_node = tree_address_lookup(&dev->uc, addr, dev->addr_len, NETDEV_HW_ADDR_T_UNICAST, false, &insert_parent);
> + if (*ha_node)
> + {
+1 curly bracket style
> + err = -EEXIST;
> + goto out;
> }
> +
> err = __hw_addr_create_ex(&dev->uc, addr, dev->addr_len,
> - NETDEV_HW_ADDR_T_UNICAST, true, false);
> + NETDEV_HW_ADDR_T_UNICAST, true, false, ha_node, insert_parent);
> if (!err)
> __dev_set_rx_mode(dev);
> out:
> @@ -745,19 +824,19 @@ EXPORT_SYMBOL(dev_uc_init);
> */
> int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr)
> {
> - struct netdev_hw_addr *ha;
> + struct rb_node **ha_node;
> + struct rb_node *insert_parent = NULL;
reverse xmas tree
> int err;
>
> netif_addr_lock_bh(dev);
> - list_for_each_entry(ha, &dev->mc.list, list) {
> - if (!memcmp(ha->addr, addr, dev->addr_len) &&
> - ha->type == NETDEV_HW_ADDR_T_MULTICAST) {
> - err = -EEXIST;
> - goto out;
> - }
> + ha_node = tree_address_lookup(&dev->mc, addr, dev->addr_len, NETDEV_HW_ADDR_T_MULTICAST, false, &insert_parent);
> + if (*ha_node)
> + {
+1 curly bracket style
> + err = -EEXIST;
> + goto out;
> }
> err = __hw_addr_create_ex(&dev->mc, addr, dev->addr_len,
> - NETDEV_HW_ADDR_T_MULTICAST, true, false);
> + NETDEV_HW_ADDR_T_MULTICAST, true, false, ha_node, insert_parent);
> if (!err)
> __dev_set_rx_mode(dev);
> out:
>
Powered by blists - more mailing lists