[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20210818062637.343839-2-gnaaman@drivenets.com>
Date: Wed, 18 Aug 2021 09:26:37 +0300
From: Gilad Naaman <gnaaman@...venets.com>
To: davem@...emloft.net, kuba@...nel.org, luwei32@...wei.com,
gnaaman@...venets.com, wangxiongfeng2@...wei.com,
ap420073@...il.com
Cc: netdev@...r.kernel.org
Subject: [PATCH net-next] net-next: Improve perf of bond/vlans modification
When a bond have a massive amount of VLANs with IPv6 addresses,
performance of changing link state, attaching a VRF, changing an IPv6
address, etc. go down dramtically.
The source of most of the slow down is the `dev_addr_lists.c` module,
which mainatins a linked list of HW addresses.
When using IPv6, this list grows for each IPv6 address added on a
VLAN, since each IPv6 address has a multicast HW address associated with
it.
When performing any modification to the involved links, this list is
traversed many times, often for nothing, all while holding the RTNL
lock.
Instead, this patch adds an auxilliary rbtree which cuts down
traversal time significantly.
Performance can be seen with the following script:
#!/bin/bash
ip netns del test || true 2>/dev/null
ip netns add test
echo 1 | ip netns exec test tee /proc/sys/net/ipv6/conf/all/keep_addr_on_down > /dev/null
set -e
ip -n test link add foo type veth peer name bar
ip -n test link add b1 type bond
ip -n test link add florp type vrf table 10
ip -n test link set bar master b1
ip -n test link set foo up
ip -n test link set bar up
ip -n test link set b1 up
ip -n test link set florp up
VLAN_COUNT=1500
BASE_DEV=b1
echo Creating vlans
ip netns exec test time -p bash -c "for i in \$(seq 1 $VLAN_COUNT);
do ip -n test link add link $BASE_DEV name foo.\$i type vlan id \$i; done"
echo Bringing them up
ip netns exec test time -p bash -c "for i in \$(seq 1 $VLAN_COUNT);
do ip -n test link set foo.\$i up; done"
echo Assiging IPv6 Addresses
ip netns exec test time -p bash -c "for i in \$(seq 1 $VLAN_COUNT);
do ip -n test address add dev foo.\$i 2000::\$i/64; done"
echo Attaching to VRF
ip netns exec test time -p bash -c "for i in \$(seq 1 $VLAN_COUNT);
do ip -n test link set foo.\$i master florp; done"
On an Intel(R) Xeon(R) CPU E5-2650 v3 @ 2.30GHz machine, the performance
before the patch is (truncated):
Creating vlans
real 108.35
Bringing them up
real 4.96
Assiging IPv6 Addresses
real 19.22
Attaching to VRF
real 458.84
After the patch:
Creating vlans
real 5.59
Bringing them up
real 5.07
Assiging IPv6 Addresses
real 5.64
Attaching to VRF
real 25.37
Cc: David S. Miller <davem@...emloft.net>
Cc: Jakub Kicinski <kuba@...nel.org>
Cc: Lu Wei <luwei32@...wei.com>
Cc: Xiongfeng Wang <wangxiongfeng2@...wei.com>
Cc: Taehee Yoo <ap420073@...il.com>
Signed-off-by: Gilad Naaman <gnaaman@...venets.com>
---
include/linux/netdevice.h | 5 ++
net/core/dev_addr_lists.c | 161 ++++++++++++++++++++++++++++----------
2 files changed, 124 insertions(+), 42 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index eaf5bb008aa9..8ae56a25661b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -47,6 +47,7 @@
#include <uapi/linux/if_bonding.h>
#include <uapi/linux/pkt_cls.h>
#include <linux/hashtable.h>
+#include <linux/rbtree.h>
struct netpoll_info;
struct device;
@@ -218,12 +219,16 @@ struct netdev_hw_addr {
int sync_cnt;
int refcount;
int synced;
+ struct rb_node node;
struct rcu_head rcu_head;
};
struct netdev_hw_addr_list {
struct list_head list;
int count;
+
+ /* Auxiliary tree for faster lookup when modifying the structure. */
+ struct rb_root tree_root;
};
#define netdev_hw_addr_list_count(l) ((l)->count)
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index 45ae6eeb2964..7fd73b905790 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -12,6 +12,70 @@
#include <linux/export.h>
#include <linux/list.h>
+/* Lookup for an address in the list using the rbtree.
+ * The return value is always a valid pointer.
+ * If the address exists, `*ret` is non-null and the address can be retrieved using
+ *
+ * container_of(*ret, struct netdev_hw_addr, node)
+ *
+ * Otherwise, `ret` can be used with `parent` as an insertion point
+ * when calling `__hw_addr_insert_address_to_tree`.
+ *
+ * Must only be called when holding the netdevice's spinlock.
+ *
+ * @ignore_zero_addr_type if true and `addr_type` is zero,
+ * disregard addr_type when matching;
+ */
+static struct rb_node **__hw_addr_tree_address_lookup(struct netdev_hw_addr_list *list,
+ const unsigned char *addr,
+ int addr_len,
+ unsigned char addr_type,
+ bool ignore_zero_addr_type,
+ struct rb_node **parent)
+{
+ struct rb_node **node = &list->tree_root.rb_node, *_parent;
+
+ while (*node) {
+ struct netdev_hw_addr *data = container_of(*node, struct netdev_hw_addr, node);
+ int result;
+
+ result = memcmp(addr, data->addr, addr_len);
+
+ if (!result && (ignore_zero_addr_type && !addr_type))
+ result = memcmp(&addr_type, &data->type, sizeof(addr_type));
+
+ _parent = *node;
+ if (result < 0)
+ node = &(*node)->rb_left;
+ else if (result > 0)
+ node = &(*node)->rb_right;
+ else
+ break;
+ }
+
+ if (parent)
+ *parent = _parent;
+ return node;
+}
+
+
+static int __hw_addr_insert_address_to_tree(struct netdev_hw_addr_list *list,
+ struct netdev_hw_addr *ha,
+ int addr_len,
+ struct rb_node **insertion_point,
+ struct rb_node *parent)
+{
+ /* Figure out where to put new node */
+ if (!insertion_point || !parent)
+ insertion_point = __hw_addr_tree_address_lookup(list, ha->addr, addr_len, ha->type, false, &parent);
+
+ /* Add new node and rebalance tree. */
+ rb_link_node(&ha->node, parent, insertion_point);
+ rb_insert_color(&ha->node, &list->tree_root);
+
+ return true;
+}
+
/*
* General list handling functions
*/
@@ -19,7 +83,9 @@
static int __hw_addr_create_ex(struct netdev_hw_addr_list *list,
const unsigned char *addr, int addr_len,
unsigned char addr_type, bool global,
- bool sync)
+ bool sync,
+ struct rb_node **insertion_point,
+ struct rb_node *parent)
{
struct netdev_hw_addr *ha;
int alloc_size;
@@ -36,6 +102,10 @@ static int __hw_addr_create_ex(struct netdev_hw_addr_list *list,
ha->global_use = global;
ha->synced = sync ? 1 : 0;
ha->sync_cnt = 0;
+
+ /* Insert node to hash table for quicker lookups during modification */
+ __hw_addr_insert_address_to_tree(list, ha, addr_len, insertion_point, parent);
+
list_add_tail_rcu(&ha->list, &list->list);
list->count++;
@@ -47,34 +117,36 @@ static int __hw_addr_add_ex(struct netdev_hw_addr_list *list,
unsigned char addr_type, bool global, bool sync,
int sync_count)
{
+ struct rb_node *insert_parent = NULL;
struct netdev_hw_addr *ha;
+ struct rb_node **ha_node;
if (addr_len > MAX_ADDR_LEN)
return -EINVAL;
- list_for_each_entry(ha, &list->list, list) {
- if (ha->type == addr_type &&
- !memcmp(ha->addr, addr, addr_len)) {
- if (global) {
- /* check if addr is already used as global */
- if (ha->global_use)
- return 0;
- else
- ha->global_use = true;
- }
- if (sync) {
- if (ha->synced && sync_count)
- return -EEXIST;
- else
- ha->synced++;
- }
- ha->refcount++;
- return 0;
+ ha_node = __hw_addr_tree_address_lookup(list, addr, addr_len,
+ addr_type, false, &insert_parent);
+ if (*ha_node) {
+ ha = container_of(*ha_node, struct netdev_hw_addr, node);
+ if (global) {
+ /* check if addr is already used as global */
+ if (ha->global_use)
+ return 0;
+ else
+ ha->global_use = true;
}
+ if (sync) {
+ if (ha->synced && sync_count)
+ return -EEXIST;
+ else
+ ha->synced++;
+ }
+ ha->refcount++;
+ return 0;
}
return __hw_addr_create_ex(list, addr, addr_len, addr_type, global,
- sync);
+ sync, ha_node, insert_parent);
}
static int __hw_addr_add(struct netdev_hw_addr_list *list,
@@ -103,6 +175,8 @@ static int __hw_addr_del_entry(struct netdev_hw_addr_list *list,
if (--ha->refcount)
return 0;
+
+ rb_erase(&ha->node, &list->tree_root);
list_del_rcu(&ha->list);
kfree_rcu(ha, rcu_head);
list->count--;
@@ -114,13 +188,14 @@ static int __hw_addr_del_ex(struct netdev_hw_addr_list *list,
unsigned char addr_type, bool global, bool sync)
{
struct netdev_hw_addr *ha;
+ struct rb_node **ha_node;
- list_for_each_entry(ha, &list->list, list) {
- if (!memcmp(ha->addr, addr, addr_len) &&
- (ha->type == addr_type || !addr_type))
- return __hw_addr_del_entry(list, ha, global, sync);
- }
- return -ENOENT;
+ ha_node = __hw_addr_tree_address_lookup(list, addr, addr_len, addr_type, true, NULL);
+ if (*ha_node == NULL)
+ return -ENOENT;
+
+ ha = container_of(*ha_node, struct netdev_hw_addr, node);
+ return __hw_addr_del_entry(list, ha, global, sync);
}
static int __hw_addr_del(struct netdev_hw_addr_list *list,
@@ -418,6 +493,7 @@ void __hw_addr_init(struct netdev_hw_addr_list *list)
{
INIT_LIST_HEAD(&list->list);
list->count = 0;
+ list->tree_root = RB_ROOT;
}
EXPORT_SYMBOL(__hw_addr_init);
@@ -552,19 +628,20 @@ EXPORT_SYMBOL(dev_addr_del);
*/
int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr)
{
- struct netdev_hw_addr *ha;
+ struct rb_node *insert_parent = NULL;
+ struct rb_node **ha_node = NULL;
int err;
netif_addr_lock_bh(dev);
- list_for_each_entry(ha, &dev->uc.list, list) {
- if (!memcmp(ha->addr, addr, dev->addr_len) &&
- ha->type == NETDEV_HW_ADDR_T_UNICAST) {
- err = -EEXIST;
- goto out;
- }
+ ha_node = __hw_addr_tree_address_lookup(&dev->uc, addr, dev->addr_len,
+ NETDEV_HW_ADDR_T_UNICAST, false, &insert_parent);
+ if (*ha_node) {
+ err = -EEXIST;
+ goto out;
}
+
err = __hw_addr_create_ex(&dev->uc, addr, dev->addr_len,
- NETDEV_HW_ADDR_T_UNICAST, true, false);
+ NETDEV_HW_ADDR_T_UNICAST, true, false, ha_node, insert_parent);
if (!err)
__dev_set_rx_mode(dev);
out:
@@ -745,19 +822,19 @@ EXPORT_SYMBOL(dev_uc_init);
*/
int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr)
{
- struct netdev_hw_addr *ha;
+ struct rb_node *insert_parent = NULL;
+ struct rb_node **ha_node;
int err;
netif_addr_lock_bh(dev);
- list_for_each_entry(ha, &dev->mc.list, list) {
- if (!memcmp(ha->addr, addr, dev->addr_len) &&
- ha->type == NETDEV_HW_ADDR_T_MULTICAST) {
- err = -EEXIST;
- goto out;
- }
+ ha_node = __hw_addr_tree_address_lookup(&dev->mc, addr, dev->addr_len,
+ NETDEV_HW_ADDR_T_MULTICAST, false, &insert_parent);
+ if (*ha_node) {
+ err = -EEXIST;
+ goto out;
}
err = __hw_addr_create_ex(&dev->mc, addr, dev->addr_len,
- NETDEV_HW_ADDR_T_MULTICAST, true, false);
+ NETDEV_HW_ADDR_T_MULTICAST, true, false, ha_node, insert_parent);
if (!err)
__dev_set_rx_mode(dev);
out:
--
2.25.1
Powered by blists - more mailing lists