lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Tue, 20 Oct 2009 07:18:49 +0200
From:	Eric Dumazet <eric.dumazet@...il.com>
To:	Stephen Hemminger <shemminger@...tta.com>
CC:	David Miller <davem@...emloft.net>, netdev@...r.kernel.org
Subject: Re: [PATCH net-next-2.6] net: Introduce dev_get_by_index_rcu()

Stephen Hemminger a écrit :
> On Tue, 20 Oct 2009 07:03:44 +0200
> Eric Dumazet <eric.dumazet@...il.com> wrote:
> 
>> David Miller a écrit :
>>> From: Eric Dumazet <eric.dumazet@...il.com>
>>> Date: Tue, 20 Oct 2009 06:23:54 +0200
>>>
>>>> I wonder if the whole thing could use RCU somehow, since some
>>>> workloads hit this dev_base_lock rwlock pretty hard...
>>> True, but for now we'll put your fix in :-)
>> [PATCH net-next-2.6] net: Introduce dev_get_by_index_rcu()
>>
>> Some workloads hit dev_base_lock rwlock pretty hard.
>> We can use RCU lookups to avoid touching this rwlock.
>>
>> netdevices are already freed after a RCU grace period, so this patch
>> adds no penalty at device dismantle time.
>>
>> Signed-off-by: Eric Dumazet <eric.dumazet@...il.com>
> 
> All usage dev_base_lock should be replaceable by using combination of rtnl_mutex
> and RCU?

Yes probably, but I believe we should make step-by-step patches ?

1) __dev_get_by_index() is faster than dev_get_by_index_rcu()

2) I am not sure holding RTNL means we also have rcu_lock() implied.

However dev_ifname() could use rcu_lock() in the same patch, 
here is an updated version.

[PATCH net-next-2.6] net: Introduce dev_get_by_index_rcu()

Some workloads hit dev_base_lock rwlock pretty hard.
We can use RCU lookups to avoid touching this rwlock.

netdevices are already freed after a RCU grace period, so this patch
adds no penalty at device dismantle time.

dev_ifname() converted to dev_get_by_index_rcu()

Signed-off-by: Eric Dumazet <eric.dumazet@...il.com>
---
 include/linux/netdevice.h |    1
 net/core/dev.c            |   48 ++++++++++++++++++++++++++++--------
 2 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8380009..4eda680 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1127,6 +1127,7 @@ extern void		netdev_resync_ops(struct net_device *dev);
 extern int call_netdevice_notifiers(unsigned long val, struct net_device *dev);
 extern struct net_device	*dev_get_by_index(struct net *net, int ifindex);
 extern struct net_device	*__dev_get_by_index(struct net *net, int ifindex);
+extern struct net_device	*dev_get_by_index_rcu(struct net *net, int ifindex);
 extern int		dev_restart(struct net_device *dev);
 #ifdef CONFIG_NETPOLL_TRAP
 extern int		netpoll_trap(void);
diff --git a/net/core/dev.c b/net/core/dev.c
index 28b0b9e..4564596 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -217,12 +217,15 @@ static int list_netdevice(struct net_device *dev)
 	write_lock_bh(&dev_base_lock);
 	list_add_tail(&dev->dev_list, &net->dev_base_head);
 	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
-	hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex));
+	hlist_add_head_rcu(&dev->index_hlist,
+			   dev_index_hash(net, dev->ifindex));
 	write_unlock_bh(&dev_base_lock);
 	return 0;
 }
 
-/* Device list removal */
+/* Device list removal
+ * caller must respect a RCU grace period before freeing/reusing dev
+ */
 static void unlist_netdevice(struct net_device *dev)
 {
 	ASSERT_RTNL();
@@ -231,7 +234,7 @@ static void unlist_netdevice(struct net_device *dev)
 	write_lock_bh(&dev_base_lock);
 	list_del(&dev->dev_list);
 	hlist_del(&dev->name_hlist);
-	hlist_del(&dev->index_hlist);
+	hlist_del_rcu(&dev->index_hlist);
 	write_unlock_bh(&dev_base_lock);
 }
 
@@ -649,6 +652,31 @@ struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 }
 EXPORT_SYMBOL(__dev_get_by_index);
 
+/**
+ *	dev_get_by_index_rcu - find a device by its ifindex
+ *	@net: the applicable net namespace
+ *	@ifindex: index of device
+ *
+ *	Search for an interface by index. Returns %NULL if the device
+ *	is not found or a pointer to the device. The device has not
+ *	had its reference counter increased so the caller must be careful
+ *	about locking. The caller must hold RCU lock.
+ */
+
+struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
+{
+	struct hlist_node *p;
+	struct net_device *dev;
+	struct hlist_head *head = dev_index_hash(net, ifindex);
+
+	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
+		if (dev->ifindex == ifindex)
+			return dev;
+
+	return NULL;
+}
+EXPORT_SYMBOL(dev_get_by_index_rcu);
+
 
 /**
  *	dev_get_by_index - find a device by its ifindex
@@ -665,11 +693,11 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex)
 {
 	struct net_device *dev;
 
-	read_lock(&dev_base_lock);
-	dev = __dev_get_by_index(net, ifindex);
+	rcu_read_lock();
+	dev = dev_get_by_index_rcu(net, ifindex);
 	if (dev)
 		dev_hold(dev);
-	read_unlock(&dev_base_lock);
+	rcu_read_unlock();
 	return dev;
 }
 EXPORT_SYMBOL(dev_get_by_index);
@@ -2930,15 +2958,15 @@ static int dev_ifname(struct net *net, struct ifreq __user *arg)
 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
 		return -EFAULT;
 
-	read_lock(&dev_base_lock);
-	dev = __dev_get_by_index(net, ifr.ifr_ifindex);
+	rcu_read_lock();
+	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
 	if (!dev) {
-		read_unlock(&dev_base_lock);
+		rcu_read_unlock();
 		return -ENODEV;
 	}
 
 	strcpy(ifr.ifr_name, dev->name);
-	read_unlock(&dev_base_lock);
+	rcu_read_unlock();
 
 	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
 		return -EFAULT;

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ