lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [day] [month] [year] [list]
Message-Id: <20080305205116.5DF6541255B@localhost>
Date:	Wed,  5 Mar 2008 12:51:16 -0800 (PST)
From:	therbert@...gle.com (Tom Herbert)
To:	davem@...emloft.net, netdev@...r.kernel.org
Subject: RFC [PATCH net-2.6 3/6] net: softRSS net changes

This patch adds support for software RSS in the networking layer.

Signed-off-by: Tom Herbert <therbert@...gle.com>

---

diff -uprN -X /tmp/donts/rss_1 net-2.6/include/linux/netdevice.h net-2.6.patch/include/linux/netdevice.h
--- net-2.6/include/linux/netdevice.h	2008-03-05 09:03:21.742957000 -0800
+++ net-2.6.patch/include/linux/netdevice.h	2008-03-05 09:25:33.526752000 -0800
@@ -308,10 +308,15 @@ struct napi_struct {
 	unsigned long		state;
 	int			weight;
 	int			(*poll)(struct napi_struct *, int);
+#ifdef CONFIG_NET_NAPI_RSS
+	int			last_rx_cpu;
+#endif
+#if defined(CONFIG_NETPOLL) || defined(CONFIG_NET_NAPI_RSS)
+	struct net_device	*dev;
+#endif
 #ifdef CONFIG_NETPOLL
 	spinlock_t		poll_lock;
 	int			poll_owner;
-	struct net_device	*dev;
 	struct list_head	dev_list;
 #endif
 };
@@ -607,6 +612,12 @@ struct net_device
 	/* ingress path synchronizer */
 	spinlock_t		ingress_lock;
 	struct Qdisc		*qdisc_ingress;
+#ifdef CONFIG_NET_SOFTRSS
+	cpumask_t		soft_rss_cpus;
+#endif
+#ifdef CONFIG_NET_NAPI_RSS
+	cpumask_t		napi_rss_cpus;
+#endif
 
 /*
  * Cache line mostly used on queue transmit path (qdisc)
@@ -767,8 +778,10 @@ static inline void netif_napi_add(struct
 	INIT_LIST_HEAD(&napi->poll_list);
 	napi->poll = poll;
 	napi->weight = weight;
-#ifdef CONFIG_NETPOLL
+#if defined(CONFIG_NETPOLL) || defined(CONFIG_NET_NAPI_RSS)
 	napi->dev = dev;
+#endif
+#ifdef CONFIG_NETPOLL
 	list_add(&napi->dev_list, &dev->napi_list);
 	spin_lock_init(&napi->poll_lock);
 	napi->poll_owner = -1;
@@ -888,6 +901,10 @@ struct softnet_data
 	struct net_device	*output_queue;
 	struct sk_buff_head	input_pkt_queue;
 	struct list_head	poll_list;
+#if defined(CONFIG_NET_NAPI_RSS) || defined(CONFIG_NET_SOFTRSS)
+	struct list_head	rss_poll_list;
+	spinlock_t		rss_poll_list_lock;
+#endif
 	struct sk_buff		*completion_queue;
 
 	struct napi_struct	backlog;
@@ -1085,7 +1102,28 @@ extern void dev_kfree_skb_any(struct sk_
 extern int		netif_rx(struct sk_buff *skb);
 extern int		netif_rx_ni(struct sk_buff *skb);
 #define HAVE_NETIF_RECEIVE_SKB 1
-extern int		netif_receive_skb(struct sk_buff *skb);
+extern int		__netif_receive_skb(struct sk_buff *skb);
+
+#ifdef CONFIG_NET_NAPI_RSS
+extern int sysctl_napi_rss;
+#endif
+
+#ifdef CONFIG_NET_SOFTRSS
+extern int sysctl_soft_rss;
+static inline int netif_receive_skb(struct sk_buff *skb)
+{
+	if (sysctl_soft_rss)
+		return (netif_rx(skb));
+	else
+		return (__netif_receive_skb(skb));
+}
+#else
+static inline int netif_receive_skb(struct sk_buff *skb)
+{
+	return (__netif_receive_skb(skb));
+}
+#endif
+
 extern int		dev_valid_name(const char *name);
 extern int		dev_ioctl(struct net *net, unsigned int cmd, void __user *);
 extern int		dev_ethtool(struct net *net, struct ifreq *);
diff -uprN -X /tmp/donts/rss_1 net-2.6/net/Kconfig net-2.6.patch/net/Kconfig
--- net-2.6/net/Kconfig	2008-03-05 09:03:27.571549000 -0800
+++ net-2.6.patch/net/Kconfig	2008-03-05 09:31:03.526132000 -0800
@@ -35,6 +35,38 @@ config NET_NS
 	  Allow user space to create what appear to be multiple instances
 	  of the network stack.
 
+config NET_NAPI_RSS
+	bool "NAPI RSS"
+	help
+	  Say Y here to enable NAPI RSS.  In this mode the execution of the
+	  NAPI poll function for each device is spread across CPUs in a
+	  round robin fashion.  Each time the poll function runs it gets
+	  scheduled on the next CPU in the round robin.
+
+	  A mask of CPUs that can be used is set on a per device basis
+	  in the sysfs variable /sys/class/net/<device>/napi_rss_cpus. This
+	  feature needs to  be enabled at run-time by setting the
+	  net.core.napi_rss sysctl to "1".
+
+config NET_SOFTRSS
+	bool "Software RSS"
+	help
+	  Say Y here to enable a software implementation of receive side
+	  scaling (RSS).  RSS distributes the load of received
+	  packet processing across multiple CPUs.  In this software
+	  implementation of RSS, stack processing for each packet can be
+	  scheduled on a different CPU from that which handles the device
+	  interrupt or NAPI poll.  The scheduling is done by the netif_rx
+	  function which uses a hash over fields in the packet header into
+	  a CPU identifier. For example, in the case of a TCP packet, the
+	  four tuple is hashed to choose a CPU for processing all packets of
+	  that connection.
+
+	  A mask of CPUs that can be used is set on a per device basis
+	  in the sysfs variable /sys/class/net/<device>/soft_rss_cpus. This
+	  feature needs to  be enabled at run-time by setting the
+	  net.core.soft_rss sysctl to "1".
+
 source "net/packet/Kconfig"
 source "net/unix/Kconfig"
 source "net/xfrm/Kconfig"
diff -uprN -X /tmp/donts/rss_1 net-2.6/net/core/dev.c net-2.6.patch/net/core/dev.c
--- net-2.6/net/core/dev.c	2008-03-05 09:03:28.151549000 -0800
+++ net-2.6.patch/net/core/dev.c	2008-03-05 09:25:33.595757000 -0800
@@ -122,6 +122,10 @@
 
 #include "net-sysfs.h"
 
+#ifdef CONFIG_NET_SOFTRSS
+#include <net/ip.h>
+#endif
+
 /*
  *	The list of packet types we will receive (as opposed to discard)
  *	and the routines to invoke.
@@ -254,6 +258,16 @@ static RAW_NOTIFIER_HEAD(netdev_chain);
 
 DEFINE_PER_CPU(struct softnet_data, softnet_data);
 
+#ifdef CONFIG_NET_SOFTRSS
+int sysctl_soft_rss = 0;
+EXPORT_SYMBOL(sysctl_soft_rss);
+#endif
+
+#ifdef CONFIG_NET_NAPI_RSS
+int sysctl_napi_rss = 0;
+EXPORT_SYMBOL(sysctl_napi_rss);
+#endif
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 /*
  * register_netdevice() inits dev->_xmit_lock and sets lockdep class
@@ -1745,6 +1759,96 @@ int weight_p __read_mostly = 64;        
 
 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
 
+#ifdef CONFIG_NET_NAPI_RSS
+static inline int napi_rss_next_cpu(struct napi_struct *n)
+{
+	cpumask_t mask;
+	int cpu;
+
+	if (!n->dev)
+		return get_cpu();
+
+	cpus_and(mask, n->dev->napi_rss_cpus, cpu_online_map);
+
+	if (cpus_empty(mask))
+		cpu = get_cpu();
+	else {
+		cpu = next_cpu(n->last_rx_cpu, mask);
+		if (cpu == NR_CPUS)
+			cpu = first_cpu(mask);
+	}
+	n->last_rx_cpu = cpu;
+	return (cpu);
+}
+#endif /* CONFIG_NET_NAPI_RSS */
+
+#if defined(CONFIG_NET_NAPI_RSS) || defined(CONFIG_NET_SOFTRSS)
+/*
+ * Schedule rx softirq on remote CPU.
+ */
+static inline void __napi_schedule_oncpu(struct napi_struct *n, int cpu)
+{
+	unsigned long flags;
+	struct softnet_data *queue = &per_cpu(softnet_data, cpu);
+
+	spin_lock_irqsave(&queue->rss_poll_list_lock, flags);
+	list_add_tail(&n->poll_list, &queue->rss_poll_list);
+	spin_unlock_irqrestore(&queue->rss_poll_list_lock, flags);
+
+	raise_softirq_oncpu(cpu, NET_RX_SOFTIRQ);
+}
+#endif /* CONFIG_NET_NAPI_RSS  || CONFIG_NET_SOFT_RSS*/
+
+/*
+ * Schedule rx softirq on local CPU.
+ */
+static inline void __napi_schedule_local(struct napi_struct *n)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
+	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+	local_irq_restore(flags);
+}
+
+#ifdef CONFIG_NET_SOFTRSS
+static int netif_cpu_for_rss(struct net_device *dev, struct sk_buff *skb)
+{
+	int cpu;
+
+	/*
+	 * Hash the packet header to a CPU.  Code borrowed from bonding
+	 * driver
+	 */
+	if (skb->protocol == __constant_htons(ETH_P_IP)) {
+		struct iphdr *iph = (struct iphdr *)skb->data;
+		u16 *layer4hdr = (u16 *)((u32 *)iph + iph->ihl);
+		cpumask_t mask;
+		int index = 0, count = 0;
+
+		cpus_and(mask, dev->soft_rss_cpus, cpu_online_map);
+		if (cpus_empty(mask))
+			return (get_cpu());
+
+		if (!(iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) &&
+		    (iph->protocol == IPPROTO_TCP ||
+		     iph->protocol == IPPROTO_UDP)) {
+			index = htons((*layer4hdr ^ *(layer4hdr + 1)));
+		}
+
+		index = index ^ ((ntohl(iph->saddr ^ iph->daddr)) & 0xffff);
+		index %= cpus_weight(mask);
+
+		for_each_cpu_mask(cpu, mask) {
+			if (count++ == index)
+				break;
+		}
+		return (cpu);
+	} else
+		return (get_cpu());
+}
+#endif
 
 /**
  *	netif_rx	-	post buffer to the network code
@@ -1765,6 +1869,9 @@ int netif_rx(struct sk_buff *skb)
 {
 	struct softnet_data *queue;
 	unsigned long flags;
+#ifdef CONFIG_NET_SOFTRSS
+	int cpu;
+#endif
 
 	/* if netpoll wants it, pretend we never saw it */
 	if (netpoll_rx(skb))
@@ -1778,23 +1885,51 @@ int netif_rx(struct sk_buff *skb)
 	 * short when CPU is congested, but is still operating.
 	 */
 	local_irq_save(flags);
-	queue = &__get_cpu_var(softnet_data);
 
 	__get_cpu_var(netdev_rx_stat).total++;
+
+#ifdef CONFIG_NET_SOFTRSS
+	cpu = sysctl_soft_rss ? netif_cpu_for_rss(skb->dev, skb) : get_cpu();
+	queue = &per_cpu(softnet_data, cpu);
+	spin_lock(&queue->input_pkt_queue.lock);
+#else
+	queue = &__get_cpu_var(softnet_data);
+#endif
+
 	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
 		if (queue->input_pkt_queue.qlen) {
 enqueue:
 			dev_hold(skb->dev);
 			__skb_queue_tail(&queue->input_pkt_queue, skb);
+#ifdef CONFIG_NET_SOFTRSS
+			spin_unlock(&queue->input_pkt_queue.lock);
+#endif
 			local_irq_restore(flags);
 			return NET_RX_SUCCESS;
 		}
 
+
+#ifdef CONFIG_NET_SOFTRSS
+		/*
+		 * Schedule backlog poll function (possibly on another CPU).
+		 */
+		if (napi_schedule_prep(&queue->backlog)) {
+			if (cpu != get_cpu())
+				__napi_schedule_oncpu(&queue->backlog, cpu);
+			else
+				__napi_schedule_local(&queue->backlog);
+		}
+#else
 		napi_schedule(&queue->backlog);
+#endif
 		goto enqueue;
 	}
 
 	__get_cpu_var(netdev_rx_stat).dropped++;
+
+#ifdef CONFIG_NET_SOFTRSS
+	spin_unlock(&queue->input_pkt_queue.lock);
+#endif
 	local_irq_restore(flags);
 
 	kfree_skb(skb);
@@ -2005,7 +2140,7 @@ out:
 #endif
 
 /**
- *	netif_receive_skb - process receive buffer from network
+ *	__netif_receive_skb - process receive buffer from network
  *	@skb: buffer to process
  *
  *	netif_receive_skb() is the main receive data processing function.
@@ -2019,7 +2154,7 @@ out:
  *	NET_RX_SUCCESS: no congestion
  *	NET_RX_DROP: packet was dropped
  */
-int netif_receive_skb(struct sk_buff *skb)
+int __netif_receive_skb(struct sk_buff *skb)
 {
 	struct packet_type *ptype, *pt_prev;
 	struct net_device *orig_dev;
@@ -2118,13 +2253,22 @@ static int process_backlog(struct napi_s
 		struct net_device *dev;
 
 		local_irq_disable();
+#ifdef CONFIG_NET_SOFTRSS
+		spin_lock(&queue->input_pkt_queue.lock);
+#endif
 		skb = __skb_dequeue(&queue->input_pkt_queue);
 		if (!skb) {
 			__napi_complete(napi);
+#ifdef CONFIG_NET_SOFTRSS
+			spin_unlock(&queue->input_pkt_queue.lock);
+#endif
 			local_irq_enable();
 			break;
 		}
 
+#ifdef CONFIG_NET_SOFTRSS
+		spin_unlock(&queue->input_pkt_queue.lock);
+#endif
 		local_irq_enable();
 
 		dev = skb->dev;
@@ -2145,25 +2289,38 @@ static int process_backlog(struct napi_s
  */
 void __napi_schedule(struct napi_struct *n)
 {
-	unsigned long flags;
-
-	local_irq_save(flags);
-	list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
-	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
-	local_irq_restore(flags);
+#ifdef CONFIG_NET_NAPI_RSS
+	if (sysctl_napi_rss) {
+		int cpu = napi_rss_next_cpu(n);
+		if (cpu != get_cpu()) {
+			__napi_schedule_oncpu(n, cpu);
+			return;
+		}
+	}
+#endif
+	__napi_schedule_local(n);
 }
 EXPORT_SYMBOL(__napi_schedule);
 
 
 static void net_rx_action(struct softirq_action *h)
 {
-	struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
+	struct softnet_data *queue = &__get_cpu_var(softnet_data);
+	struct list_head *list = &queue->poll_list;
 	unsigned long start_time = jiffies;
 	int budget = netdev_budget;
 	void *have;
 
 	local_irq_disable();
 
+#if defined(CONFIG_NET_NAPI_RSS) || defined(CONFIG_NET_SOFTRSS)
+	if (!list_empty(&queue->rss_poll_list)) {
+		spin_lock(&queue->rss_poll_list_lock);
+		list_splice_init(&queue->rss_poll_list, queue->poll_list.prev);
+		spin_unlock(&queue->rss_poll_list_lock);
+	}
+#endif
+
 	while (!list_empty(list)) {
 		struct napi_struct *n;
 		int work, weight;
@@ -2215,8 +2372,23 @@ static void net_rx_action(struct softirq
 		if (unlikely(work == weight)) {
 			if (unlikely(napi_disable_pending(n)))
 				__napi_complete(n);
-			else
+			else {
+#ifdef CONFIG_NET_NAPI_RSS
+				int cpu;
+				if (sysctl_napi_rss)
+					cpu = napi_rss_next_cpu(n);
+				else
+					cpu = get_cpu();
+
+				if (cpu != get_cpu()) {
+					list_del(&n->poll_list);
+					__napi_schedule_oncpu(n, cpu);
+				} else
+					list_move_tail(&n->poll_list, list);
+#else
 				list_move_tail(&n->poll_list, list);
+#endif
+			}
 		}
 
 		netpoll_poll_unlock(have);
@@ -4527,6 +4699,10 @@ static int __init net_dev_init(void)
 		skb_queue_head_init(&queue->input_pkt_queue);
 		queue->completion_queue = NULL;
 		INIT_LIST_HEAD(&queue->poll_list);
+#if defined(CONFIG_NET_NAPI_RSS) || defined(CONFIG_NET_SOFTRSS)
+		INIT_LIST_HEAD(&queue->rss_poll_list);
+		spin_lock_init(&queue->rss_poll_list_lock);
+#endif
 
 		queue->backlog.poll = process_backlog;
 		queue->backlog.weight = weight_p;
@@ -4571,7 +4747,7 @@ EXPORT_SYMBOL(free_netdev);
 EXPORT_SYMBOL(netdev_boot_setup_check);
 EXPORT_SYMBOL(netdev_set_master);
 EXPORT_SYMBOL(netdev_state_change);
-EXPORT_SYMBOL(netif_receive_skb);
+EXPORT_SYMBOL(__netif_receive_skb);
 EXPORT_SYMBOL(netif_rx);
 EXPORT_SYMBOL(register_gifconf);
 EXPORT_SYMBOL(register_netdevice);
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ