[<prev] [next>] [day] [month] [year] [list]
Message-Id: <20080305205116.5DF6541255B@localhost>
Date: Wed, 5 Mar 2008 12:51:16 -0800 (PST)
From: therbert@...gle.com (Tom Herbert)
To: davem@...emloft.net, netdev@...r.kernel.org
Subject: RFC [PATCH net-2.6 3/6] net: softRSS net changes
This patch adds support for software RSS in the networking layer.
Signed-off-by: Tom Herbert <therbert@...gle.com>
---
diff -uprN -X /tmp/donts/rss_1 net-2.6/include/linux/netdevice.h net-2.6.patch/include/linux/netdevice.h
--- net-2.6/include/linux/netdevice.h 2008-03-05 09:03:21.742957000 -0800
+++ net-2.6.patch/include/linux/netdevice.h 2008-03-05 09:25:33.526752000 -0800
@@ -308,10 +308,15 @@ struct napi_struct {
unsigned long state;
int weight;
int (*poll)(struct napi_struct *, int);
+#ifdef CONFIG_NET_NAPI_RSS
+ int last_rx_cpu;
+#endif
+#if defined(CONFIG_NETPOLL) || defined(CONFIG_NET_NAPI_RSS)
+ struct net_device *dev;
+#endif
#ifdef CONFIG_NETPOLL
spinlock_t poll_lock;
int poll_owner;
- struct net_device *dev;
struct list_head dev_list;
#endif
};
@@ -607,6 +612,12 @@ struct net_device
/* ingress path synchronizer */
spinlock_t ingress_lock;
struct Qdisc *qdisc_ingress;
+#ifdef CONFIG_NET_SOFTRSS
+ cpumask_t soft_rss_cpus;
+#endif
+#ifdef CONFIG_NET_NAPI_RSS
+ cpumask_t napi_rss_cpus;
+#endif
/*
* Cache line mostly used on queue transmit path (qdisc)
@@ -767,8 +778,10 @@ static inline void netif_napi_add(struct
INIT_LIST_HEAD(&napi->poll_list);
napi->poll = poll;
napi->weight = weight;
-#ifdef CONFIG_NETPOLL
+#if defined(CONFIG_NETPOLL) || defined(CONFIG_NET_NAPI_RSS)
napi->dev = dev;
+#endif
+#ifdef CONFIG_NETPOLL
list_add(&napi->dev_list, &dev->napi_list);
spin_lock_init(&napi->poll_lock);
napi->poll_owner = -1;
@@ -888,6 +901,10 @@ struct softnet_data
struct net_device *output_queue;
struct sk_buff_head input_pkt_queue;
struct list_head poll_list;
+#if defined(CONFIG_NET_NAPI_RSS) || defined(CONFIG_NET_SOFTRSS)
+ struct list_head rss_poll_list;
+ spinlock_t rss_poll_list_lock;
+#endif
struct sk_buff *completion_queue;
struct napi_struct backlog;
@@ -1085,7 +1102,28 @@ extern void dev_kfree_skb_any(struct sk_
extern int netif_rx(struct sk_buff *skb);
extern int netif_rx_ni(struct sk_buff *skb);
#define HAVE_NETIF_RECEIVE_SKB 1
-extern int netif_receive_skb(struct sk_buff *skb);
+extern int __netif_receive_skb(struct sk_buff *skb);
+
+#ifdef CONFIG_NET_NAPI_RSS
+extern int sysctl_napi_rss;
+#endif
+
+#ifdef CONFIG_NET_SOFTRSS
+extern int sysctl_soft_rss;
+static inline int netif_receive_skb(struct sk_buff *skb)
+{
+ if (sysctl_soft_rss)
+ return (netif_rx(skb));
+ else
+ return (__netif_receive_skb(skb));
+}
+#else
+static inline int netif_receive_skb(struct sk_buff *skb)
+{
+ return (__netif_receive_skb(skb));
+}
+#endif
+
extern int dev_valid_name(const char *name);
extern int dev_ioctl(struct net *net, unsigned int cmd, void __user *);
extern int dev_ethtool(struct net *net, struct ifreq *);
diff -uprN -X /tmp/donts/rss_1 net-2.6/net/Kconfig net-2.6.patch/net/Kconfig
--- net-2.6/net/Kconfig 2008-03-05 09:03:27.571549000 -0800
+++ net-2.6.patch/net/Kconfig 2008-03-05 09:31:03.526132000 -0800
@@ -35,6 +35,38 @@ config NET_NS
Allow user space to create what appear to be multiple instances
of the network stack.
+config NET_NAPI_RSS
+ bool "NAPI RSS"
+ help
+ Say Y here to enable NAPI RSS. In this mode the execution of the
+ NAPI poll function for each device is spread across CPUs in a
+ round robin fashion. Each time the poll function runs it gets
+ scheduled on the next CPU in the round robin.
+
+ A mask of CPUs that can be used is set on a per device basis
+ in the sysfs variable /sys/class/net/<device>/napi_rss_cpus. This
+ feature needs to be enabled at run-time by setting the
+ net.core.napi_rss sysctl to "1".
+
+config NET_SOFTRSS
+ bool "Software RSS"
+ help
+ Say Y here to enable a software implementation of receive side
+ scaling (RSS). RSS distributes the load of received
+ packet processing across multiple CPUs. In this software
+ implementation of RSS, stack processing for each packet can be
+ scheduled on a different CPU from that which handles the device
+ interrupt or NAPI poll. The scheduling is done by the netif_rx
+ function which uses a hash over fields in the packet header into
+ a CPU identifier. For example, in the case of a TCP packet, the
+ four tuple is hashed to choose a CPU for processing all packets of
+ that connection.
+
+ A mask of CPUs that can be used is set on a per device basis
+ in the sysfs variable /sys/class/net/<device>/soft_rss_cpus. This
+ feature needs to be enabled at run-time by setting the
+ net.core.soft_rss sysctl to "1".
+
source "net/packet/Kconfig"
source "net/unix/Kconfig"
source "net/xfrm/Kconfig"
diff -uprN -X /tmp/donts/rss_1 net-2.6/net/core/dev.c net-2.6.patch/net/core/dev.c
--- net-2.6/net/core/dev.c 2008-03-05 09:03:28.151549000 -0800
+++ net-2.6.patch/net/core/dev.c 2008-03-05 09:25:33.595757000 -0800
@@ -122,6 +122,10 @@
#include "net-sysfs.h"
+#ifdef CONFIG_NET_SOFTRSS
+#include <net/ip.h>
+#endif
+
/*
* The list of packet types we will receive (as opposed to discard)
* and the routines to invoke.
@@ -254,6 +258,16 @@ static RAW_NOTIFIER_HEAD(netdev_chain);
DEFINE_PER_CPU(struct softnet_data, softnet_data);
+#ifdef CONFIG_NET_SOFTRSS
+int sysctl_soft_rss = 0;
+EXPORT_SYMBOL(sysctl_soft_rss);
+#endif
+
+#ifdef CONFIG_NET_NAPI_RSS
+int sysctl_napi_rss = 0;
+EXPORT_SYMBOL(sysctl_napi_rss);
+#endif
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
/*
* register_netdevice() inits dev->_xmit_lock and sets lockdep class
@@ -1745,6 +1759,96 @@ int weight_p __read_mostly = 64;
DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
+#ifdef CONFIG_NET_NAPI_RSS
+static inline int napi_rss_next_cpu(struct napi_struct *n)
+{
+ cpumask_t mask;
+ int cpu;
+
+ if (!n->dev)
+ return get_cpu();
+
+ cpus_and(mask, n->dev->napi_rss_cpus, cpu_online_map);
+
+ if (cpus_empty(mask))
+ cpu = get_cpu();
+ else {
+ cpu = next_cpu(n->last_rx_cpu, mask);
+ if (cpu == NR_CPUS)
+ cpu = first_cpu(mask);
+ }
+ n->last_rx_cpu = cpu;
+ return (cpu);
+}
+#endif /* CONFIG_NET_NAPI_RSS */
+
+#if defined(CONFIG_NET_NAPI_RSS) || defined(CONFIG_NET_SOFTRSS)
+/*
+ * Schedule rx softirq on remote CPU.
+ */
+static inline void __napi_schedule_oncpu(struct napi_struct *n, int cpu)
+{
+ unsigned long flags;
+ struct softnet_data *queue = &per_cpu(softnet_data, cpu);
+
+ spin_lock_irqsave(&queue->rss_poll_list_lock, flags);
+ list_add_tail(&n->poll_list, &queue->rss_poll_list);
+ spin_unlock_irqrestore(&queue->rss_poll_list_lock, flags);
+
+ raise_softirq_oncpu(cpu, NET_RX_SOFTIRQ);
+}
+#endif /* CONFIG_NET_NAPI_RSS || CONFIG_NET_SOFT_RSS*/
+
+/*
+ * Schedule rx softirq on local CPU.
+ */
+static inline void __napi_schedule_local(struct napi_struct *n)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
+ __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+ local_irq_restore(flags);
+}
+
+#ifdef CONFIG_NET_SOFTRSS
+static int netif_cpu_for_rss(struct net_device *dev, struct sk_buff *skb)
+{
+ int cpu;
+
+ /*
+ * Hash the packet header to a CPU. Code borrowed from bonding
+ * driver
+ */
+ if (skb->protocol == __constant_htons(ETH_P_IP)) {
+ struct iphdr *iph = (struct iphdr *)skb->data;
+ u16 *layer4hdr = (u16 *)((u32 *)iph + iph->ihl);
+ cpumask_t mask;
+ int index = 0, count = 0;
+
+ cpus_and(mask, dev->soft_rss_cpus, cpu_online_map);
+ if (cpus_empty(mask))
+ return (get_cpu());
+
+ if (!(iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) &&
+ (iph->protocol == IPPROTO_TCP ||
+ iph->protocol == IPPROTO_UDP)) {
+ index = htons((*layer4hdr ^ *(layer4hdr + 1)));
+ }
+
+ index = index ^ ((ntohl(iph->saddr ^ iph->daddr)) & 0xffff);
+ index %= cpus_weight(mask);
+
+ for_each_cpu_mask(cpu, mask) {
+ if (count++ == index)
+ break;
+ }
+ return (cpu);
+ } else
+ return (get_cpu());
+}
+#endif
/**
* netif_rx - post buffer to the network code
@@ -1765,6 +1869,9 @@ int netif_rx(struct sk_buff *skb)
{
struct softnet_data *queue;
unsigned long flags;
+#ifdef CONFIG_NET_SOFTRSS
+ int cpu;
+#endif
/* if netpoll wants it, pretend we never saw it */
if (netpoll_rx(skb))
@@ -1778,23 +1885,51 @@ int netif_rx(struct sk_buff *skb)
* short when CPU is congested, but is still operating.
*/
local_irq_save(flags);
- queue = &__get_cpu_var(softnet_data);
__get_cpu_var(netdev_rx_stat).total++;
+
+#ifdef CONFIG_NET_SOFTRSS
+ cpu = sysctl_soft_rss ? netif_cpu_for_rss(skb->dev, skb) : get_cpu();
+ queue = &per_cpu(softnet_data, cpu);
+ spin_lock(&queue->input_pkt_queue.lock);
+#else
+ queue = &__get_cpu_var(softnet_data);
+#endif
+
if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
if (queue->input_pkt_queue.qlen) {
enqueue:
dev_hold(skb->dev);
__skb_queue_tail(&queue->input_pkt_queue, skb);
+#ifdef CONFIG_NET_SOFTRSS
+ spin_unlock(&queue->input_pkt_queue.lock);
+#endif
local_irq_restore(flags);
return NET_RX_SUCCESS;
}
+
+#ifdef CONFIG_NET_SOFTRSS
+ /*
+ * Schedule backlog poll function (possibly on another CPU).
+ */
+ if (napi_schedule_prep(&queue->backlog)) {
+ if (cpu != get_cpu())
+ __napi_schedule_oncpu(&queue->backlog, cpu);
+ else
+ __napi_schedule_local(&queue->backlog);
+ }
+#else
napi_schedule(&queue->backlog);
+#endif
goto enqueue;
}
__get_cpu_var(netdev_rx_stat).dropped++;
+
+#ifdef CONFIG_NET_SOFTRSS
+ spin_unlock(&queue->input_pkt_queue.lock);
+#endif
local_irq_restore(flags);
kfree_skb(skb);
@@ -2005,7 +2140,7 @@ out:
#endif
/**
- * netif_receive_skb - process receive buffer from network
+ * __netif_receive_skb - process receive buffer from network
* @skb: buffer to process
*
* netif_receive_skb() is the main receive data processing function.
@@ -2019,7 +2154,7 @@ out:
* NET_RX_SUCCESS: no congestion
* NET_RX_DROP: packet was dropped
*/
-int netif_receive_skb(struct sk_buff *skb)
+int __netif_receive_skb(struct sk_buff *skb)
{
struct packet_type *ptype, *pt_prev;
struct net_device *orig_dev;
@@ -2118,13 +2253,22 @@ static int process_backlog(struct napi_s
struct net_device *dev;
local_irq_disable();
+#ifdef CONFIG_NET_SOFTRSS
+ spin_lock(&queue->input_pkt_queue.lock);
+#endif
skb = __skb_dequeue(&queue->input_pkt_queue);
if (!skb) {
__napi_complete(napi);
+#ifdef CONFIG_NET_SOFTRSS
+ spin_unlock(&queue->input_pkt_queue.lock);
+#endif
local_irq_enable();
break;
}
+#ifdef CONFIG_NET_SOFTRSS
+ spin_unlock(&queue->input_pkt_queue.lock);
+#endif
local_irq_enable();
dev = skb->dev;
@@ -2145,25 +2289,38 @@ static int process_backlog(struct napi_s
*/
void __napi_schedule(struct napi_struct *n)
{
- unsigned long flags;
-
- local_irq_save(flags);
- list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
- __raise_softirq_irqoff(NET_RX_SOFTIRQ);
- local_irq_restore(flags);
+#ifdef CONFIG_NET_NAPI_RSS
+ if (sysctl_napi_rss) {
+ int cpu = napi_rss_next_cpu(n);
+ if (cpu != get_cpu()) {
+ __napi_schedule_oncpu(n, cpu);
+ return;
+ }
+ }
+#endif
+ __napi_schedule_local(n);
}
EXPORT_SYMBOL(__napi_schedule);
static void net_rx_action(struct softirq_action *h)
{
- struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
+ struct softnet_data *queue = &__get_cpu_var(softnet_data);
+ struct list_head *list = &queue->poll_list;
unsigned long start_time = jiffies;
int budget = netdev_budget;
void *have;
local_irq_disable();
+#if defined(CONFIG_NET_NAPI_RSS) || defined(CONFIG_NET_SOFTRSS)
+ if (!list_empty(&queue->rss_poll_list)) {
+ spin_lock(&queue->rss_poll_list_lock);
+ list_splice_init(&queue->rss_poll_list, queue->poll_list.prev);
+ spin_unlock(&queue->rss_poll_list_lock);
+ }
+#endif
+
while (!list_empty(list)) {
struct napi_struct *n;
int work, weight;
@@ -2215,8 +2372,23 @@ static void net_rx_action(struct softirq
if (unlikely(work == weight)) {
if (unlikely(napi_disable_pending(n)))
__napi_complete(n);
- else
+ else {
+#ifdef CONFIG_NET_NAPI_RSS
+ int cpu;
+ if (sysctl_napi_rss)
+ cpu = napi_rss_next_cpu(n);
+ else
+ cpu = get_cpu();
+
+ if (cpu != get_cpu()) {
+ list_del(&n->poll_list);
+ __napi_schedule_oncpu(n, cpu);
+ } else
+ list_move_tail(&n->poll_list, list);
+#else
list_move_tail(&n->poll_list, list);
+#endif
+ }
}
netpoll_poll_unlock(have);
@@ -4527,6 +4699,10 @@ static int __init net_dev_init(void)
skb_queue_head_init(&queue->input_pkt_queue);
queue->completion_queue = NULL;
INIT_LIST_HEAD(&queue->poll_list);
+#if defined(CONFIG_NET_NAPI_RSS) || defined(CONFIG_NET_SOFTRSS)
+ INIT_LIST_HEAD(&queue->rss_poll_list);
+ spin_lock_init(&queue->rss_poll_list_lock);
+#endif
queue->backlog.poll = process_backlog;
queue->backlog.weight = weight_p;
@@ -4571,7 +4747,7 @@ EXPORT_SYMBOL(free_netdev);
EXPORT_SYMBOL(netdev_boot_setup_check);
EXPORT_SYMBOL(netdev_set_master);
EXPORT_SYMBOL(netdev_state_change);
-EXPORT_SYMBOL(netif_receive_skb);
+EXPORT_SYMBOL(__netif_receive_skb);
EXPORT_SYMBOL(netif_rx);
EXPORT_SYMBOL(register_gifconf);
EXPORT_SYMBOL(register_netdevice);
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists