netdev - [PATCH] Improved network performance by balancing Rx against other work

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <87tysfu05d.wl%peterc@chubb.wattle.id.au>
Date:	Wed, 17 Mar 2010 13:55:58 +1100
From:	Peter Chubb <peter.chubb@...ta.com.au>
To:	netdev@...r.kernel.org
Subject: [PATCH] Improved network performance by balancing Rx against other work

Hi,
	I don't know how many people caught my talk at LCA, but the
slides are now up at http://www.lca2010.org.nz/slides/50036.pdf

The general approach is to restrict the work done in the Rx-side
processing to just 32 or so packets at a time then call
sys_sched_yield() to allow other system processing to get a look in.
Currently, NAPI processing happens in soft IRQ context, and much of it
with interrupts off.  Moreover, when a particular poll routine is over
budget, the softirq is reraised --- so the softirq thread keeps
running until the scheduler (which under loaded conditions sees
softirq as a CPU-intensive thread) decides to preempt it.  By this
time, more packets are queued than the upper levels can handle in a
reasonable time, killing overall system performance.  In addition,
because so much runs with interrupts disabled, real-time performance
sucks.

The results in the talk were for UDP echo, which is a bit easier to
analyse than NFS.

By running the interrupt handler as a separate thread running
SCHED_NORMAL, and getting rid of all deferred work (implicitly
coalescing interrupts) I see around 15% improvement in NFSstone.
What's more, under overload conditions, the standard Linux kernel's
performance droops markedly with increasing load; with this patch, it
remains approximately constant.

Against a 3GHz Celeron, the standard kernel gives a peak of around
8000 NFS ops/sec; as the load increases this drops a little.  With this
patch I see 10 000 ops/sec, and less drooping.

NFS over UDP
requests/sec    Completed	Completed
Generated	Std Kernel	My patch

5000		5000		5000
10000		8169		10000
15000		7938		10649
20000		7579		11084
25000		7541		10133

Median Latencies at low load are about the same, at around 300usec
(max 7ms).  At 10000 applied ops per second, with my patch the median
per-request latency is around 53ms, with the standard kernel 200ms.

CPU usage is around the same in both cases.

I've appended an updated patch (the one I used to present the results
at LCA2010 was extremely hacky, this one is just a bit hacky).

I'm not expecting this to be applied; but please try to reproduce my
results.  To do this `properly' is going to mean fairly major surgery
to the NAPI layer, and I'm not yet sure of the right way to do that.
I'd also like a sanity check on my analysis.


Signed-off-by: Peter Chubb <peter.chubb@...ta.com.au>
---
 drivers/net/e1000/e1000.h      |    2 
 drivers/net/e1000/e1000_main.c |  115 ++++++++++++++++++++++++-----------------
 kernel/fork.c                  |    2 
 kernel/sched.c                 |    1 
 4 files changed, 73 insertions(+), 47 deletions(-)

Index: linux-2.6-peterc/drivers/net/e1000/e1000.h
===================================================================
--- linux-2.6-peterc.orig/drivers/net/e1000/e1000.h
+++ linux-2.6-peterc/drivers/net/e1000/e1000.h
@@ -287,6 +287,8 @@ struct e1000_adapter {
 			     int cleaned_count);
 	struct e1000_rx_ring *rx_ring;      /* One per active queue */
 	struct napi_struct napi;
+	struct task_struct *irq_thread;
+	u32	last_icr;
 
 	int num_tx_queues;
 	int num_rx_queues;
Index: linux-2.6-peterc/drivers/net/e1000/e1000_main.c
===================================================================
--- linux-2.6-peterc.orig/drivers/net/e1000/e1000_main.c
+++ linux-2.6-peterc/drivers/net/e1000/e1000_main.c
@@ -28,6 +28,8 @@
 
 #include "e1000.h"
 #include <net/ip6_checksum.h>
+#include <linux/kthread.h>
+#include <linux/syscalls.h> /* sys_sched_yield() */
 
 char e1000_driver_name[] = "e1000";
 static char e1000_driver_string[] = "Intel(R) PRO/1000 Network Driver";
@@ -131,6 +133,7 @@ static struct net_device_stats * e1000_g
 static int e1000_change_mtu(struct net_device *netdev, int new_mtu);
 static int e1000_set_mac(struct net_device *netdev, void *p);
 static irqreturn_t e1000_intr(int irq, void *data);
+static int e1000_intr_thread(void *data);
 static bool e1000_clean_tx_irq(struct e1000_adapter *adapter,
 			       struct e1000_tx_ring *tx_ring);
 static int e1000_clean(struct napi_struct *napi, int budget);
@@ -269,6 +272,12 @@ static int e1000_request_irq(struct e100
 		        "Unable to allocate interrupt Error: %d\n", err);
 	}
 
+	adapter->irq_thread = kthread_create(e1000_intr_thread,
+					     netdev,
+					     "irq/%d-%s",
+					     adapter->pdev->irq,
+					     netdev->name);
+	get_task_struct(adapter->irq_thread);
 	return err;
 }
 
@@ -277,6 +286,9 @@ static void e1000_free_irq(struct e1000_
 	struct net_device *netdev = adapter->netdev;
 
 	free_irq(adapter->pdev->irq, netdev);
+	kthread_stop(adapter->irq_thread);
+	put_task_struct(adapter->irq_thread);
+
 }
 
 /**
@@ -396,8 +408,6 @@ int e1000_up(struct e1000_adapter *adapt
 
 	clear_bit(__E1000_DOWN, &adapter->flags);
 
-	napi_enable(&adapter->napi);
-
 	e1000_irq_enable(adapter);
 
 	netif_wake_queue(adapter->netdev);
@@ -495,8 +505,6 @@ void e1000_down(struct e1000_adapter *ad
 	E1000_WRITE_FLUSH();
 	msleep(10);
 
-	napi_disable(&adapter->napi);
-
 	e1000_irq_disable(adapter);
 
 	del_timer_sync(&adapter->tx_fifo_stall_timer);
@@ -888,7 +896,6 @@ static int __devinit e1000_probe(struct 
 	netdev->netdev_ops = &e1000_netdev_ops;
 	e1000_set_ethtool_ops(netdev);
 	netdev->watchdog_timeo = 5 * HZ;
-	netif_napi_add(netdev, &adapter->napi, e1000_clean, 64);
 
 	strncpy(netdev->name, pci_name(pdev), sizeof(netdev->name) - 1);
 
@@ -1288,8 +1295,6 @@ static int e1000_open(struct net_device 
 	/* From here on the code is the same as e1000_up() */
 	clear_bit(__E1000_DOWN, &adapter->flags);
 
-	napi_enable(&adapter->napi);
-
 	e1000_irq_enable(adapter);
 
 	netif_start_queue(netdev);
@@ -3347,6 +3352,55 @@ void e1000_update_stats(struct e1000_ada
 	spin_unlock_irqrestore(&adapter->stats_lock, flags);
 }
 
+static int
+e1000_wait_for_intr(struct e1000_adapter *adapter)
+{
+	while (!kthread_should_stop()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (adapter->last_icr) {
+			__set_current_state(TASK_RUNNING);
+			return 0;
+		}
+		schedule();
+	}
+	return -1;
+}
+
+static int e1000_intr_thread(void *data)
+{
+	struct net_device *netdev = data;
+	struct e1000_adapter *adapter = netdev_priv(netdev);
+	const int budget = 32; // FIXME should be auto-tuneable
+	int tx_clean_complete = 0, work_done = 0;
+
+	while(!e1000_wait_for_intr(adapter)) {
+		do {
+			work_done = 0;
+
+			tx_clean_complete = e1000_clean_tx_irq(adapter, &adapter->tx_ring[0]);
+			adapter->clean_rx(adapter, &adapter->rx_ring[0], &work_done, budget);
+			if (!tx_clean_complete)
+				work_done = budget;
+
+			if (work_done == budget) {
+				/*
+				 * Give up the rest of the timeslice to allow
+				 * userspace to make forward progress
+				 */
+				sys_sched_yield();
+			}
+		} while (work_done == budget);
+
+		/* If budget not fully consumed, wait for an interrupt */
+		adapter->last_icr = 0;
+		if (likely(adapter->itr_setting & 3))
+			e1000_set_itr(adapter);
+		if (!test_bit(__E1000_DOWN, &adapter->flags))
+			e1000_irq_enable(adapter);
+	}
+	return 0;
+}
+
 /**
  * e1000_intr - Interrupt Handler
  * @irq: interrupt number
@@ -3374,49 +3428,12 @@ static irqreturn_t e1000_intr(int irq, v
 	ew32(IMC, ~0);
 	E1000_WRITE_FLUSH();
 
-	if (likely(napi_schedule_prep(&adapter->napi))) {
-		adapter->total_tx_bytes = 0;
-		adapter->total_tx_packets = 0;
-		adapter->total_rx_bytes = 0;
-		adapter->total_rx_packets = 0;
-		__napi_schedule(&adapter->napi);
-	} else {
-		/* this really should not happen! if it does it is basically a
-		 * bug, but not a hard error, so enable ints and continue */
-		if (!test_bit(__E1000_DOWN, &adapter->flags))
-			e1000_irq_enable(adapter);
-	}
+	adapter->last_icr = icr;
+	wake_up_process(adapter->irq_thread);
 
 	return IRQ_HANDLED;
 }
 
-/**
- * e1000_clean - NAPI Rx polling callback
- * @adapter: board private structure
- **/
-static int e1000_clean(struct napi_struct *napi, int budget)
-{
-	struct e1000_adapter *adapter = container_of(napi, struct e1000_adapter, napi);
-	int tx_clean_complete = 0, work_done = 0;
-
-	tx_clean_complete = e1000_clean_tx_irq(adapter, &adapter->tx_ring[0]);
-
-	adapter->clean_rx(adapter, &adapter->rx_ring[0], &work_done, budget);
-
-	if (!tx_clean_complete)
-		work_done = budget;
-
-	/* If budget not fully consumed, exit the polling mode */
-	if (work_done < budget) {
-		if (likely(adapter->itr_setting & 3))
-			e1000_set_itr(adapter);
-		napi_complete(napi);
-		if (!test_bit(__E1000_DOWN, &adapter->flags))
-			e1000_irq_enable(adapter);
-	}
-
-	return work_done;
-}
 
 /**
  * e1000_clean_tx_irq - Reclaim resources after transmit completes
Index: linux-2.6-peterc/kernel/fork.c
===================================================================
--- linux-2.6-peterc.orig/kernel/fork.c
+++ linux-2.6-peterc/kernel/fork.c
@@ -178,6 +178,8 @@ void __put_task_struct(struct task_struc
 		free_task(tsk);
 }
 
+EXPORT_SYMBOL(__put_task_struct);
+
 /*
  * macro override instead of weak attribute alias, to workaround
  * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions.
Index: linux-2.6-peterc/kernel/sched.c
===================================================================
--- linux-2.6-peterc.orig/kernel/sched.c
+++ linux-2.6-peterc/kernel/sched.c
@@ -4946,6 +4946,7 @@ SYSCALL_DEFINE0(sched_yield)
 
 	return 0;
 }
+EXPORT_SYMBOL(sys_sched_yield);
 
 static inline int should_resched(void)
 {

--
Dr Peter Chubb  http://www.gelato.unsw.edu.au  peterc AT gelato.unsw.edu.au
http://www.ertos.nicta.com.au           ERTOS within National ICT Australia
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html