[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1479233715-9905-2-git-send-email-edumazet@google.com>
Date: Tue, 15 Nov 2016 10:15:11 -0800
From: Eric Dumazet <edumazet@...gle.com>
To: "David S . Miller" <davem@...emloft.net>
Cc: netdev <netdev@...r.kernel.org>,
Willem de Bruijn <willemb@...gle.com>,
Adam Belay <abelay@...gle.com>, Zach Brown <zach.brown@...com>,
Tariq Toukan <tariqt@...lanox.com>,
Yuval Mintz <Yuval.Mintz@...ium.com>,
Ariel Elior <ariel.elior@...ium.com>,
Eric Dumazet <edumazet@...gle.com>,
Eric Dumazet <eric.dumazet@...il.com>
Subject: [PATCH net-next 1/5] net: busy-poll: allow preemption in sk_busy_loop()
After commit 4cd13c21b207 ("softirq: Let ksoftirqd do its job"),
sk_busy_loop() needs a bit of care :
softirqs might be delayed since we do not allow preemption yet.
This patch adds preemptiom points in sk_busy_loop(),
and makes sure no unnecessary cache line dirtying
or atomic operations are done while looping.
A new flag is added into napi->state : NAPI_STATE_IN_BUSY_POLL
This prevents napi_complete_done() from clearing NAPIF_STATE_SCHED,
so that sk_busy_loop() does not have to grab it again.
Similarly, netpoll_poll_lock() is done one time.
This gives about 10 to 20 % improvement in various busy polling
tests, especially when many threads are busy polling in
configurations with large number of NIC queues.
This should allow experimenting with bigger delays without
hurting overall latencies.
Tested:
On a 40Gb mlx4 NIC, 32 RX/TX queues.
echo 70 >/proc/sys/net/core/busy_read
for i in `seq 1 40`; do echo -n $i: ; ./super_netperf $i -H lpaa24 -t UDP_RR -- -N -n; done
Before: After:
1: 90072 92819
2: 157289 184007
3: 235772 213504
4: 344074 357513
5: 394755 458267
6: 461151 487819
7: 549116 625963
8: 544423 716219
9: 720460 738446
10: 794686 837612
11: 915998 923960
12: 937507 925107
13: 1019677 971506
14: 1046831 1113650
15: 1114154 1148902
16: 1105221 1179263
17: 1266552 1299585
18: 1258454 1383817
19: 1341453 1312194
20: 1363557 1488487
21: 1387979 1501004
22: 1417552 1601683
23: 1550049 1642002
24: 1568876 1601915
25: 1560239 1683607
26: 1640207 1745211
27: 1706540 1723574
28: 1638518 1722036
29: 1734309 1757447
30: 1782007 1855436
31: 1724806 1888539
32: 1717716 1944297
33: 1778716 1869118
34: 1805738 1983466
35: 1815694 2020758
36: 1893059 2035632
37: 1843406 2034653
38: 1888830 2086580
39: 1972827 2143567
40: 1877729 2181851
Signed-off-by: Eric Dumazet <edumazet@...gle.com>
Cc: Willem de Bruijn <willemb@...gle.com>
Cc: Adam Belay <abelay@...gle.com>
Cc: Tariq Toukan <tariqt@...lanox.com>
Cc: Yuval Mintz <Yuval.Mintz@...ium.com>
Cc: Ariel Elior <ariel.elior@...ium.com>
---
include/linux/netdevice.h | 10 +++++
net/core/dev.c | 102 +++++++++++++++++++++++++++++++++++++---------
2 files changed, 92 insertions(+), 20 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 86bacf6a64f0..e71de66e3792 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -334,6 +334,16 @@ enum {
NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */
NAPI_STATE_HASHED, /* In NAPI hash (busy polling possible) */
NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
+ NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
+};
+
+enum {
+ NAPIF_STATE_SCHED = (1UL << NAPI_STATE_SCHED),
+ NAPIF_STATE_DISABLE = (1UL << NAPI_STATE_DISABLE),
+ NAPIF_STATE_NPSVC = (1UL << NAPI_STATE_NPSVC),
+ NAPIF_STATE_HASHED = (1UL << NAPI_STATE_HASHED),
+ NAPIF_STATE_NO_BUSY_POLL = (1UL << NAPI_STATE_NO_BUSY_POLL),
+ NAPIF_STATE_IN_BUSY_POLL = (1UL << NAPI_STATE_IN_BUSY_POLL),
};
enum gro_result {
diff --git a/net/core/dev.c b/net/core/dev.c
index 6deba68ad9e4..369dcc8efc01 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4902,6 +4902,12 @@ void __napi_complete(struct napi_struct *n)
{
BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
+ /* Some drivers call us directly, instead of calling
+ * napi_complete_done().
+ */
+ if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL, &n->state)))
+ return;
+
list_del_init(&n->poll_list);
smp_mb__before_atomic();
clear_bit(NAPI_STATE_SCHED, &n->state);
@@ -4913,10 +4919,13 @@ void napi_complete_done(struct napi_struct *n, int work_done)
unsigned long flags;
/*
- * don't let napi dequeue from the cpu poll list
- * just in case its running on a different cpu
+ * 1) Don't let napi dequeue from the cpu poll list
+ * just in case its running on a different cpu.
+ * 2) If we are busy polling, do nothing here, we have
+ * the guarantee we will be called later.
*/
- if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
+ if (unlikely(n->state & (NAPIF_STATE_NPSVC |
+ NAPIF_STATE_IN_BUSY_POLL)))
return;
if (n->gro_list) {
@@ -4956,13 +4965,41 @@ static struct napi_struct *napi_by_id(unsigned int napi_id)
}
#if defined(CONFIG_NET_RX_BUSY_POLL)
+
#define BUSY_POLL_BUDGET 8
+
+static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
+{
+ int rc;
+
+ clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
+
+ local_bh_disable();
+
+ /* All we really want here is to re-enable device interrupts.
+ * Ideally, a new ndo_busy_poll_stop() could avoid another round.
+ */
+ rc = napi->poll(napi, BUSY_POLL_BUDGET);
+ netpoll_poll_unlock(have_poll_lock);
+ if (rc == BUSY_POLL_BUDGET)
+ __napi_schedule(napi);
+ local_bh_enable();
+ if (local_softirq_pending())
+ do_softirq();
+}
+
bool sk_busy_loop(struct sock *sk, int nonblock)
{
unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
+ int (*napi_poll)(struct napi_struct *napi, int budget);
int (*busy_poll)(struct napi_struct *dev);
+ void *have_poll_lock = NULL;
struct napi_struct *napi;
- int rc = false;
+ int rc;
+
+restart:
+ rc = false;
+ napi_poll = NULL;
rcu_read_lock();
@@ -4973,24 +5010,33 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
/* Note: ndo_busy_poll method is optional in linux-4.5 */
busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
- do {
+ preempt_disable();
+ for (;;) {
rc = 0;
local_bh_disable();
if (busy_poll) {
rc = busy_poll(napi);
- } else if (napi_schedule_prep(napi)) {
- void *have = netpoll_poll_lock(napi);
-
- if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
- rc = napi->poll(napi, BUSY_POLL_BUDGET);
- trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
- if (rc == BUSY_POLL_BUDGET) {
- napi_complete_done(napi, rc);
- napi_schedule(napi);
- }
- }
- netpoll_poll_unlock(have);
+ goto count;
}
+ if (!napi_poll) {
+ unsigned long val = READ_ONCE(napi->state);
+
+ /* If multiple threads are competing for this napi,
+ * we avoid dirtying napi->state as much as we can.
+ */
+ if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
+ NAPIF_STATE_IN_BUSY_POLL))
+ goto count;
+ if (cmpxchg(&napi->state, val,
+ val | NAPIF_STATE_IN_BUSY_POLL |
+ NAPIF_STATE_SCHED) != val)
+ goto count;
+ have_poll_lock = netpoll_poll_lock(napi);
+ napi_poll = napi->poll;
+ }
+ rc = napi_poll(napi, BUSY_POLL_BUDGET);
+ trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
+count:
if (rc > 0)
__NET_ADD_STATS(sock_net(sk),
LINUX_MIB_BUSYPOLLRXPACKETS, rc);
@@ -4999,10 +5045,26 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
if (rc == LL_FLUSH_FAILED)
break; /* permanent failure */
- cpu_relax();
- } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
- !need_resched() && !busy_loop_timeout(end_time));
+ if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) ||
+ busy_loop_timeout(end_time))
+ break;
+ if (unlikely(need_resched())) {
+ if (napi_poll)
+ busy_poll_stop(napi, have_poll_lock);
+ preempt_enable();
+ rcu_read_unlock();
+ cond_resched();
+ rc = !skb_queue_empty(&sk->sk_receive_queue);
+ if (rc || busy_loop_timeout(end_time))
+ return rc;
+ goto restart;
+ }
+ cpu_relax_lowlatency();
+ }
+ if (napi_poll)
+ busy_poll_stop(napi, have_poll_lock);
+ preempt_enable();
rc = !skb_queue_empty(&sk->sk_receive_queue);
out:
rcu_read_unlock();
--
2.8.0.rc3.226.g39d4020
Powered by blists - more mailing lists