[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1488166294.9415.172.camel@edumazet-glaptop3.roam.corp.google.com>
Date: Sun, 26 Feb 2017 19:31:34 -0800
From: Eric Dumazet <eric.dumazet@...il.com>
To: David Miller <davem@...emloft.net>
Cc: netdev <netdev@...r.kernel.org>,
Tariq Toukan <tariqt@...lanox.com>,
Saeed Mahameed <saeedm@...lanox.com>
Subject: [PATCH net] net: solve a NAPI race
From: Eric Dumazet <edumazet@...gle.com>
While playing with mlx4 hardware timestamping of RX packets, I found
that some packets were received by TCP stack with a ~200 ms delay...
Since the timestamp was provided by the NIC, and my probe was added
in tcp_v4_rcv() while in BH handler, I was confident it was not
a sender issue, or a drop in the network.
This would happen with a very low probability, but hurting RPC
workloads.
A NAPI driver normally arms the IRQ after the napi_complete_done(),
after NAPI_STATE_SCHED is cleared, so that the hard irq handler can grab
it.
Problem is that if another point in the stack grabs NAPI_STATE_SCHED bit
while IRQ are not disabled, we might have later an IRQ firing and
finding this bit set, right before napi_complete_done() clears it.
This can happen with busy polling users, or if gro_flush_timeout is
used. But some other uses of napi_schedule() in drivers can cause this
as well.
This patch adds a new NAPI_STATE_MISSED bit, that napi_schedule_prep()
can set if it could not grab NAPI_STATE_SCHED
Then napi_complete_done() properly reschedules the napi to make sure
we do not miss something.
Since we manipulate multiple bits at once, use cmpxchg() like in
sk_busy_loop() to provide proper transactions.
Signed-off-by: Eric Dumazet <edumazet@...gle.com>
---
include/linux/netdevice.h | 29 +++++++----------------
net/core/dev.c | 44 ++++++++++++++++++++++++++++++++++--
2 files changed, 51 insertions(+), 22 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f40f0ab3847a8caaf46bd4d5f224c65014f501cc..97456b2539e46d6232dda804f6a434db6fd7134f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -330,6 +330,7 @@ struct napi_struct {
enum {
NAPI_STATE_SCHED, /* Poll is scheduled */
+ NAPI_STATE_MISSED, /* reschedule a napi poll */
NAPI_STATE_DISABLE, /* Disable pending */
NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */
NAPI_STATE_HASHED, /* In NAPI hash (busy polling possible) */
@@ -338,12 +339,13 @@ enum {
};
enum {
- NAPIF_STATE_SCHED = (1UL << NAPI_STATE_SCHED),
- NAPIF_STATE_DISABLE = (1UL << NAPI_STATE_DISABLE),
- NAPIF_STATE_NPSVC = (1UL << NAPI_STATE_NPSVC),
- NAPIF_STATE_HASHED = (1UL << NAPI_STATE_HASHED),
- NAPIF_STATE_NO_BUSY_POLL = (1UL << NAPI_STATE_NO_BUSY_POLL),
- NAPIF_STATE_IN_BUSY_POLL = (1UL << NAPI_STATE_IN_BUSY_POLL),
+ NAPIF_STATE_SCHED = BIT(NAPI_STATE_SCHED),
+ NAPIF_STATE_MISSED = BIT(NAPI_STATE_MISSED),
+ NAPIF_STATE_DISABLE = BIT(NAPI_STATE_DISABLE),
+ NAPIF_STATE_NPSVC = BIT(NAPI_STATE_NPSVC),
+ NAPIF_STATE_HASHED = BIT(NAPI_STATE_HASHED),
+ NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
+ NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
};
enum gro_result {
@@ -414,20 +416,7 @@ static inline bool napi_disable_pending(struct napi_struct *n)
return test_bit(NAPI_STATE_DISABLE, &n->state);
}
-/**
- * napi_schedule_prep - check if NAPI can be scheduled
- * @n: NAPI context
- *
- * Test if NAPI routine is already running, and if not mark
- * it as running. This is used as a condition variable to
- * insure only one NAPI poll instance runs. We also make
- * sure there is no pending NAPI disable.
- */
-static inline bool napi_schedule_prep(struct napi_struct *n)
-{
- return !napi_disable_pending(n) &&
- !test_and_set_bit(NAPI_STATE_SCHED, &n->state);
-}
+bool napi_schedule_prep(struct napi_struct *n);
/**
* napi_schedule - schedule NAPI poll
diff --git a/net/core/dev.c b/net/core/dev.c
index 304f2deae5f9897e60a79ed8b69d6ef208295ded..82d868049ba78260a5f28376842657b72bd31994 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4884,6 +4884,32 @@ void __napi_schedule(struct napi_struct *n)
EXPORT_SYMBOL(__napi_schedule);
/**
+ * napi_schedule_prep - check if napi can be scheduled
+ * @n: napi context
+ *
+ * Test if NAPI routine is already running, and if not mark
+ * it as running. This is used as a condition variable
+ * insure only one NAPI poll instance runs. We also make
+ * sure there is no pending NAPI disable.
+ */
+bool napi_schedule_prep(struct napi_struct *n)
+{
+ unsigned long val, new;
+
+ do {
+ val = READ_ONCE(n->state);
+ if (unlikely(val & NAPIF_STATE_DISABLE))
+ return false;
+ new = val | NAPIF_STATE_SCHED;
+ if (unlikely(val & NAPIF_STATE_SCHED))
+ new |= NAPIF_STATE_MISSED;
+ } while (cmpxchg(&n->state, val, new) != val);
+
+ return !(val & NAPIF_STATE_SCHED);
+}
+EXPORT_SYMBOL(napi_schedule_prep);
+
+/**
* __napi_schedule_irqoff - schedule for receive
* @n: entry to schedule
*
@@ -4897,7 +4923,7 @@ EXPORT_SYMBOL(__napi_schedule_irqoff);
bool napi_complete_done(struct napi_struct *n, int work_done)
{
- unsigned long flags;
+ unsigned long flags, val, new;
/*
* 1) Don't let napi dequeue from the cpu poll list
@@ -4927,7 +4953,21 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
list_del_init(&n->poll_list);
local_irq_restore(flags);
}
- WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
+
+ do {
+ val = READ_ONCE(n->state);
+
+ WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
+
+ new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
+
+ if (unlikely(val & NAPIF_STATE_MISSED))
+ new |= NAPIF_STATE_SCHED;
+ } while (cmpxchg(&n->state, val, new) != val);
+
+ if (unlikely(val & NAPIF_STATE_MISSED))
+ __napi_schedule(n);
+
return true;
}
EXPORT_SYMBOL(napi_complete_done);
Powered by blists - more mailing lists