[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20201028133437.212503-2-bjorn.topel@gmail.com>
Date: Wed, 28 Oct 2020 14:34:29 +0100
From: Björn Töpel <bjorn.topel@...il.com>
To: netdev@...r.kernel.org, bpf@...r.kernel.org
Cc: Björn Töpel <bjorn.topel@...el.com>,
magnus.karlsson@...el.com, ast@...nel.org, daniel@...earbox.net,
maciej.fijalkowski@...el.com, sridhar.samudrala@...el.com,
jesse.brandeburg@...el.com, qi.z.zhang@...el.com, kuba@...nel.org,
edumazet@...gle.com, intel-wired-lan@...ts.osuosl.org,
jonathan.lemon@...il.com
Subject: [RFC PATCH bpf-next 1/9] net: introduce biased busy-polling
From: Björn Töpel <bjorn.topel@...el.com>
This change adds a new NAPI mode, called biased busy-polling, which is
an extension to the existing busy-polling mode. The new mode is
enabled on the socket layer, where a socket setting this option
"promisies" to busy-poll the NAPI context via a system call. When this
mode is enabled, the NAPI context will operate in a mode with
interrupts disabled. The kernel monitors that the busy-polling promise
is fulfilled by an internal watchdog. If the socket fail/stop
performing the busy-polling, the mode will be disabled. The watchdog
is currently 200 ms.
Biased busy-polling follows the same mechanism as the existing
busy-poll; The napi_id is reported to the socket via the skbuff. Later
commits will extend napi_id reporting to XDP, in order to work
correctly with XDP sockets.
Let us walk through a flow of execution:
1. A socket sets the new SO_BIAS_BUSY_POLL socket option to true. The
socket now shows an intent of doing busy-polling. No data has been
received to the socket, so the napi_id of the socket is still 0
(non-valid). As usual for busy-polling, the SO_BUSY_POLL option
also has to be non-zero for biased busy-polling.
2. Data is received on the socket changing the napi_id to non-zero.
3. The socket does a system call that has the busy-polling logic wired
up, e.g. recvfrom() for UDP sockets. The NAPI context is now marked
as biased busy-poll. The kernel watchdog is armed. If the NAPI
context is already running, it will try to finish as soon as
possible and move to busy-polling. If the NAPI context is not
running, it will execute the NAPI poll function for the
corresponding napi_id.
4. Goto 3, or wait until the watchdog timeout.
Given the nature of busy-polling, this mode only make sense for
non-blocking sockets.
When the NAPI context is in biased busy-polling mode, it will not
allow a NAPI to be scheduled using the
napi_schedule_prep()/napi_scheduleXXX() combo.
Signed-off-by: Björn Töpel <bjorn.topel@...el.com>
---
arch/alpha/include/uapi/asm/socket.h | 2 +
arch/mips/include/uapi/asm/socket.h | 2 +
arch/parisc/include/uapi/asm/socket.h | 2 +
arch/sparc/include/uapi/asm/socket.h | 2 +
include/linux/netdevice.h | 33 +++++-----
include/net/busy_poll.h | 17 ++++-
include/net/sock.h | 3 +
include/uapi/asm-generic/socket.h | 2 +
net/core/dev.c | 89 +++++++++++++++++++++++++--
net/core/sock.c | 9 +++
10 files changed, 140 insertions(+), 21 deletions(-)
diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index de6c4df61082..0f776668fb09 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -124,6 +124,8 @@
#define SO_DETACH_REUSEPORT_BPF 68
+#define SO_BIAS_BUSY_POLL 69
+
#if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index d0a9ed2ca2d6..d23984731504 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -135,6 +135,8 @@
#define SO_DETACH_REUSEPORT_BPF 68
+#define SO_BIAS_BUSY_POLL 69
+
#if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index 10173c32195e..49469713ed2a 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -116,6 +116,8 @@
#define SO_DETACH_REUSEPORT_BPF 0x4042
+#define SO_BIAS_BUSY_POLL 0x4043
+
#if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index 8029b681fc7c..009aba6f7a54 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -117,6 +117,8 @@
#define SO_DETACH_REUSEPORT_BPF 0x0047
+#define SO_BIAS_BUSY_POLL 0x0048
+
#if !defined(__KERNEL__)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 964b494b0e8d..9bdc84d3d6b8 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -344,29 +344,32 @@ struct napi_struct {
struct list_head rx_list; /* Pending GRO_NORMAL skbs */
int rx_count; /* length of rx_list */
struct hrtimer timer;
+ struct hrtimer bp_watchdog;
struct list_head dev_list;
struct hlist_node napi_hash_node;
unsigned int napi_id;
};
enum {
- NAPI_STATE_SCHED, /* Poll is scheduled */
- NAPI_STATE_MISSED, /* reschedule a napi */
- NAPI_STATE_DISABLE, /* Disable pending */
- NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */
- NAPI_STATE_LISTED, /* NAPI added to system lists */
- NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
- NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
+ NAPI_STATE_SCHED, /* Poll is scheduled */
+ NAPI_STATE_MISSED, /* reschedule a napi */
+ NAPI_STATE_DISABLE, /* Disable pending */
+ NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */
+ NAPI_STATE_LISTED, /* NAPI added to system lists */
+ NAPI_STATE_NO_BUSY_POLL, /* Do not add in napi_hash, no busy polling */
+ NAPI_STATE_IN_BUSY_POLL, /* sk_busy_loop() owns this NAPI */
+ NAPI_STATE_BIAS_BUSY_POLL, /* biased busy-polling */
};
enum {
- NAPIF_STATE_SCHED = BIT(NAPI_STATE_SCHED),
- NAPIF_STATE_MISSED = BIT(NAPI_STATE_MISSED),
- NAPIF_STATE_DISABLE = BIT(NAPI_STATE_DISABLE),
- NAPIF_STATE_NPSVC = BIT(NAPI_STATE_NPSVC),
- NAPIF_STATE_LISTED = BIT(NAPI_STATE_LISTED),
- NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
- NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
+ NAPIF_STATE_SCHED = BIT(NAPI_STATE_SCHED),
+ NAPIF_STATE_MISSED = BIT(NAPI_STATE_MISSED),
+ NAPIF_STATE_DISABLE = BIT(NAPI_STATE_DISABLE),
+ NAPIF_STATE_NPSVC = BIT(NAPI_STATE_NPSVC),
+ NAPIF_STATE_LISTED = BIT(NAPI_STATE_LISTED),
+ NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
+ NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
+ NAPIF_STATE_BIAS_BUSY_POLL = BIT(NAPI_STATE_BIAS_BUSY_POLL),
};
enum gro_result {
@@ -555,6 +558,8 @@ static inline bool napi_if_scheduled_mark_missed(struct napi_struct *n)
return true;
}
+void napi_bias_busy_poll(unsigned int napi_id);
+
enum netdev_queue_state_t {
__QUEUE_STATE_DRV_XOFF,
__QUEUE_STATE_STACK_XOFF,
diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index b001fa91c14e..9738923ed17b 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -23,6 +23,9 @@
*/
#define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1))
+/* Biased busy-poll watchdog timeout in ms */
+#define BIASED_BUSY_POLL_TIMEOUT 200
+
#ifdef CONFIG_NET_RX_BUSY_POLL
struct napi_struct;
@@ -99,13 +102,25 @@ static inline bool sk_busy_loop_timeout(struct sock *sk,
return true;
}
+#ifdef CONFIG_NET_RX_BUSY_POLL
+static inline void __sk_bias_busy_poll(struct sock *sk, unsigned int napi_id)
+{
+ if (likely(!READ_ONCE(sk->sk_bias_busy_poll)))
+ return;
+
+ napi_bias_busy_poll(napi_id);
+}
+#endif
+
static inline void sk_busy_loop(struct sock *sk, int nonblock)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
unsigned int napi_id = READ_ONCE(sk->sk_napi_id);
- if (napi_id >= MIN_NAPI_ID)
+ if (napi_id >= MIN_NAPI_ID) {
+ __sk_bias_busy_poll(sk, napi_id);
napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk);
+ }
#endif
}
diff --git a/include/net/sock.h b/include/net/sock.h
index a5c6ae78df77..cf71834fb601 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -479,6 +479,9 @@ struct sock {
u32 sk_ack_backlog;
u32 sk_max_ack_backlog;
kuid_t sk_uid;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ u8 sk_bias_busy_poll;
+#endif
struct pid *sk_peer_pid;
const struct cred *sk_peer_cred;
long sk_rcvtimeo;
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index 77f7c1638eb1..8a2b37ccd9d5 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -119,6 +119,8 @@
#define SO_DETACH_REUSEPORT_BPF 68
+#define SO_BIAS_BUSY_POLL 69
+
#if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
diff --git a/net/core/dev.c b/net/core/dev.c
index 9499a414d67e..a29e4c4a35f6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6378,6 +6378,9 @@ bool napi_schedule_prep(struct napi_struct *n)
val = READ_ONCE(n->state);
if (unlikely(val & NAPIF_STATE_DISABLE))
return false;
+ if (unlikely(val & NAPIF_STATE_BIAS_BUSY_POLL))
+ return false;
+
new = val | NAPIF_STATE_SCHED;
/* Sets STATE_MISSED bit if STATE_SCHED was already set
@@ -6458,12 +6461,14 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
/* If STATE_MISSED was set, leave STATE_SCHED set,
* because we will call napi->poll() one more time.
- * This C code was suggested by Alexander Duyck to help gcc.
*/
- new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
- NAPIF_STATE_SCHED;
+ if (val & NAPIF_STATE_MISSED && !(val & NAPIF_STATE_BIAS_BUSY_POLL))
+ new |= NAPIF_STATE_SCHED;
} while (cmpxchg(&n->state, val, new) != val);
+ if (unlikely(val & NAPIF_STATE_BIAS_BUSY_POLL))
+ return false;
+
if (unlikely(val & NAPIF_STATE_MISSED)) {
__napi_schedule(n);
return false;
@@ -6497,6 +6502,20 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
{
int rc;
+ clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
+
+ local_bh_disable();
+ /* If we're biased towards busy poll, clear the sched flags,
+ * so that we can enter again.
+ */
+ if (READ_ONCE(napi->state) & NAPIF_STATE_BIAS_BUSY_POLL) {
+ netpoll_poll_unlock(have_poll_lock);
+ napi_complete(napi);
+ __kfree_skb_flush();
+ local_bh_enable();
+ return;
+ }
+
/* Busy polling means there is a high chance device driver hard irq
* could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
* set in napi_schedule_prep().
@@ -6507,9 +6526,6 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
* to perform these two clear_bit()
*/
clear_bit(NAPI_STATE_MISSED, &napi->state);
- clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
-
- local_bh_disable();
/* All we really want here is to re-enable device interrupts.
* Ideally, a new ndo_busy_poll_stop() could avoid another round.
@@ -6569,6 +6585,11 @@ void napi_busy_loop(unsigned int napi_id,
goto count;
have_poll_lock = netpoll_poll_lock(napi);
napi_poll = napi->poll;
+ if (val & NAPIF_STATE_BIAS_BUSY_POLL) {
+ hrtimer_start(&napi->bp_watchdog,
+ ms_to_ktime(BIASED_BUSY_POLL_TIMEOUT),
+ HRTIMER_MODE_REL_PINNED);
+ }
}
work = napi_poll(napi, BUSY_POLL_BUDGET);
trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
@@ -6652,6 +6673,53 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
+static enum hrtimer_restart napi_biased_busy_poll_watchdog(struct hrtimer *timer)
+{
+ struct napi_struct *napi;
+ unsigned long val, new;
+
+ napi = container_of(timer, struct napi_struct, bp_watchdog);
+
+ do {
+ val = READ_ONCE(napi->state);
+ if (WARN_ON_ONCE(!(val & NAPIF_STATE_BIAS_BUSY_POLL)))
+ return HRTIMER_NORESTART;
+
+ new = val & ~NAPIF_STATE_BIAS_BUSY_POLL;
+ } while (cmpxchg(&napi->state, val, new) != val);
+
+ if (!napi_disable_pending(napi) &&
+ !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
+ __napi_schedule_irqoff(napi);
+
+ return HRTIMER_NORESTART;
+}
+
+void napi_bias_busy_poll(unsigned int napi_id)
+{
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ struct napi_struct *napi;
+ unsigned long val, new;
+
+ napi = napi_by_id(napi_id);
+ if (!napi)
+ return;
+
+ do {
+ val = READ_ONCE(napi->state);
+ if (val & NAPIF_STATE_BIAS_BUSY_POLL)
+ return;
+
+ new = val | NAPIF_STATE_BIAS_BUSY_POLL;
+ } while (cmpxchg(&napi->state, val, new) != val);
+
+ hrtimer_start(&napi->bp_watchdog, ms_to_ktime(BIASED_BUSY_POLL_TIMEOUT),
+ HRTIMER_MODE_REL_PINNED);
+#endif
+}
+EXPORT_SYMBOL(napi_bias_busy_poll);
+
+
static void init_gro_hash(struct napi_struct *napi)
{
int i;
@@ -6673,6 +6741,8 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
INIT_HLIST_NODE(&napi->napi_hash_node);
hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
napi->timer.function = napi_watchdog;
+ hrtimer_init(&napi->bp_watchdog, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+ napi->bp_watchdog.function = napi_biased_busy_poll_watchdog;
init_gro_hash(napi);
napi->skb = NULL;
INIT_LIST_HEAD(&napi->rx_list);
@@ -6704,7 +6774,9 @@ void napi_disable(struct napi_struct *n)
msleep(1);
hrtimer_cancel(&n->timer);
+ hrtimer_cancel(&n->bp_watchdog);
+ clear_bit(NAPI_STATE_BIAS_BUSY_POLL, &n->state);
clear_bit(NAPI_STATE_DISABLE, &n->state);
}
EXPORT_SYMBOL(napi_disable);
@@ -6767,6 +6839,11 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
if (likely(work < weight))
goto out_unlock;
+ if (unlikely(n->state & NAPIF_STATE_BIAS_BUSY_POLL)) {
+ napi_complete(n);
+ goto out_unlock;
+ }
+
/* Drivers must not modify the NAPI state if they
* consume the entire weight. In such cases this code
* still "owns" the NAPI instance and therefore can
diff --git a/net/core/sock.c b/net/core/sock.c
index 727ea1cc633c..686eb5549b79 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1159,6 +1159,12 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
sk->sk_ll_usec = val;
}
break;
+ case SO_BIAS_BUSY_POLL:
+ if (valbool && !capable(CAP_NET_ADMIN))
+ ret = -EPERM;
+ else
+ sk->sk_bias_busy_poll = valbool;
+ break;
#endif
case SO_MAX_PACING_RATE:
@@ -1523,6 +1529,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
case SO_BUSY_POLL:
v.val = sk->sk_ll_usec;
break;
+ case SO_BIAS_BUSY_POLL:
+ v.val = sk->sk_bias_busy_poll;
+ break;
#endif
case SO_MAX_PACING_RATE:
--
2.27.0
Powered by blists - more mailing lists