lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250825211832.84901-1-simon.schippers@tu-dortmund.de>
Date: Mon, 25 Aug 2025 23:16:03 +0200
From: Simon Schippers <simon.schippers@...dortmund.de>
To: willemdebruijn.kernel@...il.com, jasowang@...hat.com,
        netdev@...r.kernel.org, linux-kernel@...r.kernel.org
Cc: Simon Schippers <simon.schippers@...dortmund.de>,
        Tim Gebauer <tim.gebauer@...dortmund.de>
Subject: [PATCH net v3] TUN/TAP: Improving throughput and latency by avoiding SKB drops

This patch is a result of our paper [1] and deals with the tun_net_xmit
function which drops SKB's with the reason SKB_DROP_REASON_FULL_RING
whenever the tx_ring (TUN queue) is full. This behavior results in reduced
TCP performance and packet loss for VPNs and VMs. In addition this patch
also allows qdiscs to work properly (see [2]) and to reduce buffer bloat
when reducing the TUN queue.

TUN benchmarks:
+-----------------------------------------------------------------+
| Lab setup of our paper [1]:                                     |
| TCP throughput of VPN solutions at varying RTT (values in Mbps) |
+-----------+---------------+---------------+----------+----------+
| RTT [ms]  | wireguard-go  | wireguard-go  | OpenVPN  | OpenVPN  |
|           |               | patched       |          | patched  |
+-----------+---------------+---------------+----------+----------+
| 10        | 787.3         | 679.0         | 402.4    | 416.9    |
+-----------+---------------+---------------+----------+----------+
| 20        | 765.1         | 718.8         | 401.6    | 393.18   |
+-----------+---------------+---------------+----------+----------+
| 40        | 441.5         | 529.4         | 96.9     | 411.8    |
+-----------+---------------+---------------+----------+----------+
| 80        | 218.7         | 265.7         | 57.9     | 262.7    |
+-----------+---------------+---------------+----------+----------+
| 120       | 145.4         | 181.7         | 52.8     | 178.0    |
+-----------+---------------+---------------+----------+----------+

+--------------------------------------------------------------------+
| Real-world setup of our paper [1]:                                 |
| TCP throughput of VPN solutions without and with the patch         |
| at a RTT of ~120 ms (values in Mbps)                               |
+------------------+--------------+--------------+---------+---------+
| TUN queue        | wireguard-go | wireguard-go | OpenVPN | OpenVPN |
| length [packets] |              | patched      |         | patched |
+------------------+--------------+--------------+---------+---------+
| 5000             | 185.8        | 185.6        | 184.7   | 184.8   |
+------------------+--------------+--------------+---------+---------+
| 1000             | 185.1        | 184.9        | 177.1   | 183.0   |
+------------------+--------------+--------------+---------+---------+
| 500 (default)    | 137.5        | 184.9        | 117.4   | 184.6   |
+------------------+--------------+--------------+---------+---------+
| 100              | 99.8         | 185.3        | 66.4    | 183.5   |
+------------------+--------------+--------------+---------+---------+
| 50               | 59.4         | 185.7        | 21.6    | 184.7   |
+------------------+--------------+--------------+---------+---------+
| 10               | 1.7          | 185.4        | 1.6     | 183.6   |
+------------------+--------------+--------------+---------+---------+

TAP benchmarks:
+------------------------------------------------------------------+
| Lab Setup [3]:                                                   |
| TCP throughput from host to Debian VM using TAP (values in Mbps) |
+----------------------------+------------------+------------------+
| TUN queue                  | Default          | Patched          |
| length [packets]           |                  |                  |
+----------------------------+------------------+------------------+
| 1000 (default)             | 2194.3           | 2185.0           |
+----------------------------+------------------+------------------+
| 100                        | 1986.4           | 2268.5           |
+----------------------------+------------------+------------------+
| 10                         | 625.0            | 1988.9           |
+----------------------------+------------------+------------------+
| 1                          | 2.2              | 1112.7           |
+----------------------------+------------------+------------------+
|                                                                  |
+------------------------------------------------------------------+
| Measurement with 1000 packets queue and emulated delay           |
+----------------------------+------------------+------------------+
| RTT [ms]                   | Default          | Patched          |
+----------------------------+------------------+------------------+
| 60                         | 171.8            | 341.2            |
+----------------------------+------------------+------------------+
| 120                        | 98.3             | 255.0            |
+----------------------------+------------------+------------------+

TAP+vhost_net benchmarks:
+----------------------------------------------------------------------+
| Lab Setup [3]:                                                       |
| TCP throughput from host to Debian VM using TAP+vhost_net            |
| (values in Mbps)                                                     |
+-----------------------------+--------------------+-------------------+
| TUN queue                   | Default            | Patched           |
| length [packets]            |                    |                   |
+-----------------------------+--------------------+-------------------+
| 1000 (default)              | 23403.9            | 23858.8           |
+-----------------------------+--------------------+-------------------+
| 100                         | 23372.5            | 23889.9           |
+-----------------------------+--------------------+-------------------+
| 10                          | 25837.5            | 23730.2           |
+-----------------------------+--------------------+-------------------+
| 1                           | 0.7                | 19244.8           |
+-----------------------------+--------------------+-------------------+
| Note: Default suffers from many retransmits, while patched does not. |
+----------------------------------------------------------------------+
|                                                                      |
+----------------------------------------------------------------------+
| Measurement with 1000 packets queue and emulated delay               |
+-----------------------------+--------------------+-------------------+
| RTT [ms]                    | Default            | Patched           |
+-----------------------------+--------------------+-------------------+
| 60                          | 397.1              | 397.8             |
+-----------------------------+--------------------+-------------------+
| 120                         | 200.7              | 199.9             |
+-----------------------------+--------------------+-------------------+

Implementation details:
- The netdev queue start/stop flow control is utilized.
- Compatible with multi-queue by only stopping/waking the specific
netdevice subqueue.

In the tun_net_xmit function:
- Stopping the subqueue is done when the tx_ring gets full after inserting
the SKB into the tx_ring.
- In the unlikely case when the insertion with ptr_ring_produce fails, the
old dropping behavior is used for this SKB.

In the tun_ring_recv function:
- Waking the subqueue is done after consuming a SKB from the tx_ring when
the tx_ring is empty.
- When the tx_ring is configured to be small (for example to hold 1 SKB),
queuing might be stopped in the tun_net_xmit function while at the same
time, ptr_ring_consume is not able to grab a SKB. This prevents
tun_net_xmit from being called again and causes tun_ring_recv to wait
indefinitely for a SKB in the blocking wait queue. Therefore, the netdev
queue is woken in the wait queue.

In the tap_do_read function:
- Same behavior as in tun_ring_recv: Waking the subqueue when the tx_ring
is empty & waking the subqueue in the blocking wait queue.
- Here the netdev txq is obtained with a rcu read lock instead.

In the vhost_net_buf_produce function:
- Same behavior as in tun_ring_recv: Waking the subqueue when the tx_ring
is empty.
- Here the netdev_queue is saved in the vhost_net_virtqueue at init with
new helpers.

We are open to suggestions regarding the implementation :)
Thank you for your work!

[1] Link:
https://cni.etit.tu-dortmund.de/storages/cni-etit/r/Research/Publications/2025/Gebauer_2025_VTCFall/Gebauer_VTCFall2025_AuthorsVersion.pdf
[2] Link:
https://unix.stackexchange.com/questions/762935/traffic-shaping-ineffective-on-tun-device
[3] Link: https://github.com/tudo-cni/nodrop

Co-developed-by: Tim Gebauer <tim.gebauer@...dortmund.de>
Signed-off-by: Tim Gebauer <tim.gebauer@...dortmund.de>
Signed-off-by: Simon Schippers <simon.schippers@...dortmund.de>
---
V2 -> V3: Added support for TAP and TAP+vhost_net.
V1 -> V2: Removed NETDEV_TX_BUSY return case in tun_net_xmit and removed 
unnecessary netif_tx_wake_queue in tun_ring_recv.

 drivers/net/tap.c      | 35 +++++++++++++++++++++++++++++++++++
 drivers/net/tun.c      | 39 +++++++++++++++++++++++++++++++++++----
 drivers/vhost/net.c    | 24 ++++++++++++++++++++++--
 include/linux/if_tap.h |  5 +++++
 include/linux/if_tun.h |  6 ++++++
 5 files changed, 103 insertions(+), 6 deletions(-)

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 1197f245e873..df7e4063fb7c 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -758,6 +758,8 @@ static ssize_t tap_do_read(struct tap_queue *q,
 			   int noblock, struct sk_buff *skb)
 {
 	DEFINE_WAIT(wait);
+	struct netdev_queue *txq;
+	struct net_device *dev;
 	ssize_t ret = 0;
 
 	if (!iov_iter_count(to)) {
@@ -785,12 +787,26 @@ static ssize_t tap_do_read(struct tap_queue *q,
 			ret = -ERESTARTSYS;
 			break;
 		}
+		rcu_read_lock();
+		dev = rcu_dereference(q->tap)->dev;
+		txq = netdev_get_tx_queue(dev, q->queue_index);
+		netif_tx_wake_queue(txq);
+		rcu_read_unlock();
+
 		/* Nothing to read, let's sleep */
 		schedule();
 	}
 	if (!noblock)
 		finish_wait(sk_sleep(&q->sk), &wait);
 
+	if (ptr_ring_empty(&q->ring)) {
+		rcu_read_lock();
+		dev = rcu_dereference(q->tap)->dev;
+		txq = netdev_get_tx_queue(dev, q->queue_index);
+		netif_tx_wake_queue(txq);
+		rcu_read_unlock();
+	}
+
 put:
 	if (skb) {
 		ret = tap_put_user(q, skb, to);
@@ -1176,6 +1192,25 @@ struct socket *tap_get_socket(struct file *file)
 }
 EXPORT_SYMBOL_GPL(tap_get_socket);
 
+struct netdev_queue *tap_get_netdev_queue(struct file *file)
+{
+	struct netdev_queue *txq;
+	struct net_device *dev;
+	struct tap_queue *q;
+
+	if (file->f_op != &tap_fops)
+		return ERR_PTR(-EINVAL);
+	q = file->private_data;
+	if (!q)
+		return ERR_PTR(-EBADFD);
+	rcu_read_lock();
+	dev = rcu_dereference(q->tap)->dev;
+	txq = netdev_get_tx_queue(dev, q->queue_index);
+	rcu_read_unlock();
+	return txq;
+}
+EXPORT_SYMBOL_GPL(tap_get_netdev_queue);
+
 struct ptr_ring *tap_get_ptr_ring(struct file *file)
 {
 	struct tap_queue *q;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index cc6c50180663..30ddcd20fcd3 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1060,13 +1060,16 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	nf_reset_ct(skb);
 
-	if (ptr_ring_produce(&tfile->tx_ring, skb)) {
+	queue = netdev_get_tx_queue(dev, txq);
+	if (unlikely(ptr_ring_produce(&tfile->tx_ring, skb))) {
+		netif_tx_stop_queue(queue);
 		drop_reason = SKB_DROP_REASON_FULL_RING;
 		goto drop;
 	}
+	if (ptr_ring_full(&tfile->tx_ring))
+		netif_tx_stop_queue(queue);
 
 	/* dev->lltx requires to do our own update of trans_start */
-	queue = netdev_get_tx_queue(dev, txq);
 	txq_trans_cond_update(queue);
 
 	/* Notify and wake up reader process */
@@ -2110,9 +2113,10 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 	return total;
 }
 
-static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
+static void *tun_ring_recv(struct tun_struct *tun, struct tun_file *tfile, int noblock, int *err)
 {
 	DECLARE_WAITQUEUE(wait, current);
+	struct netdev_queue *txq;
 	void *ptr = NULL;
 	int error = 0;
 
@@ -2124,6 +2128,7 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
 		goto out;
 	}
 
+	txq = netdev_get_tx_queue(tun->dev, tfile->queue_index);
 	add_wait_queue(&tfile->socket.wq.wait, &wait);
 
 	while (1) {
@@ -2131,6 +2136,9 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
 		ptr = ptr_ring_consume(&tfile->tx_ring);
 		if (ptr)
 			break;
+
+		netif_tx_wake_queue(txq);
+
 		if (signal_pending(current)) {
 			error = -ERESTARTSYS;
 			break;
@@ -2147,6 +2155,10 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
 	remove_wait_queue(&tfile->socket.wq.wait, &wait);
 
 out:
+	if (ptr_ring_empty(&tfile->tx_ring)) {
+		txq = netdev_get_tx_queue(tun->dev, tfile->queue_index);
+		netif_tx_wake_queue(txq);
+	}
 	*err = error;
 	return ptr;
 }
@@ -2165,7 +2177,7 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
 
 	if (!ptr) {
 		/* Read frames from ring */
-		ptr = tun_ring_recv(tfile, noblock, &err);
+		ptr = tun_ring_recv(tun, tfile, noblock, &err);
 		if (!ptr)
 			return err;
 	}
@@ -3712,6 +3724,25 @@ struct socket *tun_get_socket(struct file *file)
 }
 EXPORT_SYMBOL_GPL(tun_get_socket);
 
+struct netdev_queue *tun_get_netdev_queue(struct file *file)
+{
+	struct netdev_queue *txq;
+	struct net_device *dev;
+	struct tun_file *tfile;
+
+	if (file->f_op != &tun_fops)
+		return ERR_PTR(-EINVAL);
+	tfile = file->private_data;
+	if (!tfile)
+		return ERR_PTR(-EBADFD);
+	rcu_read_lock();
+	dev = rcu_dereference(tfile->tun)->dev;
+	txq = netdev_get_tx_queue(dev, tfile->queue_index);
+	rcu_read_unlock();
+	return txq;
+}
+EXPORT_SYMBOL_GPL(tun_get_netdev_queue);
+
 struct ptr_ring *tun_get_tx_ring(struct file *file)
 {
 	struct tun_file *tfile;
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 6edac0c1ba9b..045fc31c59ff 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -130,6 +130,7 @@ struct vhost_net_virtqueue {
 	struct vhost_net_buf rxq;
 	/* Batched XDP buffs */
 	struct xdp_buff *xdp;
+	struct netdev_queue *netdev_queue;
 };
 
 struct vhost_net {
@@ -182,6 +183,8 @@ static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq)
 	rxq->head = 0;
 	rxq->tail = ptr_ring_consume_batched(nvq->rx_ring, rxq->queue,
 					      VHOST_NET_BATCH);
+	if (ptr_ring_empty(nvq->rx_ring))
+		netif_tx_wake_queue(nvq->netdev_queue);
 	return rxq->tail;
 }
 
@@ -1469,6 +1472,21 @@ static struct socket *get_raw_socket(int fd)
 	return ERR_PTR(r);
 }
 
+static struct netdev_queue *get_tap_netdev_queue(struct file *file)
+{
+	struct netdev_queue *q;
+
+	q = tun_get_netdev_queue(file);
+	if (!IS_ERR(q))
+		goto out;
+	q = tap_get_netdev_queue(file);
+	if (!IS_ERR(q))
+		goto out;
+	q = NULL;
+out:
+	return q;
+}
+
 static struct ptr_ring *get_tap_ptr_ring(struct file *file)
 {
 	struct ptr_ring *ring;
@@ -1570,10 +1588,12 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
 		if (r)
 			goto err_used;
 		if (index == VHOST_NET_VQ_RX) {
-			if (sock)
+			if (sock) {
 				nvq->rx_ring = get_tap_ptr_ring(sock->file);
-			else
+				nvq->netdev_queue = get_tap_netdev_queue(sock->file);
+			} else {
 				nvq->rx_ring = NULL;
+			}
 		}
 
 		oldubufs = nvq->ubufs;
diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h
index 553552fa635c..b15c40c86819 100644
--- a/include/linux/if_tap.h
+++ b/include/linux/if_tap.h
@@ -10,6 +10,7 @@ struct socket;
 
 #if IS_ENABLED(CONFIG_TAP)
 struct socket *tap_get_socket(struct file *);
+struct netdev_queue *tap_get_netdev_queue(struct file *file);
 struct ptr_ring *tap_get_ptr_ring(struct file *file);
 #else
 #include <linux/err.h>
@@ -18,6 +19,10 @@ static inline struct socket *tap_get_socket(struct file *f)
 {
 	return ERR_PTR(-EINVAL);
 }
+static inline struct netdev_queue *tap_get_netdev_queue(struct file *f)
+{
+	return ERR_PTR(-EINVAL);
+}
 static inline struct ptr_ring *tap_get_ptr_ring(struct file *f)
 {
 	return ERR_PTR(-EINVAL);
diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index 80166eb62f41..552eb35f0299 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -21,6 +21,7 @@ struct tun_msg_ctl {
 
 #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
 struct socket *tun_get_socket(struct file *);
+struct netdev_queue *tun_get_netdev_queue(struct file *file);
 struct ptr_ring *tun_get_tx_ring(struct file *file);
 
 static inline bool tun_is_xdp_frame(void *ptr)
@@ -50,6 +51,11 @@ static inline struct socket *tun_get_socket(struct file *f)
 	return ERR_PTR(-EINVAL);
 }
 
+static inline struct netdev_queue *tun_get_netdev_queue(struct file *f)
+{
+	return ERR_PTR(-EINVAL);
+}
+
 static inline struct ptr_ring *tun_get_tx_ring(struct file *f)
 {
 	return ERR_PTR(-EINVAL);
-- 
2.43.0


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ