netdev - [PATCH net v3] TUN/TAP: Improving throughput and latency by avoiding SKB drops

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <74b28e67-da36-4bb4-b1eb-58dd51762bab@tu-dortmund.de>
Date: Wed, 27 Aug 2025 11:34:51 +0200
From: Simon Schippers <simon.schippers@...dortmund.de>
To: Willem de Bruijn <willemdebruijn.kernel@...il.com>, jasowang@...hat.com,
        netdev@...r.kernel.org, linux-kernel@...r.kernel.org
Cc: Tim Gebauer <tim.gebauer@...dortmund.de>
Subject: [PATCH net v3] TUN/TAP: Improving throughput and latency by avoiding
 SKB drops

Willem de Bruijn wrote:
> Target net-next
> 
> Also please have the subject line summarize the functional change
> (only). Something like "tun: replace tail drop with queue pause
> when full."
> 

Thank you very much for your detailed reply!
Yes, I will target net-next instead and change the title.

> Simon Schippers wrote:
>> This patch is a result of our paper [1] and deals with the tun_net_xmit
>> function which drops SKB's with the reason SKB_DROP_REASON_FULL_RING
>> whenever the tx_ring (TUN queue) is full. This behavior results in reduced
>> TCP performance and packet loss for VPNs and VMs. In addition this patch
>> also allows qdiscs to work properly (see [2]) and to reduce buffer bloat
>> when reducing the TUN queue.
>>
>> TUN benchmarks:
>> +-----------------------------------------------------------------+
>> | Lab setup of our paper [1]:                                     |
>> | TCP throughput of VPN solutions at varying RTT (values in Mbps) |
>> +-----------+---------------+---------------+----------+----------+
>> | RTT [ms]  | wireguard-go  | wireguard-go  | OpenVPN  | OpenVPN  |
>> |           |               | patched       |          | patched  |
>> +-----------+---------------+---------------+----------+----------+
>> | 10        | 787.3         | 679.0         | 402.4    | 416.9    |
>> +-----------+---------------+---------------+----------+----------+
>> | 20        | 765.1         | 718.8         | 401.6    | 393.18   |
>> +-----------+---------------+---------------+----------+----------+
>> | 40        | 441.5         | 529.4         | 96.9     | 411.8    |
>> +-----------+---------------+---------------+----------+----------+
>> | 80        | 218.7         | 265.7         | 57.9     | 262.7    |
>> +-----------+---------------+---------------+----------+----------+
>> | 120       | 145.4         | 181.7         | 52.8     | 178.0    |
>> +-----------+---------------+---------------+----------+----------+
>>
>> +--------------------------------------------------------------------+
>> | Real-world setup of our paper [1]:                                 |
>> | TCP throughput of VPN solutions without and with the patch         |
>> | at a RTT of ~120 ms (values in Mbps)                               |
>> +------------------+--------------+--------------+---------+---------+
>> | TUN queue        | wireguard-go | wireguard-go | OpenVPN | OpenVPN |
>> | length [packets] |              | patched      |         | patched |
>> +------------------+--------------+--------------+---------+---------+
>> | 5000             | 185.8        | 185.6        | 184.7   | 184.8   |
>> +------------------+--------------+--------------+---------+---------+
>> | 1000             | 185.1        | 184.9        | 177.1   | 183.0   |
>> +------------------+--------------+--------------+---------+---------+
>> | 500 (default)    | 137.5        | 184.9        | 117.4   | 184.6   |
>> +------------------+--------------+--------------+---------+---------+
>> | 100              | 99.8         | 185.3        | 66.4    | 183.5   |
>> +------------------+--------------+--------------+---------+---------+
>> | 50               | 59.4         | 185.7        | 21.6    | 184.7   |
>> +------------------+--------------+--------------+---------+---------+
>> | 10               | 1.7          | 185.4        | 1.6     | 183.6   |
>> +------------------+--------------+--------------+---------+---------+
>>
>> TAP benchmarks:
>> +------------------------------------------------------------------+
>> | Lab Setup [3]:                                                   |
>> | TCP throughput from host to Debian VM using TAP (values in Mbps) |
>> +----------------------------+------------------+------------------+
>> | TUN queue                  | Default          | Patched          |
>> | length [packets]           |                  |                  |
>> +----------------------------+------------------+------------------+
>> | 1000 (default)             | 2194.3           | 2185.0           |
>> +----------------------------+------------------+------------------+
>> | 100                        | 1986.4           | 2268.5           |
>> +----------------------------+------------------+------------------+
>> | 10                         | 625.0            | 1988.9           |
>> +----------------------------+------------------+------------------+
>> | 1                          | 2.2              | 1112.7           |
>> +----------------------------+------------------+------------------+
>> |                                                                  |
>> +------------------------------------------------------------------+
>> | Measurement with 1000 packets queue and emulated delay           |
>> +----------------------------+------------------+------------------+
>> | RTT [ms]                   | Default          | Patched          |
>> +----------------------------+------------------+------------------+
>> | 60                         | 171.8            | 341.2            |
>> +----------------------------+------------------+------------------+
>> | 120                        | 98.3             | 255.0            |
>> +----------------------------+------------------+------------------+
>>
>> TAP+vhost_net benchmarks:
>> +----------------------------------------------------------------------+
>> | Lab Setup [3]:                                                       |
>> | TCP throughput from host to Debian VM using TAP+vhost_net            |
>> | (values in Mbps)                                                     |
>> +-----------------------------+--------------------+-------------------+
>> | TUN queue                   | Default            | Patched           |
>> | length [packets]            |                    |                   |
>> +-----------------------------+--------------------+-------------------+
>> | 1000 (default)              | 23403.9            | 23858.8           |
>> +-----------------------------+--------------------+-------------------+
>> | 100                         | 23372.5            | 23889.9           |
>> +-----------------------------+--------------------+-------------------+
>> | 10                          | 25837.5            | 23730.2           |
>> +-----------------------------+--------------------+-------------------+
>> | 1                           | 0.7                | 19244.8           |
>> +-----------------------------+--------------------+-------------------+
>> | Note: Default suffers from many retransmits, while patched does not. |
>> +----------------------------------------------------------------------+
>> |                                                                      |
>> +----------------------------------------------------------------------+
>> | Measurement with 1000 packets queue and emulated delay               |
>> +-----------------------------+--------------------+-------------------+
>> | RTT [ms]                    | Default            | Patched           |
>> +-----------------------------+--------------------+-------------------+
>> | 60                          | 397.1              | 397.8             |
>> +-----------------------------+--------------------+-------------------+
>> | 120                         | 200.7              | 199.9             |
>> +-----------------------------+--------------------+-------------------+
>>
>> Implementation details:
>> - The netdev queue start/stop flow control is utilized.
>> - Compatible with multi-queue by only stopping/waking the specific
>> netdevice subqueue.
>>
>> In the tun_net_xmit function:
>> - Stopping the subqueue is done when the tx_ring gets full after inserting
>> the SKB into the tx_ring.
>> - In the unlikely case when the insertion with ptr_ring_produce fails, the
>> old dropping behavior is used for this SKB.
>>
>> In the tun_ring_recv function:
>> - Waking the subqueue is done after consuming a SKB from the tx_ring when
>> the tx_ring is empty.
>> - When the tx_ring is configured to be small (for example to hold 1 SKB),
> 
> That's an exaggerated case that hopefully we do not have to support.
> Can this be configured? Maybe we should round_up user input to a sane
> lower bound instead.
> 

I do not think that this issue will disappear with a bigger tx_ring, it 
will just get more unlikely.
Just waking the netdev queue in the blocking wait queue is fine in my 
opinion.
And small tx_ring sizes like 1 might be used by a possible dynamic queue 
limits since my benchmarks showed that the performance can be okay with 
such small tx_ring sizes.

>> queuing might be stopped in the tun_net_xmit function while at the same
>> time, ptr_ring_consume is not able to grab a SKB. This prevents
>> tun_net_xmit from being called again and causes tun_ring_recv to wait
>> indefinitely for a SKB in the blocking wait queue. Therefore, the netdev
>> queue is woken in the wait queue.
>>
>> In the tap_do_read function:
>> - Same behavior as in tun_ring_recv: Waking the subqueue when the tx_ring
>> is empty & waking the subqueue in the blocking wait queue.
>> - Here the netdev txq is obtained with a rcu read lock instead.
>>
>> In the vhost_net_buf_produce function:
>> - Same behavior as in tun_ring_recv: Waking the subqueue when the tx_ring
>> is empty.
>> - Here the netdev_queue is saved in the vhost_net_virtqueue at init with
>> new helpers.
>>
>> We are open to suggestions regarding the implementation :)
>> Thank you for your work!
> 
> Similarly, in the commit message, lead with the technical explanation.
> Brief benchmark results are great, but this is not an academic paper.
> Best concise and below the main take-away. Or in the cover letter if a
> multi patch series. ..
> 

Okay, I will shorten the benchmarks to a minimum and lead with the 
technical explanation.

>>
>> [1] Link:
>> https://cni.etit.tu-dortmund.de/storages/cni-etit/r/Research/Publications/2025/Gebauer_2025_VTCFall/Gebauer_VTCFall2025_AuthorsVersion.pdf
>> [2] Link:
>> https://unix.stackexchange.com/questions/762935/traffic-shaping-ineffective-on-tun-device
>> [3] Link: https://github.com/tudo-cni/nodrop
>>
>> Co-developed-by: Tim Gebauer <tim.gebauer@...dortmund.de>
>> Signed-off-by: Tim Gebauer <tim.gebauer@...dortmund.de>
>> Signed-off-by: Simon Schippers <simon.schippers@...dortmund.de>
>> ---
>> V2 -> V3: Added support for TAP and TAP+vhost_net.
> 
> .. please split into a series, with separate patches for TUN, TAP and
> vhost-net.
> 
> Or, start with one and once that is merged after revisions, repeat
> for the others. That is likely less work.
> 

I will split it into a series with separate changes.
Merging one after another will not work since TUN, TAP and vhost-net share 
tun_net_xmit as a common method.
Stopping the netdev queue there without waking it again will break stuff.

>> V1 -> V2: Removed NETDEV_TX_BUSY return case in tun_net_xmit and removed 
>> unnecessary netif_tx_wake_queue in tun_ring_recv.
>>
>>  drivers/net/tap.c      | 35 +++++++++++++++++++++++++++++++++++
>>  drivers/net/tun.c      | 39 +++++++++++++++++++++++++++++++++++----
>>  drivers/vhost/net.c    | 24 ++++++++++++++++++++++--
>>  include/linux/if_tap.h |  5 +++++
>>  include/linux/if_tun.h |  6 ++++++
>>  5 files changed, 103 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
>> index 1197f245e873..df7e4063fb7c 100644
>> --- a/drivers/net/tap.c
>> +++ b/drivers/net/tap.c
>> @@ -758,6 +758,8 @@ static ssize_t tap_do_read(struct tap_queue *q,
>>  			   int noblock, struct sk_buff *skb)
>>  {
>>  	DEFINE_WAIT(wait);
>> +	struct netdev_queue *txq;
>> +	struct net_device *dev;
>>  	ssize_t ret = 0;
>>  
>>  	if (!iov_iter_count(to)) {
>> @@ -785,12 +787,26 @@ static ssize_t tap_do_read(struct tap_queue *q,
>>  			ret = -ERESTARTSYS;
>>  			break;
>>  		}
>> +		rcu_read_lock();
>> +		dev = rcu_dereference(q->tap)->dev;
>> +		txq = netdev_get_tx_queue(dev, q->queue_index);
>> +		netif_tx_wake_queue(txq);
>> +		rcu_read_unlock();
>> +
> 
> This wakes the queue only once entirely empty? That seems aggressive.
> 

This waking of the netdev queue is only for the "exaggerated" case, see 
described above for TUN.
However you are right that only waking the netdev queue when the tx_ring 
is empty (which is done with the condition below) is aggressive.
In previous testing waking the queue when the tx_ring is not full anymore 
(instead of completely empty) showed crashes. But I will reevaluate this 
logic.

> Where is the matching netif_tx_stop_queue. I had expected that
> arund the ptr_ring_produce calls in tap_handle_frame.
> 

TAP uses tun_net_xmit as .ndo_start_xmit. However, tap_handle_frame is 
used by ipvtap and macvtap. It could also be considered in the patch 
series, I guess?

>>  		/* Nothing to read, let's sleep */
>>  		schedule();
>>  	}
>>  	if (!noblock)
>>  		finish_wait(sk_sleep(&q->sk), &wait);
>>  
>> +	if (ptr_ring_empty(&q->ring)) {
>> +		rcu_read_lock();
>> +		dev = rcu_dereference(q->tap)->dev;
>> +		txq = netdev_get_tx_queue(dev, q->queue_index);
>> +		netif_tx_wake_queue(txq);
>> +		rcu_read_unlock();
>> +	}
>> +
> 
> Why the second test for the same condition: ring empty?
> 

See previous comment.

>>  put:
>>  	if (skb) {
>>  		ret = tap_put_user(q, skb, to);
>> @@ -1176,6 +1192,25 @@ struct socket *tap_get_socket(struct file *file)
>>  }
>>  EXPORT_SYMBOL_GPL(tap_get_socket);
>>  
>> +struct netdev_queue *tap_get_netdev_queue(struct file *file)
>> +{
>> +	struct netdev_queue *txq;
>> +	struct net_device *dev;
>> +	struct tap_queue *q;
>> +
>> +	if (file->f_op != &tap_fops)
>> +		return ERR_PTR(-EINVAL);
>> +	q = file->private_data;
>> +	if (!q)
>> +		return ERR_PTR(-EBADFD);
>> +	rcu_read_lock();
>> +	dev = rcu_dereference(q->tap)->dev;
>> +	txq = netdev_get_tx_queue(dev, q->queue_index);
>> +	rcu_read_unlock();
> 
> If the dev is only safe to be accessed inside an RCU readside critical
> section, is it safe to use txq outside of it?
> 

You are right, this might be a bad idea as the queues might be messed with.
However, I am not sure how to access the txq in another way?

>> +	return txq;
>> +}
>> +EXPORT_SYMBOL_GPL(tap_get_netdev_queue);
>> +
>>  struct ptr_ring *tap_get_ptr_ring(struct file *file)
>>  {
>>  	struct tap_queue *q;
>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>> index cc6c50180663..30ddcd20fcd3 100644
>> --- a/drivers/net/tun.c
>> +++ b/drivers/net/tun.c
>> @@ -1060,13 +1060,16 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>>  
>>  	nf_reset_ct(skb);
>>  
>> -	if (ptr_ring_produce(&tfile->tx_ring, skb)) {
>> +	queue = netdev_get_tx_queue(dev, txq);
>> +	if (unlikely(ptr_ring_produce(&tfile->tx_ring, skb))) {
>> +		netif_tx_stop_queue(queue);
>>  		drop_reason = SKB_DROP_REASON_FULL_RING;
> 
> again, stop the queue before dropping is needed. Which is what the
> new ptr_ring_full code below does I guess. If so, when is this reached?
> 

Yes, you are right this is what the ptr_ring_full code below does. It is 
reached when a SKB is successfully inserted into the tx_ring and with that 
the tx_ring becomes full. Then the queue is stopped which avoids packet 
drops.

>>  		goto drop;
>>  	}
>> +	if (ptr_ring_full(&tfile->tx_ring))
>> +		netif_tx_stop_queue(queue);
>>  
>>  	/* dev->lltx requires to do our own update of trans_start */
>> -	queue = netdev_get_tx_queue(dev, txq);
>>  	txq_trans_cond_update(queue);
>>  
>>  	/* Notify and wake up reader process */
>> @@ -2110,9 +2113,10 @@ static ssize_t tun_put_user(struct tun_struct *tun,
>>  	return total;
>>  }
>>  
>> -static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
>> +static void *tun_ring_recv(struct tun_struct *tun, struct tun_file *tfile, int noblock, int *err)
>>  {
>>  	DECLARE_WAITQUEUE(wait, current);
>> +	struct netdev_queue *txq;
>>  	void *ptr = NULL;
>>  	int error = 0;
>>  
>> @@ -2124,6 +2128,7 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
>>  		goto out;
>>  	}
>>  
>> +	txq = netdev_get_tx_queue(tun->dev, tfile->queue_index);
>>  	add_wait_queue(&tfile->socket.wq.wait, &wait);
>>  
>>  	while (1) {
>> @@ -2131,6 +2136,9 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
>>  		ptr = ptr_ring_consume(&tfile->tx_ring);
>>  		if (ptr)
>>  			break;
>> +
>> +		netif_tx_wake_queue(txq);
>> +
>>  		if (signal_pending(current)) {
>>  			error = -ERESTARTSYS;
>>  			break;
>> @@ -2147,6 +2155,10 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
>>  	remove_wait_queue(&tfile->socket.wq.wait, &wait);
>>  
>>  out:
>> +	if (ptr_ring_empty(&tfile->tx_ring)) {
>> +		txq = netdev_get_tx_queue(tun->dev, tfile->queue_index);
>> +		netif_tx_wake_queue(txq);
>> +	}
>>  	*err = error;
>>  	return ptr;
>>  }
>> @@ -2165,7 +2177,7 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
>>  
>>  	if (!ptr) {
>>  		/* Read frames from ring */
>> -		ptr = tun_ring_recv(tfile, noblock, &err);
>> +		ptr = tun_ring_recv(tun, tfile, noblock, &err);
>>  		if (!ptr)
>>  			return err;
>>  	}
>> @@ -3712,6 +3724,25 @@ struct socket *tun_get_socket(struct file *file)
>>  }
>>  EXPORT_SYMBOL_GPL(tun_get_socket);
>>  
>> +struct netdev_queue *tun_get_netdev_queue(struct file *file)
>> +{
>> +	struct netdev_queue *txq;
>> +	struct net_device *dev;
>> +	struct tun_file *tfile;
>> +
>> +	if (file->f_op != &tun_fops)
>> +		return ERR_PTR(-EINVAL);
>> +	tfile = file->private_data;
>> +	if (!tfile)
>> +		return ERR_PTR(-EBADFD);
>> +	rcu_read_lock();
>> +	dev = rcu_dereference(tfile->tun)->dev;
>> +	txq = netdev_get_tx_queue(dev, tfile->queue_index);
>> +	rcu_read_unlock();
>> +	return txq;
>> +}
>> +EXPORT_SYMBOL_GPL(tun_get_netdev_queue);
>> +
>>  struct ptr_ring *tun_get_tx_ring(struct file *file)
>>  {
>>  	struct tun_file *tfile;
>> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
>> index 6edac0c1ba9b..045fc31c59ff 100644
>> --- a/drivers/vhost/net.c
>> +++ b/drivers/vhost/net.c
>> @@ -130,6 +130,7 @@ struct vhost_net_virtqueue {
>>  	struct vhost_net_buf rxq;
>>  	/* Batched XDP buffs */
>>  	struct xdp_buff *xdp;
>> +	struct netdev_queue *netdev_queue;
>>  };
>>  
>>  struct vhost_net {
>> @@ -182,6 +183,8 @@ static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq)
>>  	rxq->head = 0;
>>  	rxq->tail = ptr_ring_consume_batched(nvq->rx_ring, rxq->queue,
>>  					      VHOST_NET_BATCH);
>> +	if (ptr_ring_empty(nvq->rx_ring))
>> +		netif_tx_wake_queue(nvq->netdev_queue);
>>  	return rxq->tail;
>>  }
>>  
>> @@ -1469,6 +1472,21 @@ static struct socket *get_raw_socket(int fd)
>>  	return ERR_PTR(r);
>>  }
>>  
>> +static struct netdev_queue *get_tap_netdev_queue(struct file *file)
>> +{
>> +	struct netdev_queue *q;
>> +
>> +	q = tun_get_netdev_queue(file);
>> +	if (!IS_ERR(q))
>> +		goto out;
>> +	q = tap_get_netdev_queue(file);
>> +	if (!IS_ERR(q))
>> +		goto out;
>> +	q = NULL;
>> +out:
>> +	return q;
>> +}
>> +
>>  static struct ptr_ring *get_tap_ptr_ring(struct file *file)
>>  {
>>  	struct ptr_ring *ring;
>> @@ -1570,10 +1588,12 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
>>  		if (r)
>>  			goto err_used;
>>  		if (index == VHOST_NET_VQ_RX) {
>> -			if (sock)
>> +			if (sock) {
>>  				nvq->rx_ring = get_tap_ptr_ring(sock->file);
>> -			else
>> +				nvq->netdev_queue = get_tap_netdev_queue(sock->file);
>> +			} else {
>>  				nvq->rx_ring = NULL;
>> +			}
>>  		}
>>  
>>  		oldubufs = nvq->ubufs;
>> diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h
>> index 553552fa635c..b15c40c86819 100644
>> --- a/include/linux/if_tap.h
>> +++ b/include/linux/if_tap.h
>> @@ -10,6 +10,7 @@ struct socket;
>>  
>>  #if IS_ENABLED(CONFIG_TAP)
>>  struct socket *tap_get_socket(struct file *);
>> +struct netdev_queue *tap_get_netdev_queue(struct file *file);
>>  struct ptr_ring *tap_get_ptr_ring(struct file *file);
>>  #else
>>  #include <linux/err.h>
>> @@ -18,6 +19,10 @@ static inline struct socket *tap_get_socket(struct file *f)
>>  {
>>  	return ERR_PTR(-EINVAL);
>>  }
>> +static inline struct netdev_queue *tap_get_netdev_queue(struct file *f)
>> +{
>> +	return ERR_PTR(-EINVAL);
>> +}
>>  static inline struct ptr_ring *tap_get_ptr_ring(struct file *f)
>>  {
>>  	return ERR_PTR(-EINVAL);
>> diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
>> index 80166eb62f41..552eb35f0299 100644
>> --- a/include/linux/if_tun.h
>> +++ b/include/linux/if_tun.h
>> @@ -21,6 +21,7 @@ struct tun_msg_ctl {
>>  
>>  #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
>>  struct socket *tun_get_socket(struct file *);
>> +struct netdev_queue *tun_get_netdev_queue(struct file *file);
>>  struct ptr_ring *tun_get_tx_ring(struct file *file);
>>  
>>  static inline bool tun_is_xdp_frame(void *ptr)
>> @@ -50,6 +51,11 @@ static inline struct socket *tun_get_socket(struct file *f)
>>  	return ERR_PTR(-EINVAL);
>>  }
>>  
>> +static inline struct netdev_queue *tun_get_netdev_queue(struct file *f)
>> +{
>> +	return ERR_PTR(-EINVAL);
>> +}
>> +
>>  static inline struct ptr_ring *tun_get_tx_ring(struct file *f)
>>  {
>>  	return ERR_PTR(-EINVAL);
>> -- 
>> 2.43.0
>>
> 
>