netdev - Re: [PATCH V4 net-next 3/3] tun: rx batching

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20170106214323-mutt-send-email-mst@kernel.org>
Date:   Fri, 6 Jan 2017 21:47:34 +0200
From:   "Michael S. Tsirkin" <mst@...hat.com>
To:     Jason Wang <jasowang@...hat.com>
Cc:     virtualization@...ts.linux-foundation.org, netdev@...r.kernel.org,
        kvm@...r.kernel.org, stephen@...workplumber.org, wexu@...hat.com,
        stefanha@...hat.com
Subject: Re: [PATCH V4 net-next 3/3] tun: rx batching

On Fri, Jan 06, 2017 at 10:13:17AM +0800, Jason Wang wrote:
> We can only process 1 packet at one time during sendmsg(). This often
> lead bad cache utilization under heavy load. So this patch tries to do
> some batching during rx before submitting them to host network
> stack. This is done through accepting MSG_MORE as a hint from
> sendmsg() caller, if it was set, batch the packet temporarily in a
> linked list and submit them all once MSG_MORE were cleared.
> 
> Tests were done by pktgen (burst=128) in guest over mlx4(noqueue) on host:
> 
>                                  Mpps  -+%
>     rx-frames = 0                0.91  +0%
>     rx-frames = 4                1.00  +9.8%
>     rx-frames = 8                1.00  +9.8%
>     rx-frames = 16               1.01  +10.9%
>     rx-frames = 32               1.07  +17.5%
>     rx-frames = 48               1.07  +17.5%
>     rx-frames = 64               1.08  +18.6%
>     rx-frames = 64 (no MSG_MORE) 0.91  +0%
> 
> User were allowed to change per device batched packets through
> ethtool -C rx-frames. NAPI_POLL_WEIGHT were used as upper limitation
> to prevent bh from being disabled too long.
> 
> Signed-off-by: Jason Wang <jasowang@...hat.com>
> ---
>  drivers/net/tun.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 70 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index cd8e02c..6c93926 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -218,6 +218,7 @@ struct tun_struct {
>  	struct list_head disabled;
>  	void *security;
>  	u32 flow_count;
> +	u32 rx_batched;
>  	struct tun_pcpu_stats __percpu *pcpu_stats;
>  };
>  
> @@ -522,6 +523,7 @@ static void tun_queue_purge(struct tun_file *tfile)
>  	while ((skb = skb_array_consume(&tfile->tx_array)) != NULL)
>  		kfree_skb(skb);
>  
> +	skb_queue_purge(&tfile->sk.sk_write_queue);
>  	skb_queue_purge(&tfile->sk.sk_error_queue);
>  }
>  
> @@ -1140,10 +1142,45 @@ static struct sk_buff *tun_alloc_skb(struct tun_file *tfile,
>  	return skb;
>  }
>  
> +static void tun_rx_batched(struct tun_struct *tun, struct tun_file *tfile,
> +			   struct sk_buff *skb, int more)
> +{
> +	struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
> +	struct sk_buff_head process_queue;
> +	u32 rx_batched = tun->rx_batched;
> +	bool rcv = false;
> +
> +	if (!rx_batched || (!more && skb_queue_empty(queue))) {
> +		local_bh_disable();
> +		netif_receive_skb(skb);
> +		local_bh_enable();
> +		return;
> +	}
> +
> +	spin_lock(&queue->lock);
> +	if (!more || skb_queue_len(queue) == rx_batched) {
> +		__skb_queue_head_init(&process_queue);
> +		skb_queue_splice_tail_init(queue, &process_queue);
> +		rcv = true;
> +	} else {
> +		__skb_queue_tail(queue, skb);
> +	}
> +	spin_unlock(&queue->lock);
> +
> +	if (rcv) {
> +		struct sk_buff *nskb;
> +		local_bh_disable();
> +		while ((nskb = __skb_dequeue(&process_queue)))
> +			netif_receive_skb(nskb);
> +		netif_receive_skb(skb);
> +		local_bh_enable();
> +	}
> +}
> +
>  /* Get packet from user space buffer */
>  static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
>  			    void *msg_control, struct iov_iter *from,
> -			    int noblock)
> +			    int noblock, bool more)
>  {
>  	struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) };
>  	struct sk_buff *skb;
> @@ -1283,10 +1320,9 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
>  	skb_probe_transport_header(skb, 0);
>  
>  	rxhash = skb_get_hash(skb);
> +
>  #ifndef CONFIG_4KSTACKS
> -	local_bh_disable();
> -	netif_receive_skb(skb);
> -	local_bh_enable();
> +	tun_rx_batched(tun, tfile, skb, more);
>  #else
>  	netif_rx_ni(skb);
>  #endif
> @@ -1312,7 +1348,8 @@ static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  	if (!tun)
>  		return -EBADFD;
>  
> -	result = tun_get_user(tun, tfile, NULL, from, file->f_flags & O_NONBLOCK);
> +	result = tun_get_user(tun, tfile, NULL, from,
> +			      file->f_flags & O_NONBLOCK, false);
>  
>  	tun_put(tun);
>  	return result;
> @@ -1570,7 +1607,8 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
>  		return -EBADFD;
>  
>  	ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter,
> -			   m->msg_flags & MSG_DONTWAIT);
> +			   m->msg_flags & MSG_DONTWAIT,
> +			   m->msg_flags & MSG_MORE);
>  	tun_put(tun);
>  	return ret;
>  }
> @@ -1771,6 +1809,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>  		tun->align = NET_SKB_PAD;
>  		tun->filter_attached = false;
>  		tun->sndbuf = tfile->socket.sk->sk_sndbuf;
> +		tun->rx_batched = 0;
>  
>  		tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats);
>  		if (!tun->pcpu_stats) {
> @@ -2439,6 +2478,29 @@ static void tun_set_msglevel(struct net_device *dev, u32 value)
>  #endif
>  }
>  
> +static int tun_get_coalesce(struct net_device *dev,
> +			    struct ethtool_coalesce *ec)
> +{
> +	struct tun_struct *tun = netdev_priv(dev);
> +
> +	ec->rx_max_coalesced_frames = tun->rx_batched;
> +
> +	return 0;
> +}
> +
> +static int tun_set_coalesce(struct net_device *dev,
> +			    struct ethtool_coalesce *ec)
> +{
> +	struct tun_struct *tun = netdev_priv(dev);
> +
> +	if (ec->rx_max_coalesced_frames > NAPI_POLL_WEIGHT)
> +		return -EINVAL;

So what should userspace do? Keep trying until it succeeds?
I think it's better to just use NAPI_POLL_WEIGHT instead and DTRT here.

> +
> +	tun->rx_batched = ec->rx_max_coalesced_frames;
> +
> +	return 0;
> +}
> +
>  static const struct ethtool_ops tun_ethtool_ops = {
>  	.get_settings	= tun_get_settings,
>  	.get_drvinfo	= tun_get_drvinfo,
> @@ -2446,6 +2508,8 @@ static const struct ethtool_ops tun_ethtool_ops = {
>  	.set_msglevel	= tun_set_msglevel,
>  	.get_link	= ethtool_op_get_link,
>  	.get_ts_info	= ethtool_op_get_ts_info,
> +	.get_coalesce   = tun_get_coalesce,
> +	.set_coalesce   = tun_set_coalesce,
>  };
>  
>  static int tun_queue_resize(struct tun_struct *tun)
> -- 
> 2.7.4