[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aede4fe5-baac-f50e-1ece-9b3562c03a6d@mellanox.com>
Date: Wed, 10 Jan 2018 18:11:46 +0200
From: Tariq Toukan <tariqt@...lanox.com>
To: Jason Wang <jasowang@...hat.com>, netdev@...r.kernel.org,
linux-kernel@...r.kernel.org
Cc: mst@...hat.com, jbrouer@...hat.com,
Jesper Dangaard Brouer <brouer@...hat.com>
Subject: Re: [PATCH net-next V2 2/2] tuntap: XDP transmission
On 04/01/2018 5:14 AM, Jason Wang wrote:
> This patch implements XDP transmission for TAP. Since we can't create
> new queues for TAP during XDP set, exist ptr_ring was reused for
> queuing XDP buffers. To differ xdp_buff from sk_buff, TUN_XDP_FLAG
> (0x1UL) was encoded into lowest bit of xpd_buff pointer during
> ptr_ring_produce, and was decoded during consuming. XDP metadata was
> stored in the headroom of the packet which should work in most of
> cases since driver usually reserve enough headroom. Very minor changes
> were done for vhost_net: it just need to peek the length depends on
> the type of pointer.
>
> Tests were done on two Intel E5-2630 2.40GHz machines connected back
> to back through two 82599ES. Traffic were generated/received through
> MoonGen/testpmd(rxonly). It reports ~20% improvements when
> xdp_redirect_map is doing redirection from ixgbe to TAP (from 2.50Mpps
> to 3.05Mpps)
>
> Cc: Jesper Dangaard Brouer <brouer@...hat.com>
> Signed-off-by: Jason Wang <jasowang@...hat.com>
> ---
> drivers/net/tun.c | 211 +++++++++++++++++++++++++++++++++++++++++--------
> drivers/vhost/net.c | 13 ++-
> include/linux/if_tun.h | 17 ++++
> 3 files changed, 208 insertions(+), 33 deletions(-)
>
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 2c89efe..f2e805d 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -240,6 +240,24 @@ struct tun_struct {
> struct tun_steering_prog __rcu *steering_prog;
> };
>
> +bool tun_is_xdp_buff(void *ptr)
> +{
> + return (unsigned long)ptr & TUN_XDP_FLAG;
> +}
> +EXPORT_SYMBOL(tun_is_xdp_buff);
> +
> +void *tun_xdp_to_ptr(void *ptr)
> +{
> + return (void *)((unsigned long)ptr | TUN_XDP_FLAG);
> +}
> +EXPORT_SYMBOL(tun_xdp_to_ptr);
> +
> +void *tun_ptr_to_xdp(void *ptr)
> +{
> + return (void *)((unsigned long)ptr & ~TUN_XDP_FLAG);
> +}
> +EXPORT_SYMBOL(tun_ptr_to_xdp);
> +
Hi Jason,
I started getting the following compilation issues.
+ make -j24 -s
net/socket.o: In function `tun_xdp_to_ptr':
/images/autom/buildbot/worker/merge-net-next/build/./include/linux/if_tun.h:46:
multiple definition of `tun_xdp_to_ptr'
fs/compat_ioctl.o:/images/autom/buildbot/worker/merge-net-next/build/./include/linux/if_tun.h:46:
first defined here
net/socket.o: In function `tun_ptr_to_xdp':
/images/autom/buildbot/worker/merge-net-next/build/./include/linux/if_tun.h:50:
multiple definition of `tun_ptr_to_xdp'
fs/compat_ioctl.o:/images/autom/buildbot/worker/merge-net-next/build/./include/linux/if_tun.h:50:
first defined here
make: *** [vmlinux] Error 1
Seems you missed adding the following ifdef:
#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
Thanks,
Tariq
> static int tun_napi_receive(struct napi_struct *napi, int budget)
> {
> struct tun_file *tfile = container_of(napi, struct tun_file, napi);
> @@ -630,12 +648,25 @@ static struct tun_struct *tun_enable_queue(struct tun_file *tfile)
> return tun;
> }
>
> +static void tun_ptr_free(void *ptr)
> +{
> + if (!ptr)
> + return;
> + if (tun_is_xdp_buff(ptr)) {
> + struct xdp_buff *xdp = tun_ptr_to_xdp(ptr);
> +
> + put_page(virt_to_head_page(xdp->data));
> + } else {
> + __skb_array_destroy_skb(ptr);
> + }
> +}
> +
> static void tun_queue_purge(struct tun_file *tfile)
> {
> - struct sk_buff *skb;
> + void *ptr;
>
> - while ((skb = ptr_ring_consume(&tfile->tx_ring)) != NULL)
> - kfree_skb(skb);
> + while ((ptr = ptr_ring_consume(&tfile->tx_ring)) != NULL)
> + tun_ptr_free(ptr);
>
> skb_queue_purge(&tfile->sk.sk_write_queue);
> skb_queue_purge(&tfile->sk.sk_error_queue);
> @@ -688,8 +719,7 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
> unregister_netdevice(tun->dev);
> }
> if (tun)
> - ptr_ring_cleanup(&tfile->tx_ring,
> - __skb_array_destroy_skb);
> + ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free);
> sock_put(&tfile->sk);
> }
> }
> @@ -1201,6 +1231,67 @@ static const struct net_device_ops tun_netdev_ops = {
> .ndo_get_stats64 = tun_net_get_stats64,
> };
>
> +static int tun_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
> +{
> + struct tun_struct *tun = netdev_priv(dev);
> + struct xdp_buff *buff = xdp->data_hard_start;
> + int headroom = xdp->data - xdp->data_hard_start;
> + struct tun_file *tfile;
> + u32 numqueues;
> + int ret = 0;
> +
> + /* Assure headroom is available and buff is properly aligned */
> + if (unlikely(headroom < sizeof(*xdp) || tun_is_xdp_buff(xdp)))
> + return -ENOSPC;
> +
> + *buff = *xdp;
> +
> + rcu_read_lock();
> +
> + numqueues = READ_ONCE(tun->numqueues);
> + if (!numqueues) {
> + ret = -ENOSPC;
> + goto out;
> + }
> +
> + tfile = rcu_dereference(tun->tfiles[smp_processor_id() %
> + numqueues]);
> + /* Encode the XDP flag into lowest bit for consumer to differ
> + * XDP buffer from sk_buff.
> + */
> + if (ptr_ring_produce(&tfile->tx_ring, tun_xdp_to_ptr(buff))) {
> + this_cpu_inc(tun->pcpu_stats->tx_dropped);
> + ret = -ENOSPC;
> + }
> +
> +out:
> + rcu_read_unlock();
> + return ret;
> +}
> +
> +static void tun_xdp_flush(struct net_device *dev)
> +{
> + struct tun_struct *tun = netdev_priv(dev);
> + struct tun_file *tfile;
> + u32 numqueues;
> +
> + rcu_read_lock();
> +
> + numqueues = READ_ONCE(tun->numqueues);
> + if (!numqueues)
> + goto out;
> +
> + tfile = rcu_dereference(tun->tfiles[smp_processor_id() %
> + numqueues]);
> + /* Notify and wake up reader process */
> + if (tfile->flags & TUN_FASYNC)
> + kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
> + tfile->socket.sk->sk_data_ready(tfile->socket.sk);
> +
> +out:
> + rcu_read_unlock();
> +}
> +
> static const struct net_device_ops tap_netdev_ops = {
> .ndo_uninit = tun_net_uninit,
> .ndo_open = tun_net_open,
> @@ -1218,6 +1309,8 @@ static const struct net_device_ops tap_netdev_ops = {
> .ndo_set_rx_headroom = tun_set_headroom,
> .ndo_get_stats64 = tun_net_get_stats64,
> .ndo_bpf = tun_xdp,
> + .ndo_xdp_xmit = tun_xdp_xmit,
> + .ndo_xdp_flush = tun_xdp_flush,
> };
>
> static void tun_flow_init(struct tun_struct *tun)
> @@ -1841,6 +1934,40 @@ static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from)
> return result;
> }
>
> +static ssize_t tun_put_user_xdp(struct tun_struct *tun,
> + struct tun_file *tfile,
> + struct xdp_buff *xdp,
> + struct iov_iter *iter)
> +{
> + int vnet_hdr_sz = 0;
> + size_t size = xdp->data_end - xdp->data;
> + struct tun_pcpu_stats *stats;
> + size_t ret;
> +
> + if (tun->flags & IFF_VNET_HDR) {
> + struct virtio_net_hdr gso = { 0 };
> +
> + vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
> + if (unlikely(iov_iter_count(iter) < vnet_hdr_sz))
> + return -EINVAL;
> + if (unlikely(copy_to_iter(&gso, sizeof(gso), iter) !=
> + sizeof(gso)))
> + return -EFAULT;
> + iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso));
> + }
> +
> + ret = copy_to_iter(xdp->data, size, iter) + vnet_hdr_sz;
> +
> + stats = get_cpu_ptr(tun->pcpu_stats);
> + u64_stats_update_begin(&stats->syncp);
> + stats->tx_packets++;
> + stats->tx_bytes += ret;
> + u64_stats_update_end(&stats->syncp);
> + put_cpu_ptr(tun->pcpu_stats);
> +
> + return ret;
> +}
> +
> /* Put packet to the user space buffer */
> static ssize_t tun_put_user(struct tun_struct *tun,
> struct tun_file *tfile,
> @@ -1938,15 +2065,14 @@ static ssize_t tun_put_user(struct tun_struct *tun,
> return total;
> }
>
> -static struct sk_buff *tun_ring_recv(struct tun_file *tfile, int noblock,
> - int *err)
> +static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
> {
> DECLARE_WAITQUEUE(wait, current);
> - struct sk_buff *skb = NULL;
> + void *ptr = NULL;
> int error = 0;
>
> - skb = ptr_ring_consume(&tfile->tx_ring);
> - if (skb)
> + ptr = ptr_ring_consume(&tfile->tx_ring);
> + if (ptr)
> goto out;
> if (noblock) {
> error = -EAGAIN;
> @@ -1957,8 +2083,8 @@ static struct sk_buff *tun_ring_recv(struct tun_file *tfile, int noblock,
> current->state = TASK_INTERRUPTIBLE;
>
> while (1) {
> - skb = ptr_ring_consume(&tfile->tx_ring);
> - if (skb)
> + ptr = ptr_ring_consume(&tfile->tx_ring);
> + if (ptr)
> break;
> if (signal_pending(current)) {
> error = -ERESTARTSYS;
> @@ -1977,12 +2103,12 @@ static struct sk_buff *tun_ring_recv(struct tun_file *tfile, int noblock,
>
> out:
> *err = error;
> - return skb;
> + return ptr;
> }
>
> static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
> struct iov_iter *to,
> - int noblock, struct sk_buff *skb)
> + int noblock, void *ptr)
> {
> ssize_t ret;
> int err;
> @@ -1990,23 +2116,31 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
> tun_debug(KERN_INFO, tun, "tun_do_read\n");
>
> if (!iov_iter_count(to)) {
> - if (skb)
> - kfree_skb(skb);
> + tun_ptr_free(ptr);
> return 0;
> }
>
> - if (!skb) {
> + if (!ptr) {
> /* Read frames from ring */
> - skb = tun_ring_recv(tfile, noblock, &err);
> - if (!skb)
> + ptr = tun_ring_recv(tfile, noblock, &err);
> + if (!ptr)
> return err;
> }
>
> - ret = tun_put_user(tun, tfile, skb, to);
> - if (unlikely(ret < 0))
> - kfree_skb(skb);
> - else
> - consume_skb(skb);
> + if (tun_is_xdp_buff(ptr)) {
> + struct xdp_buff *xdp = tun_ptr_to_xdp(ptr);
> +
> + ret = tun_put_user_xdp(tun, tfile, xdp, to);
> + put_page(virt_to_head_page(xdp->data));
> + } else {
> + struct sk_buff *skb = ptr;
> +
> + ret = tun_put_user(tun, tfile, skb, to);
> + if (unlikely(ret < 0))
> + kfree_skb(skb);
> + else
> + consume_skb(skb);
> + }
>
> return ret;
> }
> @@ -2143,12 +2277,12 @@ static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len,
> {
> struct tun_file *tfile = container_of(sock, struct tun_file, socket);
> struct tun_struct *tun = tun_get(tfile);
> - struct sk_buff *skb = m->msg_control;
> + void *ptr = m->msg_control;
> int ret;
>
> if (!tun) {
> ret = -EBADFD;
> - goto out_free_skb;
> + goto out_free;
> }
>
> if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) {
> @@ -2160,7 +2294,7 @@ static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len,
> SOL_PACKET, TUN_TX_TIMESTAMP);
> goto out;
> }
> - ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT, skb);
> + ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT, ptr);
> if (ret > (ssize_t)total_len) {
> m->msg_flags |= MSG_TRUNC;
> ret = flags & MSG_TRUNC ? ret : total_len;
> @@ -2171,12 +2305,25 @@ static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len,
>
> out_put_tun:
> tun_put(tun);
> -out_free_skb:
> - if (skb)
> - kfree_skb(skb);
> +out_free:
> + tun_ptr_free(ptr);
> return ret;
> }
>
> +static int tun_ptr_peek_len(void *ptr)
> +{
> + if (likely(ptr)) {
> + if (tun_is_xdp_buff(ptr)) {
> + struct xdp_buff *xdp = tun_ptr_to_xdp(ptr);
> +
> + return xdp->data_end - xdp->data;
> + }
> + return __skb_array_len_with_tag(ptr);
> + } else {
> + return 0;
> + }
> +}
> +
> static int tun_peek_len(struct socket *sock)
> {
> struct tun_file *tfile = container_of(sock, struct tun_file, socket);
> @@ -2187,7 +2334,7 @@ static int tun_peek_len(struct socket *sock)
> if (!tun)
> return 0;
>
> - ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, __skb_array_len_with_tag);
> + ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len);
> tun_put(tun);
>
> return ret;
> @@ -3110,7 +3257,7 @@ static int tun_queue_resize(struct tun_struct *tun)
>
> ret = ptr_ring_resize_multiple(rings, n,
> dev->tx_queue_len, GFP_KERNEL,
> - __skb_array_destroy_skb);
> + tun_ptr_free);
>
> kfree(rings);
> return ret;
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index c316555..a5a1db6 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -175,6 +175,17 @@ static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq)
> }
> }
>
> +static int vhost_net_buf_peek_len(void *ptr)
> +{
> + if (tun_is_xdp_buff(ptr)) {
> + struct xdp_buff *xdp = tun_ptr_to_xdp(ptr);
> +
> + return xdp->data_end - xdp->data;
> + }
> +
> + return __skb_array_len_with_tag(ptr);
> +}
> +
> static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq)
> {
> struct vhost_net_buf *rxq = &nvq->rxq;
> @@ -186,7 +197,7 @@ static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq)
> return 0;
>
> out:
> - return __skb_array_len_with_tag(vhost_net_buf_get_ptr(rxq));
> + return vhost_net_buf_peek_len(vhost_net_buf_get_ptr(rxq));
> }
>
> static void vhost_net_buf_init(struct vhost_net_buf *rxq)
> diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
> index bdee9b8..08e6682 100644
> --- a/include/linux/if_tun.h
> +++ b/include/linux/if_tun.h
> @@ -17,9 +17,14 @@
>
> #include <uapi/linux/if_tun.h>
>
> +#define TUN_XDP_FLAG 0x1UL
> +
> #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
> struct socket *tun_get_socket(struct file *);
> struct ptr_ring *tun_get_tx_ring(struct file *file);
> +bool tun_is_xdp_buff(void *ptr);
> +void *tun_xdp_to_ptr(void *ptr);
> +void *tun_ptr_to_xdp(void *ptr);
> #else
> #include <linux/err.h>
> #include <linux/errno.h>
> @@ -33,5 +38,17 @@ static inline struct ptr_ring *tun_get_tx_ring(struct file *f)
> {
> return ERR_PTR(-EINVAL);
> }
> +static inline bool tun_is_xdp_buff(void *ptr)
> +{
> + return false;
> +}
> +void *tun_xdp_to_ptr(void *ptr)
> +{
> + return NULL;
> +}
> +void *tun_ptr_to_xdp(void *ptr)
> +{
> + return NULL;
> +}
> #endif /* CONFIG_TUN */
> #endif /* __IF_TUN_H */
>
Powered by blists - more mailing lists