[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <66ead59563da7_29b98629486@willemb.c.googlers.com.notmuch>
Date: Wed, 18 Sep 2024 09:28:53 -0400
From: Willem de Bruijn <willemdebruijn.kernel@...il.com>
To: Akihiko Odaki <akihiko.odaki@...nix.com>,
Jonathan Corbet <corbet@....net>,
Willem de Bruijn <willemdebruijn.kernel@...il.com>,
Jason Wang <jasowang@...hat.com>,
"David S. Miller" <davem@...emloft.net>,
Eric Dumazet <edumazet@...gle.com>,
Jakub Kicinski <kuba@...nel.org>,
Paolo Abeni <pabeni@...hat.com>,
"Michael S. Tsirkin" <mst@...hat.com>,
Xuan Zhuo <xuanzhuo@...ux.alibaba.com>,
Shuah Khan <shuah@...nel.org>,
linux-doc@...r.kernel.org,
linux-kernel@...r.kernel.org,
netdev@...r.kernel.org,
kvm@...r.kernel.org,
virtualization@...ts.linux-foundation.org,
linux-kselftest@...r.kernel.org,
Yuri Benditovich <yuri.benditovich@...nix.com>,
Andrew Melnychenko <andrew@...nix.com>,
Akihiko Odaki <akihiko.odaki@...nix.com>
Subject: Re: [PATCH RFC v3 7/9] tun: Introduce virtio-net RSS
Akihiko Odaki wrote:
> RSS is a receive steering algorithm that can be negotiated to use with
> virtio_net. Conventionally the hash calculation was done by the VMM.
> However, computing the hash after the queue was chosen defeats the
> purpose of RSS.
>
> Another approach is to use eBPF steering program. This approach has
> another downside: it cannot report the calculated hash due to the
> restrictive nature of eBPF steering program.
>
> Introduce the code to perform RSS to the kernel in order to overcome
> thse challenges. An alternative solution is to extend the eBPF steering
> program so that it will be able to report to the userspace, but I didn't
> opt for it because extending the current mechanism of eBPF steering
> program as is because it relies on legacy context rewriting, and
> introducing kfunc-based eBPF will result in non-UAPI dependency while
> the other relevant virtualization APIs such as KVM and vhost_net are
> UAPIs.
>
> Signed-off-by: Akihiko Odaki <akihiko.odaki@...nix.com>
> ---
> drivers/net/tun.c | 119 +++++++++++++++++++++++++++++++++++++++-----
> include/uapi/linux/if_tun.h | 27 ++++++++++
> 2 files changed, 133 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index b8fcd71becac..5a429b391144 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -175,6 +175,9 @@ struct tun_prog {
>
> struct tun_vnet_hash_container {
> struct tun_vnet_hash common;
> + struct tun_vnet_hash_rss rss;
> + __be32 rss_key[VIRTIO_NET_RSS_MAX_KEY_SIZE];
> + u16 rss_indirection_table[];
> };
>
> /* Since the socket were moved to tun_file, to preserve the behavior of persist
> @@ -227,7 +230,7 @@ struct veth {
> };
>
> static const struct tun_vnet_hash tun_vnet_hash_cap = {
> - .flags = TUN_VNET_HASH_REPORT,
> + .flags = TUN_VNET_HASH_REPORT | TUN_VNET_HASH_RSS,
> .types = VIRTIO_NET_SUPPORTED_HASH_TYPES
> };
>
> @@ -591,6 +594,36 @@ static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb)
> return ret % numqueues;
> }
>
> +static u16 tun_vnet_rss_select_queue(struct tun_struct *tun,
> + struct sk_buff *skb,
> + const struct tun_vnet_hash_container *vnet_hash)
> +{
> + struct tun_vnet_hash_ext *ext;
> + struct virtio_net_hash hash;
> + u32 numqueues = READ_ONCE(tun->numqueues);
> + u16 txq, index;
> +
> + if (!numqueues)
> + return 0;
> +
> + if (!virtio_net_hash_rss(skb, vnet_hash->common.types, vnet_hash->rss_key,
> + &hash))
> + return vnet_hash->rss.unclassified_queue % numqueues;
> +
> + if (vnet_hash->common.flags & TUN_VNET_HASH_REPORT) {
> + ext = skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH);
> + if (ext) {
> + ext->value = hash.value;
> + ext->report = hash.report;
> + }
> + }
> +
> + index = hash.value & vnet_hash->rss.indirection_table_mask;
> + txq = READ_ONCE(vnet_hash->rss_indirection_table[index]);
> +
> + return txq % numqueues;
> +}
> +
> static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
> struct net_device *sb_dev)
> {
> @@ -603,7 +636,10 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
> } else {
> struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tun->vnet_hash);
>
> - ret = tun_automq_select_queue(tun, skb, vnet_hash);
> + if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS))
> + ret = tun_vnet_rss_select_queue(tun, skb, vnet_hash);
> + else
> + ret = tun_automq_select_queue(tun, skb, vnet_hash);
> }
> rcu_read_unlock();
>
> @@ -3085,13 +3121,9 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr)
> }
>
> static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p,
> - void __user *data)
> + int fd)
> {
> struct bpf_prog *prog;
> - int fd;
> -
> - if (copy_from_user(&fd, data, sizeof(fd)))
> - return -EFAULT;
>
> if (fd == -1) {
> prog = NULL;
> @@ -3157,6 +3189,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
> int ifindex;
> int sndbuf;
> int vnet_hdr_sz;
> + int fd;
> int le;
> int ret;
> bool do_notify = false;
> @@ -3460,11 +3493,27 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
> break;
>
> case TUNSETSTEERINGEBPF:
> - ret = tun_set_ebpf(tun, &tun->steering_prog, argp);
> + if (get_user(fd, (int __user *)argp)) {
> + ret = -EFAULT;
> + break;
> + }
> +
> + vnet_hash = rtnl_dereference(tun->vnet_hash);
> + if (fd != -1 && vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) {
> + ret = -EBUSY;
> + break;
> + }
> +
> + ret = tun_set_ebpf(tun, &tun->steering_prog, fd);
> break;
>
> case TUNSETFILTEREBPF:
> - ret = tun_set_ebpf(tun, &tun->filter_prog, argp);
> + if (get_user(fd, (int __user *)argp)) {
> + ret = -EFAULT;
> + break;
> + }
> +
> + ret = tun_set_ebpf(tun, &tun->filter_prog, fd);
> break;
>
> case TUNSETCARRIER:
> @@ -3496,10 +3545,54 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
> break;
> }
>
> - vnet_hash = kmalloc(sizeof(vnet_hash->common), GFP_KERNEL);
> - if (!vnet_hash) {
> - ret = -ENOMEM;
> - break;
> + if (vnet_hash_common.flags & TUN_VNET_HASH_RSS) {
> + struct tun_vnet_hash_rss rss;
> + size_t indirection_table_size;
> + size_t key_size;
> + size_t size;
> +
> + if (tun->steering_prog) {
> + ret = -EBUSY;
> + break;
> + }
> +
> + if (copy_from_user(&rss, argp, sizeof(rss))) {
> + ret = -EFAULT;
> + break;
> + }
> + argp = (struct tun_vnet_hash_rss __user *)argp + 1;
> +
> + indirection_table_size = ((size_t)rss.indirection_table_mask + 1) * 2;
Why make uapi a mask rather than a length?
Also is there a upper length bounds sanity check for this input from
userspace?
> + key_size = virtio_net_hash_key_length(vnet_hash_common.types);
> + size = sizeof(*vnet_hash) + indirection_table_size + key_size;
key_size is included in sizeof(*vnet_hash), always
VIRTIO_NET_RSS_MAX_KEY_SIZE.
> +
> + vnet_hash = kmalloc(size, GFP_KERNEL);
> + if (!vnet_hash) {
> + ret = -ENOMEM;
> + break;
> + }
> +
> + if (copy_from_user(vnet_hash->rss_indirection_table,
> + argp, indirection_table_size)) {
> + kfree(vnet_hash);
> + ret = -EFAULT;
> + break;
> + }
> + argp = (u16 __user *)argp + rss.indirection_table_mask + 1;
> +
> + if (copy_from_user(vnet_hash->rss_key, argp, key_size)) {
> + kfree(vnet_hash);
> + ret = -EFAULT;
> + break;
> + }
> +
> + vnet_hash->rss = rss;
> + } else {
> + vnet_hash = kmalloc(sizeof(vnet_hash->common), GFP_KERNEL);
> + if (!vnet_hash) {
> + ret = -ENOMEM;
> + break;
> + }
> }
>
> vnet_hash->common = vnet_hash_common;
> diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
> index 1561e8ce0a0a..1c130409db5d 100644
> --- a/include/uapi/linux/if_tun.h
> +++ b/include/uapi/linux/if_tun.h
> @@ -75,6 +75,14 @@
> *
> * The argument is a pointer to &struct tun_vnet_hash.
> *
> + * The argument is a pointer to the compound of the following in order if
> + * %TUN_VNET_HASH_RSS is set:
> + *
> + * 1. &struct tun_vnet_hash
> + * 2. &struct tun_vnet_hash_rss
> + * 3. Indirection table
> + * 4. Key
> + *
> * %TUNSETVNETHDRSZ ioctl must be called with a number greater than or equal to
> * the size of &struct virtio_net_hdr_v1_hash before calling this ioctl with
> * %TUN_VNET_HASH_REPORT.
> @@ -144,6 +152,13 @@ struct tun_filter {
> */
> #define TUN_VNET_HASH_REPORT 0x0001
>
> +/**
> + * define TUN_VNET_HASH_RSS - Request virtio_net RSS
> + *
> + * This is mutually exclusive with eBPF steering program.
> + */
> +#define TUN_VNET_HASH_RSS 0x0002
> +
> /**
> * struct tun_vnet_hash - virtio_net hashing configuration
> * @flags:
> @@ -159,4 +174,16 @@ struct tun_vnet_hash {
> __u32 types;
> };
>
> +/**
> + * struct tun_vnet_hash_rss - virtio_net RSS configuration
> + * @indirection_table_mask:
> + * Bitmask to be applied to the indirection table index
> + * @unclassified_queue:
> + * The index of the queue to place unclassified packets in
> + */
> +struct tun_vnet_hash_rss {
> + __u16 indirection_table_mask;
> + __u16 unclassified_queue;
> +};
> +
> #endif /* _UAPI__IF_TUN_H */
>
> --
> 2.46.0
>
Powered by blists - more mailing lists