linux-kernel - Re: [PATCH RFC v3 7/9] tun: Introduce virtio-net RSS

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <66ead59563da7_29b98629486@willemb.c.googlers.com.notmuch>
Date: Wed, 18 Sep 2024 09:28:53 -0400
From: Willem de Bruijn <willemdebruijn.kernel@...il.com>
To: Akihiko Odaki <akihiko.odaki@...nix.com>, 
 Jonathan Corbet <corbet@....net>, 
 Willem de Bruijn <willemdebruijn.kernel@...il.com>, 
 Jason Wang <jasowang@...hat.com>, 
 "David S. Miller" <davem@...emloft.net>, 
 Eric Dumazet <edumazet@...gle.com>, 
 Jakub Kicinski <kuba@...nel.org>, 
 Paolo Abeni <pabeni@...hat.com>, 
 "Michael S. Tsirkin" <mst@...hat.com>, 
 Xuan Zhuo <xuanzhuo@...ux.alibaba.com>, 
 Shuah Khan <shuah@...nel.org>, 
 linux-doc@...r.kernel.org, 
 linux-kernel@...r.kernel.org, 
 netdev@...r.kernel.org, 
 kvm@...r.kernel.org, 
 virtualization@...ts.linux-foundation.org, 
 linux-kselftest@...r.kernel.org, 
 Yuri Benditovich <yuri.benditovich@...nix.com>, 
 Andrew Melnychenko <andrew@...nix.com>, 
 Akihiko Odaki <akihiko.odaki@...nix.com>
Subject: Re: [PATCH RFC v3 7/9] tun: Introduce virtio-net RSS

Akihiko Odaki wrote:
> RSS is a receive steering algorithm that can be negotiated to use with
> virtio_net. Conventionally the hash calculation was done by the VMM.
> However, computing the hash after the queue was chosen defeats the
> purpose of RSS.
> 
> Another approach is to use eBPF steering program. This approach has
> another downside: it cannot report the calculated hash due to the
> restrictive nature of eBPF steering program.
> 
> Introduce the code to perform RSS to the kernel in order to overcome
> thse challenges. An alternative solution is to extend the eBPF steering
> program so that it will be able to report to the userspace, but I didn't
> opt for it because extending the current mechanism of eBPF steering
> program as is because it relies on legacy context rewriting, and
> introducing kfunc-based eBPF will result in non-UAPI dependency while
> the other relevant virtualization APIs such as KVM and vhost_net are
> UAPIs.
> 
> Signed-off-by: Akihiko Odaki <akihiko.odaki@...nix.com>
> ---
>  drivers/net/tun.c           | 119 +++++++++++++++++++++++++++++++++++++++-----
>  include/uapi/linux/if_tun.h |  27 ++++++++++
>  2 files changed, 133 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index b8fcd71becac..5a429b391144 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -175,6 +175,9 @@ struct tun_prog {
>  
>  struct tun_vnet_hash_container {
>  	struct tun_vnet_hash common;
> +	struct tun_vnet_hash_rss rss;
> +	__be32 rss_key[VIRTIO_NET_RSS_MAX_KEY_SIZE];
> +	u16 rss_indirection_table[];
>  };
>  
>  /* Since the socket were moved to tun_file, to preserve the behavior of persist
> @@ -227,7 +230,7 @@ struct veth {
>  };
>  
>  static const struct tun_vnet_hash tun_vnet_hash_cap = {
> -	.flags = TUN_VNET_HASH_REPORT,
> +	.flags = TUN_VNET_HASH_REPORT | TUN_VNET_HASH_RSS,
>  	.types = VIRTIO_NET_SUPPORTED_HASH_TYPES
>  };
>  
> @@ -591,6 +594,36 @@ static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb)
>  	return ret % numqueues;
>  }
>  
> +static u16 tun_vnet_rss_select_queue(struct tun_struct *tun,
> +				     struct sk_buff *skb,
> +				     const struct tun_vnet_hash_container *vnet_hash)
> +{
> +	struct tun_vnet_hash_ext *ext;
> +	struct virtio_net_hash hash;
> +	u32 numqueues = READ_ONCE(tun->numqueues);
> +	u16 txq, index;
> +
> +	if (!numqueues)
> +		return 0;
> +
> +	if (!virtio_net_hash_rss(skb, vnet_hash->common.types, vnet_hash->rss_key,
> +				 &hash))
> +		return vnet_hash->rss.unclassified_queue % numqueues;
> +
> +	if (vnet_hash->common.flags & TUN_VNET_HASH_REPORT) {
> +		ext = skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH);
> +		if (ext) {
> +			ext->value = hash.value;
> +			ext->report = hash.report;
> +		}
> +	}
> +
> +	index = hash.value & vnet_hash->rss.indirection_table_mask;
> +	txq = READ_ONCE(vnet_hash->rss_indirection_table[index]);
> +
> +	return txq % numqueues;
> +}
> +
>  static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
>  			    struct net_device *sb_dev)
>  {
> @@ -603,7 +636,10 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
>  	} else {
>  		struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tun->vnet_hash);
>  
> -		ret = tun_automq_select_queue(tun, skb, vnet_hash);
> +		if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS))
> +			ret = tun_vnet_rss_select_queue(tun, skb, vnet_hash);
> +		else
> +			ret = tun_automq_select_queue(tun, skb, vnet_hash);
>  	}
>  	rcu_read_unlock();
>  
> @@ -3085,13 +3121,9 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr)
>  }
>  
>  static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p,
> -			void __user *data)
> +			int fd)
>  {
>  	struct bpf_prog *prog;
> -	int fd;
> -
> -	if (copy_from_user(&fd, data, sizeof(fd)))
> -		return -EFAULT;
>  
>  	if (fd == -1) {
>  		prog = NULL;
> @@ -3157,6 +3189,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>  	int ifindex;
>  	int sndbuf;
>  	int vnet_hdr_sz;
> +	int fd;
>  	int le;
>  	int ret;
>  	bool do_notify = false;
> @@ -3460,11 +3493,27 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>  		break;
>  
>  	case TUNSETSTEERINGEBPF:
> -		ret = tun_set_ebpf(tun, &tun->steering_prog, argp);
> +		if (get_user(fd, (int __user *)argp)) {
> +			ret = -EFAULT;
> +			break;
> +		}
> +
> +		vnet_hash = rtnl_dereference(tun->vnet_hash);
> +		if (fd != -1 && vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) {
> +			ret = -EBUSY;
> +			break;
> +		}
> +
> +		ret = tun_set_ebpf(tun, &tun->steering_prog, fd);
>  		break;
>  
>  	case TUNSETFILTEREBPF:
> -		ret = tun_set_ebpf(tun, &tun->filter_prog, argp);
> +		if (get_user(fd, (int __user *)argp)) {
> +			ret = -EFAULT;
> +			break;
> +		}
> +
> +		ret = tun_set_ebpf(tun, &tun->filter_prog, fd);
>  		break;
>  
>  	case TUNSETCARRIER:
> @@ -3496,10 +3545,54 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>  			break;
>  		}
>  
> -		vnet_hash = kmalloc(sizeof(vnet_hash->common), GFP_KERNEL);
> -		if (!vnet_hash) {
> -			ret = -ENOMEM;
> -			break;
> +		if (vnet_hash_common.flags & TUN_VNET_HASH_RSS) {
> +			struct tun_vnet_hash_rss rss;
> +			size_t indirection_table_size;
> +			size_t key_size;
> +			size_t size;
> +
> +			if (tun->steering_prog) {
> +				ret = -EBUSY;
> +				break;
> +			}
> +
> +			if (copy_from_user(&rss, argp, sizeof(rss))) {
> +				ret = -EFAULT;
> +				break;
> +			}
> +			argp = (struct tun_vnet_hash_rss __user *)argp + 1;
> +
> +			indirection_table_size = ((size_t)rss.indirection_table_mask + 1) * 2;

Why make uapi a mask rather than a length?

Also is there a upper length bounds sanity check for this input from
userspace?

> +			key_size = virtio_net_hash_key_length(vnet_hash_common.types);
> +			size = sizeof(*vnet_hash) + indirection_table_size + key_size;

key_size is included in sizeof(*vnet_hash), always
VIRTIO_NET_RSS_MAX_KEY_SIZE.
> +
> +			vnet_hash = kmalloc(size, GFP_KERNEL);
> +			if (!vnet_hash) {
> +				ret = -ENOMEM;
> +				break;
> +			}
> +
> +			if (copy_from_user(vnet_hash->rss_indirection_table,
> +					   argp, indirection_table_size)) {
> +				kfree(vnet_hash);
> +				ret = -EFAULT;
> +				break;
> +			}
> +			argp = (u16 __user *)argp + rss.indirection_table_mask + 1;
> +
> +			if (copy_from_user(vnet_hash->rss_key, argp, key_size)) {
> +				kfree(vnet_hash);
> +				ret = -EFAULT;
> +				break;
> +			}
> +
> +			vnet_hash->rss = rss;
> +		} else {
> +			vnet_hash = kmalloc(sizeof(vnet_hash->common), GFP_KERNEL);
> +			if (!vnet_hash) {
> +				ret = -ENOMEM;
> +				break;
> +			}
>  		}
>  
>  		vnet_hash->common = vnet_hash_common;
> diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
> index 1561e8ce0a0a..1c130409db5d 100644
> --- a/include/uapi/linux/if_tun.h
> +++ b/include/uapi/linux/if_tun.h
> @@ -75,6 +75,14 @@
>   *
>   * The argument is a pointer to &struct tun_vnet_hash.
>   *
> + * The argument is a pointer to the compound of the following in order if
> + * %TUN_VNET_HASH_RSS is set:
> + *
> + * 1. &struct tun_vnet_hash
> + * 2. &struct tun_vnet_hash_rss
> + * 3. Indirection table
> + * 4. Key
> + *
>   * %TUNSETVNETHDRSZ ioctl must be called with a number greater than or equal to
>   * the size of &struct virtio_net_hdr_v1_hash before calling this ioctl with
>   * %TUN_VNET_HASH_REPORT.
> @@ -144,6 +152,13 @@ struct tun_filter {
>   */
>  #define TUN_VNET_HASH_REPORT	0x0001
>  
> +/**
> + * define TUN_VNET_HASH_RSS - Request virtio_net RSS
> + *
> + * This is mutually exclusive with eBPF steering program.
> + */
> +#define TUN_VNET_HASH_RSS	0x0002
> +
>  /**
>   * struct tun_vnet_hash - virtio_net hashing configuration
>   * @flags:
> @@ -159,4 +174,16 @@ struct tun_vnet_hash {
>  	__u32 types;
>  };
>  
> +/**
> + * struct tun_vnet_hash_rss - virtio_net RSS configuration
> + * @indirection_table_mask:
> + *		Bitmask to be applied to the indirection table index
> + * @unclassified_queue:
> + *		The index of the queue to place unclassified packets in
> + */
> +struct tun_vnet_hash_rss {
> +	__u16 indirection_table_mask;
> +	__u16 unclassified_queue;
> +};
> +
>  #endif /* _UAPI__IF_TUN_H */
> 
> -- 
> 2.46.0
>