netdev - Re: [PATCH RFC v3 7/9] tun: Introduce virtio-net RSS

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <694a8f81-616e-47d0-8185-5b73626c4109@daynix.com>
Date: Tue, 24 Sep 2024 10:56:52 +0200
From: Akihiko Odaki <akihiko.odaki@...nix.com>
To: Willem de Bruijn <willemdebruijn.kernel@...il.com>,
 Jonathan Corbet <corbet@....net>, Jason Wang <jasowang@...hat.com>,
 "David S. Miller" <davem@...emloft.net>, Eric Dumazet <edumazet@...gle.com>,
 Jakub Kicinski <kuba@...nel.org>, Paolo Abeni <pabeni@...hat.com>,
 "Michael S. Tsirkin" <mst@...hat.com>, Xuan Zhuo
 <xuanzhuo@...ux.alibaba.com>, Shuah Khan <shuah@...nel.org>,
 linux-doc@...r.kernel.org, linux-kernel@...r.kernel.org,
 netdev@...r.kernel.org, kvm@...r.kernel.org,
 virtualization@...ts.linux-foundation.org, linux-kselftest@...r.kernel.org,
 Yuri Benditovich <yuri.benditovich@...nix.com>,
 Andrew Melnychenko <andrew@...nix.com>
Subject: Re: [PATCH RFC v3 7/9] tun: Introduce virtio-net RSS

On 2024/09/18 15:28, Willem de Bruijn wrote:
> Akihiko Odaki wrote:
>> RSS is a receive steering algorithm that can be negotiated to use with
>> virtio_net. Conventionally the hash calculation was done by the VMM.
>> However, computing the hash after the queue was chosen defeats the
>> purpose of RSS.
>>
>> Another approach is to use eBPF steering program. This approach has
>> another downside: it cannot report the calculated hash due to the
>> restrictive nature of eBPF steering program.
>>
>> Introduce the code to perform RSS to the kernel in order to overcome
>> thse challenges. An alternative solution is to extend the eBPF steering
>> program so that it will be able to report to the userspace, but I didn't
>> opt for it because extending the current mechanism of eBPF steering
>> program as is because it relies on legacy context rewriting, and
>> introducing kfunc-based eBPF will result in non-UAPI dependency while
>> the other relevant virtualization APIs such as KVM and vhost_net are
>> UAPIs.
>>
>> Signed-off-by: Akihiko Odaki <akihiko.odaki@...nix.com>
>> ---
>>   drivers/net/tun.c           | 119 +++++++++++++++++++++++++++++++++++++++-----
>>   include/uapi/linux/if_tun.h |  27 ++++++++++
>>   2 files changed, 133 insertions(+), 13 deletions(-)
>>
>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>> index b8fcd71becac..5a429b391144 100644
>> --- a/drivers/net/tun.c
>> +++ b/drivers/net/tun.c
>> @@ -175,6 +175,9 @@ struct tun_prog {
>>   
>>   struct tun_vnet_hash_container {
>>   	struct tun_vnet_hash common;
>> +	struct tun_vnet_hash_rss rss;
>> +	__be32 rss_key[VIRTIO_NET_RSS_MAX_KEY_SIZE];
>> +	u16 rss_indirection_table[];
>>   };
>>   
>>   /* Since the socket were moved to tun_file, to preserve the behavior of persist
>> @@ -227,7 +230,7 @@ struct veth {
>>   };
>>   
>>   static const struct tun_vnet_hash tun_vnet_hash_cap = {
>> -	.flags = TUN_VNET_HASH_REPORT,
>> +	.flags = TUN_VNET_HASH_REPORT | TUN_VNET_HASH_RSS,
>>   	.types = VIRTIO_NET_SUPPORTED_HASH_TYPES
>>   };
>>   
>> @@ -591,6 +594,36 @@ static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb)
>>   	return ret % numqueues;
>>   }
>>   
>> +static u16 tun_vnet_rss_select_queue(struct tun_struct *tun,
>> +				     struct sk_buff *skb,
>> +				     const struct tun_vnet_hash_container *vnet_hash)
>> +{
>> +	struct tun_vnet_hash_ext *ext;
>> +	struct virtio_net_hash hash;
>> +	u32 numqueues = READ_ONCE(tun->numqueues);
>> +	u16 txq, index;
>> +
>> +	if (!numqueues)
>> +		return 0;
>> +
>> +	if (!virtio_net_hash_rss(skb, vnet_hash->common.types, vnet_hash->rss_key,
>> +				 &hash))
>> +		return vnet_hash->rss.unclassified_queue % numqueues;
>> +
>> +	if (vnet_hash->common.flags & TUN_VNET_HASH_REPORT) {
>> +		ext = skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH);
>> +		if (ext) {
>> +			ext->value = hash.value;
>> +			ext->report = hash.report;
>> +		}
>> +	}
>> +
>> +	index = hash.value & vnet_hash->rss.indirection_table_mask;
>> +	txq = READ_ONCE(vnet_hash->rss_indirection_table[index]);
>> +
>> +	return txq % numqueues;
>> +}
>> +
>>   static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
>>   			    struct net_device *sb_dev)
>>   {
>> @@ -603,7 +636,10 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
>>   	} else {
>>   		struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tun->vnet_hash);
>>   
>> -		ret = tun_automq_select_queue(tun, skb, vnet_hash);
>> +		if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS))
>> +			ret = tun_vnet_rss_select_queue(tun, skb, vnet_hash);
>> +		else
>> +			ret = tun_automq_select_queue(tun, skb, vnet_hash);
>>   	}
>>   	rcu_read_unlock();
>>   
>> @@ -3085,13 +3121,9 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr)
>>   }
>>   
>>   static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p,
>> -			void __user *data)
>> +			int fd)
>>   {
>>   	struct bpf_prog *prog;
>> -	int fd;
>> -
>> -	if (copy_from_user(&fd, data, sizeof(fd)))
>> -		return -EFAULT;
>>   
>>   	if (fd == -1) {
>>   		prog = NULL;
>> @@ -3157,6 +3189,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>>   	int ifindex;
>>   	int sndbuf;
>>   	int vnet_hdr_sz;
>> +	int fd;
>>   	int le;
>>   	int ret;
>>   	bool do_notify = false;
>> @@ -3460,11 +3493,27 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>>   		break;
>>   
>>   	case TUNSETSTEERINGEBPF:
>> -		ret = tun_set_ebpf(tun, &tun->steering_prog, argp);
>> +		if (get_user(fd, (int __user *)argp)) {
>> +			ret = -EFAULT;
>> +			break;
>> +		}
>> +
>> +		vnet_hash = rtnl_dereference(tun->vnet_hash);
>> +		if (fd != -1 && vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) {
>> +			ret = -EBUSY;
>> +			break;
>> +		}
>> +
>> +		ret = tun_set_ebpf(tun, &tun->steering_prog, fd);
>>   		break;
>>   
>>   	case TUNSETFILTEREBPF:
>> -		ret = tun_set_ebpf(tun, &tun->filter_prog, argp);
>> +		if (get_user(fd, (int __user *)argp)) {
>> +			ret = -EFAULT;
>> +			break;
>> +		}
>> +
>> +		ret = tun_set_ebpf(tun, &tun->filter_prog, fd);
>>   		break;
>>   
>>   	case TUNSETCARRIER:
>> @@ -3496,10 +3545,54 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>>   			break;
>>   		}
>>   
>> -		vnet_hash = kmalloc(sizeof(vnet_hash->common), GFP_KERNEL);
>> -		if (!vnet_hash) {
>> -			ret = -ENOMEM;
>> -			break;
>> +		if (vnet_hash_common.flags & TUN_VNET_HASH_RSS) {
>> +			struct tun_vnet_hash_rss rss;
>> +			size_t indirection_table_size;
>> +			size_t key_size;
>> +			size_t size;
>> +
>> +			if (tun->steering_prog) {
>> +				ret = -EBUSY;
>> +				break;
>> +			}
>> +
>> +			if (copy_from_user(&rss, argp, sizeof(rss))) {
>> +				ret = -EFAULT;
>> +				break;
>> +			}
>> +			argp = (struct tun_vnet_hash_rss __user *)argp + 1;
>> +
>> +			indirection_table_size = ((size_t)rss.indirection_table_mask + 1) * 2;
> 
> Why make uapi a mask rather than a length?

It follows the virtio specification. It is actually used as a mask in 
tun_vnet_rss_select_queue().

> 
> Also is there a upper length bounds sanity check for this input from
> userspace?

No, but the maximum size is limited to 128 bytes because the 
indirection_table_mask is 16-bit and it indexes an array of 16-bit integers.

> 
>> +			key_size = virtio_net_hash_key_length(vnet_hash_common.types);
>> +			size = sizeof(*vnet_hash) + indirection_table_size + key_size;
> 
> key_size is included in sizeof(*vnet_hash), always
> VIRTIO_NET_RSS_MAX_KEY_SIZE.

I will fix this by replacing it with:
struct_size(vnet_hash, rss_indirection_table,
             (size_t)rss.indirection_table_mask + 1)

Regards,
Akihiko Odaki

>> +
>> +			vnet_hash = kmalloc(size, GFP_KERNEL);
>> +			if (!vnet_hash) {
>> +				ret = -ENOMEM;
>> +				break;
>> +			}
>> +
>> +			if (copy_from_user(vnet_hash->rss_indirection_table,
>> +					   argp, indirection_table_size)) {
>> +				kfree(vnet_hash);
>> +				ret = -EFAULT;
>> +				break;
>> +			}
>> +			argp = (u16 __user *)argp + rss.indirection_table_mask + 1;
>> +
>> +			if (copy_from_user(vnet_hash->rss_key, argp, key_size)) {
>> +				kfree(vnet_hash);
>> +				ret = -EFAULT;
>> +				break;
>> +			}
>> +
>> +			vnet_hash->rss = rss;
>> +		} else {
>> +			vnet_hash = kmalloc(sizeof(vnet_hash->common), GFP_KERNEL);
>> +			if (!vnet_hash) {
>> +				ret = -ENOMEM;
>> +				break;
>> +			}
>>   		}
>>   
>>   		vnet_hash->common = vnet_hash_common;
>> diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
>> index 1561e8ce0a0a..1c130409db5d 100644
>> --- a/include/uapi/linux/if_tun.h
>> +++ b/include/uapi/linux/if_tun.h
>> @@ -75,6 +75,14 @@
>>    *
>>    * The argument is a pointer to &struct tun_vnet_hash.
>>    *
>> + * The argument is a pointer to the compound of the following in order if
>> + * %TUN_VNET_HASH_RSS is set:
>> + *
>> + * 1. &struct tun_vnet_hash
>> + * 2. &struct tun_vnet_hash_rss
>> + * 3. Indirection table
>> + * 4. Key
>> + *
>>    * %TUNSETVNETHDRSZ ioctl must be called with a number greater than or equal to
>>    * the size of &struct virtio_net_hdr_v1_hash before calling this ioctl with
>>    * %TUN_VNET_HASH_REPORT.
>> @@ -144,6 +152,13 @@ struct tun_filter {
>>    */
>>   #define TUN_VNET_HASH_REPORT	0x0001
>>   
>> +/**
>> + * define TUN_VNET_HASH_RSS - Request virtio_net RSS
>> + *
>> + * This is mutually exclusive with eBPF steering program.
>> + */
>> +#define TUN_VNET_HASH_RSS	0x0002
>> +
>>   /**
>>    * struct tun_vnet_hash - virtio_net hashing configuration
>>    * @flags:
>> @@ -159,4 +174,16 @@ struct tun_vnet_hash {
>>   	__u32 types;
>>   };
>>   
>> +/**
>> + * struct tun_vnet_hash_rss - virtio_net RSS configuration
>> + * @indirection_table_mask:
>> + *		Bitmask to be applied to the indirection table index
>> + * @unclassified_queue:
>> + *		The index of the queue to place unclassified packets in
>> + */
>> +struct tun_vnet_hash_rss {
>> +	__u16 indirection_table_mask;
>> +	__u16 unclassified_queue;
>> +};
>> +
>>   #endif /* _UAPI__IF_TUN_H */
>>
>> -- 
>> 2.46.0
>>
> 
>