[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <694a8f81-616e-47d0-8185-5b73626c4109@daynix.com>
Date: Tue, 24 Sep 2024 10:56:52 +0200
From: Akihiko Odaki <akihiko.odaki@...nix.com>
To: Willem de Bruijn <willemdebruijn.kernel@...il.com>,
Jonathan Corbet <corbet@....net>, Jason Wang <jasowang@...hat.com>,
"David S. Miller" <davem@...emloft.net>, Eric Dumazet <edumazet@...gle.com>,
Jakub Kicinski <kuba@...nel.org>, Paolo Abeni <pabeni@...hat.com>,
"Michael S. Tsirkin" <mst@...hat.com>, Xuan Zhuo
<xuanzhuo@...ux.alibaba.com>, Shuah Khan <shuah@...nel.org>,
linux-doc@...r.kernel.org, linux-kernel@...r.kernel.org,
netdev@...r.kernel.org, kvm@...r.kernel.org,
virtualization@...ts.linux-foundation.org, linux-kselftest@...r.kernel.org,
Yuri Benditovich <yuri.benditovich@...nix.com>,
Andrew Melnychenko <andrew@...nix.com>
Subject: Re: [PATCH RFC v3 7/9] tun: Introduce virtio-net RSS
On 2024/09/18 15:28, Willem de Bruijn wrote:
> Akihiko Odaki wrote:
>> RSS is a receive steering algorithm that can be negotiated to use with
>> virtio_net. Conventionally the hash calculation was done by the VMM.
>> However, computing the hash after the queue was chosen defeats the
>> purpose of RSS.
>>
>> Another approach is to use eBPF steering program. This approach has
>> another downside: it cannot report the calculated hash due to the
>> restrictive nature of eBPF steering program.
>>
>> Introduce the code to perform RSS to the kernel in order to overcome
>> thse challenges. An alternative solution is to extend the eBPF steering
>> program so that it will be able to report to the userspace, but I didn't
>> opt for it because extending the current mechanism of eBPF steering
>> program as is because it relies on legacy context rewriting, and
>> introducing kfunc-based eBPF will result in non-UAPI dependency while
>> the other relevant virtualization APIs such as KVM and vhost_net are
>> UAPIs.
>>
>> Signed-off-by: Akihiko Odaki <akihiko.odaki@...nix.com>
>> ---
>> drivers/net/tun.c | 119 +++++++++++++++++++++++++++++++++++++++-----
>> include/uapi/linux/if_tun.h | 27 ++++++++++
>> 2 files changed, 133 insertions(+), 13 deletions(-)
>>
>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>> index b8fcd71becac..5a429b391144 100644
>> --- a/drivers/net/tun.c
>> +++ b/drivers/net/tun.c
>> @@ -175,6 +175,9 @@ struct tun_prog {
>>
>> struct tun_vnet_hash_container {
>> struct tun_vnet_hash common;
>> + struct tun_vnet_hash_rss rss;
>> + __be32 rss_key[VIRTIO_NET_RSS_MAX_KEY_SIZE];
>> + u16 rss_indirection_table[];
>> };
>>
>> /* Since the socket were moved to tun_file, to preserve the behavior of persist
>> @@ -227,7 +230,7 @@ struct veth {
>> };
>>
>> static const struct tun_vnet_hash tun_vnet_hash_cap = {
>> - .flags = TUN_VNET_HASH_REPORT,
>> + .flags = TUN_VNET_HASH_REPORT | TUN_VNET_HASH_RSS,
>> .types = VIRTIO_NET_SUPPORTED_HASH_TYPES
>> };
>>
>> @@ -591,6 +594,36 @@ static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb)
>> return ret % numqueues;
>> }
>>
>> +static u16 tun_vnet_rss_select_queue(struct tun_struct *tun,
>> + struct sk_buff *skb,
>> + const struct tun_vnet_hash_container *vnet_hash)
>> +{
>> + struct tun_vnet_hash_ext *ext;
>> + struct virtio_net_hash hash;
>> + u32 numqueues = READ_ONCE(tun->numqueues);
>> + u16 txq, index;
>> +
>> + if (!numqueues)
>> + return 0;
>> +
>> + if (!virtio_net_hash_rss(skb, vnet_hash->common.types, vnet_hash->rss_key,
>> + &hash))
>> + return vnet_hash->rss.unclassified_queue % numqueues;
>> +
>> + if (vnet_hash->common.flags & TUN_VNET_HASH_REPORT) {
>> + ext = skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH);
>> + if (ext) {
>> + ext->value = hash.value;
>> + ext->report = hash.report;
>> + }
>> + }
>> +
>> + index = hash.value & vnet_hash->rss.indirection_table_mask;
>> + txq = READ_ONCE(vnet_hash->rss_indirection_table[index]);
>> +
>> + return txq % numqueues;
>> +}
>> +
>> static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
>> struct net_device *sb_dev)
>> {
>> @@ -603,7 +636,10 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
>> } else {
>> struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tun->vnet_hash);
>>
>> - ret = tun_automq_select_queue(tun, skb, vnet_hash);
>> + if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS))
>> + ret = tun_vnet_rss_select_queue(tun, skb, vnet_hash);
>> + else
>> + ret = tun_automq_select_queue(tun, skb, vnet_hash);
>> }
>> rcu_read_unlock();
>>
>> @@ -3085,13 +3121,9 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr)
>> }
>>
>> static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p,
>> - void __user *data)
>> + int fd)
>> {
>> struct bpf_prog *prog;
>> - int fd;
>> -
>> - if (copy_from_user(&fd, data, sizeof(fd)))
>> - return -EFAULT;
>>
>> if (fd == -1) {
>> prog = NULL;
>> @@ -3157,6 +3189,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>> int ifindex;
>> int sndbuf;
>> int vnet_hdr_sz;
>> + int fd;
>> int le;
>> int ret;
>> bool do_notify = false;
>> @@ -3460,11 +3493,27 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>> break;
>>
>> case TUNSETSTEERINGEBPF:
>> - ret = tun_set_ebpf(tun, &tun->steering_prog, argp);
>> + if (get_user(fd, (int __user *)argp)) {
>> + ret = -EFAULT;
>> + break;
>> + }
>> +
>> + vnet_hash = rtnl_dereference(tun->vnet_hash);
>> + if (fd != -1 && vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) {
>> + ret = -EBUSY;
>> + break;
>> + }
>> +
>> + ret = tun_set_ebpf(tun, &tun->steering_prog, fd);
>> break;
>>
>> case TUNSETFILTEREBPF:
>> - ret = tun_set_ebpf(tun, &tun->filter_prog, argp);
>> + if (get_user(fd, (int __user *)argp)) {
>> + ret = -EFAULT;
>> + break;
>> + }
>> +
>> + ret = tun_set_ebpf(tun, &tun->filter_prog, fd);
>> break;
>>
>> case TUNSETCARRIER:
>> @@ -3496,10 +3545,54 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>> break;
>> }
>>
>> - vnet_hash = kmalloc(sizeof(vnet_hash->common), GFP_KERNEL);
>> - if (!vnet_hash) {
>> - ret = -ENOMEM;
>> - break;
>> + if (vnet_hash_common.flags & TUN_VNET_HASH_RSS) {
>> + struct tun_vnet_hash_rss rss;
>> + size_t indirection_table_size;
>> + size_t key_size;
>> + size_t size;
>> +
>> + if (tun->steering_prog) {
>> + ret = -EBUSY;
>> + break;
>> + }
>> +
>> + if (copy_from_user(&rss, argp, sizeof(rss))) {
>> + ret = -EFAULT;
>> + break;
>> + }
>> + argp = (struct tun_vnet_hash_rss __user *)argp + 1;
>> +
>> + indirection_table_size = ((size_t)rss.indirection_table_mask + 1) * 2;
>
> Why make uapi a mask rather than a length?
It follows the virtio specification. It is actually used as a mask in
tun_vnet_rss_select_queue().
>
> Also is there a upper length bounds sanity check for this input from
> userspace?
No, but the maximum size is limited to 128 bytes because the
indirection_table_mask is 16-bit and it indexes an array of 16-bit integers.
>
>> + key_size = virtio_net_hash_key_length(vnet_hash_common.types);
>> + size = sizeof(*vnet_hash) + indirection_table_size + key_size;
>
> key_size is included in sizeof(*vnet_hash), always
> VIRTIO_NET_RSS_MAX_KEY_SIZE.
I will fix this by replacing it with:
struct_size(vnet_hash, rss_indirection_table,
(size_t)rss.indirection_table_mask + 1)
Regards,
Akihiko Odaki
>> +
>> + vnet_hash = kmalloc(size, GFP_KERNEL);
>> + if (!vnet_hash) {
>> + ret = -ENOMEM;
>> + break;
>> + }
>> +
>> + if (copy_from_user(vnet_hash->rss_indirection_table,
>> + argp, indirection_table_size)) {
>> + kfree(vnet_hash);
>> + ret = -EFAULT;
>> + break;
>> + }
>> + argp = (u16 __user *)argp + rss.indirection_table_mask + 1;
>> +
>> + if (copy_from_user(vnet_hash->rss_key, argp, key_size)) {
>> + kfree(vnet_hash);
>> + ret = -EFAULT;
>> + break;
>> + }
>> +
>> + vnet_hash->rss = rss;
>> + } else {
>> + vnet_hash = kmalloc(sizeof(vnet_hash->common), GFP_KERNEL);
>> + if (!vnet_hash) {
>> + ret = -ENOMEM;
>> + break;
>> + }
>> }
>>
>> vnet_hash->common = vnet_hash_common;
>> diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
>> index 1561e8ce0a0a..1c130409db5d 100644
>> --- a/include/uapi/linux/if_tun.h
>> +++ b/include/uapi/linux/if_tun.h
>> @@ -75,6 +75,14 @@
>> *
>> * The argument is a pointer to &struct tun_vnet_hash.
>> *
>> + * The argument is a pointer to the compound of the following in order if
>> + * %TUN_VNET_HASH_RSS is set:
>> + *
>> + * 1. &struct tun_vnet_hash
>> + * 2. &struct tun_vnet_hash_rss
>> + * 3. Indirection table
>> + * 4. Key
>> + *
>> * %TUNSETVNETHDRSZ ioctl must be called with a number greater than or equal to
>> * the size of &struct virtio_net_hdr_v1_hash before calling this ioctl with
>> * %TUN_VNET_HASH_REPORT.
>> @@ -144,6 +152,13 @@ struct tun_filter {
>> */
>> #define TUN_VNET_HASH_REPORT 0x0001
>>
>> +/**
>> + * define TUN_VNET_HASH_RSS - Request virtio_net RSS
>> + *
>> + * This is mutually exclusive with eBPF steering program.
>> + */
>> +#define TUN_VNET_HASH_RSS 0x0002
>> +
>> /**
>> * struct tun_vnet_hash - virtio_net hashing configuration
>> * @flags:
>> @@ -159,4 +174,16 @@ struct tun_vnet_hash {
>> __u32 types;
>> };
>>
>> +/**
>> + * struct tun_vnet_hash_rss - virtio_net RSS configuration
>> + * @indirection_table_mask:
>> + * Bitmask to be applied to the indirection table index
>> + * @unclassified_queue:
>> + * The index of the queue to place unclassified packets in
>> + */
>> +struct tun_vnet_hash_rss {
>> + __u16 indirection_table_mask;
>> + __u16 unclassified_queue;
>> +};
>> +
>> #endif /* _UAPI__IF_TUN_H */
>>
>> --
>> 2.46.0
>>
>
>
Powered by blists - more mailing lists