[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <cc2bc65b-d75a-4232-a0bd-9c759aba8aba@daynix.com>
Date: Tue, 24 Sep 2024 10:57:49 +0200
From: Akihiko Odaki <akihiko.odaki@...nix.com>
To: Willem de Bruijn <willemdebruijn.kernel@...il.com>,
Jonathan Corbet <corbet@....net>, Jason Wang <jasowang@...hat.com>,
"David S. Miller" <davem@...emloft.net>, Eric Dumazet <edumazet@...gle.com>,
Jakub Kicinski <kuba@...nel.org>, Paolo Abeni <pabeni@...hat.com>,
"Michael S. Tsirkin" <mst@...hat.com>, Xuan Zhuo
<xuanzhuo@...ux.alibaba.com>, Shuah Khan <shuah@...nel.org>,
linux-doc@...r.kernel.org, linux-kernel@...r.kernel.org,
netdev@...r.kernel.org, kvm@...r.kernel.org,
virtualization@...ts.linux-foundation.org, linux-kselftest@...r.kernel.org,
Yuri Benditovich <yuri.benditovich@...nix.com>,
Andrew Melnychenko <andrew@...nix.com>
Subject: Re: [PATCH RFC v3 7/9] tun: Introduce virtio-net RSS
On 2024/09/24 10:56, Akihiko Odaki wrote:
> On 2024/09/18 15:28, Willem de Bruijn wrote:
>> Akihiko Odaki wrote:
>>> RSS is a receive steering algorithm that can be negotiated to use with
>>> virtio_net. Conventionally the hash calculation was done by the VMM.
>>> However, computing the hash after the queue was chosen defeats the
>>> purpose of RSS.
>>>
>>> Another approach is to use eBPF steering program. This approach has
>>> another downside: it cannot report the calculated hash due to the
>>> restrictive nature of eBPF steering program.
>>>
>>> Introduce the code to perform RSS to the kernel in order to overcome
>>> thse challenges. An alternative solution is to extend the eBPF steering
>>> program so that it will be able to report to the userspace, but I didn't
>>> opt for it because extending the current mechanism of eBPF steering
>>> program as is because it relies on legacy context rewriting, and
>>> introducing kfunc-based eBPF will result in non-UAPI dependency while
>>> the other relevant virtualization APIs such as KVM and vhost_net are
>>> UAPIs.
>>>
>>> Signed-off-by: Akihiko Odaki <akihiko.odaki@...nix.com>
>>> ---
>>> drivers/net/tun.c | 119
>>> +++++++++++++++++++++++++++++++++++++++-----
>>> include/uapi/linux/if_tun.h | 27 ++++++++++
>>> 2 files changed, 133 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>> index b8fcd71becac..5a429b391144 100644
>>> --- a/drivers/net/tun.c
>>> +++ b/drivers/net/tun.c
>>> @@ -175,6 +175,9 @@ struct tun_prog {
>>> struct tun_vnet_hash_container {
>>> struct tun_vnet_hash common;
>>> + struct tun_vnet_hash_rss rss;
>>> + __be32 rss_key[VIRTIO_NET_RSS_MAX_KEY_SIZE];
>>> + u16 rss_indirection_table[];
>>> };
>>> /* Since the socket were moved to tun_file, to preserve the
>>> behavior of persist
>>> @@ -227,7 +230,7 @@ struct veth {
>>> };
>>> static const struct tun_vnet_hash tun_vnet_hash_cap = {
>>> - .flags = TUN_VNET_HASH_REPORT,
>>> + .flags = TUN_VNET_HASH_REPORT | TUN_VNET_HASH_RSS,
>>> .types = VIRTIO_NET_SUPPORTED_HASH_TYPES
>>> };
>>> @@ -591,6 +594,36 @@ static u16 tun_ebpf_select_queue(struct
>>> tun_struct *tun, struct sk_buff *skb)
>>> return ret % numqueues;
>>> }
>>> +static u16 tun_vnet_rss_select_queue(struct tun_struct *tun,
>>> + struct sk_buff *skb,
>>> + const struct tun_vnet_hash_container *vnet_hash)
>>> +{
>>> + struct tun_vnet_hash_ext *ext;
>>> + struct virtio_net_hash hash;
>>> + u32 numqueues = READ_ONCE(tun->numqueues);
>>> + u16 txq, index;
>>> +
>>> + if (!numqueues)
>>> + return 0;
>>> +
>>> + if (!virtio_net_hash_rss(skb, vnet_hash->common.types,
>>> vnet_hash->rss_key,
>>> + &hash))
>>> + return vnet_hash->rss.unclassified_queue % numqueues;
>>> +
>>> + if (vnet_hash->common.flags & TUN_VNET_HASH_REPORT) {
>>> + ext = skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH);
>>> + if (ext) {
>>> + ext->value = hash.value;
>>> + ext->report = hash.report;
>>> + }
>>> + }
>>> +
>>> + index = hash.value & vnet_hash->rss.indirection_table_mask;
>>> + txq = READ_ONCE(vnet_hash->rss_indirection_table[index]);
>>> +
>>> + return txq % numqueues;
>>> +}
>>> +
>>> static u16 tun_select_queue(struct net_device *dev, struct sk_buff
>>> *skb,
>>> struct net_device *sb_dev)
>>> {
>>> @@ -603,7 +636,10 @@ static u16 tun_select_queue(struct net_device
>>> *dev, struct sk_buff *skb,
>>> } else {
>>> struct tun_vnet_hash_container *vnet_hash =
>>> rcu_dereference(tun->vnet_hash);
>>> - ret = tun_automq_select_queue(tun, skb, vnet_hash);
>>> + if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS))
>>> + ret = tun_vnet_rss_select_queue(tun, skb, vnet_hash);
>>> + else
>>> + ret = tun_automq_select_queue(tun, skb, vnet_hash);
>>> }
>>> rcu_read_unlock();
>>> @@ -3085,13 +3121,9 @@ static int tun_set_queue(struct file *file,
>>> struct ifreq *ifr)
>>> }
>>> static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog
>>> __rcu **prog_p,
>>> - void __user *data)
>>> + int fd)
>>> {
>>> struct bpf_prog *prog;
>>> - int fd;
>>> -
>>> - if (copy_from_user(&fd, data, sizeof(fd)))
>>> - return -EFAULT;
>>> if (fd == -1) {
>>> prog = NULL;
>>> @@ -3157,6 +3189,7 @@ static long __tun_chr_ioctl(struct file *file,
>>> unsigned int cmd,
>>> int ifindex;
>>> int sndbuf;
>>> int vnet_hdr_sz;
>>> + int fd;
>>> int le;
>>> int ret;
>>> bool do_notify = false;
>>> @@ -3460,11 +3493,27 @@ static long __tun_chr_ioctl(struct file
>>> *file, unsigned int cmd,
>>> break;
>>> case TUNSETSTEERINGEBPF:
>>> - ret = tun_set_ebpf(tun, &tun->steering_prog, argp);
>>> + if (get_user(fd, (int __user *)argp)) {
>>> + ret = -EFAULT;
>>> + break;
>>> + }
>>> +
>>> + vnet_hash = rtnl_dereference(tun->vnet_hash);
>>> + if (fd != -1 && vnet_hash && (vnet_hash->common.flags &
>>> TUN_VNET_HASH_RSS)) {
>>> + ret = -EBUSY;
>>> + break;
>>> + }
>>> +
>>> + ret = tun_set_ebpf(tun, &tun->steering_prog, fd);
>>> break;
>>> case TUNSETFILTEREBPF:
>>> - ret = tun_set_ebpf(tun, &tun->filter_prog, argp);
>>> + if (get_user(fd, (int __user *)argp)) {
>>> + ret = -EFAULT;
>>> + break;
>>> + }
>>> +
>>> + ret = tun_set_ebpf(tun, &tun->filter_prog, fd);
>>> break;
>>> case TUNSETCARRIER:
>>> @@ -3496,10 +3545,54 @@ static long __tun_chr_ioctl(struct file
>>> *file, unsigned int cmd,
>>> break;
>>> }
>>> - vnet_hash = kmalloc(sizeof(vnet_hash->common), GFP_KERNEL);
>>> - if (!vnet_hash) {
>>> - ret = -ENOMEM;
>>> - break;
>>> + if (vnet_hash_common.flags & TUN_VNET_HASH_RSS) {
>>> + struct tun_vnet_hash_rss rss;
>>> + size_t indirection_table_size;
>>> + size_t key_size;
>>> + size_t size;
>>> +
>>> + if (tun->steering_prog) {
>>> + ret = -EBUSY;
>>> + break;
>>> + }
>>> +
>>> + if (copy_from_user(&rss, argp, sizeof(rss))) {
>>> + ret = -EFAULT;
>>> + break;
>>> + }
>>> + argp = (struct tun_vnet_hash_rss __user *)argp + 1;
>>> +
>>> + indirection_table_size =
>>> ((size_t)rss.indirection_table_mask + 1) * 2;
>>
>> Why make uapi a mask rather than a length?
>
> It follows the virtio specification. It is actually used as a mask in
> tun_vnet_rss_select_queue().
>
>>
>> Also is there a upper length bounds sanity check for this input from
>> userspace?
>
> No, but the maximum size is limited to 128 bytes because the
> indirection_table_mask is 16-bit and it indexes an array of 16-bit
> integers.
Not 128 bytes but 128 KiB.
>
>>
>>> + key_size =
>>> virtio_net_hash_key_length(vnet_hash_common.types);
>>> + size = sizeof(*vnet_hash) + indirection_table_size +
>>> key_size;
>>
>> key_size is included in sizeof(*vnet_hash), always
>> VIRTIO_NET_RSS_MAX_KEY_SIZE.
>
> I will fix this by replacing it with:
> struct_size(vnet_hash, rss_indirection_table,
> (size_t)rss.indirection_table_mask + 1)
>
> Regards,
> Akihiko Odaki
>
>>> +
>>> + vnet_hash = kmalloc(size, GFP_KERNEL);
>>> + if (!vnet_hash) {
>>> + ret = -ENOMEM;
>>> + break;
>>> + }
>>> +
>>> + if (copy_from_user(vnet_hash->rss_indirection_table,
>>> + argp, indirection_table_size)) {
>>> + kfree(vnet_hash);
>>> + ret = -EFAULT;
>>> + break;
>>> + }
>>> + argp = (u16 __user *)argp + rss.indirection_table_mask + 1;
>>> +
>>> + if (copy_from_user(vnet_hash->rss_key, argp, key_size)) {
>>> + kfree(vnet_hash);
>>> + ret = -EFAULT;
>>> + break;
>>> + }
>>> +
>>> + vnet_hash->rss = rss;
>>> + } else {
>>> + vnet_hash = kmalloc(sizeof(vnet_hash->common), GFP_KERNEL);
>>> + if (!vnet_hash) {
>>> + ret = -ENOMEM;
>>> + break;
>>> + }
>>> }
>>> vnet_hash->common = vnet_hash_common;
>>> diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
>>> index 1561e8ce0a0a..1c130409db5d 100644
>>> --- a/include/uapi/linux/if_tun.h
>>> +++ b/include/uapi/linux/if_tun.h
>>> @@ -75,6 +75,14 @@
>>> *
>>> * The argument is a pointer to &struct tun_vnet_hash.
>>> *
>>> + * The argument is a pointer to the compound of the following in
>>> order if
>>> + * %TUN_VNET_HASH_RSS is set:
>>> + *
>>> + * 1. &struct tun_vnet_hash
>>> + * 2. &struct tun_vnet_hash_rss
>>> + * 3. Indirection table
>>> + * 4. Key
>>> + *
>>> * %TUNSETVNETHDRSZ ioctl must be called with a number greater than
>>> or equal to
>>> * the size of &struct virtio_net_hdr_v1_hash before calling this
>>> ioctl with
>>> * %TUN_VNET_HASH_REPORT.
>>> @@ -144,6 +152,13 @@ struct tun_filter {
>>> */
>>> #define TUN_VNET_HASH_REPORT 0x0001
>>> +/**
>>> + * define TUN_VNET_HASH_RSS - Request virtio_net RSS
>>> + *
>>> + * This is mutually exclusive with eBPF steering program.
>>> + */
>>> +#define TUN_VNET_HASH_RSS 0x0002
>>> +
>>> /**
>>> * struct tun_vnet_hash - virtio_net hashing configuration
>>> * @flags:
>>> @@ -159,4 +174,16 @@ struct tun_vnet_hash {
>>> __u32 types;
>>> };
>>> +/**
>>> + * struct tun_vnet_hash_rss - virtio_net RSS configuration
>>> + * @indirection_table_mask:
>>> + * Bitmask to be applied to the indirection table index
>>> + * @unclassified_queue:
>>> + * The index of the queue to place unclassified packets in
>>> + */
>>> +struct tun_vnet_hash_rss {
>>> + __u16 indirection_table_mask;
>>> + __u16 unclassified_queue;
>>> +};
>>> +
>>> #endif /* _UAPI__IF_TUN_H */
>>>
>>> --
>>> 2.46.0
>>>
>>
>>
Powered by blists - more mailing lists