lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <cc2bc65b-d75a-4232-a0bd-9c759aba8aba@daynix.com>
Date: Tue, 24 Sep 2024 10:57:49 +0200
From: Akihiko Odaki <akihiko.odaki@...nix.com>
To: Willem de Bruijn <willemdebruijn.kernel@...il.com>,
 Jonathan Corbet <corbet@....net>, Jason Wang <jasowang@...hat.com>,
 "David S. Miller" <davem@...emloft.net>, Eric Dumazet <edumazet@...gle.com>,
 Jakub Kicinski <kuba@...nel.org>, Paolo Abeni <pabeni@...hat.com>,
 "Michael S. Tsirkin" <mst@...hat.com>, Xuan Zhuo
 <xuanzhuo@...ux.alibaba.com>, Shuah Khan <shuah@...nel.org>,
 linux-doc@...r.kernel.org, linux-kernel@...r.kernel.org,
 netdev@...r.kernel.org, kvm@...r.kernel.org,
 virtualization@...ts.linux-foundation.org, linux-kselftest@...r.kernel.org,
 Yuri Benditovich <yuri.benditovich@...nix.com>,
 Andrew Melnychenko <andrew@...nix.com>
Subject: Re: [PATCH RFC v3 7/9] tun: Introduce virtio-net RSS



On 2024/09/24 10:56, Akihiko Odaki wrote:
> On 2024/09/18 15:28, Willem de Bruijn wrote:
>> Akihiko Odaki wrote:
>>> RSS is a receive steering algorithm that can be negotiated to use with
>>> virtio_net. Conventionally the hash calculation was done by the VMM.
>>> However, computing the hash after the queue was chosen defeats the
>>> purpose of RSS.
>>>
>>> Another approach is to use eBPF steering program. This approach has
>>> another downside: it cannot report the calculated hash due to the
>>> restrictive nature of eBPF steering program.
>>>
>>> Introduce the code to perform RSS to the kernel in order to overcome
>>> thse challenges. An alternative solution is to extend the eBPF steering
>>> program so that it will be able to report to the userspace, but I didn't
>>> opt for it because extending the current mechanism of eBPF steering
>>> program as is because it relies on legacy context rewriting, and
>>> introducing kfunc-based eBPF will result in non-UAPI dependency while
>>> the other relevant virtualization APIs such as KVM and vhost_net are
>>> UAPIs.
>>>
>>> Signed-off-by: Akihiko Odaki <akihiko.odaki@...nix.com>
>>> ---
>>>   drivers/net/tun.c           | 119 
>>> +++++++++++++++++++++++++++++++++++++++-----
>>>   include/uapi/linux/if_tun.h |  27 ++++++++++
>>>   2 files changed, 133 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>> index b8fcd71becac..5a429b391144 100644
>>> --- a/drivers/net/tun.c
>>> +++ b/drivers/net/tun.c
>>> @@ -175,6 +175,9 @@ struct tun_prog {
>>>   struct tun_vnet_hash_container {
>>>       struct tun_vnet_hash common;
>>> +    struct tun_vnet_hash_rss rss;
>>> +    __be32 rss_key[VIRTIO_NET_RSS_MAX_KEY_SIZE];
>>> +    u16 rss_indirection_table[];
>>>   };
>>>   /* Since the socket were moved to tun_file, to preserve the 
>>> behavior of persist
>>> @@ -227,7 +230,7 @@ struct veth {
>>>   };
>>>   static const struct tun_vnet_hash tun_vnet_hash_cap = {
>>> -    .flags = TUN_VNET_HASH_REPORT,
>>> +    .flags = TUN_VNET_HASH_REPORT | TUN_VNET_HASH_RSS,
>>>       .types = VIRTIO_NET_SUPPORTED_HASH_TYPES
>>>   };
>>> @@ -591,6 +594,36 @@ static u16 tun_ebpf_select_queue(struct 
>>> tun_struct *tun, struct sk_buff *skb)
>>>       return ret % numqueues;
>>>   }
>>> +static u16 tun_vnet_rss_select_queue(struct tun_struct *tun,
>>> +                     struct sk_buff *skb,
>>> +                     const struct tun_vnet_hash_container *vnet_hash)
>>> +{
>>> +    struct tun_vnet_hash_ext *ext;
>>> +    struct virtio_net_hash hash;
>>> +    u32 numqueues = READ_ONCE(tun->numqueues);
>>> +    u16 txq, index;
>>> +
>>> +    if (!numqueues)
>>> +        return 0;
>>> +
>>> +    if (!virtio_net_hash_rss(skb, vnet_hash->common.types, 
>>> vnet_hash->rss_key,
>>> +                 &hash))
>>> +        return vnet_hash->rss.unclassified_queue % numqueues;
>>> +
>>> +    if (vnet_hash->common.flags & TUN_VNET_HASH_REPORT) {
>>> +        ext = skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH);
>>> +        if (ext) {
>>> +            ext->value = hash.value;
>>> +            ext->report = hash.report;
>>> +        }
>>> +    }
>>> +
>>> +    index = hash.value & vnet_hash->rss.indirection_table_mask;
>>> +    txq = READ_ONCE(vnet_hash->rss_indirection_table[index]);
>>> +
>>> +    return txq % numqueues;
>>> +}
>>> +
>>>   static u16 tun_select_queue(struct net_device *dev, struct sk_buff 
>>> *skb,
>>>                   struct net_device *sb_dev)
>>>   {
>>> @@ -603,7 +636,10 @@ static u16 tun_select_queue(struct net_device 
>>> *dev, struct sk_buff *skb,
>>>       } else {
>>>           struct tun_vnet_hash_container *vnet_hash = 
>>> rcu_dereference(tun->vnet_hash);
>>> -        ret = tun_automq_select_queue(tun, skb, vnet_hash);
>>> +        if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS))
>>> +            ret = tun_vnet_rss_select_queue(tun, skb, vnet_hash);
>>> +        else
>>> +            ret = tun_automq_select_queue(tun, skb, vnet_hash);
>>>       }
>>>       rcu_read_unlock();
>>> @@ -3085,13 +3121,9 @@ static int tun_set_queue(struct file *file, 
>>> struct ifreq *ifr)
>>>   }
>>>   static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog 
>>> __rcu **prog_p,
>>> -            void __user *data)
>>> +            int fd)
>>>   {
>>>       struct bpf_prog *prog;
>>> -    int fd;
>>> -
>>> -    if (copy_from_user(&fd, data, sizeof(fd)))
>>> -        return -EFAULT;
>>>       if (fd == -1) {
>>>           prog = NULL;
>>> @@ -3157,6 +3189,7 @@ static long __tun_chr_ioctl(struct file *file, 
>>> unsigned int cmd,
>>>       int ifindex;
>>>       int sndbuf;
>>>       int vnet_hdr_sz;
>>> +    int fd;
>>>       int le;
>>>       int ret;
>>>       bool do_notify = false;
>>> @@ -3460,11 +3493,27 @@ static long __tun_chr_ioctl(struct file 
>>> *file, unsigned int cmd,
>>>           break;
>>>       case TUNSETSTEERINGEBPF:
>>> -        ret = tun_set_ebpf(tun, &tun->steering_prog, argp);
>>> +        if (get_user(fd, (int __user *)argp)) {
>>> +            ret = -EFAULT;
>>> +            break;
>>> +        }
>>> +
>>> +        vnet_hash = rtnl_dereference(tun->vnet_hash);
>>> +        if (fd != -1 && vnet_hash && (vnet_hash->common.flags & 
>>> TUN_VNET_HASH_RSS)) {
>>> +            ret = -EBUSY;
>>> +            break;
>>> +        }
>>> +
>>> +        ret = tun_set_ebpf(tun, &tun->steering_prog, fd);
>>>           break;
>>>       case TUNSETFILTEREBPF:
>>> -        ret = tun_set_ebpf(tun, &tun->filter_prog, argp);
>>> +        if (get_user(fd, (int __user *)argp)) {
>>> +            ret = -EFAULT;
>>> +            break;
>>> +        }
>>> +
>>> +        ret = tun_set_ebpf(tun, &tun->filter_prog, fd);
>>>           break;
>>>       case TUNSETCARRIER:
>>> @@ -3496,10 +3545,54 @@ static long __tun_chr_ioctl(struct file 
>>> *file, unsigned int cmd,
>>>               break;
>>>           }
>>> -        vnet_hash = kmalloc(sizeof(vnet_hash->common), GFP_KERNEL);
>>> -        if (!vnet_hash) {
>>> -            ret = -ENOMEM;
>>> -            break;
>>> +        if (vnet_hash_common.flags & TUN_VNET_HASH_RSS) {
>>> +            struct tun_vnet_hash_rss rss;
>>> +            size_t indirection_table_size;
>>> +            size_t key_size;
>>> +            size_t size;
>>> +
>>> +            if (tun->steering_prog) {
>>> +                ret = -EBUSY;
>>> +                break;
>>> +            }
>>> +
>>> +            if (copy_from_user(&rss, argp, sizeof(rss))) {
>>> +                ret = -EFAULT;
>>> +                break;
>>> +            }
>>> +            argp = (struct tun_vnet_hash_rss __user *)argp + 1;
>>> +
>>> +            indirection_table_size = 
>>> ((size_t)rss.indirection_table_mask + 1) * 2;
>>
>> Why make uapi a mask rather than a length?
> 
> It follows the virtio specification. It is actually used as a mask in 
> tun_vnet_rss_select_queue().
> 
>>
>> Also is there a upper length bounds sanity check for this input from
>> userspace?
> 
> No, but the maximum size is limited to 128 bytes because the 
> indirection_table_mask is 16-bit and it indexes an array of 16-bit 
> integers.

Not 128 bytes but 128 KiB.

> 
>>
>>> +            key_size = 
>>> virtio_net_hash_key_length(vnet_hash_common.types);
>>> +            size = sizeof(*vnet_hash) + indirection_table_size + 
>>> key_size;
>>
>> key_size is included in sizeof(*vnet_hash), always
>> VIRTIO_NET_RSS_MAX_KEY_SIZE.
> 
> I will fix this by replacing it with:
> struct_size(vnet_hash, rss_indirection_table,
>              (size_t)rss.indirection_table_mask + 1)
> 
> Regards,
> Akihiko Odaki
> 
>>> +
>>> +            vnet_hash = kmalloc(size, GFP_KERNEL);
>>> +            if (!vnet_hash) {
>>> +                ret = -ENOMEM;
>>> +                break;
>>> +            }
>>> +
>>> +            if (copy_from_user(vnet_hash->rss_indirection_table,
>>> +                       argp, indirection_table_size)) {
>>> +                kfree(vnet_hash);
>>> +                ret = -EFAULT;
>>> +                break;
>>> +            }
>>> +            argp = (u16 __user *)argp + rss.indirection_table_mask + 1;
>>> +
>>> +            if (copy_from_user(vnet_hash->rss_key, argp, key_size)) {
>>> +                kfree(vnet_hash);
>>> +                ret = -EFAULT;
>>> +                break;
>>> +            }
>>> +
>>> +            vnet_hash->rss = rss;
>>> +        } else {
>>> +            vnet_hash = kmalloc(sizeof(vnet_hash->common), GFP_KERNEL);
>>> +            if (!vnet_hash) {
>>> +                ret = -ENOMEM;
>>> +                break;
>>> +            }
>>>           }
>>>           vnet_hash->common = vnet_hash_common;
>>> diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
>>> index 1561e8ce0a0a..1c130409db5d 100644
>>> --- a/include/uapi/linux/if_tun.h
>>> +++ b/include/uapi/linux/if_tun.h
>>> @@ -75,6 +75,14 @@
>>>    *
>>>    * The argument is a pointer to &struct tun_vnet_hash.
>>>    *
>>> + * The argument is a pointer to the compound of the following in 
>>> order if
>>> + * %TUN_VNET_HASH_RSS is set:
>>> + *
>>> + * 1. &struct tun_vnet_hash
>>> + * 2. &struct tun_vnet_hash_rss
>>> + * 3. Indirection table
>>> + * 4. Key
>>> + *
>>>    * %TUNSETVNETHDRSZ ioctl must be called with a number greater than 
>>> or equal to
>>>    * the size of &struct virtio_net_hdr_v1_hash before calling this 
>>> ioctl with
>>>    * %TUN_VNET_HASH_REPORT.
>>> @@ -144,6 +152,13 @@ struct tun_filter {
>>>    */
>>>   #define TUN_VNET_HASH_REPORT    0x0001
>>> +/**
>>> + * define TUN_VNET_HASH_RSS - Request virtio_net RSS
>>> + *
>>> + * This is mutually exclusive with eBPF steering program.
>>> + */
>>> +#define TUN_VNET_HASH_RSS    0x0002
>>> +
>>>   /**
>>>    * struct tun_vnet_hash - virtio_net hashing configuration
>>>    * @flags:
>>> @@ -159,4 +174,16 @@ struct tun_vnet_hash {
>>>       __u32 types;
>>>   };
>>> +/**
>>> + * struct tun_vnet_hash_rss - virtio_net RSS configuration
>>> + * @indirection_table_mask:
>>> + *        Bitmask to be applied to the indirection table index
>>> + * @unclassified_queue:
>>> + *        The index of the queue to place unclassified packets in
>>> + */
>>> +struct tun_vnet_hash_rss {
>>> +    __u16 indirection_table_mask;
>>> +    __u16 unclassified_queue;
>>> +};
>>> +
>>>   #endif /* _UAPI__IF_TUN_H */
>>>
>>> -- 
>>> 2.46.0
>>>
>>
>>

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ