[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <ddd35697-bf7b-4778-a6af-8997f4df031b@daynix.com>
Date: Mon, 23 Sep 2024 20:35:57 +0200
From: Akihiko Odaki <akihiko.odaki@...nix.com>
To: Willem de Bruijn <willemdebruijn.kernel@...il.com>,
Jonathan Corbet <corbet@....net>, Jason Wang <jasowang@...hat.com>,
"David S. Miller" <davem@...emloft.net>, Eric Dumazet <edumazet@...gle.com>,
Jakub Kicinski <kuba@...nel.org>, Paolo Abeni <pabeni@...hat.com>,
"Michael S. Tsirkin" <mst@...hat.com>, Xuan Zhuo
<xuanzhuo@...ux.alibaba.com>, Shuah Khan <shuah@...nel.org>,
linux-doc@...r.kernel.org, linux-kernel@...r.kernel.org,
netdev@...r.kernel.org, kvm@...r.kernel.org,
virtualization@...ts.linux-foundation.org, linux-kselftest@...r.kernel.org,
Yuri Benditovich <yuri.benditovich@...nix.com>,
Andrew Melnychenko <andrew@...nix.com>
Subject: Re: [PATCH RFC v3 6/9] tun: Introduce virtio-net hash reporting
feature
On 2024/09/18 15:17, Willem de Bruijn wrote:
> Akihiko Odaki wrote:
>> Allow the guest to reuse the hash value to make receive steering
>> consistent between the host and guest, and to save hash computation.
>>
>> Signed-off-by: Akihiko Odaki <akihiko.odaki@...nix.com>
>> ---
>> Documentation/networking/tuntap.rst | 7 ++
>> drivers/net/Kconfig | 1 +
>> drivers/net/tun.c | 146 +++++++++++++++++++++++++++++++-----
>> include/uapi/linux/if_tun.h | 44 +++++++++++
>> 4 files changed, 180 insertions(+), 18 deletions(-)
>>
>> diff --git a/Documentation/networking/tuntap.rst b/Documentation/networking/tuntap.rst
>> index 4d7087f727be..86b4ae8caa8a 100644
>> --- a/Documentation/networking/tuntap.rst
>> +++ b/Documentation/networking/tuntap.rst
>> @@ -206,6 +206,13 @@ enable is true we enable it, otherwise we disable it::
>> return ioctl(fd, TUNSETQUEUE, (void *)&ifr);
>> }
>>
>> +3.4 Reference
>> +-------------
>> +
>> +``linux/if_tun.h`` defines the interface described below:
>> +
>> +.. kernel-doc:: include/uapi/linux/if_tun.h
>> +
>> Universal TUN/TAP device driver Frequently Asked Question
>> =========================================================
>>
>> diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
>> index 9920b3a68ed1..e2a7bd703550 100644
>> --- a/drivers/net/Kconfig
>> +++ b/drivers/net/Kconfig
>> @@ -395,6 +395,7 @@ config TUN
>> tristate "Universal TUN/TAP device driver support"
>> depends on INET
>> select CRC32
>> + select SKB_EXTENSIONS
>> help
>> TUN/TAP provides packet reception and transmission for user space
>> programs. It can be viewed as a simple Point-to-Point or Ethernet
>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>> index 9d93ab9ee58f..b8fcd71becac 100644
>> --- a/drivers/net/tun.c
>> +++ b/drivers/net/tun.c
>> @@ -173,6 +173,10 @@ struct tun_prog {
>> struct bpf_prog *prog;
>> };
>>
>> +struct tun_vnet_hash_container {
>> + struct tun_vnet_hash common;
>> +};
>> +
>> /* Since the socket were moved to tun_file, to preserve the behavior of persist
>> * device, socket filter, sndbuf and vnet header size were restore when the
>> * file were attached to a persist device.
>> @@ -210,6 +214,7 @@ struct tun_struct {
>> struct bpf_prog __rcu *xdp_prog;
>> struct tun_prog __rcu *steering_prog;
>> struct tun_prog __rcu *filter_prog;
>> + struct tun_vnet_hash_container __rcu *vnet_hash;
>
> This is just
>
> +struct tun_vnet_hash {
> + u32 value;
> + u16 report;
> +};
>
> Can just be fields in the struct directly.
I will change to store struct tun_vnet_hash directly.
>
> Also, only one bit really used for report, so probably can be
> condensed further.
It is more than one bit; the report types are defined as follows:
#define VIRTIO_NET_HASH_REPORT_NONE 0
#define VIRTIO_NET_HASH_REPORT_IPv4 1
#define VIRTIO_NET_HASH_REPORT_TCPv4 2
#define VIRTIO_NET_HASH_REPORT_UDPv4 3
#define VIRTIO_NET_HASH_REPORT_IPv6 4
#define VIRTIO_NET_HASH_REPORT_TCPv6 5
#define VIRTIO_NET_HASH_REPORT_UDPv6 6
#define VIRTIO_NET_HASH_REPORT_IPv6_EX 7
#define VIRTIO_NET_HASH_REPORT_TCPv6_EX 8
#define VIRTIO_NET_HASH_REPORT_UDPv6_EX 9
>
>> struct ethtool_link_ksettings link_ksettings;
>> /* init args */
>> struct file *file;
>> @@ -221,6 +226,11 @@ struct veth {
>> __be16 h_vlan_TCI;
>> };
>>
>> +static const struct tun_vnet_hash tun_vnet_hash_cap = {
>> + .flags = TUN_VNET_HASH_REPORT,
>> + .types = VIRTIO_NET_SUPPORTED_HASH_TYPES
>> +};
>> +
>> static void tun_flow_init(struct tun_struct *tun);
>> static void tun_flow_uninit(struct tun_struct *tun);
>>
>> @@ -322,10 +332,17 @@ static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp)
>> if (get_user(be, argp))
>> return -EFAULT;
>>
>> - if (be)
>> + if (be) {
>> + struct tun_vnet_hash_container *vnet_hash = rtnl_dereference(tun->vnet_hash);
>> +
>> + if (!(tun->flags & TUN_VNET_LE) &&
>> + vnet_hash && (vnet_hash->flags & TUN_VNET_HASH_REPORT))
>> + return -EBUSY;
>> +
>
> Doesn't be here imply !tun->flags & TUN_VNET_LE? Same again below.
Unfortunately no. TUN_VNET_LE and TUN_VNET_BE can be set at the same
time, and TUN_VNET_LE is enforced in such a case.
>
>> tun->flags |= TUN_VNET_BE;
>> - else
>> + } else {
>> tun->flags &= ~TUN_VNET_BE;
>> + }
>>
>> return 0;
>> }
>> @@ -522,14 +539,20 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
>> * the userspace application move between processors, we may get a
>> * different rxq no. here.
>> */
>> -static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
>> +static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb,
>> + const struct tun_vnet_hash_container *vnet_hash)
>> {
>> + struct tun_vnet_hash_ext *ext;
>> + struct flow_keys keys;
>> struct tun_flow_entry *e;
>> u32 txq, numqueues;
>>
>> numqueues = READ_ONCE(tun->numqueues);
>>
>> - txq = __skb_get_hash_symmetric(skb);
>> + memset(&keys, 0, sizeof(keys));
>> + skb_flow_dissect(skb, &flow_keys_dissector_symmetric, &keys, 0);
>> +
>> + txq = flow_hash_from_keys(&keys);
>> e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq);
>> if (e) {
>> tun_flow_save_rps_rxhash(e, txq);
>> @@ -538,6 +561,16 @@ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
>> txq = reciprocal_scale(txq, numqueues);
>> }
>>
>> + if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_REPORT)) {
>> + ext = skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH);
>> + if (ext) {
>> + u32 types = vnet_hash->common.types;
>> +
>> + ext->report = virtio_net_hash_report(types, keys.basic);
>> + ext->value = skb->l4_hash ? skb->hash : txq;
>> + }
>> + }
>> +
>> return txq;
>> }
>>
>> @@ -565,10 +598,13 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
>> u16 ret;
>>
>> rcu_read_lock();
>> - if (rcu_dereference(tun->steering_prog))
>> + if (rcu_dereference(tun->steering_prog)) {
>> ret = tun_ebpf_select_queue(tun, skb);
>> - else
>> - ret = tun_automq_select_queue(tun, skb);
>> + } else {
>> + struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tun->vnet_hash);
>> +
>> + ret = tun_automq_select_queue(tun, skb, vnet_hash);
>
> Already passing tun, no need to pass tun->vnet_hash separately.
I will remove the parameter with v4.
>> + }
>> rcu_read_unlock();
>>
>> return ret;
>> @@ -2120,33 +2156,63 @@ static ssize_t tun_put_user(struct tun_struct *tun,
>> }
>>
>> if (vnet_hdr_sz) {
>> - struct virtio_net_hdr gso;
>> + struct tun_vnet_hash_ext *ext;
>> + size_t vnet_hdr_content_sz = sizeof(struct virtio_net_hdr);
>> + union {
>> + struct virtio_net_hdr hdr;
>> + struct virtio_net_hdr_v1_hash hdr_v1_hash;
>> + } vnet_hdr;
>> + int ret;
>>
>> if (iov_iter_count(iter) < vnet_hdr_sz)
>> return -EINVAL;
>>
>> - if (virtio_net_hdr_from_skb(skb, &gso,
>> - tun_is_little_endian(tun), true,
>> - vlan_hlen)) {
>> + ext = vnet_hdr_sz < sizeof(vnet_hdr.hdr_v1_hash) ?
>> + NULL : skb_ext_find(skb, SKB_EXT_TUN_VNET_HASH);
>> +
>> + if (ext) {
>> + struct virtio_net_hash hash = {
>> + .value = ext->value,
>> + .report = ext->report,
>> + };
>> +
>> + vnet_hdr_content_sz = sizeof(vnet_hdr.hdr_v1_hash);
>> + ret = virtio_net_hdr_v1_hash_from_skb(skb,
>> + &vnet_hdr.hdr_v1_hash,
>> + true,
>> + vlan_hlen,
>> + &hash);
>> + } else {
>> + vnet_hdr_content_sz = sizeof(struct virtio_net_hdr);
>> + ret = virtio_net_hdr_from_skb(skb,
>> + &vnet_hdr.hdr,
>> + tun_is_little_endian(tun),
>> + true,
>> + vlan_hlen);
>> + }
>> +
>
> This is why just setting the fields directly rather than adding
> virtio_net_hdr_v1_hash_from_skb is actually simpler.
I'll make a change accordingly in v4.
>
>> + if (ret) {
>> struct skb_shared_info *sinfo = skb_shinfo(skb);
>>
>> if (net_ratelimit()) {
>> netdev_err(tun->dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n",
>> - sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size),
>> - tun16_to_cpu(tun, gso.hdr_len));
>> + sinfo->gso_type,
>> + tun16_to_cpu(tun, vnet_hdr.hdr.gso_size),
>> + tun16_to_cpu(tun, vnet_hdr.hdr.hdr_len));
>> print_hex_dump(KERN_ERR, "tun: ",
>> DUMP_PREFIX_NONE,
>> 16, 1, skb->head,
>> - min((int)tun16_to_cpu(tun, gso.hdr_len), 64), true);
>> + min(tun16_to_cpu(tun, vnet_hdr.hdr.hdr_len), 64),
>> + true);
>> }
>> WARN_ON_ONCE(1);
>> return -EINVAL;
>> }
>>
>> - if (copy_to_iter(&gso, sizeof(gso), iter) != sizeof(gso))
>> + if (copy_to_iter(&vnet_hdr, vnet_hdr_content_sz, iter) != vnet_hdr_content_sz)
>> return -EFAULT;
>>
>> - iov_iter_zero(vnet_hdr_sz - sizeof(gso), iter);
>> + iov_iter_zero(vnet_hdr_sz - vnet_hdr_content_sz, iter);
>> }
>>
>> if (vlan_hlen) {
>> @@ -3094,6 +3160,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>> int le;
>> int ret;
>> bool do_notify = false;
>> + struct tun_vnet_hash vnet_hash_common;
>> + struct tun_vnet_hash_container *vnet_hash;
>>
>> if (cmd == TUNSETIFF || cmd == TUNSETQUEUE ||
>> (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) {
>> @@ -3115,6 +3183,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>> if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
>> return -EPERM;
>> return open_related_ns(&net->ns, get_net_ns);
>> + } else if (cmd == TUNGETVNETHASHCAP) {
>> + return copy_to_user(argp, &tun_vnet_hash_cap, sizeof(tun_vnet_hash_cap)) ?
>> + -EFAULT : 0;
>> }
>>
>> rtnl_lock();
>> @@ -3314,6 +3385,13 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>> break;
>> }
>>
>> + vnet_hash = rtnl_dereference(tun->vnet_hash);
>> + if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_REPORT) &&
>> + vnet_hdr_sz < (int)sizeof(struct virtio_net_hdr_v1_hash)) {
>> + ret = -EBUSY;
>> + break;
>> + }
>> +
>> tun->vnet_hdr_sz = vnet_hdr_sz;
>> break;
>>
>> @@ -3328,10 +3406,18 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>> ret = -EFAULT;
>> break;
>> }
>> - if (le)
>> + if (le) {
>> tun->flags |= TUN_VNET_LE;
>> - else
>> + } else {
>> + vnet_hash = rtnl_dereference(tun->vnet_hash);
>> + if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_REPORT) &&
>> + !tun_legacy_is_little_endian(tun)) {
>> + ret = -EBUSY;
>> + break;
>> + }
>> +
>> tun->flags &= ~TUN_VNET_LE;
>> + }
>> break;
>>
>> case TUNGETVNETBE:
>> @@ -3396,6 +3482,30 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>> ret = open_related_ns(&net->ns, get_net_ns);
>> break;
>>
>> + case TUNSETVNETHASH:
>> + if (copy_from_user(&vnet_hash_common, argp, sizeof(vnet_hash_common))) {
>> + ret = -EFAULT;
>> + break;
>> + }
>> + argp = (struct tun_vnet_hash __user *)argp + 1;
>> +
>> + if ((vnet_hash_common.flags & TUN_VNET_HASH_REPORT) &&
>> + (tun->vnet_hdr_sz < sizeof(struct virtio_net_hdr_v1_hash) ||
>> + !tun_is_little_endian(tun))) {
>> + ret = -EBUSY;
>> + break;
>> + }
>> +
>> + vnet_hash = kmalloc(sizeof(vnet_hash->common), GFP_KERNEL);
>> + if (!vnet_hash) {
>> + ret = -ENOMEM;
>> + break;
>> + }
>> +
>> + vnet_hash->common = vnet_hash_common;
>> + kfree_rcu_mightsleep(rcu_replace_pointer_rtnl(tun->vnet_hash, vnet_hash));
>> + break;
>> +
>> default:
>> ret = -EINVAL;
>> break;
>> diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
>> index 287cdc81c939..1561e8ce0a0a 100644
>> --- a/include/uapi/linux/if_tun.h
>> +++ b/include/uapi/linux/if_tun.h
>> @@ -62,6 +62,30 @@
>> #define TUNSETCARRIER _IOW('T', 226, int)
>> #define TUNGETDEVNETNS _IO('T', 227)
>>
>> +/**
>> + * define TUNGETVNETHASHCAP - ioctl to get virtio_net hashing capability.
>> + *
>> + * The argument is a pointer to &struct tun_vnet_hash which will store the
>> + * maximal virtio_net hashing configuration.
>> + */
>> +#define TUNGETVNETHASHCAP _IOR('T', 228, struct tun_vnet_hash)
>> +
>> +/**
>> + * define TUNSETVNETHASH - ioctl to configure virtio_net hashing
>> + *
>> + * The argument is a pointer to &struct tun_vnet_hash.
>> + *
>> + * %TUNSETVNETHDRSZ ioctl must be called with a number greater than or equal to
>> + * the size of &struct virtio_net_hdr_v1_hash before calling this ioctl with
>> + * %TUN_VNET_HASH_REPORT.
>> + *
>> + * The virtio_net header must be configured as little-endian before calling this
>> + * ioctl with %TUN_VNET_HASH_REPORT.
>> + *
>> + * This ioctl currently has no effect on XDP packets.
>> + */
>> +#define TUNSETVNETHASH _IOW('T', 229, struct tun_vnet_hash)
>> +
>> /* TUNSETIFF ifr flags */
>> #define IFF_TUN 0x0001
>> #define IFF_TAP 0x0002
>> @@ -115,4 +139,24 @@ struct tun_filter {
>> __u8 addr[][ETH_ALEN];
>> };
>>
>> +/**
>> + * define TUN_VNET_HASH_REPORT - Request virtio_net hash reporting for vhost
>> + */
>> +#define TUN_VNET_HASH_REPORT 0x0001
>> +
>> +/**
>> + * struct tun_vnet_hash - virtio_net hashing configuration
>> + * @flags:
>> + * Bitmask consists of %TUN_VNET_HASH_REPORT and %TUN_VNET_HASH_RSS
>> + * @pad:
>> + * Should be filled with zero before passing to %TUNSETVNETHASH
>> + * @types:
>> + * Bitmask of allowed hash types
>> + */
>> +struct tun_vnet_hash {
>> + __u16 flags;
>> + __u8 pad[2];
>> + __u32 types;
>> +};
>> +
>
> The values for flags and types should probably be defined here.
I put TUN_VNET_HASH_REPORT before struct tun_vnet_hash following the
examples of TUN_PKT_STRIP/struct tun_pi and TUN_FLT_ALLMULTI/struct
tun_filter. The types are defined in: include/uapi/linux/virtio_net.h
Regards,
Akihiko Odaki
>
>> #endif /* _UAPI__IF_TUN_H */
>>
>> --
>> 2.46.0
>>
>
>
Powered by blists - more mailing lists