lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <4bc7dfaa-a7cd-41f4-a917-e71b5c7241f7@daynix.com>
Date: Sat, 12 Oct 2024 19:29:01 +0900
From: Akihiko Odaki <akihiko.odaki@...nix.com>
To: Jason Wang <jasowang@...hat.com>
Cc: Jonathan Corbet <corbet@....net>,
 Willem de Bruijn <willemdebruijn.kernel@...il.com>,
 "David S. Miller" <davem@...emloft.net>, Eric Dumazet <edumazet@...gle.com>,
 Jakub Kicinski <kuba@...nel.org>, Paolo Abeni <pabeni@...hat.com>,
 "Michael S. Tsirkin" <mst@...hat.com>, Xuan Zhuo
 <xuanzhuo@...ux.alibaba.com>, Shuah Khan <shuah@...nel.org>,
 linux-doc@...r.kernel.org, linux-kernel@...r.kernel.org,
 netdev@...r.kernel.org, kvm@...r.kernel.org,
 virtualization@...ts.linux-foundation.org, linux-kselftest@...r.kernel.org,
 Yuri Benditovich <yuri.benditovich@...nix.com>,
 Andrew Melnychenko <andrew@...nix.com>,
 Stephen Hemminger <stephen@...workplumber.org>, gur.stavi@...wei.com
Subject: Re: [PATCH RFC v5 07/10] tun: Introduce virtio-net RSS

On 2024/10/09 17:14, Jason Wang wrote:
> On Tue, Oct 8, 2024 at 2:55 PM Akihiko Odaki <akihiko.odaki@...nix.com> wrote:
>>
>> RSS is a receive steering algorithm that can be negotiated to use with
>> virtio_net. Conventionally the hash calculation was done by the VMM.
>> However, computing the hash after the queue was chosen defeats the
>> purpose of RSS.
>>
>> Another approach is to use eBPF steering program. This approach has
>> another downside: it cannot report the calculated hash due to the
>> restrictive nature of eBPF steering program.
>>
>> Introduce the code to perform RSS to the kernel in order to overcome
>> thse challenges. An alternative solution is to extend the eBPF steering
>> program so that it will be able to report to the userspace, but I didn't
>> opt for it because extending the current mechanism of eBPF steering
>> program as is because it relies on legacy context rewriting, and
>> introducing kfunc-based eBPF will result in non-UAPI dependency while
>> the other relevant virtualization APIs such as KVM and vhost_net are
>> UAPIs.
>>
>> Signed-off-by: Akihiko Odaki <akihiko.odaki@...nix.com>
>> ---
>>   drivers/net/tap.c           | 11 +++++-
>>   drivers/net/tun.c           | 57 ++++++++++++++++++++-------
>>   drivers/net/tun_vnet.h      | 96 +++++++++++++++++++++++++++++++++++++++++----
>>   include/linux/if_tap.h      |  4 +-
>>   include/uapi/linux/if_tun.h | 27 +++++++++++++
>>   5 files changed, 169 insertions(+), 26 deletions(-)
>>
>> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
>> index 5e2fbe63ca47..a58b83285af4 100644
>> --- a/drivers/net/tap.c
>> +++ b/drivers/net/tap.c
>> @@ -207,6 +207,7 @@ static struct tap_queue *tap_get_queue(struct tap_dev *tap,
>>           * racing against queue removal.
>>           */
>>          int numvtaps = READ_ONCE(tap->numvtaps);
>> +       struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tap->vnet_hash);
>>          __u32 rxq;
>>
>>          *tap_add_hash(skb) = (struct virtio_net_hash) { .report = VIRTIO_NET_HASH_REPORT_NONE };
>> @@ -217,6 +218,12 @@ static struct tap_queue *tap_get_queue(struct tap_dev *tap,
>>          if (numvtaps == 1)
>>                  goto single;
>>
>> +       if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) {
>> +               rxq = tun_vnet_rss_select_queue(numvtaps, vnet_hash, skb, tap_add_hash);
>> +               queue = rcu_dereference(tap->taps[rxq]);
>> +               goto out;
>> +       }
>> +
>>          if (!skb->l4_hash && !skb->sw_hash) {
>>                  struct flow_keys keys;
>>
>> @@ -234,7 +241,7 @@ static struct tap_queue *tap_get_queue(struct tap_dev *tap,
>>
>>          /* Check if we can use flow to select a queue */
>>          if (rxq) {
>> -               tun_vnet_hash_report(&tap->vnet_hash, skb, &keys_basic, rxq, tap_add_hash);
>> +               tun_vnet_hash_report(vnet_hash, skb, &keys_basic, rxq, tap_add_hash);
>>                  queue = rcu_dereference(tap->taps[rxq % numvtaps]);
>>                  goto out;
>>          }
>> @@ -1058,7 +1065,7 @@ static long tap_ioctl(struct file *file, unsigned int cmd,
>>                  tap = rtnl_dereference(q->tap);
>>                  ret = tun_vnet_ioctl(&q->vnet_hdr_sz, &q->flags,
>>                                       tap ? &tap->vnet_hash : NULL, -EINVAL,
>> -                                    cmd, sp);
>> +                                    true, cmd, sp);
>>                  rtnl_unlock();
>>                  return ret;
>>          }
>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>> index 27308417b834..18528568aed7 100644
>> --- a/drivers/net/tun.c
>> +++ b/drivers/net/tun.c
>> @@ -209,7 +209,7 @@ struct tun_struct {
>>          struct bpf_prog __rcu *xdp_prog;
>>          struct tun_prog __rcu *steering_prog;
>>          struct tun_prog __rcu *filter_prog;
>> -       struct tun_vnet_hash vnet_hash;
>> +       struct tun_vnet_hash_container __rcu *vnet_hash;
>>          struct ethtool_link_ksettings link_ksettings;
>>          /* init args */
>>          struct file *file;
>> @@ -468,7 +468,9 @@ static const struct virtio_net_hash *tun_find_hash(const struct sk_buff *skb)
>>    * the userspace application move between processors, we may get a
>>    * different rxq no. here.
>>    */
>> -static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
>> +static u16 tun_automq_select_queue(struct tun_struct *tun,
>> +                                  const struct tun_vnet_hash_container *vnet_hash,
>> +                                  struct sk_buff *skb)
>>   {
>>          struct flow_keys keys;
>>          struct flow_keys_basic keys_basic;
>> @@ -493,7 +495,7 @@ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
>>                  .control = keys.control,
>>                  .basic = keys.basic
>>          };
>> -       tun_vnet_hash_report(&tun->vnet_hash, skb, &keys_basic, skb->l4_hash ? skb->hash : txq,
>> +       tun_vnet_hash_report(vnet_hash, skb, &keys_basic, skb->l4_hash ? skb->hash : txq,
>>                               tun_add_hash);
>>
>>          return txq;
>> @@ -523,10 +525,17 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
>>          u16 ret;
>>
>>          rcu_read_lock();
>> -       if (rcu_dereference(tun->steering_prog))
>> +       if (rcu_dereference(tun->steering_prog)) {
>>                  ret = tun_ebpf_select_queue(tun, skb);
>> -       else
>> -               ret = tun_automq_select_queue(tun, skb);
>> +       } else {
>> +               struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tun->vnet_hash);
>> +
>> +               if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS))
>> +                       ret = tun_vnet_rss_select_queue(READ_ONCE(tun->numqueues), vnet_hash,
>> +                                                       skb, tun_add_hash);
>> +               else
>> +                       ret = tun_automq_select_queue(tun, vnet_hash, skb);
>> +       }
>>          rcu_read_unlock();
>>
>>          return ret;
>> @@ -2248,6 +2257,9 @@ static void tun_free_netdev(struct net_device *dev)
>>          security_tun_dev_free_security(tun->security);
>>          __tun_set_ebpf(tun, &tun->steering_prog, NULL);
>>          __tun_set_ebpf(tun, &tun->filter_prog, NULL);
>> +       rtnl_lock();
>> +       kfree_rcu_mightsleep(rtnl_dereference(tun->vnet_hash));
>> +       rtnl_unlock();
>>   }
>>
>>   static void tun_setup(struct net_device *dev)
>> @@ -2946,13 +2958,9 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr)
>>   }
>>
>>   static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p,
>> -                       void __user *data)
>> +                       int fd)
>>   {
>>          struct bpf_prog *prog;
>> -       int fd;
>> -
>> -       if (copy_from_user(&fd, data, sizeof(fd)))
>> -               return -EFAULT;
>>
>>          if (fd == -1) {
>>                  prog = NULL;
>> @@ -3019,6 +3027,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>>          int sndbuf;
>>          int ret;
>>          bool do_notify = false;
>> +       struct tun_vnet_hash_container *vnet_hash;
>>
>>          if (cmd == TUNSETIFF || cmd == TUNSETQUEUE ||
>>              (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) {
>> @@ -3078,7 +3087,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>>          }
>>
>>          if (!tun) {
>> -               ret = tun_vnet_ioctl(NULL, NULL, NULL, -EBADFD, cmd, argp);
>> +               ret = tun_vnet_ioctl(NULL, NULL, NULL, -EBADFD, true, cmd, argp);
>>                  goto unlock;
>>          }
>>
>> @@ -3256,11 +3265,27 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>>                  break;
>>
>>          case TUNSETSTEERINGEBPF:
>> -               ret = tun_set_ebpf(tun, &tun->steering_prog, argp);
>> +               if (get_user(ret, (int __user *)argp)) {
>> +                       ret = -EFAULT;
>> +                       break;
>> +               }
>> +
>> +               vnet_hash = rtnl_dereference(tun->vnet_hash);
>> +               if (ret != -1 && vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) {
>> +                       ret = -EBUSY;
>> +                       break;
>> +               }
>> +
>> +               ret = tun_set_ebpf(tun, &tun->steering_prog, ret);
>>                  break;
>>
>>          case TUNSETFILTEREBPF:
>> -               ret = tun_set_ebpf(tun, &tun->filter_prog, argp);
>> +               if (get_user(ret, (int __user *)argp)) {
>> +                       ret = -EFAULT;
>> +                       break;
>> +               }
>> +
>> +               ret = tun_set_ebpf(tun, &tun->filter_prog, ret);
>>                  break;
>>
>>          case TUNSETCARRIER:
>> @@ -3280,7 +3305,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>>
>>          default:
>>                  ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags,
>> -                                    &tun->vnet_hash, -EINVAL, cmd, argp);
>> +                                    &tun->vnet_hash, -EINVAL,
>> +                                    !rtnl_dereference(tun->steering_prog),
>> +                                    cmd, argp);
>>          }
>>
>>          if (do_notify)
>> diff --git a/drivers/net/tun_vnet.h b/drivers/net/tun_vnet.h
>> index 589a97dd7d02..f5de4fe9d14e 100644
>> --- a/drivers/net/tun_vnet.h
>> +++ b/drivers/net/tun_vnet.h
>> @@ -9,6 +9,13 @@
>>   typedef struct virtio_net_hash *(*tun_vnet_hash_add)(struct sk_buff *);
>>   typedef const struct virtio_net_hash *(*tun_vnet_hash_find)(const struct sk_buff *);
>>
>> +struct tun_vnet_hash_container {
>> +       struct tun_vnet_hash common;
>> +       struct tun_vnet_hash_rss rss;
>> +       u32 rss_key[VIRTIO_NET_RSS_MAX_KEY_SIZE];
>> +       u16 rss_indirection_table[];
>> +};
>> +
>>   static inline bool tun_vnet_legacy_is_little_endian(unsigned int flags)
>>   {
>>          return !(IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE) && (flags & TUN_VNET_BE)) &&
>> @@ -62,14 +69,16 @@ static inline __virtio16 cpu_to_tun_vnet16(unsigned int flags, u16 val)
>>   }
>>
>>   static inline long tun_vnet_ioctl(int *sz, unsigned int *flags,
>> -                                 struct tun_vnet_hash *hash, long fallback,
>> +                                 struct tun_vnet_hash_container __rcu **hashp,
>> +                                 long fallback, bool can_rss,
>>                                    unsigned int cmd, void __user *argp)
>>   {
>>          static const struct tun_vnet_hash cap = {
>> -               .flags = TUN_VNET_HASH_REPORT,
>> +               .flags = TUN_VNET_HASH_REPORT | TUN_VNET_HASH_RSS,
>>                  .types = VIRTIO_NET_SUPPORTED_HASH_TYPES
>>          };
>>          struct tun_vnet_hash hash_buf;
>> +       struct tun_vnet_hash_container *hash;
>>          int __user *sp = argp;
>>          int s;
>>
>> @@ -132,13 +141,57 @@ static inline long tun_vnet_ioctl(int *sz, unsigned int *flags,
>>                  return copy_to_user(argp, &cap, sizeof(cap)) ? -EFAULT : 0;
>>
>>          case TUNSETVNETHASH:
>> -               if (!hash)
>> +               if (!hashp)
>>                          return -EBADFD;
>>
>>                  if (copy_from_user(&hash_buf, argp, sizeof(hash_buf)))
>>                          return -EFAULT;
>> +               argp = (struct tun_vnet_hash __user *)argp + 1;
>> +
>> +               if (hash_buf.flags & TUN_VNET_HASH_RSS) {
>> +                       struct tun_vnet_hash_rss rss;
>> +                       size_t indirection_table_size;
>> +                       size_t key_size;
>> +                       size_t size;
>> +
>> +                       if (!can_rss)
>> +                               return -EBUSY;
>> +
>> +                       if (copy_from_user(&rss, argp, sizeof(rss)))
> 
> This seems to be a change of the uAPI of TUNSETVNETHASH.
> 
>> +                               return -EFAULT;
>> +                       argp = (struct tun_vnet_hash_rss __user *)argp + 1;
>> +
>> +                       indirection_table_size = ((size_t)rss.indirection_table_mask + 1) * 2;
>> +                       key_size = virtio_net_hash_key_length(hash_buf.types);
>> +                       size = struct_size(hash, rss_indirection_table,
>> +                                          (size_t)rss.indirection_table_mask + 1);
>> +
>> +                       hash = kmalloc(size, GFP_KERNEL);
>> +                       if (!hash)
>> +                               return -ENOMEM;
>> +
>> +                       if (copy_from_user(hash->rss_indirection_table,
>> +                                          argp, indirection_table_size)) {
>> +                               kfree(hash);
>> +                               return -EFAULT;
>> +                       }
>> +                       argp = (u16 __user *)argp + rss.indirection_table_mask + 1;
>> +
>> +                       if (copy_from_user(hash->rss_key, argp, key_size)) {
>> +                               kfree(hash);
>> +                               return -EFAULT;
>> +                       }
>> +
>> +                       virtio_net_toeplitz_convert_key(hash->rss_key, key_size);
>> +                       hash->rss = rss;
>> +               } else {
>> +                       hash = kmalloc(sizeof(hash->common), GFP_KERNEL);
>> +                       if (!hash)
>> +                               return -ENOMEM;
>> +               }
>>
>> -               *hash = hash_buf;
>> +               hash->common = hash_buf;
>> +               kfree_rcu_mightsleep(rcu_replace_pointer_rtnl(*hashp, hash));
>>                  return 0;
>>
>>          default:
>> @@ -146,7 +199,7 @@ static inline long tun_vnet_ioctl(int *sz, unsigned int *flags,
>>          }
>>   }
>>
>> -static inline void tun_vnet_hash_report(const struct tun_vnet_hash *hash,
>> +static inline void tun_vnet_hash_report(const struct tun_vnet_hash_container *hash,
>>                                          struct sk_buff *skb,
>>                                          const struct flow_keys_basic *keys,
>>                                          u32 value,
>> @@ -154,7 +207,7 @@ static inline void tun_vnet_hash_report(const struct tun_vnet_hash *hash,
>>   {
>>          struct virtio_net_hash *report;
>>
>> -       if (!(hash->flags & TUN_VNET_HASH_REPORT))
>> +       if (!hash || !(hash->common.flags & TUN_VNET_HASH_REPORT))
>>                  return;
>>
>>          report = vnet_hash_add(skb);
>> @@ -162,11 +215,40 @@ static inline void tun_vnet_hash_report(const struct tun_vnet_hash *hash,
>>                  return;
>>
>>          *report = (struct virtio_net_hash) {
>> -               .report = virtio_net_hash_report(hash->types, keys),
>> +               .report = virtio_net_hash_report(hash->common.types, keys),
>>                  .value = value
>>          };
>>   }
>>
>> +static inline u16 tun_vnet_rss_select_queue(u32 numqueues,
>> +                                           const struct tun_vnet_hash_container *hash,
>> +                                           struct sk_buff *skb,
>> +                                           tun_vnet_hash_add vnet_hash_add)
>> +{
>> +       struct virtio_net_hash *report;
>> +       struct virtio_net_hash ret;
>> +       u16 txq, index;
>> +
>> +       if (!numqueues)
>> +               return 0;
>> +
>> +       virtio_net_hash_rss(skb, hash->common.types, hash->rss_key, &ret);
>> +
>> +       if (!ret.report)
>> +               return hash->rss.unclassified_queue % numqueues;
>> +
>> +       if (hash->common.flags & TUN_VNET_HASH_REPORT) {
>> +               report = vnet_hash_add(skb);
>> +               if (report)
>> +                       *report = ret;
>> +       }
>> +
>> +       index = ret.value & hash->rss.indirection_table_mask;
>> +       txq = READ_ONCE(hash->rss_indirection_table[index]);
>> +
>> +       return txq % numqueues;
>> +}
>> +
>>   static inline int tun_vnet_hdr_get(int sz, unsigned int flags,
>>                                     struct iov_iter *from,
>>                                     struct virtio_net_hdr *hdr)
>> diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h
>> index 5bbb343a6dba..7334c46a3f10 100644
>> --- a/include/linux/if_tap.h
>> +++ b/include/linux/if_tap.h
>> @@ -4,7 +4,6 @@
>>
>>   #include <net/sock.h>
>>   #include <linux/skb_array.h>
>> -#include <uapi/linux/if_tun.h>
>>
>>   struct file;
>>   struct socket;
>> @@ -32,6 +31,7 @@ static inline struct ptr_ring *tap_get_ptr_ring(struct file *f)
>>   #define MAX_TAP_QUEUES 256
>>
>>   struct tap_queue;
>> +struct tun_vnet_hash_container;
>>
>>   struct tap_dev {
>>          struct net_device       *dev;
>> @@ -44,7 +44,7 @@ struct tap_dev {
>>          int                     numqueues;
>>          netdev_features_t       tap_features;
>>          int                     minor;
>> -       struct tun_vnet_hash    vnet_hash;
>> +       struct tun_vnet_hash_container __rcu *vnet_hash;
>>
>>          void (*update_features)(struct tap_dev *tap, netdev_features_t features);
>>          void (*count_tx_dropped)(struct tap_dev *tap);
>> diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
>> index d11e79b4e0dc..4887f97500a8 100644
>> --- a/include/uapi/linux/if_tun.h
>> +++ b/include/uapi/linux/if_tun.h
>> @@ -75,6 +75,14 @@
>>    *
>>    * The argument is a pointer to &struct tun_vnet_hash.
>>    *
>> + * The argument is a pointer to the compound of the following in order if
>> + * %TUN_VNET_HASH_RSS is set:
>> + *
>> + * 1. &struct tun_vnet_hash
>> + * 2. &struct tun_vnet_hash_rss
>> + * 3. Indirection table
>> + * 4. Key
>> + *
> 
> Let's try not modify uAPI. We can introduce new ioctl if necessary.

2, 3, and 4 are new additions. Adding a separate ioctl for them means we 
need to call two ioctls to configure RSS and it is hard to design the 
interactions with them.

For example, if we set TUN_VNET_HASH_RSS with TUNSETVNETHASH before 
setting struct tun_vnet_hash_rss with another ioctl, tuntap will enable 
RSS with undefined parameters. Setting struct tun_vnet_hash_rss with 
TUN_VNET_HASH_RSS unset also sounds unreasnoable.

Letting the new ioctl set TUN_VNET_HASH_RSS does not help either. 
TUNSETVNETHASH still sets the bitmask of allowed hash types so RSS will 
depend on two ioctls.

> 
>>    * The %TUN_VNET_HASH_REPORT flag set with this ioctl will be effective only
>>    * after calling the %TUNSETVNETHDRSZ ioctl with a number greater than or equal
>>    * to the size of &struct virtio_net_hdr_v1_hash.
>> @@ -148,6 +156,13 @@ struct tun_filter {
>>    */
>>   #define TUN_VNET_HASH_REPORT   0x0001
>>
>> +/**
>> + * define TUN_VNET_HASH_RSS - Request virtio_net RSS
>> + *
>> + * This is mutually exclusive with eBPF steering program.
>> + */
>> +#define TUN_VNET_HASH_RSS      0x0002
>> +
>>   /**
>>    * struct tun_vnet_hash - virtio_net hashing configuration
>>    * @flags:
>> @@ -163,4 +178,16 @@ struct tun_vnet_hash {
>>          __u32 types;
>>   };
>>
>> +/**
>> + * struct tun_vnet_hash_rss - virtio_net RSS configuration
>> + * @indirection_table_mask:
>> + *             Bitmask to be applied to the indirection table index
>> + * @unclassified_queue:
>> + *             The index of the queue to place unclassified packets in
>> + */
>> +struct tun_vnet_hash_rss {
>> +       __u16 indirection_table_mask;
>> +       __u16 unclassified_queue;
>> +};
>> +
>>   #endif /* _UAPI__IF_TUN_H */
>>
>> --
>> 2.46.2
>>
> 
> Thanks
> 


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ