lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CACGkMEsPNTr3zcstsQGoOiQdCFQ+6EG6cSGiZzNxONsH9Xm=Aw@mail.gmail.com>
Date: Wed, 9 Oct 2024 16:14:41 +0800
From: Jason Wang <jasowang@...hat.com>
To: Akihiko Odaki <akihiko.odaki@...nix.com>
Cc: Jonathan Corbet <corbet@....net>, Willem de Bruijn <willemdebruijn.kernel@...il.com>, 
	"David S. Miller" <davem@...emloft.net>, Eric Dumazet <edumazet@...gle.com>, 
	Jakub Kicinski <kuba@...nel.org>, Paolo Abeni <pabeni@...hat.com>, 
	"Michael S. Tsirkin" <mst@...hat.com>, Xuan Zhuo <xuanzhuo@...ux.alibaba.com>, 
	Shuah Khan <shuah@...nel.org>, linux-doc@...r.kernel.org, linux-kernel@...r.kernel.org, 
	netdev@...r.kernel.org, kvm@...r.kernel.org, 
	virtualization@...ts.linux-foundation.org, linux-kselftest@...r.kernel.org, 
	Yuri Benditovich <yuri.benditovich@...nix.com>, Andrew Melnychenko <andrew@...nix.com>, 
	Stephen Hemminger <stephen@...workplumber.org>, gur.stavi@...wei.com
Subject: Re: [PATCH RFC v5 07/10] tun: Introduce virtio-net RSS

On Tue, Oct 8, 2024 at 2:55 PM Akihiko Odaki <akihiko.odaki@...nix.com> wrote:
>
> RSS is a receive steering algorithm that can be negotiated to use with
> virtio_net. Conventionally the hash calculation was done by the VMM.
> However, computing the hash after the queue was chosen defeats the
> purpose of RSS.
>
> Another approach is to use eBPF steering program. This approach has
> another downside: it cannot report the calculated hash due to the
> restrictive nature of eBPF steering program.
>
> Introduce the code to perform RSS to the kernel in order to overcome
> thse challenges. An alternative solution is to extend the eBPF steering
> program so that it will be able to report to the userspace, but I didn't
> opt for it because extending the current mechanism of eBPF steering
> program as is because it relies on legacy context rewriting, and
> introducing kfunc-based eBPF will result in non-UAPI dependency while
> the other relevant virtualization APIs such as KVM and vhost_net are
> UAPIs.
>
> Signed-off-by: Akihiko Odaki <akihiko.odaki@...nix.com>
> ---
>  drivers/net/tap.c           | 11 +++++-
>  drivers/net/tun.c           | 57 ++++++++++++++++++++-------
>  drivers/net/tun_vnet.h      | 96 +++++++++++++++++++++++++++++++++++++++++----
>  include/linux/if_tap.h      |  4 +-
>  include/uapi/linux/if_tun.h | 27 +++++++++++++
>  5 files changed, 169 insertions(+), 26 deletions(-)
>
> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
> index 5e2fbe63ca47..a58b83285af4 100644
> --- a/drivers/net/tap.c
> +++ b/drivers/net/tap.c
> @@ -207,6 +207,7 @@ static struct tap_queue *tap_get_queue(struct tap_dev *tap,
>          * racing against queue removal.
>          */
>         int numvtaps = READ_ONCE(tap->numvtaps);
> +       struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tap->vnet_hash);
>         __u32 rxq;
>
>         *tap_add_hash(skb) = (struct virtio_net_hash) { .report = VIRTIO_NET_HASH_REPORT_NONE };
> @@ -217,6 +218,12 @@ static struct tap_queue *tap_get_queue(struct tap_dev *tap,
>         if (numvtaps == 1)
>                 goto single;
>
> +       if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) {
> +               rxq = tun_vnet_rss_select_queue(numvtaps, vnet_hash, skb, tap_add_hash);
> +               queue = rcu_dereference(tap->taps[rxq]);
> +               goto out;
> +       }
> +
>         if (!skb->l4_hash && !skb->sw_hash) {
>                 struct flow_keys keys;
>
> @@ -234,7 +241,7 @@ static struct tap_queue *tap_get_queue(struct tap_dev *tap,
>
>         /* Check if we can use flow to select a queue */
>         if (rxq) {
> -               tun_vnet_hash_report(&tap->vnet_hash, skb, &keys_basic, rxq, tap_add_hash);
> +               tun_vnet_hash_report(vnet_hash, skb, &keys_basic, rxq, tap_add_hash);
>                 queue = rcu_dereference(tap->taps[rxq % numvtaps]);
>                 goto out;
>         }
> @@ -1058,7 +1065,7 @@ static long tap_ioctl(struct file *file, unsigned int cmd,
>                 tap = rtnl_dereference(q->tap);
>                 ret = tun_vnet_ioctl(&q->vnet_hdr_sz, &q->flags,
>                                      tap ? &tap->vnet_hash : NULL, -EINVAL,
> -                                    cmd, sp);
> +                                    true, cmd, sp);
>                 rtnl_unlock();
>                 return ret;
>         }
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 27308417b834..18528568aed7 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -209,7 +209,7 @@ struct tun_struct {
>         struct bpf_prog __rcu *xdp_prog;
>         struct tun_prog __rcu *steering_prog;
>         struct tun_prog __rcu *filter_prog;
> -       struct tun_vnet_hash vnet_hash;
> +       struct tun_vnet_hash_container __rcu *vnet_hash;
>         struct ethtool_link_ksettings link_ksettings;
>         /* init args */
>         struct file *file;
> @@ -468,7 +468,9 @@ static const struct virtio_net_hash *tun_find_hash(const struct sk_buff *skb)
>   * the userspace application move between processors, we may get a
>   * different rxq no. here.
>   */
> -static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
> +static u16 tun_automq_select_queue(struct tun_struct *tun,
> +                                  const struct tun_vnet_hash_container *vnet_hash,
> +                                  struct sk_buff *skb)
>  {
>         struct flow_keys keys;
>         struct flow_keys_basic keys_basic;
> @@ -493,7 +495,7 @@ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
>                 .control = keys.control,
>                 .basic = keys.basic
>         };
> -       tun_vnet_hash_report(&tun->vnet_hash, skb, &keys_basic, skb->l4_hash ? skb->hash : txq,
> +       tun_vnet_hash_report(vnet_hash, skb, &keys_basic, skb->l4_hash ? skb->hash : txq,
>                              tun_add_hash);
>
>         return txq;
> @@ -523,10 +525,17 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
>         u16 ret;
>
>         rcu_read_lock();
> -       if (rcu_dereference(tun->steering_prog))
> +       if (rcu_dereference(tun->steering_prog)) {
>                 ret = tun_ebpf_select_queue(tun, skb);
> -       else
> -               ret = tun_automq_select_queue(tun, skb);
> +       } else {
> +               struct tun_vnet_hash_container *vnet_hash = rcu_dereference(tun->vnet_hash);
> +
> +               if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS))
> +                       ret = tun_vnet_rss_select_queue(READ_ONCE(tun->numqueues), vnet_hash,
> +                                                       skb, tun_add_hash);
> +               else
> +                       ret = tun_automq_select_queue(tun, vnet_hash, skb);
> +       }
>         rcu_read_unlock();
>
>         return ret;
> @@ -2248,6 +2257,9 @@ static void tun_free_netdev(struct net_device *dev)
>         security_tun_dev_free_security(tun->security);
>         __tun_set_ebpf(tun, &tun->steering_prog, NULL);
>         __tun_set_ebpf(tun, &tun->filter_prog, NULL);
> +       rtnl_lock();
> +       kfree_rcu_mightsleep(rtnl_dereference(tun->vnet_hash));
> +       rtnl_unlock();
>  }
>
>  static void tun_setup(struct net_device *dev)
> @@ -2946,13 +2958,9 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr)
>  }
>
>  static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p,
> -                       void __user *data)
> +                       int fd)
>  {
>         struct bpf_prog *prog;
> -       int fd;
> -
> -       if (copy_from_user(&fd, data, sizeof(fd)))
> -               return -EFAULT;
>
>         if (fd == -1) {
>                 prog = NULL;
> @@ -3019,6 +3027,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>         int sndbuf;
>         int ret;
>         bool do_notify = false;
> +       struct tun_vnet_hash_container *vnet_hash;
>
>         if (cmd == TUNSETIFF || cmd == TUNSETQUEUE ||
>             (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) {
> @@ -3078,7 +3087,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>         }
>
>         if (!tun) {
> -               ret = tun_vnet_ioctl(NULL, NULL, NULL, -EBADFD, cmd, argp);
> +               ret = tun_vnet_ioctl(NULL, NULL, NULL, -EBADFD, true, cmd, argp);
>                 goto unlock;
>         }
>
> @@ -3256,11 +3265,27 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>                 break;
>
>         case TUNSETSTEERINGEBPF:
> -               ret = tun_set_ebpf(tun, &tun->steering_prog, argp);
> +               if (get_user(ret, (int __user *)argp)) {
> +                       ret = -EFAULT;
> +                       break;
> +               }
> +
> +               vnet_hash = rtnl_dereference(tun->vnet_hash);
> +               if (ret != -1 && vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS)) {
> +                       ret = -EBUSY;
> +                       break;
> +               }
> +
> +               ret = tun_set_ebpf(tun, &tun->steering_prog, ret);
>                 break;
>
>         case TUNSETFILTEREBPF:
> -               ret = tun_set_ebpf(tun, &tun->filter_prog, argp);
> +               if (get_user(ret, (int __user *)argp)) {
> +                       ret = -EFAULT;
> +                       break;
> +               }
> +
> +               ret = tun_set_ebpf(tun, &tun->filter_prog, ret);
>                 break;
>
>         case TUNSETCARRIER:
> @@ -3280,7 +3305,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>
>         default:
>                 ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags,
> -                                    &tun->vnet_hash, -EINVAL, cmd, argp);
> +                                    &tun->vnet_hash, -EINVAL,
> +                                    !rtnl_dereference(tun->steering_prog),
> +                                    cmd, argp);
>         }
>
>         if (do_notify)
> diff --git a/drivers/net/tun_vnet.h b/drivers/net/tun_vnet.h
> index 589a97dd7d02..f5de4fe9d14e 100644
> --- a/drivers/net/tun_vnet.h
> +++ b/drivers/net/tun_vnet.h
> @@ -9,6 +9,13 @@
>  typedef struct virtio_net_hash *(*tun_vnet_hash_add)(struct sk_buff *);
>  typedef const struct virtio_net_hash *(*tun_vnet_hash_find)(const struct sk_buff *);
>
> +struct tun_vnet_hash_container {
> +       struct tun_vnet_hash common;
> +       struct tun_vnet_hash_rss rss;
> +       u32 rss_key[VIRTIO_NET_RSS_MAX_KEY_SIZE];
> +       u16 rss_indirection_table[];
> +};
> +
>  static inline bool tun_vnet_legacy_is_little_endian(unsigned int flags)
>  {
>         return !(IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE) && (flags & TUN_VNET_BE)) &&
> @@ -62,14 +69,16 @@ static inline __virtio16 cpu_to_tun_vnet16(unsigned int flags, u16 val)
>  }
>
>  static inline long tun_vnet_ioctl(int *sz, unsigned int *flags,
> -                                 struct tun_vnet_hash *hash, long fallback,
> +                                 struct tun_vnet_hash_container __rcu **hashp,
> +                                 long fallback, bool can_rss,
>                                   unsigned int cmd, void __user *argp)
>  {
>         static const struct tun_vnet_hash cap = {
> -               .flags = TUN_VNET_HASH_REPORT,
> +               .flags = TUN_VNET_HASH_REPORT | TUN_VNET_HASH_RSS,
>                 .types = VIRTIO_NET_SUPPORTED_HASH_TYPES
>         };
>         struct tun_vnet_hash hash_buf;
> +       struct tun_vnet_hash_container *hash;
>         int __user *sp = argp;
>         int s;
>
> @@ -132,13 +141,57 @@ static inline long tun_vnet_ioctl(int *sz, unsigned int *flags,
>                 return copy_to_user(argp, &cap, sizeof(cap)) ? -EFAULT : 0;
>
>         case TUNSETVNETHASH:
> -               if (!hash)
> +               if (!hashp)
>                         return -EBADFD;
>
>                 if (copy_from_user(&hash_buf, argp, sizeof(hash_buf)))
>                         return -EFAULT;
> +               argp = (struct tun_vnet_hash __user *)argp + 1;
> +
> +               if (hash_buf.flags & TUN_VNET_HASH_RSS) {
> +                       struct tun_vnet_hash_rss rss;
> +                       size_t indirection_table_size;
> +                       size_t key_size;
> +                       size_t size;
> +
> +                       if (!can_rss)
> +                               return -EBUSY;
> +
> +                       if (copy_from_user(&rss, argp, sizeof(rss)))

This seems to be a change of the uAPI of TUNSETVNETHASH.

> +                               return -EFAULT;
> +                       argp = (struct tun_vnet_hash_rss __user *)argp + 1;
> +
> +                       indirection_table_size = ((size_t)rss.indirection_table_mask + 1) * 2;
> +                       key_size = virtio_net_hash_key_length(hash_buf.types);
> +                       size = struct_size(hash, rss_indirection_table,
> +                                          (size_t)rss.indirection_table_mask + 1);
> +
> +                       hash = kmalloc(size, GFP_KERNEL);
> +                       if (!hash)
> +                               return -ENOMEM;
> +
> +                       if (copy_from_user(hash->rss_indirection_table,
> +                                          argp, indirection_table_size)) {
> +                               kfree(hash);
> +                               return -EFAULT;
> +                       }
> +                       argp = (u16 __user *)argp + rss.indirection_table_mask + 1;
> +
> +                       if (copy_from_user(hash->rss_key, argp, key_size)) {
> +                               kfree(hash);
> +                               return -EFAULT;
> +                       }
> +
> +                       virtio_net_toeplitz_convert_key(hash->rss_key, key_size);
> +                       hash->rss = rss;
> +               } else {
> +                       hash = kmalloc(sizeof(hash->common), GFP_KERNEL);
> +                       if (!hash)
> +                               return -ENOMEM;
> +               }
>
> -               *hash = hash_buf;
> +               hash->common = hash_buf;
> +               kfree_rcu_mightsleep(rcu_replace_pointer_rtnl(*hashp, hash));
>                 return 0;
>
>         default:
> @@ -146,7 +199,7 @@ static inline long tun_vnet_ioctl(int *sz, unsigned int *flags,
>         }
>  }
>
> -static inline void tun_vnet_hash_report(const struct tun_vnet_hash *hash,
> +static inline void tun_vnet_hash_report(const struct tun_vnet_hash_container *hash,
>                                         struct sk_buff *skb,
>                                         const struct flow_keys_basic *keys,
>                                         u32 value,
> @@ -154,7 +207,7 @@ static inline void tun_vnet_hash_report(const struct tun_vnet_hash *hash,
>  {
>         struct virtio_net_hash *report;
>
> -       if (!(hash->flags & TUN_VNET_HASH_REPORT))
> +       if (!hash || !(hash->common.flags & TUN_VNET_HASH_REPORT))
>                 return;
>
>         report = vnet_hash_add(skb);
> @@ -162,11 +215,40 @@ static inline void tun_vnet_hash_report(const struct tun_vnet_hash *hash,
>                 return;
>
>         *report = (struct virtio_net_hash) {
> -               .report = virtio_net_hash_report(hash->types, keys),
> +               .report = virtio_net_hash_report(hash->common.types, keys),
>                 .value = value
>         };
>  }
>
> +static inline u16 tun_vnet_rss_select_queue(u32 numqueues,
> +                                           const struct tun_vnet_hash_container *hash,
> +                                           struct sk_buff *skb,
> +                                           tun_vnet_hash_add vnet_hash_add)
> +{
> +       struct virtio_net_hash *report;
> +       struct virtio_net_hash ret;
> +       u16 txq, index;
> +
> +       if (!numqueues)
> +               return 0;
> +
> +       virtio_net_hash_rss(skb, hash->common.types, hash->rss_key, &ret);
> +
> +       if (!ret.report)
> +               return hash->rss.unclassified_queue % numqueues;
> +
> +       if (hash->common.flags & TUN_VNET_HASH_REPORT) {
> +               report = vnet_hash_add(skb);
> +               if (report)
> +                       *report = ret;
> +       }
> +
> +       index = ret.value & hash->rss.indirection_table_mask;
> +       txq = READ_ONCE(hash->rss_indirection_table[index]);
> +
> +       return txq % numqueues;
> +}
> +
>  static inline int tun_vnet_hdr_get(int sz, unsigned int flags,
>                                    struct iov_iter *from,
>                                    struct virtio_net_hdr *hdr)
> diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h
> index 5bbb343a6dba..7334c46a3f10 100644
> --- a/include/linux/if_tap.h
> +++ b/include/linux/if_tap.h
> @@ -4,7 +4,6 @@
>
>  #include <net/sock.h>
>  #include <linux/skb_array.h>
> -#include <uapi/linux/if_tun.h>
>
>  struct file;
>  struct socket;
> @@ -32,6 +31,7 @@ static inline struct ptr_ring *tap_get_ptr_ring(struct file *f)
>  #define MAX_TAP_QUEUES 256
>
>  struct tap_queue;
> +struct tun_vnet_hash_container;
>
>  struct tap_dev {
>         struct net_device       *dev;
> @@ -44,7 +44,7 @@ struct tap_dev {
>         int                     numqueues;
>         netdev_features_t       tap_features;
>         int                     minor;
> -       struct tun_vnet_hash    vnet_hash;
> +       struct tun_vnet_hash_container __rcu *vnet_hash;
>
>         void (*update_features)(struct tap_dev *tap, netdev_features_t features);
>         void (*count_tx_dropped)(struct tap_dev *tap);
> diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
> index d11e79b4e0dc..4887f97500a8 100644
> --- a/include/uapi/linux/if_tun.h
> +++ b/include/uapi/linux/if_tun.h
> @@ -75,6 +75,14 @@
>   *
>   * The argument is a pointer to &struct tun_vnet_hash.
>   *
> + * The argument is a pointer to the compound of the following in order if
> + * %TUN_VNET_HASH_RSS is set:
> + *
> + * 1. &struct tun_vnet_hash
> + * 2. &struct tun_vnet_hash_rss
> + * 3. Indirection table
> + * 4. Key
> + *

Let's try not modify uAPI. We can introduce new ioctl if necessary.

>   * The %TUN_VNET_HASH_REPORT flag set with this ioctl will be effective only
>   * after calling the %TUNSETVNETHDRSZ ioctl with a number greater than or equal
>   * to the size of &struct virtio_net_hdr_v1_hash.
> @@ -148,6 +156,13 @@ struct tun_filter {
>   */
>  #define TUN_VNET_HASH_REPORT   0x0001
>
> +/**
> + * define TUN_VNET_HASH_RSS - Request virtio_net RSS
> + *
> + * This is mutually exclusive with eBPF steering program.
> + */
> +#define TUN_VNET_HASH_RSS      0x0002
> +
>  /**
>   * struct tun_vnet_hash - virtio_net hashing configuration
>   * @flags:
> @@ -163,4 +178,16 @@ struct tun_vnet_hash {
>         __u32 types;
>  };
>
> +/**
> + * struct tun_vnet_hash_rss - virtio_net RSS configuration
> + * @indirection_table_mask:
> + *             Bitmask to be applied to the indirection table index
> + * @unclassified_queue:
> + *             The index of the queue to place unclassified packets in
> + */
> +struct tun_vnet_hash_rss {
> +       __u16 indirection_table_mask;
> +       __u16 unclassified_queue;
> +};
> +
>  #endif /* _UAPI__IF_TUN_H */
>
> --
> 2.46.2
>

Thanks


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ