lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAF2d9jjQ16NycS8dRJqLm55K5rd1w6Zfni7JA29pPoZnvf_pUg@mail.gmail.com>
Date:   Mon, 25 Sep 2017 09:27:10 -0700
From:   Mahesh Bandewar (महेश बंडेवार) 
        <maheshb@...gle.com>
To:     Petar Penkov <peterpenkov96@...il.com>
Cc:     linux-netdev <netdev@...r.kernel.org>,
        Eric Dumazet <edumazet@...gle.com>,
        Willem Bruijn <willemb@...gle.com>,
        David Miller <davem@...emloft.net>,
        Petar Bozhidarov Penkov <ppenkov@...nford.edu>
Subject: Re: [PATCH,v3,net-next 2/2] tun: enable napi_gro_frags() for TUN/TAP driver

On Fri, Sep 22, 2017 at 1:49 PM, Petar Penkov <peterpenkov96@...il.com> wrote:
> Add a TUN/TAP receive mode that exercises the napi_gro_frags()
> interface. This mode is available only in TAP mode, as the interface
> expects packets with Ethernet headers.
>
> Furthermore, packets follow the layout of the iovec_iter that was
> received. The first iovec is the linear data, and every one after the
> first is a fragment. If there are more fragments than the max number,
> drop the packet. Additionally, invoke eth_get_headlen() to exercise flow
> dissector code and to verify that the header resides in the linear data.
>
> The napi_gro_frags() mode requires setting the IFF_NAPI_FRAGS option.
> This is imposed because this mode is intended for testing via tools like
> syzkaller and packetdrill, and the increased flexibility it provides can
> introduce security vulnerabilities. This flag is accepted only if the
> device is in TAP mode and has the IFF_NAPI flag set as well. This is
> done because both of these are explicit requirements for correct
> operation in this mode.
>
> Signed-off-by: Petar Penkov <peterpenkov96@...il.com>

Thank you Petar.

Acked-by: Mahesh Bandewar <maheshb@...gle,com>
> Cc: Eric Dumazet <edumazet@...gle.com>
> Cc: Mahesh Bandewar <maheshb@...gle.com>
> Cc: Willem de Bruijn <willemb@...gle.com>
> Cc: davem@...emloft.net
> Cc: ppenkov@...nford.edu
> ---
>  drivers/net/tun.c           | 134 ++++++++++++++++++++++++++++++++++++++++++--
>  include/uapi/linux/if_tun.h |   1 +
>  2 files changed, 129 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index f16407242b18..9880b3bc8fa5 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -75,6 +75,7 @@
>  #include <linux/skb_array.h>
>  #include <linux/bpf.h>
>  #include <linux/bpf_trace.h>
> +#include <linux/mutex.h>
>
>  #include <linux/uaccess.h>
>
> @@ -121,7 +122,8 @@ do {                                                                \
>  #define TUN_VNET_BE     0x40000000
>
>  #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
> -                     IFF_MULTI_QUEUE | IFF_NAPI)
> +                     IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS)
> +
>  #define GOODCOPY_LEN 128
>
>  #define FLT_EXACT_COUNT 8
> @@ -173,6 +175,7 @@ struct tun_file {
>                 unsigned int ifindex;
>         };
>         struct napi_struct napi;
> +       struct mutex napi_mutex;        /* Protects access to the above napi */
>         struct list_head next;
>         struct tun_struct *detached;
>         struct skb_array tx_array;
> @@ -277,6 +280,7 @@ static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile,
>                 netif_napi_add(tun->dev, &tfile->napi, tun_napi_poll,
>                                NAPI_POLL_WEIGHT);
>                 napi_enable(&tfile->napi);
> +               mutex_init(&tfile->napi_mutex);
>         }
>  }
>
> @@ -292,6 +296,11 @@ static void tun_napi_del(struct tun_struct *tun, struct tun_file *tfile)
>                 netif_napi_del(&tfile->napi);
>  }
>
> +static bool tun_napi_frags_enabled(const struct tun_struct *tun)
> +{
> +       return READ_ONCE(tun->flags) & IFF_NAPI_FRAGS;
> +}
> +
>  #ifdef CONFIG_TUN_VNET_CROSS_LE
>  static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
>  {
> @@ -1036,7 +1045,8 @@ static void tun_poll_controller(struct net_device *dev)
>          * supports polling, which enables bridge devices in virt setups to
>          * still use netconsole
>          * If NAPI is enabled, however, we need to schedule polling for all
> -        * queues.
> +        * queues unless we are using napi_gro_frags(), which we call in
> +        * process context and not in NAPI context.
>          */
>         struct tun_struct *tun = netdev_priv(dev);
>
> @@ -1044,6 +1054,9 @@ static void tun_poll_controller(struct net_device *dev)
>                 struct tun_file *tfile;
>                 int i;
>
> +               if (tun_napi_frags_enabled(tun))
> +                       return;
> +
>                 rcu_read_lock();
>                 for (i = 0; i < tun->numqueues; i++) {
>                         tfile = rcu_dereference(tun->tfiles[i]);
> @@ -1266,6 +1279,64 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait)
>         return mask;
>  }
>
> +static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile,
> +                                           size_t len,
> +                                           const struct iov_iter *it)
> +{
> +       struct sk_buff *skb;
> +       size_t linear;
> +       int err;
> +       int i;
> +
> +       if (it->nr_segs > MAX_SKB_FRAGS + 1)
> +               return ERR_PTR(-ENOMEM);
> +
> +       local_bh_disable();
> +       skb = napi_get_frags(&tfile->napi);
> +       local_bh_enable();
> +       if (!skb)
> +               return ERR_PTR(-ENOMEM);
> +
> +       linear = iov_iter_single_seg_count(it);
> +       err = __skb_grow(skb, linear);
> +       if (err)
> +               goto free;
> +
> +       skb->len = len;
> +       skb->data_len = len - linear;
> +       skb->truesize += skb->data_len;
> +
> +       for (i = 1; i < it->nr_segs; i++) {
> +               size_t fragsz = it->iov[i].iov_len;
> +               unsigned long offset;
> +               struct page *page;
> +               void *data;
> +
> +               if (fragsz == 0 || fragsz > PAGE_SIZE) {
> +                       err = -EINVAL;
> +                       goto free;
> +               }
> +
> +               local_bh_disable();
> +               data = napi_alloc_frag(fragsz);
> +               local_bh_enable();
> +               if (!data) {
> +                       err = -ENOMEM;
> +                       goto free;
> +               }
> +
> +               page = virt_to_head_page(data);
> +               offset = data - page_address(page);
> +               skb_fill_page_desc(skb, i - 1, page, offset, fragsz);
> +       }
> +
> +       return skb;
> +free:
> +       /* frees skb and all frags allocated with napi_alloc_frag() */
> +       napi_free_frags(&tfile->napi);
> +       return ERR_PTR(err);
> +}
> +
>  /* prepad is the amount to reserve at front.  len is length after that.
>   * linear is a hint as to how much to copy (usually headers). */
>  static struct sk_buff *tun_alloc_skb(struct tun_file *tfile,
> @@ -1478,6 +1549,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
>         int err;
>         u32 rxhash;
>         int skb_xdp = 1;
> +       bool frags = tun_napi_frags_enabled(tun);
>
>         if (!(tun->dev->flags & IFF_UP))
>                 return -EIO;
> @@ -1535,7 +1607,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
>                         zerocopy = true;
>         }
>
> -       if (tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
> +       if (!frags && tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
>                 /* For the packet that is not easy to be processed
>                  * (e.g gso or jumbo packet), we will do it at after
>                  * skb was created with generic XDP routine.
> @@ -1556,10 +1628,24 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
>                                 linear = tun16_to_cpu(tun, gso.hdr_len);
>                 }
>
> -               skb = tun_alloc_skb(tfile, align, copylen, linear, noblock);
> +               if (frags) {
> +                       mutex_lock(&tfile->napi_mutex);
> +                       skb = tun_napi_alloc_frags(tfile, copylen, from);
> +                       /* tun_napi_alloc_frags() enforces a layout for the skb.
> +                        * If zerocopy is enabled, then this layout will be
> +                        * overwritten by zerocopy_sg_from_iter().
> +                        */
> +                       zerocopy = false;
> +               } else {
> +                       skb = tun_alloc_skb(tfile, align, copylen, linear,
> +                                           noblock);
> +               }
> +
>                 if (IS_ERR(skb)) {
>                         if (PTR_ERR(skb) != -EAGAIN)
>                                 this_cpu_inc(tun->pcpu_stats->rx_dropped);
> +                       if (frags)
> +                               mutex_unlock(&tfile->napi_mutex);
>                         return PTR_ERR(skb);
>                 }
>
> @@ -1571,6 +1657,11 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
>                 if (err) {
>                         this_cpu_inc(tun->pcpu_stats->rx_dropped);
>                         kfree_skb(skb);
> +                       if (frags) {
> +                               tfile->napi.skb = NULL;
> +                               mutex_unlock(&tfile->napi_mutex);
> +                       }
> +
>                         return -EFAULT;
>                 }
>         }
> @@ -1578,6 +1669,11 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
>         if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) {
>                 this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
>                 kfree_skb(skb);
> +               if (frags) {
> +                       tfile->napi.skb = NULL;
> +                       mutex_unlock(&tfile->napi_mutex);
> +               }
> +
>                 return -EINVAL;
>         }
>
> @@ -1603,7 +1699,8 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
>                 skb->dev = tun->dev;
>                 break;
>         case IFF_TAP:
> -               skb->protocol = eth_type_trans(skb, tun->dev);
> +               if (!frags)
> +                       skb->protocol = eth_type_trans(skb, tun->dev);
>                 break;
>         }
>
> @@ -1638,7 +1735,23 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
>
>         rxhash = __skb_get_hash_symmetric(skb);
>
> -       if (tun->flags & IFF_NAPI) {
> +       if (frags) {
> +               /* Exercise flow dissector code path. */
> +               u32 headlen = eth_get_headlen(skb->data, skb_headlen(skb));
> +
> +               if (headlen > skb_headlen(skb) || headlen < ETH_HLEN) {
> +                       this_cpu_inc(tun->pcpu_stats->rx_dropped);
> +                       napi_free_frags(&tfile->napi);
> +                       mutex_unlock(&tfile->napi_mutex);
> +                       WARN_ON(1);
> +                       return -ENOMEM;
> +               }
> +
> +               local_bh_disable();
> +               napi_gro_frags(&tfile->napi);
> +               local_bh_enable();
> +               mutex_unlock(&tfile->napi_mutex);
> +       } else if (tun->flags & IFF_NAPI) {
>                 struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
>                 int queue_len;
>
> @@ -2061,6 +2174,15 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>         if (tfile->detached)
>                 return -EINVAL;
>
> +       if ((ifr->ifr_flags & IFF_NAPI_FRAGS)) {
> +               if (!capable(CAP_NET_ADMIN))
> +                       return -EPERM;
> +
> +               if (!(ifr->ifr_flags & IFF_NAPI) ||
> +                   (ifr->ifr_flags & TUN_TYPE_MASK) != IFF_TAP)
> +                       return -EINVAL;
> +       }
> +
>         dev = __dev_get_by_name(net, ifr->ifr_name);
>         if (dev) {
>                 if (ifr->ifr_flags & IFF_TUN_EXCL)
> diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
> index 30b6184884eb..365ade5685c9 100644
> --- a/include/uapi/linux/if_tun.h
> +++ b/include/uapi/linux/if_tun.h
> @@ -61,6 +61,7 @@
>  #define IFF_TUN                0x0001
>  #define IFF_TAP                0x0002
>  #define IFF_NAPI       0x0010
> +#define IFF_NAPI_FRAGS 0x0020
>  #define IFF_NO_PI      0x1000
>  /* This flag has no real effect */
>  #define IFF_ONE_QUEUE  0x2000
> --
> 2.11.0
>

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ