[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20170919073402.2292-3-peterpenkov96@gmail.com>
Date: Tue, 19 Sep 2017 00:34:02 -0700
From: Petar Penkov <peterpenkov96@...il.com>
To: netdev@...r.kernel.org
Cc: Petar Penkov <peterpenkov96@...il.com>,
Eric Dumazet <edumazet@...gle.com>,
Mahesh Bandewar <maheshb@...gle.com>,
Willem de Bruijn <willemb@...gle.com>, davem@...emloft.net,
ppenkov@...nford.edu
Subject: [PATCH,net-next,2/2] tun: enable napi_gro_frags() for TUN/TAP driver
Add a TUN/TAP receive mode that exercises the napi_gro_frags()
interface. This mode is available only in TAP mode, as the interface
expects packets with Ethernet headers.
Furthermore, packets follow the layout of the iovec_iter that was
received. The first iovec is the linear data, and every one after the
first is a fragment. If there are more fragments than the max number,
drop the packet. Additionally, invoke eth_get_headlen() to exercise flow
dissector code and to verify that the header resides in the linear data.
The napi_gro_frags() mode requires setting the IFF_NAPI_FRAGS option.
This is imposed because this mode is intended for testing via tools like
syzkaller and packetdrill, and the increased flexibility it provides can
introduce security vulnerabilities.
Signed-off-by: Petar Penkov <peterpenkov96@...il.com>
Cc: Eric Dumazet <edumazet@...gle.com>
Cc: Mahesh Bandewar <maheshb@...gle.com>
Cc: Willem de Bruijn <willemb@...gle.com>
Cc: davem@...emloft.net
Cc: ppenkov@...nford.edu
---
drivers/net/tun.c | 135 ++++++++++++++++++++++++++++++++++++++++++--
include/uapi/linux/if_tun.h | 1 +
2 files changed, 130 insertions(+), 6 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 46cca1094c91..ebe0d7dc7de6 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -75,6 +75,7 @@
#include <linux/skb_array.h>
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
+#include <linux/mutex.h>
#include <linux/uaccess.h>
@@ -120,8 +121,15 @@ do { \
#define TUN_VNET_LE 0x80000000
#define TUN_VNET_BE 0x40000000
+#if IS_ENABLED(CONFIG_TUN_NAPI)
+#define TUN_FEATURES_EXTRA IFF_NAPI_FRAGS
+#else
+#define TUN_FEATURES_EXTRA 0
+#endif
+
#define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
- IFF_MULTI_QUEUE)
+ IFF_MULTI_QUEUE | TUN_FEATURES_EXTRA)
+
#define GOODCOPY_LEN 128
#define FLT_EXACT_COUNT 8
@@ -173,6 +181,7 @@ struct tun_file {
unsigned int ifindex;
};
struct napi_struct napi;
+ struct mutex napi_mutex; /* Protects access to the above napi */
struct list_head next;
struct tun_struct *detached;
struct skb_array tx_array;
@@ -276,6 +285,7 @@ static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile)
netif_napi_add(tun->dev, &tfile->napi, tun_napi_poll,
NAPI_POLL_WEIGHT);
napi_enable(&tfile->napi);
+ mutex_init(&tfile->napi_mutex);
}
}
@@ -291,6 +301,11 @@ static void tun_napi_del(struct tun_file *tfile)
netif_napi_del(&tfile->napi);
}
+static bool tun_napi_frags_enabled(const struct tun_struct *tun)
+{
+ return READ_ONCE(tun->flags) & IFF_NAPI_FRAGS;
+}
+
#ifdef CONFIG_TUN_VNET_CROSS_LE
static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
{
@@ -1034,7 +1049,8 @@ static void tun_poll_controller(struct net_device *dev)
* supports polling, which enables bridge devices in virt setups to
* still use netconsole
* If NAPI is enabled, however, we need to schedule polling for all
- * queues.
+ * queues unless we are using napi_gro_frags(), which we call in
+ * process context and not in NAPI context.
*/
if (IS_ENABLED(CONFIG_TUN_NAPI)) {
@@ -1042,6 +1058,9 @@ static void tun_poll_controller(struct net_device *dev)
struct tun_file *tfile;
int i;
+ if (tun_napi_frags_enabled(tun))
+ return;
+
rcu_read_lock();
for (i = 0; i < tun->numqueues; i++) {
tfile = rcu_dereference(tun->tfiles[i]);
@@ -1264,6 +1283,64 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait)
return mask;
}
+static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile,
+ size_t len,
+ const struct iov_iter *it)
+{
+ struct sk_buff *skb;
+ size_t linear;
+ int err;
+ int i;
+
+ if (it->nr_segs > MAX_SKB_FRAGS + 1)
+ return ERR_PTR(-ENOMEM);
+
+ local_bh_disable();
+ skb = napi_get_frags(&tfile->napi);
+ local_bh_enable();
+ if (!skb)
+ return ERR_PTR(-ENOMEM);
+
+ linear = iov_iter_single_seg_count(it);
+ err = __skb_grow(skb, linear);
+ if (err)
+ goto free;
+
+ skb->len = len;
+ skb->data_len = len - linear;
+ skb->truesize += skb->data_len;
+
+ for (i = 1; i < it->nr_segs; i++) {
+ size_t fragsz = it->iov[i].iov_len;
+ unsigned long offset;
+ struct page *page;
+ void *data;
+
+ if (fragsz == 0 || fragsz > PAGE_SIZE) {
+ err = -EINVAL;
+ goto free;
+ }
+
+ local_bh_disable();
+ data = napi_alloc_frag(fragsz);
+ local_bh_enable();
+ if (!data) {
+ err = -ENOMEM;
+ goto free;
+ }
+
+ page = virt_to_head_page(data);
+ offset = data - page_address(page);
+ skb_fill_page_desc(skb, i - 1, page, offset, fragsz);
+ }
+
+ return skb;
+free:
+ /* frees skb and all frags allocated with napi_alloc_frag() */
+ napi_free_frags(&tfile->napi);
+ return ERR_PTR(err);
+}
+
/* prepad is the amount to reserve at front. len is length after that.
* linear is a hint as to how much to copy (usually headers). */
static struct sk_buff *tun_alloc_skb(struct tun_file *tfile,
@@ -1476,6 +1553,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
int err;
u32 rxhash;
int skb_xdp = 1;
+ bool frags = tun_napi_frags_enabled(tun);
if (!(tun->dev->flags & IFF_UP))
return -EIO;
@@ -1533,7 +1611,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
zerocopy = true;
}
- if (tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
+ if (!frags && tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
/* For the packet that is not easy to be processed
* (e.g gso or jumbo packet), we will do it at after
* skb was created with generic XDP routine.
@@ -1554,10 +1632,24 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
linear = tun16_to_cpu(tun, gso.hdr_len);
}
- skb = tun_alloc_skb(tfile, align, copylen, linear, noblock);
+ if (frags) {
+ mutex_lock(&tfile->napi_mutex);
+ skb = tun_napi_alloc_frags(tfile, copylen, from);
+ /* tun_napi_alloc_frags() enforces a layout for the skb.
+ * If zerocopy is enabled, then this layout will be
+ * overwritten by zerocopy_sg_from_iter().
+ */
+ zerocopy = false;
+ } else {
+ skb = tun_alloc_skb(tfile, align, copylen, linear,
+ noblock);
+ }
+
if (IS_ERR(skb)) {
if (PTR_ERR(skb) != -EAGAIN)
this_cpu_inc(tun->pcpu_stats->rx_dropped);
+ if (frags)
+ mutex_unlock(&tfile->napi_mutex);
return PTR_ERR(skb);
}
@@ -1569,6 +1661,11 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
if (err) {
this_cpu_inc(tun->pcpu_stats->rx_dropped);
kfree_skb(skb);
+ if (frags) {
+ tfile->napi.skb = NULL;
+ mutex_unlock(&tfile->napi_mutex);
+ }
+
return -EFAULT;
}
}
@@ -1576,6 +1673,11 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) {
this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
kfree_skb(skb);
+ if (frags) {
+ tfile->napi.skb = NULL;
+ mutex_unlock(&tfile->napi_mutex);
+ }
+
return -EINVAL;
}
@@ -1601,7 +1703,8 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
skb->dev = tun->dev;
break;
case IFF_TAP:
- skb->protocol = eth_type_trans(skb, tun->dev);
+ if (!frags)
+ skb->protocol = eth_type_trans(skb, tun->dev);
break;
}
@@ -1636,7 +1739,23 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
rxhash = __skb_get_hash_symmetric(skb);
- if (IS_ENABLED(CONFIG_TUN_NAPI)) {
+ if (frags) {
+ /* Exercise flow dissector code path. */
+ u32 headlen = eth_get_headlen(skb->data, skb_headlen(skb));
+
+ if (headlen > skb_headlen(skb) || headlen < ETH_HLEN) {
+ this_cpu_inc(tun->pcpu_stats->rx_dropped);
+ napi_free_frags(&tfile->napi);
+ mutex_unlock(&tfile->napi_mutex);
+ WARN_ON(1);
+ return -ENOMEM;
+ }
+
+ local_bh_disable();
+ napi_gro_frags(&tfile->napi);
+ local_bh_enable();
+ mutex_unlock(&tfile->napi_mutex);
+ } else if (IS_ENABLED(CONFIG_TUN_NAPI)) {
struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
int queue_len;
@@ -2182,6 +2301,10 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
tun->flags = (tun->flags & ~TUN_FEATURES) |
(ifr->ifr_flags & TUN_FEATURES);
+ if (!IS_ENABLED(CONFIG_TUN_NAPI) ||
+ (tun->flags & TUN_TYPE_MASK) != IFF_TAP)
+ tun->flags = tun->flags & ~IFF_NAPI_FRAGS;
+
/* Make sure persistent devices do not get stuck in
* xoff state.
*/
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 3cb5e1d85ddd..1eb1eb42f151 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -60,6 +60,7 @@
/* TUNSETIFF ifr flags */
#define IFF_TUN 0x0001
#define IFF_TAP 0x0002
+#define IFF_NAPI_FRAGS 0x0010
#define IFF_NO_PI 0x1000
/* This flag has no real effect */
#define IFF_ONE_QUEUE 0x2000
--
2.11.0
Powered by blists - more mailing lists