netdev - Re: vxlan/veth performance issues on net.git + latest kernels

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1386120216.30495.59.camel@edumazet-glaptop2.roam.corp.google.com>
Date:	Tue, 03 Dec 2013 17:23:36 -0800
From:	Eric Dumazet <eric.dumazet@...il.com>
To:	Alexei Starovoitov <ast@...mgrid.com>
Cc:	Or Gerlitz <or.gerlitz@...il.com>,
	David Miller <davem@...emloft.net>,
	Joseph Gasparakis <joseph.gasparakis@...el.com>,
	Jerry Chu <hkchu@...gle.com>,
	Or Gerlitz <ogerlitz@...lanox.com>,
	Eric Dumazet <edumazet@...gle.com>,
	Pravin B Shelar <pshelar@...ira.com>,
	"netdev@...r.kernel.org" <netdev@...r.kernel.org>
Subject: Re: vxlan/veth performance issues on net.git + latest kernels

On Tue, 2013-12-03 at 16:55 -0800, Alexei Starovoitov wrote:
> On Tue, Dec 3, 2013 at 4:36 PM, Eric Dumazet <eric.dumazet@...il.com> wrote:
> > On Tue, 2013-12-03 at 16:26 -0800, Alexei Starovoitov wrote:
> >
> >>
> >> Could you use some other way to mark skb ?
> >
> > I could ;)
> >
> >> In tracing we might want to examine skb more carefully and not being
> >> able to see the device
> >> will limit the usability of this tracepoint.
> >
> > Unfortunately, using skb->dev as a pointer to device would be buggy or
> > expensive (you would need to take a reference on device in order not
> > letting it disappear, as we escape RCU protection)
> 
> well, yes, you might have an skb around when device is already freed
> when skb_dst_noref.
> but I'm not suggesting anything expensive. Tracing definitely should
> not add overhead by doing rcu_lock() or dev_hold(). Instead it can go
> through skb, skb->dev, skb->dev->xxx via probe_kernel_read(). If dev
> is gone, it's still safe.

Its certainly not safe to 'probe'.

Its not about faulting inexistent memory, that is the least of the
problem.

Any kind of information fetched from skb->dev might have been
overwritten.

You could for example fetch security sensitive data and expose it.


> 
> > Anyway, this magic is pretty easy to change, I am open to suggestions.
> 
> you're the expert :) use skb->mark field, since it's unused during
> freeing path... ?

cache line miss ;)

skb->dev is in the first cache line, where we access skb->next anyway.

I could use skb->cb[] like the following patch :

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index ec96130533cc..8b8c2171b187 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -209,9 +209,9 @@ static u16 bnx2x_free_tx_pkt(struct bnx2x *bp, struct bnx2x_fp_txdata *txdata,
 	if (likely(skb)) {
 		(*pkts_compl)++;
 		(*bytes_compl) += skb->len;
+		dev_consume_skb_any(skb);
 	}
 
-	dev_kfree_skb_any(skb);
 	tx_buf->first_bd = 0;
 	tx_buf->skb = NULL;
 
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index 8d3945ab7334..03081932e519 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -1058,7 +1058,7 @@ static void e1000_put_txbuf(struct e1000_ring *tx_ring,
 		buffer_info->dma = 0;
 	}
 	if (buffer_info->skb) {
-		dev_kfree_skb_any(buffer_info->skb);
+		dev_consume_skb_any(buffer_info->skb);
 		buffer_info->skb = NULL;
 	}
 	buffer_info->time_stamp = 0;
diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c
index 43aa7acd84a6..294825efb248 100644
--- a/drivers/net/ethernet/marvell/sky2.c
+++ b/drivers/net/ethernet/marvell/sky2.c
@@ -2037,7 +2037,7 @@ static void sky2_tx_complete(struct sky2_port *sky2, u16 done)
 			bytes_compl += skb->len;
 
 			re->skb = NULL;
-			dev_kfree_skb_any(skb);
+			dev_consume_skb_any(skb);
 
 			sky2->tx_next = RING_NEXT(idx, sky2->tx_ring_size);
 		}
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index f54ebd5a1702..653484bfae98 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -317,7 +317,7 @@ static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
 			}
 		}
 	}
-	dev_kfree_skb_any(skb);
+	dev_consume_skb_any(skb);
 	return tx_info->nr_txbb;
 }
 
diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c
index 2d045be4b5cf..d44f7b69a6a0 100644
--- a/drivers/net/ethernet/nvidia/forcedeth.c
+++ b/drivers/net/ethernet/nvidia/forcedeth.c
@@ -2557,7 +2557,7 @@ static int nv_tx_done(struct net_device *dev, int limit)
 					u64_stats_update_end(&np->swstats_tx_syncp);
 				}
 				bytes_compl += np->get_tx_ctx->skb->len;
-				dev_kfree_skb_any(np->get_tx_ctx->skb);
+				dev_consume_skb_any(np->get_tx_ctx->skb);
 				np->get_tx_ctx->skb = NULL;
 				tx_work++;
 			}
@@ -2574,7 +2574,7 @@ static int nv_tx_done(struct net_device *dev, int limit)
 					u64_stats_update_end(&np->swstats_tx_syncp);
 				}
 				bytes_compl += np->get_tx_ctx->skb->len;
-				dev_kfree_skb_any(np->get_tx_ctx->skb);
+				dev_consume_skb_any(np->get_tx_ctx->skb);
 				np->get_tx_ctx->skb = NULL;
 				tx_work++;
 			}
@@ -2625,7 +2625,7 @@ static int nv_tx_done_optimized(struct net_device *dev, int limit)
 			}
 
 			bytes_cleaned += np->get_tx_ctx->skb->len;
-			dev_kfree_skb_any(np->get_tx_ctx->skb);
+			dev_consume_skb_any(np->get_tx_ctx->skb);
 			np->get_tx_ctx->skb = NULL;
 			tx_work++;
 
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 916241d16c67..ab8970693ff3 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -755,7 +755,7 @@ static void free_old_xmit_skbs(struct send_queue *sq)
 		stats->tx_packets++;
 		u64_stats_update_end(&stats->tx_syncp);
 
-		dev_kfree_skb_any(skb);
+		dev_consume_skb_any(skb);
 	}
 }
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7f0ed423a360..8b80a58ec1ac 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2374,11 +2374,38 @@ int netif_get_num_default_rss_queues(void);
  */
 void dev_kfree_skb_irq(struct sk_buff *skb);
 
+void __dev_kfree_skb_any(struct sk_buff *skb);
+
+struct __dev_kfree_skb_cb {
+	unsigned int reason;
+};
+
+static inline struct __dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
+{
+	return (struct __dev_kfree_skb_cb *)skb->cb;
+}
+
+enum {
+	SKB_REASON_CONSUMED,
+	SKB_REASON_DROPPED,
+};
+
 /* Use this variant in places where it could be invoked
  * from either hardware interrupt or other context, with hardware interrupts
  * either disabled or enabled.
+ * Note that TX completion should use dev_consume_skb_any()
  */
-void dev_kfree_skb_any(struct sk_buff *skb);
+static inline void dev_kfree_skb_any(struct sk_buff *skb)
+{
+	get_kfree_skb_cb(skb)->reason = SKB_REASON_DROPPED;
+	__dev_kfree_skb_any(skb);
+}
+
+static inline void dev_consume_skb_any(struct sk_buff *skb)
+{
+	get_kfree_skb_cb(skb)->reason = SKB_REASON_CONSUMED;
+	__dev_kfree_skb_any(skb);
+}
 
 int netif_rx(struct sk_buff *skb);
 int netif_rx_ni(struct sk_buff *skb);
diff --git a/net/core/dev.c b/net/core/dev.c
index ba3b7ea5ebb3..3170776e53da 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2161,14 +2161,14 @@ void dev_kfree_skb_irq(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(dev_kfree_skb_irq);
 
-void dev_kfree_skb_any(struct sk_buff *skb)
+void __dev_kfree_skb_any(struct sk_buff *skb)
 {
 	if (in_irq() || irqs_disabled())
 		dev_kfree_skb_irq(skb);
 	else
 		dev_kfree_skb(skb);
 }
-EXPORT_SYMBOL(dev_kfree_skb_any);
+EXPORT_SYMBOL(__dev_kfree_skb_any);
 
 
 /**
@@ -3306,7 +3306,10 @@ static void net_tx_action(struct softirq_action *h)
 			clist = clist->next;
 
 			WARN_ON(atomic_read(&skb->users));
-			trace_kfree_skb(skb, net_tx_action);
+			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
+				trace_consume_skb(skb);
+			else
+				trace_kfree_skb(skb, net_tx_action);
 			__kfree_skb(skb);
 		}
 	}




--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html