netdev - Re: [PATCH] loopback: better handling of packet drops

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <49E85AFD.6080407@cosmosbay.com>
Date:	Fri, 17 Apr 2009 12:33:33 +0200
From:	Eric Dumazet <dada1@...mosbay.com>
To:	David Miller <davem@...emloft.net>
CC:	netdev@...r.kernel.org
Subject: Re: [PATCH] loopback: better handling of packet drops

Eric Dumazet a écrit :
> Eric Dumazet a écrit :
>> David Miller a écrit :
>>> From: Eric Dumazet <dada1@...mosbay.com>
>>> Date: Fri, 17 Apr 2009 10:56:57 +0200
>>>
>>>> We can in some situations drop packets in netif_rx()
>>>>
>>>> loopback driver does not report these (unlikely) drops to its stats,
>>>> and incorrectly change packets/bytes counts.
>>>>
>>>> After this patch applied, "ifconfig lo" can reports these drops as in :
>>>>
>>>> # ifconfig lo
>>>> lo        Link encap:Local Loopback
>>>>           inet addr:127.0.0.1  Mask:255.0.0.0
>>>>           UP LOOPBACK RUNNING  MTU:16436  Metric:1
>>>>           RX packets:692562900 errors:0 dropped:0 overruns:0 frame:0
>>>>           TX packets:692562900 errors:3228 dropped:3228 overruns:0 carrier:0
>>>>           collisions:0 txqueuelen:0
>>>>           RX bytes:2865674174 (2.6 GiB)  TX bytes:2865674174 (2.6 GiB)
>>>>
>>>> I chose to reflect those errors only in tx_dropped/tx_errors, and not mirror
>>>> these errors in rx_dropped/rx_errors.
>>>>
>>>> Signed-off-by: Eric Dumazet <dada1@...mosbay.com>
>>> Well, logically the receive is what failed, not the transmit.
>>>
>>> I think it's therefore misleading to count it as a TX drop.
>>>
>>> Do you feel strongly about this?
>> Not at all, but my plan was to go a litle bit further, ie being able to 
>> return from loopback_xmit() with a non null value.
>>
> 
> Something like this :

I just noticed NETDEV_TX_BUSY & NETDEV_TX_OK, so here is an updated version
using these macros instead of 0 & 1

[PATCH] loopback: better handling of packet drops

We can in some situations drop packets in netif_rx()

loopback driver does not report these (unlikely) drops to its stats,
and incorrectly change packets/bytes counts. Also upper layers are
not warned of these transmit failures.

After this patch applied, "ifconfig lo" can reports these drops as in :

# ifconfig lo
lo        Link encap:Local Loopback
          inet addr:127.0.0.1  Mask:255.0.0.0
          UP LOOPBACK RUNNING  MTU:16436  Metric:1
          RX packets:692562900 errors:0 dropped:0 overruns:0 frame:0
          TX packets:692562900 errors:3228 dropped:3228 overruns:0 carrier:0
          collisions:0 txqueuelen:0
          RX bytes:2865674174 (2.6 GiB)  TX bytes:2865674174 (2.6 GiB)

More over, loopback_xmit() can now return to its caller the indication that
packet was not transmitted for better queue management and error handling.

I chose to reflect those errors only in tx_dropped/tx_errors, and not mirror
them in rx_dropped/rx_errors.

Splitting netif_rx() with a helper function boosts tbench performance by 1%,
because we can avoid two tests (about netpoll and timestamping)

Tested with /proc/sys/net/core/netdev_max_backlog set to 0, tbench
can run at full speed even with some 'losses' on loopback. No more
tcp stalls...

Signed-off-by: Eric Dumazet <dada1@...mosbay.com>
---
 drivers/net/loopback.c    |   24 +++++++++---
 include/linux/netdevice.h |    1
 net/core/dev.c            |   68 +++++++++++++++++++++++-------------
 3 files changed, 62 insertions(+), 31 deletions(-)

diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index b7d438a..101a3bc 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -62,6 +62,7 @@
 struct pcpu_lstats {
 	unsigned long packets;
 	unsigned long bytes;
+	unsigned long drops;
 };
 
 /*
@@ -71,20 +72,25 @@ struct pcpu_lstats {
 static int loopback_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct pcpu_lstats *pcpu_lstats, *lb_stats;
+	int len;
 
 	skb_orphan(skb);
 
-	skb->protocol = eth_type_trans(skb,dev);
+	skb->protocol = eth_type_trans(skb, dev);
 
 	/* it's OK to use per_cpu_ptr() because BHs are off */
 	pcpu_lstats = dev->ml_priv;
 	lb_stats = per_cpu_ptr(pcpu_lstats, smp_processor_id());
-	lb_stats->bytes += skb->len;
-	lb_stats->packets++;
 
-	netif_rx(skb);
+	len = skb->len;
+	if (likely(__netif_rx(skb) == NET_RX_SUCCESS)) {
+		lb_stats->bytes += len;
+		lb_stats->packets++;
+		return NETDEV_TX_OK;
+	}
+	lb_stats->drops++;
 
-	return 0;
+	return NETDEV_TX_BUSY;
 }
 
 static struct net_device_stats *loopback_get_stats(struct net_device *dev)
@@ -93,6 +99,7 @@ static struct net_device_stats *loopback_get_stats(struct net_device *dev)
 	struct net_device_stats *stats = &dev->stats;
 	unsigned long bytes = 0;
 	unsigned long packets = 0;
+	unsigned long drops = 0;
 	int i;
 
 	pcpu_lstats = dev->ml_priv;
@@ -102,11 +109,14 @@ static struct net_device_stats *loopback_get_stats(struct net_device *dev)
 		lb_stats = per_cpu_ptr(pcpu_lstats, i);
 		bytes   += lb_stats->bytes;
 		packets += lb_stats->packets;
+		drops   += lb_stats->drops;
 	}
 	stats->rx_packets = packets;
 	stats->tx_packets = packets;
-	stats->rx_bytes = bytes;
-	stats->tx_bytes = bytes;
+	stats->tx_dropped = drops;
+	stats->tx_errors  = drops;
+	stats->rx_bytes   = bytes;
+	stats->tx_bytes   = bytes;
 	return stats;
 }
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2e7783f..c60e250 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1430,6 +1430,7 @@ extern void dev_kfree_skb_irq(struct sk_buff *skb);
 extern void dev_kfree_skb_any(struct sk_buff *skb);
 
 #define HAVE_NETIF_RX 1
+extern int		__netif_rx(struct sk_buff *skb);
 extern int		netif_rx(struct sk_buff *skb);
 extern int		netif_rx_ni(struct sk_buff *skb);
 #define HAVE_NETIF_RECEIVE_SKB 1
diff --git a/net/core/dev.c b/net/core/dev.c
index 343883f..8ae3f19 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1909,6 +1909,44 @@ int weight_p __read_mostly = 64;            /* old backlog weight */
 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
 
 
+/*
+ * helper function called from netif_rx() or loopback_xmit()
+ */
+int __netif_rx(struct sk_buff *skb)
+{
+	struct softnet_data *queue;
+	unsigned long flags;
+
+	/*
+	 * The code is rearranged so that the path is the most
+	 * short when CPU is congested, but is still operating.
+	 */
+	local_irq_save(flags);
+	queue = &__get_cpu_var(softnet_data);
+
+	__get_cpu_var(netdev_rx_stat).total++;
+	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
+		if (queue->input_pkt_queue.qlen) {
+enqueue:
+			__skb_queue_tail(&queue->input_pkt_queue, skb);
+			local_irq_restore(flags);
+			return NET_RX_SUCCESS;
+		}
+
+		napi_schedule(&queue->backlog);
+		goto enqueue;
+	}
+
+	__get_cpu_var(netdev_rx_stat).dropped++;
+	local_irq_restore(flags);
+	/*
+	 * Dont free skb here.
+	 * netif_rx() will call kfree_skb(skb)
+	 * loopback_xmit() will not free it but return an error to its caller
+	 */
+	return NET_RX_DROP;
+}
+
 /**
  *	netif_rx	-	post buffer to the network code
  *	@skb: buffer to post
@@ -1928,6 +1966,7 @@ int netif_rx(struct sk_buff *skb)
 {
 	struct softnet_data *queue;
 	unsigned long flags;
+	int ret;
 
 	/* if netpoll wants it, pretend we never saw it */
 	if (netpoll_rx(skb))
@@ -1936,32 +1975,14 @@ int netif_rx(struct sk_buff *skb)
 	if (!skb->tstamp.tv64)
 		net_timestamp(skb);
 
-	/*
-	 * The code is rearranged so that the path is the most
-	 * short when CPU is congested, but is still operating.
-	 */
-	local_irq_save(flags);
-	queue = &__get_cpu_var(softnet_data);
-
-	__get_cpu_var(netdev_rx_stat).total++;
-	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
-		if (queue->input_pkt_queue.qlen) {
-enqueue:
-			__skb_queue_tail(&queue->input_pkt_queue, skb);
-			local_irq_restore(flags);
-			return NET_RX_SUCCESS;
-		}
-
-		napi_schedule(&queue->backlog);
-		goto enqueue;
-	}
+	ret = __netif_rx(skb);
 
-	__get_cpu_var(netdev_rx_stat).dropped++;
-	local_irq_restore(flags);
+	if (unlikely(ret == NET_RX_DROP))
+		kfree_skb(skb);
 
-	kfree_skb(skb);
-	return NET_RX_DROP;
+	return ret;
 }
+EXPORT_SYMBOL(netif_rx);
 
 int netif_rx_ni(struct sk_buff *skb)
 {
@@ -5307,7 +5328,6 @@ EXPORT_SYMBOL(netdev_boot_setup_check);
 EXPORT_SYMBOL(netdev_set_master);
 EXPORT_SYMBOL(netdev_state_change);
 EXPORT_SYMBOL(netif_receive_skb);
-EXPORT_SYMBOL(netif_rx);
 EXPORT_SYMBOL(register_gifconf);
 EXPORT_SYMBOL(register_netdevice);
 EXPORT_SYMBOL(register_netdevice_notifier);

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html