lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1442844715.7367.26.camel@infradead.org>
Date:	Mon, 21 Sep 2015 15:11:55 +0100
From:	David Woodhouse <dwmw2@...radead.org>
To:	David Miller <davem@...emloft.net>
Cc:	netdev@...r.kernel.org, romieu@...zoreil.com
Subject: Re: [PATCH 2/2] 8139cp: Call __cp_set_rx_mode() from cp_tx_timeout()

On Mon, 2015-09-21 at 14:59 +0100, David Woodhouse wrote:

> After which, I think we might be able to turn on TX checksumming by
> default and I also have a way to implement early detection of the TX
> stall I've been seeing.

This is a patch I've been playing with to catch the TX stall.

When it happens we get a TxEmpty interrupt *without* TxDone.

After loading the driver, we never get TxEmpty without TxDone until the
problem has happened. I've got a minimal cp_tx_timeout() which just
clears the TxOn bit in the Cmd register and turns it back on again,
after moving the descriptors all down to the start of the TX ring.

Strangely, *after* this has happened we do see a lot of instances of
TxEmpty without TxDone. But then the TxDone usually comes immediately
afterwards. Until the driver is reloaded, at which point the next
instance of TxEmpty without TxDone *is* the hardware stalling again.

I'm very confused about what's going on there. I think I possibly need
to set a timer when the TxEmpty/!TxDone interrupt happens, and do a
preemptive reset of the TX queue (as shown below) if the timer manages
to expire before the TxDone actually happens.

diff --git a/drivers/net/ethernet/realtek/8139cp.c b/drivers/net/ethernet/realtek/8139cp.c
index 67a4fcf..64b44ec 100644
--- a/drivers/net/ethernet/realtek/8139cp.c
+++ b/drivers/net/ethernet/realtek/8139cp.c
@@ -592,8 +592,8 @@ static irqreturn_t cp_interrupt (int irq, void *dev_instance)
 	if (!status || (status == 0xFFFF))
 		goto out_unlock;
 
-	netif_dbg(cp, intr, dev, "intr, status %04x cmd %02x cpcmd %04x\n",
-		  status, cpr8(Cmd), cpr16(CpCmd));
+	netif_dbg(cp, intr, dev, "intr, status %04x cmd %02x cpcmd %04x desc %04x\n",
+		  status, cpr8(Cmd), cpr16(CpCmd), cpr16(TxDmaOkLowDesc));
 
 	cpw16(IntrStatus, status & ~cp_rx_intr_mask);
 
@@ -623,6 +623,10 @@ static irqreturn_t cp_interrupt (int irq, void *dev_instance)
 			   Let it stop. */
 			cp->tx_running = 0;
 		} else {
+			if (!(status & (TxOK | TxErr)))
+				netdev_warn(dev, "TxEmpty without TxDone. h %d t %d d %04x\n",
+					    cp->tx_head, cp->tx_tail, cpr16(TxDmaOkLowDesc));
+
 			/* The hardware raced with us adding a new descriptor,
 			   and we didn't get the TxEmpty IRQ in time so we
 			   didn't prod it. Prod it now to restart. */
@@ -1307,12 +1311,49 @@ static void cp_tx_timeout(struct net_device *dev)
 			  le64_to_cpu(cp->tx_ring[i].addr),
 			  cp->tx_skb[i]);
 	}
-
+#if 1
+	/* Stop the TX (which is already stopped), move the
+	   descriptors down, and start it again */
+	cpw8_f(Cmd, RxOn);
+	if (cp->tx_tail == 0) {
+		/* Do nothing */
+	} else if (cp->tx_head > cp->tx_tail) {
+		for (i = 0; i < cp->tx_head - cp->tx_tail; i++) {
+			int from = i + cp->tx_tail;
+			cp->tx_ring[i].addr = cp->tx_ring[from].addr;
+			cp->tx_ring[i].opts2 = cp->tx_ring[from].opts2;
+			cp->tx_ring[i].opts1 = cp->tx_ring[from].opts1;
+			cp->tx_ring[from].opts1 &= ~cpu_to_le32(DescOwn);
+			cp->tx_opts[i] = cp->tx_opts[from];
+			cp->tx_skb[i] = cp->tx_skb[from];
+			cp->tx_skb[from] = NULL;
+			printk("Ring move %d->%d\n", from, i);
+		}
+	} else {
+		for (i = cp->tx_tail - cp->tx_head - 1; i >= 0; i--) {
+			int from = (i + cp->tx_tail) & (CP_TX_RING_SIZE - 1);
+			cp->tx_ring[i].addr = cp->tx_ring[from].addr;
+			cp->tx_ring[i].opts2 = cp->tx_ring[from].opts2;
+			cp->tx_ring[i].opts1 = cp->tx_ring[from].opts1;
+			cp->tx_ring[from].opts1 &= ~cpu_to_le32(DescOwn);
+			cp->tx_opts[i] = cp->tx_opts[from];
+			cp->tx_skb[i] = cp->tx_skb[from];
+			cp->tx_skb[from] = NULL;
+			printk("Ring move %d->%d\n", from, i);
+		}
+	}
+	cpw8_f(Cmd, RxOn|TxOn);
+	cp->tx_head = (cp->tx_head - cp->tx_tail) & (CP_TX_RING_SIZE - 1);
+	cp->tx_tail = 0;
+	printk("head %d tail %d\n", cp->tx_head, cp->tx_tail);
+	cpw8(TxPoll, NormalTxPoll);
+#else
 	cp_stop_hw(cp);
 	cp_clean_rings(cp);
 	rc = cp_init_rings(cp);
 	cp_start_hw(cp);
 	__cp_set_rx_mode(dev);
+#endif
 	cpw16_f(IntrMask, cp_norx_intr_mask);
 
 	netif_wake_queue(dev);

-- 
dwmw2


Download attachment "smime.p7s" of type "application/x-pkcs7-signature" (5691 bytes)

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ