netdev - Re: [PATCH net-next 2/2] net: macb: use NAPI for TX completion path

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <d75679b0-d9a7-619f-bc35-a63704c255ee@vaisala.com>
Date:   Tue, 3 May 2022 12:57:34 +0300
From:   Tomas Melin <tomas.melin@...sala.com>
To:     Robert Hancock <robert.hancock@...ian.com>, netdev@...r.kernel.org
Cc:     nicolas.ferre@...rochip.com, claudiu.beznea@...rochip.com,
        davem@...emloft.net, edumazet@...gle.com, kuba@...nel.org,
        pabeni@...hat.com
Subject: Re: [PATCH net-next 2/2] net: macb: use NAPI for TX completion path

Hi,


On 30/04/2022 01:31, Robert Hancock wrote:
> This driver was using the TX IRQ handler to perform all TX completion
> tasks. Under heavy TX network load, this can cause significant irqs-off
> latencies (found to be in the hundreds of microseconds using ftrace).
> This can cause other issues, such as overrunning serial UART FIFOs when
> using high baud rates with limited UART FIFO sizes.
> 
> Switch to using the NAPI poll handler to perform the TX completion work
> to get this out of hard IRQ context and avoid the IRQ latency impact.
> 
> The TX Used Bit Read interrupt (TXUBR) handling also needs to be moved
> into the NAPI poll handler to maintain the proper order of operations. A
> flag is used to notify the poll handler that a UBR condition needs to be
> handled. The macb_tx_restart handler has had some locking added for global
> register access, since this could now potentially happen concurrently on
> different queues.
> 
> Signed-off-by: Robert Hancock <robert.hancock@...ian.com>
> ---
>  drivers/net/ethernet/cadence/macb.h      |   1 +
>  drivers/net/ethernet/cadence/macb_main.c | 138 +++++++++++++----------
>  2 files changed, 80 insertions(+), 59 deletions(-)
> 
> diff --git a/drivers/net/ethernet/cadence/macb.h b/drivers/net/ethernet/cadence/macb.h
> index f0a7d8396a4a..5355cef95a9b 100644
> --- a/drivers/net/ethernet/cadence/macb.h
> +++ b/drivers/net/ethernet/cadence/macb.h
> @@ -1209,6 +1209,7 @@ struct macb_queue {
>  	struct macb_tx_skb	*tx_skb;
>  	dma_addr_t		tx_ring_dma;
>  	struct work_struct	tx_error_task;
> +	bool			txubr_pending;
>  
>  	dma_addr_t		rx_ring_dma;
>  	dma_addr_t		rx_buffers_dma;
> diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
> index 160dc5ad84ae..1cb8afb8ef0d 100644
> --- a/drivers/net/ethernet/cadence/macb_main.c
> +++ b/drivers/net/ethernet/cadence/macb_main.c
> @@ -959,7 +959,7 @@ static int macb_halt_tx(struct macb *bp)
>  	return -ETIMEDOUT;
>  }
>  
> -static void macb_tx_unmap(struct macb *bp, struct macb_tx_skb *tx_skb)
> +static void macb_tx_unmap(struct macb *bp, struct macb_tx_skb *tx_skb, int budget)
>  {
>  	if (tx_skb->mapping) {
>  		if (tx_skb->mapped_as_page)
> @@ -972,7 +972,7 @@ static void macb_tx_unmap(struct macb *bp, struct macb_tx_skb *tx_skb)
>  	}
>  
>  	if (tx_skb->skb) {
> -		dev_kfree_skb_any(tx_skb->skb);
> +		napi_consume_skb(tx_skb->skb, budget);
>  		tx_skb->skb = NULL;
>  	}
>  }
> @@ -1025,12 +1025,13 @@ static void macb_tx_error_task(struct work_struct *work)
>  		    (unsigned int)(queue - bp->queues),
>  		    queue->tx_tail, queue->tx_head);
>  
> -	/* Prevent the queue IRQ handlers from running: each of them may call
> -	 * macb_tx_interrupt(), which in turn may call netif_wake_subqueue().
> +	/* Prevent the queue NAPI poll from running, as it calls
> +	 * macb_tx_complete(), which in turn may call netif_wake_subqueue().
>  	 * As explained below, we have to halt the transmission before updating
>  	 * TBQP registers so we call netif_tx_stop_all_queues() to notify the
>  	 * network engine about the macb/gem being halted.
>  	 */
> +	napi_disable(&queue->napi);
>  	spin_lock_irqsave(&bp->lock, flags);
>  
>  	/* Make sure nobody is trying to queue up new packets */
> @@ -1058,7 +1059,7 @@ static void macb_tx_error_task(struct work_struct *work)
>  		if (ctrl & MACB_BIT(TX_USED)) {
>  			/* skb is set for the last buffer of the frame */
>  			while (!skb) {
> -				macb_tx_unmap(bp, tx_skb);
> +				macb_tx_unmap(bp, tx_skb, 0);
>  				tail++;
>  				tx_skb = macb_tx_skb(queue, tail);
>  				skb = tx_skb->skb;
> @@ -1088,7 +1089,7 @@ static void macb_tx_error_task(struct work_struct *work)
>  			desc->ctrl = ctrl | MACB_BIT(TX_USED);
>  		}
>  
> -		macb_tx_unmap(bp, tx_skb);
> +		macb_tx_unmap(bp, tx_skb, 0);
>  	}
>  
>  	/* Set end of TX queue */
> @@ -1118,25 +1119,28 @@ static void macb_tx_error_task(struct work_struct *work)
>  	macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART));
>  
>  	spin_unlock_irqrestore(&bp->lock, flags);
> +	napi_enable(&queue->napi);
>  }
>  
> -static void macb_tx_interrupt(struct macb_queue *queue)
> +static bool macb_tx_complete_pending(struct macb_queue *queue)

This might be somewhat problematic approach as TX_USED bit could
potentially also be set if a descriptor with TX_USED is found mid-frame
(for whatever reason).
Not sure how it should be done, but it would be nice if TCOMP
would rather be known on per queue basis.

Related note, rx side function seems to be named as macb_rx_pending(),
should be similary macb_tx_pending()?


Thanks,
Tomas


> +{
> +	if (queue->tx_head != queue->tx_tail) {
> +		/* Make hw descriptor updates visible to CPU */
> +		rmb();
> +
> +		if (macb_tx_desc(queue, queue->tx_tail)->ctrl & MACB_BIT(TX_USED))
> +			return true;
> +	}
> +	return false;
> +}
> +
> +static void macb_tx_complete(struct macb_queue *queue, int budget)
>  {
>  	unsigned int tail;
>  	unsigned int head;
> -	u32 status;
>  	struct macb *bp = queue->bp;
>  	u16 queue_index = queue - bp->queues;
>  
> -	status = macb_readl(bp, TSR);
> -	macb_writel(bp, TSR, status);
> -
> -	if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
> -		queue_writel(queue, ISR, MACB_BIT(TCOMP));
> -
> -	netdev_vdbg(bp->dev, "macb_tx_interrupt status = 0x%03lx\n",
> -		    (unsigned long)status);
> -
>  	head = queue->tx_head;
>  	for (tail = queue->tx_tail; tail != head; tail++) {
>  		struct macb_tx_skb	*tx_skb;
> @@ -1182,7 +1186,7 @@ static void macb_tx_interrupt(struct macb_queue *queue)
>  			}
>  
>  			/* Now we can safely release resources */
> -			macb_tx_unmap(bp, tx_skb);
> +			macb_tx_unmap(bp, tx_skb, budget);
>  
>  			/* skb is set only for the last buffer of the frame.
>  			 * WARNING: at this point skb has been freed by
> @@ -1569,23 +1573,55 @@ static bool macb_rx_pending(struct macb_queue *queue)
>  	return (desc->addr & MACB_BIT(RX_USED)) != 0;
>  }
>  
> +static void macb_tx_restart(struct macb_queue *queue)
> +{
> +	unsigned int head = queue->tx_head;
> +	unsigned int tail = queue->tx_tail;
> +	struct macb *bp = queue->bp;
> +	unsigned int head_idx, tbqp;
> +
> +	if (head == tail)
> +		return;
> +
> +	tbqp = queue_readl(queue, TBQP) / macb_dma_desc_get_size(bp);
> +	tbqp = macb_adj_dma_desc_idx(bp, macb_tx_ring_wrap(bp, tbqp));
> +	head_idx = macb_adj_dma_desc_idx(bp, macb_tx_ring_wrap(bp, head));
> +
> +	if (tbqp == head_idx)
> +		return;
> +
> +	spin_lock_irq(&bp->lock);
> +	macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART));
> +	spin_unlock_irq(&bp->lock);
> +}
> +
>  static int macb_poll(struct napi_struct *napi, int budget)
>  {
>  	struct macb_queue *queue = container_of(napi, struct macb_queue, napi);
>  	struct macb *bp = queue->bp;
>  	int work_done;
>  
> +	macb_tx_complete(queue, budget);
> +
> +	rmb(); // ensure txubr_pending is up to date
> +	if (queue->txubr_pending) {
> +		queue->txubr_pending = false;
> +		netdev_vdbg(bp->dev, "poll: tx restart\n");
> +		macb_tx_restart(queue);
> +	}
> +
>  	work_done = bp->macbgem_ops.mog_rx(queue, napi, budget);
>  
>  	netdev_vdbg(bp->dev, "poll: queue = %u, work_done = %d, budget = %d\n",
>  		    (unsigned int)(queue - bp->queues), work_done, budget);
>  
>  	if (work_done < budget && napi_complete_done(napi, work_done)) {
> -		queue_writel(queue, IER, bp->rx_intr_mask);
> +		queue_writel(queue, IER, bp->rx_intr_mask |
> +					 MACB_BIT(TCOMP));
>  
>  		/* Packet completions only seem to propagate to raise
>  		 * interrupts when interrupts are enabled at the time, so if
> -		 * packets were received while interrupts were disabled,
> +		 * packets were sent/received while interrupts were disabled,
>  		 * they will not cause another interrupt to be generated when
>  		 * interrupts are re-enabled.
>  		 * Check for this case here to avoid losing a wakeup. This can
> @@ -1593,10 +1629,13 @@ static int macb_poll(struct napi_struct *napi, int budget)
>  		 * actions if an interrupt is raised just after enabling them,
>  		 * but this should be harmless.
>  		 */
> -		if (macb_rx_pending(queue)) {
> -			queue_writel(queue, IDR, bp->rx_intr_mask);
> +		if (macb_rx_pending(queue) ||
> +		    macb_tx_complete_pending(queue)) {
> +			queue_writel(queue, IDR, bp->rx_intr_mask |
> +						 MACB_BIT(TCOMP));
>  			if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
> -				queue_writel(queue, ISR, MACB_BIT(RCOMP));
> +				queue_writel(queue, ISR, MACB_BIT(RCOMP) |
> +							 MACB_BIT(TCOMP));
>  			netdev_vdbg(bp->dev, "poll: packets pending, reschedule\n");
>  			napi_schedule(napi);
>  		}
> @@ -1646,29 +1685,6 @@ static void macb_hresp_error_task(struct tasklet_struct *t)
>  	netif_tx_start_all_queues(dev);
>  }
>  
> -static void macb_tx_restart(struct macb_queue *queue)
> -{
> -	unsigned int head = queue->tx_head;
> -	unsigned int tail = queue->tx_tail;
> -	struct macb *bp = queue->bp;
> -	unsigned int head_idx, tbqp;
> -
> -	if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
> -		queue_writel(queue, ISR, MACB_BIT(TXUBR));
> -
> -	if (head == tail)
> -		return;
> -
> -	tbqp = queue_readl(queue, TBQP) / macb_dma_desc_get_size(bp);
> -	tbqp = macb_adj_dma_desc_idx(bp, macb_tx_ring_wrap(bp, tbqp));
> -	head_idx = macb_adj_dma_desc_idx(bp, macb_tx_ring_wrap(bp, head));
> -
> -	if (tbqp == head_idx)
> -		return;
> -
> -	macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART));
> -}
> -
>  static irqreturn_t macb_wol_interrupt(int irq, void *dev_id)
>  {
>  	struct macb_queue *queue = dev_id;
> @@ -1754,19 +1770,29 @@ static irqreturn_t macb_interrupt(int irq, void *dev_id)
>  			    (unsigned int)(queue - bp->queues),
>  			    (unsigned long)status);
>  
> -		if (status & bp->rx_intr_mask) {
> -			/* There's no point taking any more interrupts
> -			 * until we have processed the buffers. The
> +		if (status & (bp->rx_intr_mask |
> +			      MACB_BIT(TCOMP) |
> +			      MACB_BIT(TXUBR))) {
> +			/* There's no point taking any more RX/TX completion
> +			 * interrupts until we have processed the buffers. The
>  			 * scheduling call may fail if the poll routine
>  			 * is already scheduled, so disable interrupts
>  			 * now.
>  			 */
> -			queue_writel(queue, IDR, bp->rx_intr_mask);
> +			queue_writel(queue, IDR, bp->rx_intr_mask |
> +						 MACB_BIT(TCOMP));
>  			if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
> -				queue_writel(queue, ISR, MACB_BIT(RCOMP));
> +				queue_writel(queue, ISR, MACB_BIT(RCOMP) |
> +							 MACB_BIT(TCOMP) |> +							 MACB_BIT(TXUBR))> +
> +			if (status & MACB_BIT(TXUBR)) {
> +				queue->txubr_pending = true;
> +				wmb(); // ensure softirq can see update
> +			}
>  
>  			if (napi_schedule_prep(&queue->napi)) {
> -				netdev_vdbg(bp->dev, "scheduling RX softirq\n");
> +				netdev_vdbg(bp->dev, "scheduling NAPI softirq\n");
>  				__napi_schedule(&queue->napi);
>  			}
>  		}
> @@ -1781,12 +1807,6 @@ static irqreturn_t macb_interrupt(int irq, void *dev_id)
>  			break;
>  		}
>  
> -		if (status & MACB_BIT(TCOMP))
> -			macb_tx_interrupt(queue);
> -
> -		if (status & MACB_BIT(TXUBR))
> -			macb_tx_restart(queue);
> -
>  		/* Link change detection isn't possible with RMII, so we'll
>  		 * add that if/when we get our hands on a full-blown MII PHY.
>  		 */
> @@ -2019,7 +2039,7 @@ static unsigned int macb_tx_map(struct macb *bp,
>  	for (i = queue->tx_head; i != tx_head; i++) {
>  		tx_skb = macb_tx_skb(queue, i);
>  
> -		macb_tx_unmap(bp, tx_skb);
> +		macb_tx_unmap(bp, tx_skb, 0);
>  	}
>  
>  	return 0;