lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Mon, 9 May 2022 18:00:21 +0000
From:   Robert Hancock <robert.hancock@...ian.com>
To:     "netdev@...r.kernel.org" <netdev@...r.kernel.org>,
        "tomas.melin@...sala.com" <tomas.melin@...sala.com>
CC:     "nicolas.ferre@...rochip.com" <nicolas.ferre@...rochip.com>,
        "davem@...emloft.net" <davem@...emloft.net>,
        "claudiu.beznea@...rochip.com" <claudiu.beznea@...rochip.com>,
        "kuba@...nel.org" <kuba@...nel.org>,
        "pabeni@...hat.com" <pabeni@...hat.com>,
        "edumazet@...gle.com" <edumazet@...gle.com>
Subject: Re: [PATCH net-next 2/2] net: macb: use NAPI for TX completion path

On Mon, 2022-05-09 at 08:49 +0300, Tomas Melin wrote:
> Hi Robert,
> 
> On 03/05/2022 20:07, Robert Hancock wrote:
> > On Tue, 2022-05-03 at 12:57 +0300, Tomas Melin wrote:
> > > Hi,
> > > 
> > > 
> > > On 30/04/2022 01:31, Robert Hancock wrote:
> > > > This driver was using the TX IRQ handler to perform all TX completion
> > > > tasks. Under heavy TX network load, this can cause significant irqs-off
> > > > latencies (found to be in the hundreds of microseconds using ftrace).
> > > > This can cause other issues, such as overrunning serial UART FIFOs when
> > > > using high baud rates with limited UART FIFO sizes.
> > > > 
> > > > Switch to using the NAPI poll handler to perform the TX completion work
> > > > to get this out of hard IRQ context and avoid the IRQ latency impact.
> > > > 
> > > > The TX Used Bit Read interrupt (TXUBR) handling also needs to be moved
> > > > into the NAPI poll handler to maintain the proper order of operations.
> > > > A
> > > > flag is used to notify the poll handler that a UBR condition needs to
> > > > be
> > > > handled. The macb_tx_restart handler has had some locking added for
> > > > global
> > > > register access, since this could now potentially happen concurrently
> > > > on
> > > > different queues.
> > > > 
> > > > Signed-off-by: Robert Hancock <robert.hancock@...ian.com>
> > > > ---
> > > >  drivers/net/ethernet/cadence/macb.h      |   1 +
> > > >  drivers/net/ethernet/cadence/macb_main.c | 138 +++++++++++++----------
> > > >  2 files changed, 80 insertions(+), 59 deletions(-)
> > > > 
> > > > diff --git a/drivers/net/ethernet/cadence/macb.h
> > > > b/drivers/net/ethernet/cadence/macb.h
> > > > index f0a7d8396a4a..5355cef95a9b 100644
> > > > --- a/drivers/net/ethernet/cadence/macb.h
> > > > +++ b/drivers/net/ethernet/cadence/macb.h
> > > > @@ -1209,6 +1209,7 @@ struct macb_queue {
> > > >  	struct macb_tx_skb	*tx_skb;
> > > >  	dma_addr_t		tx_ring_dma;
> > > >  	struct work_struct	tx_error_task;
> > > > +	bool			txubr_pending;
> > > >  
> > > >  	dma_addr_t		rx_ring_dma;
> > > >  	dma_addr_t		rx_buffers_dma;
> > > > diff --git a/drivers/net/ethernet/cadence/macb_main.c
> > > > b/drivers/net/ethernet/cadence/macb_main.c
> > > > index 160dc5ad84ae..1cb8afb8ef0d 100644
> > > > --- a/drivers/net/ethernet/cadence/macb_main.c
> > > > +++ b/drivers/net/ethernet/cadence/macb_main.c
> > > > @@ -959,7 +959,7 @@ static int macb_halt_tx(struct macb *bp)
> > > >  	return -ETIMEDOUT;
> > > >  }
> > > >  
> > > > -static void macb_tx_unmap(struct macb *bp, struct macb_tx_skb *tx_skb)
> > > > +static void macb_tx_unmap(struct macb *bp, struct macb_tx_skb *tx_skb,
> > > > int
> > > > budget)
> > > >  {
> > > >  	if (tx_skb->mapping) {
> > > >  		if (tx_skb->mapped_as_page)
> > > > @@ -972,7 +972,7 @@ static void macb_tx_unmap(struct macb *bp, struct
> > > > macb_tx_skb *tx_skb)
> > > >  	}
> > > >  
> > > >  	if (tx_skb->skb) {
> > > > -		dev_kfree_skb_any(tx_skb->skb);
> > > > +		napi_consume_skb(tx_skb->skb, budget);
> > > >  		tx_skb->skb = NULL;
> > > >  	}
> > > >  }
> > > > @@ -1025,12 +1025,13 @@ static void macb_tx_error_task(struct
> > > > work_struct
> > > > *work)
> > > >  		    (unsigned int)(queue - bp->queues),
> > > >  		    queue->tx_tail, queue->tx_head);
> > > >  
> > > > -	/* Prevent the queue IRQ handlers from running: each of them
> > > > may call
> > > > -	 * macb_tx_interrupt(), which in turn may call
> > > > netif_wake_subqueue().
> > > > +	/* Prevent the queue NAPI poll from running, as it calls
> > > > +	 * macb_tx_complete(), which in turn may call
> > > > netif_wake_subqueue().
> > > >  	 * As explained below, we have to halt the transmission before
> > > > updating
> > > >  	 * TBQP registers so we call netif_tx_stop_all_queues() to
> > > > notify the
> > > >  	 * network engine about the macb/gem being halted.
> > > >  	 */
> > > > +	napi_disable(&queue->napi);
> > > >  	spin_lock_irqsave(&bp->lock, flags);
> > > >  
> > > >  	/* Make sure nobody is trying to queue up new packets */
> > > > @@ -1058,7 +1059,7 @@ static void macb_tx_error_task(struct work_struct
> > > > *work)
> > > >  		if (ctrl & MACB_BIT(TX_USED)) {
> > > >  			/* skb is set for the last buffer of the frame
> > > > */
> > > >  			while (!skb) {
> > > > -				macb_tx_unmap(bp, tx_skb);
> > > > +				macb_tx_unmap(bp, tx_skb, 0);
> > > >  				tail++;
> > > >  				tx_skb = macb_tx_skb(queue, tail);
> > > >  				skb = tx_skb->skb;
> > > > @@ -1088,7 +1089,7 @@ static void macb_tx_error_task(struct work_struct
> > > > *work)
> > > >  			desc->ctrl = ctrl | MACB_BIT(TX_USED);
> > > >  		}
> > > >  
> > > > -		macb_tx_unmap(bp, tx_skb);
> > > > +		macb_tx_unmap(bp, tx_skb, 0);
> > > >  	}
> > > >  
> > > >  	/* Set end of TX queue */
> > > > @@ -1118,25 +1119,28 @@ static void macb_tx_error_task(struct
> > > > work_struct
> > > > *work)
> > > >  	macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART));
> > > >  
> > > >  	spin_unlock_irqrestore(&bp->lock, flags);
> > > > +	napi_enable(&queue->napi);
> > > >  }
> > > >  
> > > > -static void macb_tx_interrupt(struct macb_queue *queue)
> > > > +static bool macb_tx_complete_pending(struct macb_queue *queue)
> > > 
> > > This might be somewhat problematic approach as TX_USED bit could
> > > potentially also be set if a descriptor with TX_USED is found mid-frame
> > > (for whatever reason).
> > > Not sure how it should be done, but it would be nice if TCOMP
> > > would rather be known on per queue basis.
> > 
> > I think this scenario would already have been a problem if it were possible
> > -
> > i.e. when we detect the TCOMP interrupt was asserted, it has likely already
> > started sending the next frame in the ring when we are processing the ring
> > entries, so if a TX_USED bit was set in the middle of a frame we would
> > potentially already have been acting on it and likely breaking? So it seems
> > like a safe assumption that this doesn't happen, as it that would make it
> > very
> > difficult for the driver to tell when a frame was actually done or not..
> The consideration was that TCOMP and TX_USED are as such separate
> interrupts, so TX_USED can be asserted even though there is no TCOMP.
> 
> Mixing terminology could become confusing here. I.e. IMHO
> tx_complete_pending should check if there is TCOMP pending and
> txubr_pending TX_USED.
> 
> > > Related note, rx side function seems to be named as macb_rx_pending(),
> > > should be similary macb_tx_pending()?
> > 
> > I thought that naming might be a bit misleading, as in the TX case we are
> > not
> > only interested in whether frames are pending (i.e. in the TX ring to be
> > sent)
> > but also there are completed frames in the ring that have not been
> > processed
> > yet.
> 
> Wouldn't tx_pending() describe that better then? :) (That there is tx
> side operations pending to be processed.)

I think my explanation was unclear: in this case only care about whether TX
packets have been completed. We don't care about incomplete TX packets that are
still in the ring which could also be considered "pending".

> 
> Thanks,
> Tomas
> 
> 
> 
> > > 
> > > Thanks,
> > > Tomas
> > > 
> > > 
> > > > +{
> > > > +	if (queue->tx_head != queue->tx_tail) {
> > > > +		/* Make hw descriptor updates visible to CPU */
> > > > +		rmb();
> > > > +
> > > > +		if (macb_tx_desc(queue, queue->tx_tail)->ctrl &
> > > > MACB_BIT(TX_USED))
> > > > +			return true;
> > > > +	}
> > > > +	return false;
> > > > +}
> > > > +
> > > > +static void macb_tx_complete(struct macb_queue *queue, int budget)
> > > >  {
> > > >  	unsigned int tail;
> > > >  	unsigned int head;
> > > > -	u32 status;
> > > >  	struct macb *bp = queue->bp;
> > > >  	u16 queue_index = queue - bp->queues;
> > > >  
> > > > -	status = macb_readl(bp, TSR);
> > > > -	macb_writel(bp, TSR, status);
> > > > -
> > > > -	if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
> > > > -		queue_writel(queue, ISR, MACB_BIT(TCOMP));
> > > > -
> > > > -	netdev_vdbg(bp->dev, "macb_tx_interrupt status = 0x%03lx\n",
> > > > -		    (unsigned long)status);
> > > > -
> > > >  	head = queue->tx_head;
> > > >  	for (tail = queue->tx_tail; tail != head; tail++) {
> > > >  		struct macb_tx_skb	*tx_skb;
> > > > @@ -1182,7 +1186,7 @@ static void macb_tx_interrupt(struct macb_queue
> > > > *queue)
> > > >  			}
> > > >  
> > > >  			/* Now we can safely release resources */
> > > > -			macb_tx_unmap(bp, tx_skb);
> > > > +			macb_tx_unmap(bp, tx_skb, budget);
> > > >  
> > > >  			/* skb is set only for the last buffer of the
> > > > frame.
> > > >  			 * WARNING: at this point skb has been freed by
> > > > @@ -1569,23 +1573,55 @@ static bool macb_rx_pending(struct macb_queue
> > > > *queue)
> > > >  	return (desc->addr & MACB_BIT(RX_USED)) != 0;
> > > >  }
> > > >  
> > > > +static void macb_tx_restart(struct macb_queue *queue)
> > > > +{
> > > > +	unsigned int head = queue->tx_head;
> > > > +	unsigned int tail = queue->tx_tail;
> > > > +	struct macb *bp = queue->bp;
> > > > +	unsigned int head_idx, tbqp;
> > > > +
> > > > +	if (head == tail)
> > > > +		return;
> > > > +
> > > > +	tbqp = queue_readl(queue, TBQP) / macb_dma_desc_get_size(bp);
> > > > +	tbqp = macb_adj_dma_desc_idx(bp, macb_tx_ring_wrap(bp, tbqp));
> > > > +	head_idx = macb_adj_dma_desc_idx(bp, macb_tx_ring_wrap(bp,
> > > > head));
> > > > +
> > > > +	if (tbqp == head_idx)
> > > > +		return;
> > > > +
> > > > +	spin_lock_irq(&bp->lock);
> > > > +	macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART));
> > > > +	spin_unlock_irq(&bp->lock);
> > > > +}
> > > > +
> > > >  static int macb_poll(struct napi_struct *napi, int budget)
> > > >  {
> > > >  	struct macb_queue *queue = container_of(napi, struct
> > > > macb_queue, napi);
> > > >  	struct macb *bp = queue->bp;
> > > >  	int work_done;
> > > >  
> > > > +	macb_tx_complete(queue, budget);
> > > > +
> > > > +	rmb(); // ensure txubr_pending is up to date
> > > > +	if (queue->txubr_pending) {
> > > > +		queue->txubr_pending = false;
> > > > +		netdev_vdbg(bp->dev, "poll: tx restart\n");
> > > > +		macb_tx_restart(queue);
> > > > +	}
> > > > +
> > > >  	work_done = bp->macbgem_ops.mog_rx(queue, napi, budget);
> > > >  
> > > >  	netdev_vdbg(bp->dev, "poll: queue = %u, work_done = %d, budget
> > > > = %d\n",
> > > >  		    (unsigned int)(queue - bp->queues), work_done,
> > > > budget);
> > > >  
> > > >  	if (work_done < budget && napi_complete_done(napi, work_done))
> > > > {
> > > > -		queue_writel(queue, IER, bp->rx_intr_mask);
> > > > +		queue_writel(queue, IER, bp->rx_intr_mask |
> > > > +					 MACB_BIT(TCOMP));
> > > >  
> > > >  		/* Packet completions only seem to propagate to raise
> > > >  		 * interrupts when interrupts are enabled at the time,
> > > > so if
> > > > -		 * packets were received while interrupts were
> > > > disabled,
> > > > +		 * packets were sent/received while interrupts were
> > > > disabled,
> > > >  		 * they will not cause another interrupt to be
> > > > generated when
> > > >  		 * interrupts are re-enabled.
> > > >  		 * Check for this case here to avoid losing a wakeup.
> > > > This can
> > > > @@ -1593,10 +1629,13 @@ static int macb_poll(struct napi_struct *napi,
> > > > int
> > > > budget)
> > > >  		 * actions if an interrupt is raised just after
> > > > enabling them,
> > > >  		 * but this should be harmless.
> > > >  		 */
> > > > -		if (macb_rx_pending(queue)) {
> > > > -			queue_writel(queue, IDR, bp->rx_intr_mask);
> > > > +		if (macb_rx_pending(queue) ||
> > > > +		    macb_tx_complete_pending(queue)) {
> > > > +			queue_writel(queue, IDR, bp->rx_intr_mask |
> > > > +						 MACB_BIT(TCOMP));
> > > >  			if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
> > > > -				queue_writel(queue, ISR,
> > > > MACB_BIT(RCOMP));
> > > > +				queue_writel(queue, ISR,
> > > > MACB_BIT(RCOMP) |
> > > > +							 MACB_BIT(TCOMP
> > > > ));
> > > >  			netdev_vdbg(bp->dev, "poll: packets pending,
> > > > reschedule\n");
> > > >  			napi_schedule(napi);
> > > >  		}
> > > > @@ -1646,29 +1685,6 @@ static void macb_hresp_error_task(struct
> > > > tasklet_struct *t)
> > > >  	netif_tx_start_all_queues(dev);
> > > >  }
> > > >  
> > > > -static void macb_tx_restart(struct macb_queue *queue)
> > > > -{
> > > > -	unsigned int head = queue->tx_head;
> > > > -	unsigned int tail = queue->tx_tail;
> > > > -	struct macb *bp = queue->bp;
> > > > -	unsigned int head_idx, tbqp;
> > > > -
> > > > -	if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
> > > > -		queue_writel(queue, ISR, MACB_BIT(TXUBR));
> > > > -
> > > > -	if (head == tail)
> > > > -		return;
> > > > -
> > > > -	tbqp = queue_readl(queue, TBQP) / macb_dma_desc_get_size(bp);
> > > > -	tbqp = macb_adj_dma_desc_idx(bp, macb_tx_ring_wrap(bp, tbqp));
> > > > -	head_idx = macb_adj_dma_desc_idx(bp, macb_tx_ring_wrap(bp,
> > > > head));
> > > > -
> > > > -	if (tbqp == head_idx)
> > > > -		return;
> > > > -
> > > > -	macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART));
> > > > -}
> > > > -
> > > >  static irqreturn_t macb_wol_interrupt(int irq, void *dev_id)
> > > >  {
> > > >  	struct macb_queue *queue = dev_id;
> > > > @@ -1754,19 +1770,29 @@ static irqreturn_t macb_interrupt(int irq, void
> > > > *dev_id)
> > > >  			    (unsigned int)(queue - bp->queues),
> > > >  			    (unsigned long)status);
> > > >  
> > > > -		if (status & bp->rx_intr_mask) {
> > > > -			/* There's no point taking any more interrupts
> > > > -			 * until we have processed the buffers. The
> > > > +		if (status & (bp->rx_intr_mask |
> > > > +			      MACB_BIT(TCOMP) |
> > > > +			      MACB_BIT(TXUBR))) {
> > > > +			/* There's no point taking any more RX/TX
> > > > completion
> > > > +			 * interrupts until we have processed the
> > > > buffers. The
> > > >  			 * scheduling call may fail if the poll routine
> > > >  			 * is already scheduled, so disable interrupts
> > > >  			 * now.
> > > >  			 */
> > > > -			queue_writel(queue, IDR, bp->rx_intr_mask);
> > > > +			queue_writel(queue, IDR, bp->rx_intr_mask |
> > > > +						 MACB_BIT(TCOMP));
> > > >  			if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
> > > > -				queue_writel(queue, ISR,
> > > > MACB_BIT(RCOMP));
> > > > +				queue_writel(queue, ISR,
> > > > MACB_BIT(RCOMP) |
> > > > +							 MACB_BIT(TCOMP
> > > > ) |> +	
> > > > 						 MACB_BIT(TXUBR))> +
> > > > +			if (status & MACB_BIT(TXUBR)) {
> > > > +				queue->txubr_pending = true;
> > > > +				wmb(); // ensure softirq can see update
> > > > +			}
> > > >  
> > > >  			if (napi_schedule_prep(&queue->napi)) {
> > > > -				netdev_vdbg(bp->dev, "scheduling RX
> > > > softirq\n");
> > > > +				netdev_vdbg(bp->dev, "scheduling NAPI
> > > > softirq\n");
> > > >  				__napi_schedule(&queue->napi);
> > > >  			}
> > > >  		}
> > > > @@ -1781,12 +1807,6 @@ static irqreturn_t macb_interrupt(int irq, void
> > > > *dev_id)
> > > >  			break;
> > > >  		}
> > > >  
> > > > -		if (status & MACB_BIT(TCOMP))
> > > > -			macb_tx_interrupt(queue);
> > > > -
> > > > -		if (status & MACB_BIT(TXUBR))
> > > > -			macb_tx_restart(queue);
> > > > -
> > > >  		/* Link change detection isn't possible with RMII, so
> > > > we'll
> > > >  		 * add that if/when we get our hands on a full-blown
> > > > MII PHY.
> > > >  		 */
> > > > @@ -2019,7 +2039,7 @@ static unsigned int macb_tx_map(struct macb *bp,
> > > >  	for (i = queue->tx_head; i != tx_head; i++) {
> > > >  		tx_skb = macb_tx_skb(queue, i);
> > > >  
> > > > -		macb_tx_unmap(bp, tx_skb);
> > > > +		macb_tx_unmap(bp, tx_skb, 0);
> > > >  	}
> > > >  
> > > >  	return 0;
-- 
Robert Hancock
Senior Hardware Designer, Calian Advanced Technologies
www.calian.com

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ