[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <cc63933a-24e8-bf83-a214-57e3810d640f@vaisala.com>
Date: Mon, 9 May 2022 08:49:46 +0300
From: Tomas Melin <tomas.melin@...sala.com>
To: Robert Hancock <robert.hancock@...ian.com>,
"netdev@...r.kernel.org" <netdev@...r.kernel.org>
Cc: "nicolas.ferre@...rochip.com" <nicolas.ferre@...rochip.com>,
"davem@...emloft.net" <davem@...emloft.net>,
"claudiu.beznea@...rochip.com" <claudiu.beznea@...rochip.com>,
"kuba@...nel.org" <kuba@...nel.org>,
"pabeni@...hat.com" <pabeni@...hat.com>,
"edumazet@...gle.com" <edumazet@...gle.com>
Subject: Re: [PATCH net-next 2/2] net: macb: use NAPI for TX completion path
Hi Robert,
On 03/05/2022 20:07, Robert Hancock wrote:
> On Tue, 2022-05-03 at 12:57 +0300, Tomas Melin wrote:
>> Hi,
>>
>>
>> On 30/04/2022 01:31, Robert Hancock wrote:
>>> This driver was using the TX IRQ handler to perform all TX completion
>>> tasks. Under heavy TX network load, this can cause significant irqs-off
>>> latencies (found to be in the hundreds of microseconds using ftrace).
>>> This can cause other issues, such as overrunning serial UART FIFOs when
>>> using high baud rates with limited UART FIFO sizes.
>>>
>>> Switch to using the NAPI poll handler to perform the TX completion work
>>> to get this out of hard IRQ context and avoid the IRQ latency impact.
>>>
>>> The TX Used Bit Read interrupt (TXUBR) handling also needs to be moved
>>> into the NAPI poll handler to maintain the proper order of operations. A
>>> flag is used to notify the poll handler that a UBR condition needs to be
>>> handled. The macb_tx_restart handler has had some locking added for global
>>> register access, since this could now potentially happen concurrently on
>>> different queues.
>>>
>>> Signed-off-by: Robert Hancock <robert.hancock@...ian.com>
>>> ---
>>> drivers/net/ethernet/cadence/macb.h | 1 +
>>> drivers/net/ethernet/cadence/macb_main.c | 138 +++++++++++++----------
>>> 2 files changed, 80 insertions(+), 59 deletions(-)
>>>
>>> diff --git a/drivers/net/ethernet/cadence/macb.h
>>> b/drivers/net/ethernet/cadence/macb.h
>>> index f0a7d8396a4a..5355cef95a9b 100644
>>> --- a/drivers/net/ethernet/cadence/macb.h
>>> +++ b/drivers/net/ethernet/cadence/macb.h
>>> @@ -1209,6 +1209,7 @@ struct macb_queue {
>>> struct macb_tx_skb *tx_skb;
>>> dma_addr_t tx_ring_dma;
>>> struct work_struct tx_error_task;
>>> + bool txubr_pending;
>>>
>>> dma_addr_t rx_ring_dma;
>>> dma_addr_t rx_buffers_dma;
>>> diff --git a/drivers/net/ethernet/cadence/macb_main.c
>>> b/drivers/net/ethernet/cadence/macb_main.c
>>> index 160dc5ad84ae..1cb8afb8ef0d 100644
>>> --- a/drivers/net/ethernet/cadence/macb_main.c
>>> +++ b/drivers/net/ethernet/cadence/macb_main.c
>>> @@ -959,7 +959,7 @@ static int macb_halt_tx(struct macb *bp)
>>> return -ETIMEDOUT;
>>> }
>>>
>>> -static void macb_tx_unmap(struct macb *bp, struct macb_tx_skb *tx_skb)
>>> +static void macb_tx_unmap(struct macb *bp, struct macb_tx_skb *tx_skb, int
>>> budget)
>>> {
>>> if (tx_skb->mapping) {
>>> if (tx_skb->mapped_as_page)
>>> @@ -972,7 +972,7 @@ static void macb_tx_unmap(struct macb *bp, struct
>>> macb_tx_skb *tx_skb)
>>> }
>>>
>>> if (tx_skb->skb) {
>>> - dev_kfree_skb_any(tx_skb->skb);
>>> + napi_consume_skb(tx_skb->skb, budget);
>>> tx_skb->skb = NULL;
>>> }
>>> }
>>> @@ -1025,12 +1025,13 @@ static void macb_tx_error_task(struct work_struct
>>> *work)
>>> (unsigned int)(queue - bp->queues),
>>> queue->tx_tail, queue->tx_head);
>>>
>>> - /* Prevent the queue IRQ handlers from running: each of them may call
>>> - * macb_tx_interrupt(), which in turn may call netif_wake_subqueue().
>>> + /* Prevent the queue NAPI poll from running, as it calls
>>> + * macb_tx_complete(), which in turn may call netif_wake_subqueue().
>>> * As explained below, we have to halt the transmission before updating
>>> * TBQP registers so we call netif_tx_stop_all_queues() to notify the
>>> * network engine about the macb/gem being halted.
>>> */
>>> + napi_disable(&queue->napi);
>>> spin_lock_irqsave(&bp->lock, flags);
>>>
>>> /* Make sure nobody is trying to queue up new packets */
>>> @@ -1058,7 +1059,7 @@ static void macb_tx_error_task(struct work_struct
>>> *work)
>>> if (ctrl & MACB_BIT(TX_USED)) {
>>> /* skb is set for the last buffer of the frame */
>>> while (!skb) {
>>> - macb_tx_unmap(bp, tx_skb);
>>> + macb_tx_unmap(bp, tx_skb, 0);
>>> tail++;
>>> tx_skb = macb_tx_skb(queue, tail);
>>> skb = tx_skb->skb;
>>> @@ -1088,7 +1089,7 @@ static void macb_tx_error_task(struct work_struct
>>> *work)
>>> desc->ctrl = ctrl | MACB_BIT(TX_USED);
>>> }
>>>
>>> - macb_tx_unmap(bp, tx_skb);
>>> + macb_tx_unmap(bp, tx_skb, 0);
>>> }
>>>
>>> /* Set end of TX queue */
>>> @@ -1118,25 +1119,28 @@ static void macb_tx_error_task(struct work_struct
>>> *work)
>>> macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART));
>>>
>>> spin_unlock_irqrestore(&bp->lock, flags);
>>> + napi_enable(&queue->napi);
>>> }
>>>
>>> -static void macb_tx_interrupt(struct macb_queue *queue)
>>> +static bool macb_tx_complete_pending(struct macb_queue *queue)
>>
>> This might be somewhat problematic approach as TX_USED bit could
>> potentially also be set if a descriptor with TX_USED is found mid-frame
>> (for whatever reason).
>> Not sure how it should be done, but it would be nice if TCOMP
>> would rather be known on per queue basis.
>
> I think this scenario would already have been a problem if it were possible -
> i.e. when we detect the TCOMP interrupt was asserted, it has likely already
> started sending the next frame in the ring when we are processing the ring
> entries, so if a TX_USED bit was set in the middle of a frame we would
> potentially already have been acting on it and likely breaking? So it seems
> like a safe assumption that this doesn't happen, as it that would make it very
> difficult for the driver to tell when a frame was actually done or not..
The consideration was that TCOMP and TX_USED are as such separate
interrupts, so TX_USED can be asserted even though there is no TCOMP.
Mixing terminology could become confusing here. I.e. IMHO
tx_complete_pending should check if there is TCOMP pending and
txubr_pending TX_USED.
>
>>
>> Related note, rx side function seems to be named as macb_rx_pending(),
>> should be similary macb_tx_pending()?
>
> I thought that naming might be a bit misleading, as in the TX case we are not
> only interested in whether frames are pending (i.e. in the TX ring to be sent)
> but also there are completed frames in the ring that have not been processed
> yet.
Wouldn't tx_pending() describe that better then? :) (That there is tx
side operations pending to be processed.)
Thanks,
Tomas
>
>>
>>
>> Thanks,
>> Tomas
>>
>>
>>> +{
>>> + if (queue->tx_head != queue->tx_tail) {
>>> + /* Make hw descriptor updates visible to CPU */
>>> + rmb();
>>> +
>>> + if (macb_tx_desc(queue, queue->tx_tail)->ctrl &
>>> MACB_BIT(TX_USED))
>>> + return true;
>>> + }
>>> + return false;
>>> +}
>>> +
>>> +static void macb_tx_complete(struct macb_queue *queue, int budget)
>>> {
>>> unsigned int tail;
>>> unsigned int head;
>>> - u32 status;
>>> struct macb *bp = queue->bp;
>>> u16 queue_index = queue - bp->queues;
>>>
>>> - status = macb_readl(bp, TSR);
>>> - macb_writel(bp, TSR, status);
>>> -
>>> - if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
>>> - queue_writel(queue, ISR, MACB_BIT(TCOMP));
>>> -
>>> - netdev_vdbg(bp->dev, "macb_tx_interrupt status = 0x%03lx\n",
>>> - (unsigned long)status);
>>> -
>>> head = queue->tx_head;
>>> for (tail = queue->tx_tail; tail != head; tail++) {
>>> struct macb_tx_skb *tx_skb;
>>> @@ -1182,7 +1186,7 @@ static void macb_tx_interrupt(struct macb_queue
>>> *queue)
>>> }
>>>
>>> /* Now we can safely release resources */
>>> - macb_tx_unmap(bp, tx_skb);
>>> + macb_tx_unmap(bp, tx_skb, budget);
>>>
>>> /* skb is set only for the last buffer of the frame.
>>> * WARNING: at this point skb has been freed by
>>> @@ -1569,23 +1573,55 @@ static bool macb_rx_pending(struct macb_queue
>>> *queue)
>>> return (desc->addr & MACB_BIT(RX_USED)) != 0;
>>> }
>>>
>>> +static void macb_tx_restart(struct macb_queue *queue)
>>> +{
>>> + unsigned int head = queue->tx_head;
>>> + unsigned int tail = queue->tx_tail;
>>> + struct macb *bp = queue->bp;
>>> + unsigned int head_idx, tbqp;
>>> +
>>> + if (head == tail)
>>> + return;
>>> +
>>> + tbqp = queue_readl(queue, TBQP) / macb_dma_desc_get_size(bp);
>>> + tbqp = macb_adj_dma_desc_idx(bp, macb_tx_ring_wrap(bp, tbqp));
>>> + head_idx = macb_adj_dma_desc_idx(bp, macb_tx_ring_wrap(bp, head));
>>> +
>>> + if (tbqp == head_idx)
>>> + return;
>>> +
>>> + spin_lock_irq(&bp->lock);
>>> + macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART));
>>> + spin_unlock_irq(&bp->lock);
>>> +}
>>> +
>>> static int macb_poll(struct napi_struct *napi, int budget)
>>> {
>>> struct macb_queue *queue = container_of(napi, struct macb_queue, napi);
>>> struct macb *bp = queue->bp;
>>> int work_done;
>>>
>>> + macb_tx_complete(queue, budget);
>>> +
>>> + rmb(); // ensure txubr_pending is up to date
>>> + if (queue->txubr_pending) {
>>> + queue->txubr_pending = false;
>>> + netdev_vdbg(bp->dev, "poll: tx restart\n");
>>> + macb_tx_restart(queue);
>>> + }
>>> +
>>> work_done = bp->macbgem_ops.mog_rx(queue, napi, budget);
>>>
>>> netdev_vdbg(bp->dev, "poll: queue = %u, work_done = %d, budget = %d\n",
>>> (unsigned int)(queue - bp->queues), work_done, budget);
>>>
>>> if (work_done < budget && napi_complete_done(napi, work_done)) {
>>> - queue_writel(queue, IER, bp->rx_intr_mask);
>>> + queue_writel(queue, IER, bp->rx_intr_mask |
>>> + MACB_BIT(TCOMP));
>>>
>>> /* Packet completions only seem to propagate to raise
>>> * interrupts when interrupts are enabled at the time, so if
>>> - * packets were received while interrupts were disabled,
>>> + * packets were sent/received while interrupts were disabled,
>>> * they will not cause another interrupt to be generated when
>>> * interrupts are re-enabled.
>>> * Check for this case here to avoid losing a wakeup. This can
>>> @@ -1593,10 +1629,13 @@ static int macb_poll(struct napi_struct *napi, int
>>> budget)
>>> * actions if an interrupt is raised just after enabling them,
>>> * but this should be harmless.
>>> */
>>> - if (macb_rx_pending(queue)) {
>>> - queue_writel(queue, IDR, bp->rx_intr_mask);
>>> + if (macb_rx_pending(queue) ||
>>> + macb_tx_complete_pending(queue)) {
>>> + queue_writel(queue, IDR, bp->rx_intr_mask |
>>> + MACB_BIT(TCOMP));
>>> if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
>>> - queue_writel(queue, ISR, MACB_BIT(RCOMP));
>>> + queue_writel(queue, ISR, MACB_BIT(RCOMP) |
>>> + MACB_BIT(TCOMP));
>>> netdev_vdbg(bp->dev, "poll: packets pending,
>>> reschedule\n");
>>> napi_schedule(napi);
>>> }
>>> @@ -1646,29 +1685,6 @@ static void macb_hresp_error_task(struct
>>> tasklet_struct *t)
>>> netif_tx_start_all_queues(dev);
>>> }
>>>
>>> -static void macb_tx_restart(struct macb_queue *queue)
>>> -{
>>> - unsigned int head = queue->tx_head;
>>> - unsigned int tail = queue->tx_tail;
>>> - struct macb *bp = queue->bp;
>>> - unsigned int head_idx, tbqp;
>>> -
>>> - if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
>>> - queue_writel(queue, ISR, MACB_BIT(TXUBR));
>>> -
>>> - if (head == tail)
>>> - return;
>>> -
>>> - tbqp = queue_readl(queue, TBQP) / macb_dma_desc_get_size(bp);
>>> - tbqp = macb_adj_dma_desc_idx(bp, macb_tx_ring_wrap(bp, tbqp));
>>> - head_idx = macb_adj_dma_desc_idx(bp, macb_tx_ring_wrap(bp, head));
>>> -
>>> - if (tbqp == head_idx)
>>> - return;
>>> -
>>> - macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART));
>>> -}
>>> -
>>> static irqreturn_t macb_wol_interrupt(int irq, void *dev_id)
>>> {
>>> struct macb_queue *queue = dev_id;
>>> @@ -1754,19 +1770,29 @@ static irqreturn_t macb_interrupt(int irq, void
>>> *dev_id)
>>> (unsigned int)(queue - bp->queues),
>>> (unsigned long)status);
>>>
>>> - if (status & bp->rx_intr_mask) {
>>> - /* There's no point taking any more interrupts
>>> - * until we have processed the buffers. The
>>> + if (status & (bp->rx_intr_mask |
>>> + MACB_BIT(TCOMP) |
>>> + MACB_BIT(TXUBR))) {
>>> + /* There's no point taking any more RX/TX completion
>>> + * interrupts until we have processed the buffers. The
>>> * scheduling call may fail if the poll routine
>>> * is already scheduled, so disable interrupts
>>> * now.
>>> */
>>> - queue_writel(queue, IDR, bp->rx_intr_mask);
>>> + queue_writel(queue, IDR, bp->rx_intr_mask |
>>> + MACB_BIT(TCOMP));
>>> if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE)
>>> - queue_writel(queue, ISR, MACB_BIT(RCOMP));
>>> + queue_writel(queue, ISR, MACB_BIT(RCOMP) |
>>> + MACB_BIT(TCOMP) |> +
>>> MACB_BIT(TXUBR))> +
>>> + if (status & MACB_BIT(TXUBR)) {
>>> + queue->txubr_pending = true;
>>> + wmb(); // ensure softirq can see update
>>> + }
>>>
>>> if (napi_schedule_prep(&queue->napi)) {
>>> - netdev_vdbg(bp->dev, "scheduling RX
>>> softirq\n");
>>> + netdev_vdbg(bp->dev, "scheduling NAPI
>>> softirq\n");
>>> __napi_schedule(&queue->napi);
>>> }
>>> }
>>> @@ -1781,12 +1807,6 @@ static irqreturn_t macb_interrupt(int irq, void
>>> *dev_id)
>>> break;
>>> }
>>>
>>> - if (status & MACB_BIT(TCOMP))
>>> - macb_tx_interrupt(queue);
>>> -
>>> - if (status & MACB_BIT(TXUBR))
>>> - macb_tx_restart(queue);
>>> -
>>> /* Link change detection isn't possible with RMII, so we'll
>>> * add that if/when we get our hands on a full-blown MII PHY.
>>> */
>>> @@ -2019,7 +2039,7 @@ static unsigned int macb_tx_map(struct macb *bp,
>>> for (i = queue->tx_head; i != tx_head; i++) {
>>> tx_skb = macb_tx_skb(queue, i);
>>>
>>> - macb_tx_unmap(bp, tx_skb);
>>> + macb_tx_unmap(bp, tx_skb, 0);
>>> }
>>>
>>> return 0;
Powered by blists - more mailing lists