[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAL+tcoACfOd=TL4xpg114O+feBf1=dd2r5v_WT+6MB8QUDmJ+Q@mail.gmail.com>
Date: Tue, 22 Jul 2025 07:58:14 +0800
From: Jason Xing <kerneljasonxing@...il.com>
To: Jeroen de Borst <jeroendb@...gle.com>
Cc: netdev@...r.kernel.org, hramamurthy@...gle.com, davem@...emloft.net,
edumazet@...gle.com, kuba@...nel.org, willemb@...gle.com, pabeni@...hat.com,
Joshua Washington <joshwash@...gle.com>, Praveen Kaligineedi <pkaligineedi@...gle.com>
Subject: Re: [PATCH net-next v2 4/5] gve: implement DQO TX datapath for AF_XDP zero-copy
On Tue, Jul 22, 2025 at 7:37 AM Jason Xing <kerneljasonxing@...il.com> wrote:
>
> Hi Jeroen,
>
> On Thu, Jul 17, 2025 at 11:29 PM Jeroen de Borst <jeroendb@...gle.com> wrote:
> >
> > From: Joshua Washington <joshwash@...gle.com>
> >
> > In the descriptor clean path, a number of changes need to be made to
> > accommodate out of order completions and double completions.
> >
> > The XSK stack can only handle completions being processed in order, as a
> > single counter is incremented in xsk_tx_completed to sigify how many XSK
> > descriptors have been completed. Because completions can come back out
> > of order in DQ, a separate queue of XSK descriptors must be maintained.
> > This queue keeps the pending packets in the order that they were written
> > so that the descriptors can be counted in xsk_tx_completed in the same
> > order.
> >
> > For double completions, a new pending packet state and type are
> > introduced. The new type, GVE_TX_PENDING_PACKET_DQO_XSK, plays an
> > anlogous role to pre-existing _SKB and _XDP_FRAME pending packet types
> > for XSK descriptors. The new state, GVE_PACKET_STATE_XSK_COMPLETE,
> > represents packets for which no more completions are expected. This
> > includes packets which have received a packet completion or reinjection
> > completion, as well as packets whose reinjection completion timer have
> > timed out. At this point, such packets can be counted as part of
> > xsk_tx_completed() and freed.
> >
> > Reviewed-by: Willem de Bruijn <willemb@...gle.com>
> > Signed-off-by: Praveen Kaligineedi <pkaligineedi@...gle.com>
> > Signed-off-by: Joshua Washington <joshwash@...gle.com>
> > Signed-off-by: Jeroen de Borst <jeroendb@...gle.com>
> > ---
> > drivers/net/ethernet/google/gve/gve.h | 19 ++-
> > drivers/net/ethernet/google/gve/gve_dqo.h | 1 +
> > drivers/net/ethernet/google/gve/gve_main.c | 6 +
> > drivers/net/ethernet/google/gve/gve_tx_dqo.c | 148 +++++++++++++++++++
> > 4 files changed, 171 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h
> > index 9925c08e595e..ff7dc06e7fa4 100644
> > --- a/drivers/net/ethernet/google/gve/gve.h
> > +++ b/drivers/net/ethernet/google/gve/gve.h
> > @@ -399,11 +399,17 @@ enum gve_packet_state {
> > GVE_PACKET_STATE_PENDING_REINJECT_COMPL,
> > /* No valid completion received within the specified timeout. */
> > GVE_PACKET_STATE_TIMED_OUT_COMPL,
> > + /* XSK pending packet has received a packet/reinjection completion, or
> > + * has timed out. At this point, the pending packet can be counted by
> > + * xsk_tx_complete and freed.
> > + */
> > + GVE_PACKET_STATE_XSK_COMPLETE,
> > };
> >
> > enum gve_tx_pending_packet_dqo_type {
> > GVE_TX_PENDING_PACKET_DQO_SKB,
> > - GVE_TX_PENDING_PACKET_DQO_XDP_FRAME
> > + GVE_TX_PENDING_PACKET_DQO_XDP_FRAME,
> > + GVE_TX_PENDING_PACKET_DQO_XSK,
> > };
> >
> > struct gve_tx_pending_packet_dqo {
> > @@ -440,10 +446,10 @@ struct gve_tx_pending_packet_dqo {
> > /* Identifies the current state of the packet as defined in
> > * `enum gve_packet_state`.
> > */
> > - u8 state : 2;
> > + u8 state : 3;
> >
> > /* gve_tx_pending_packet_dqo_type */
> > - u8 type : 1;
> > + u8 type : 2;
> >
> > /* If packet is an outstanding miss completion, then the packet is
> > * freed if the corresponding re-injection completion is not received
> > @@ -512,6 +518,8 @@ struct gve_tx_ring {
> > /* Cached value of `dqo_compl.free_tx_qpl_buf_cnt` */
> > u32 free_tx_qpl_buf_cnt;
> > };
> > +
> > + atomic_t xsk_reorder_queue_tail;
> > } dqo_tx;
> > };
> >
> > @@ -545,6 +553,9 @@ struct gve_tx_ring {
> > /* Last TX ring index fetched by HW */
> > atomic_t hw_tx_head;
> >
> > + u16 xsk_reorder_queue_head;
> > + u16 xsk_reorder_queue_tail;
> > +
> > /* List to track pending packets which received a miss
> > * completion but not a corresponding reinjection.
> > */
> > @@ -598,6 +609,8 @@ struct gve_tx_ring {
> > struct gve_tx_pending_packet_dqo *pending_packets;
> > s16 num_pending_packets;
> >
> > + u16 *xsk_reorder_queue;
> > +
> > u32 complq_mask; /* complq size is complq_mask + 1 */
> >
> > /* QPL fields */
> > diff --git a/drivers/net/ethernet/google/gve/gve_dqo.h b/drivers/net/ethernet/google/gve/gve_dqo.h
> > index bb278727f4d9..6eb442096e02 100644
> > --- a/drivers/net/ethernet/google/gve/gve_dqo.h
> > +++ b/drivers/net/ethernet/google/gve/gve_dqo.h
> > @@ -38,6 +38,7 @@ netdev_features_t gve_features_check_dqo(struct sk_buff *skb,
> > netdev_features_t features);
> > bool gve_tx_poll_dqo(struct gve_notify_block *block, bool do_clean);
> > bool gve_xdp_poll_dqo(struct gve_notify_block *block);
> > +bool gve_xsk_tx_poll_dqo(struct gve_notify_block *block, int budget);
> > int gve_rx_poll_dqo(struct gve_notify_block *block, int budget);
> > int gve_tx_alloc_rings_dqo(struct gve_priv *priv,
> > struct gve_tx_alloc_rings_cfg *cfg);
> > diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c
> > index d5953f5d1895..c6ccc0bb40c9 100644
> > --- a/drivers/net/ethernet/google/gve/gve_main.c
> > +++ b/drivers/net/ethernet/google/gve/gve_main.c
> > @@ -427,6 +427,12 @@ int gve_napi_poll_dqo(struct napi_struct *napi, int budget)
> >
> > if (block->rx) {
> > work_done = gve_rx_poll_dqo(block, budget);
> > +
> > + /* Poll XSK TX as part of RX NAPI. Setup re-poll based on if
> > + * either datapath has more work to do.
> > + */
> > + if (priv->xdp_prog)
> > + reschedule |= gve_xsk_tx_poll_dqo(block, budget);
> > reschedule |= work_done == budget;
> > }
> >
> > diff --git a/drivers/net/ethernet/google/gve/gve_tx_dqo.c b/drivers/net/ethernet/google/gve/gve_tx_dqo.c
> > index ce5370b741ec..6f1d515673d2 100644
> > --- a/drivers/net/ethernet/google/gve/gve_tx_dqo.c
> > +++ b/drivers/net/ethernet/google/gve/gve_tx_dqo.c
> > @@ -13,6 +13,7 @@
> > #include <linux/tcp.h>
> > #include <linux/slab.h>
> > #include <linux/skbuff.h>
> > +#include <net/xdp_sock_drv.h>
> >
> > /* Returns true if tx_bufs are available. */
> > static bool gve_has_free_tx_qpl_bufs(struct gve_tx_ring *tx, int count)
> > @@ -241,6 +242,9 @@ static void gve_tx_free_ring_dqo(struct gve_priv *priv, struct gve_tx_ring *tx,
> > tx->dqo.tx_ring = NULL;
> > }
> >
> > + kvfree(tx->dqo.xsk_reorder_queue);
> > + tx->dqo.xsk_reorder_queue = NULL;
> > +
> > kvfree(tx->dqo.pending_packets);
> > tx->dqo.pending_packets = NULL;
> >
> > @@ -345,6 +349,17 @@ static int gve_tx_alloc_ring_dqo(struct gve_priv *priv,
> >
> > tx->dqo.pending_packets[tx->dqo.num_pending_packets - 1].next = -1;
> > atomic_set_release(&tx->dqo_compl.free_pending_packets, -1);
> > +
> > + /* Only alloc xsk pool for XDP queues */
> > + if (idx >= cfg->qcfg->num_queues && cfg->num_xdp_rings) {
> > + tx->dqo.xsk_reorder_queue =
> > + kvcalloc(tx->dqo.complq_mask + 1,
> > + sizeof(tx->dqo.xsk_reorder_queue[0]),
> > + GFP_KERNEL);
> > + if (!tx->dqo.xsk_reorder_queue)
> > + goto err;
> > + }
> > +
> > tx->dqo_compl.miss_completions.head = -1;
> > tx->dqo_compl.miss_completions.tail = -1;
> > tx->dqo_compl.timed_out_completions.head = -1;
> > @@ -992,6 +1007,38 @@ static int gve_try_tx_skb(struct gve_priv *priv, struct gve_tx_ring *tx,
> > return 0;
> > }
> >
> > +static void gve_xsk_reorder_queue_push_dqo(struct gve_tx_ring *tx,
> > + u16 completion_tag)
> > +{
> > + u32 tail = atomic_read(&tx->dqo_tx.xsk_reorder_queue_tail);
> > +
> > + tx->dqo.xsk_reorder_queue[tail] = completion_tag;
> > + tail = (tail + 1) & tx->dqo.complq_mask;
> > + atomic_set_release(&tx->dqo_tx.xsk_reorder_queue_tail, tail);
> > +}
> > +
> > +static struct gve_tx_pending_packet_dqo *
> > +gve_xsk_reorder_queue_head(struct gve_tx_ring *tx)
> > +{
> > + u32 head = tx->dqo_compl.xsk_reorder_queue_head;
> > +
> > + if (head == tx->dqo_compl.xsk_reorder_queue_tail) {
> > + tx->dqo_compl.xsk_reorder_queue_tail =
> > + atomic_read_acquire(&tx->dqo_tx.xsk_reorder_queue_tail);
> > +
> > + if (head == tx->dqo_compl.xsk_reorder_queue_tail)
> > + return NULL;
> > + }
> > +
> > + return &tx->dqo.pending_packets[tx->dqo.xsk_reorder_queue[head]];
> > +}
> > +
> > +static void gve_xsk_reorder_queue_pop_dqo(struct gve_tx_ring *tx)
> > +{
> > + tx->dqo_compl.xsk_reorder_queue_head++;
> > + tx->dqo_compl.xsk_reorder_queue_head &= tx->dqo.complq_mask;
> > +}
> > +
> > /* Transmit a given skb and ring the doorbell. */
> > netdev_tx_t gve_tx_dqo(struct sk_buff *skb, struct net_device *dev)
> > {
> > @@ -1015,6 +1062,62 @@ netdev_tx_t gve_tx_dqo(struct sk_buff *skb, struct net_device *dev)
> > return NETDEV_TX_OK;
> > }
> >
> > +static bool gve_xsk_tx_dqo(struct gve_priv *priv, struct gve_tx_ring *tx,
> > + int budget)
> > +{
> > + struct xsk_buff_pool *pool = tx->xsk_pool;
> > + struct xdp_desc desc;
> > + bool repoll = false;
> > + int sent = 0;
> > +
> > + spin_lock(&tx->dqo_tx.xdp_lock);
> > + for (; sent < budget; sent++) {
> > + struct gve_tx_pending_packet_dqo *pkt;
> > + s16 completion_tag;
> > + dma_addr_t addr;
> > + u32 desc_idx;
> > +
> > + if (unlikely(!gve_has_avail_slots_tx_dqo(tx, 1, 1))) {
> > + repoll = true;
> > + break;
>
> If the whole 'for' loop breaks here, the sent should not be increased
> by one, right?
Oops, I was wrong here because 'sent' starts at zero and will be
counted later. So the sent++ is always necessary even when it might
break sometimes.
Thanks,
Jason
Powered by blists - more mailing lists