lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAL+tcoBu0ZQzLA0MvwAzsYYpz=Z=xR7LiLmFwJUXcHFDvFZVPg@mail.gmail.com>
Date: Tue, 22 Jul 2025 07:37:51 +0800
From: Jason Xing <kerneljasonxing@...il.com>
To: Jeroen de Borst <jeroendb@...gle.com>
Cc: netdev@...r.kernel.org, hramamurthy@...gle.com, davem@...emloft.net, 
	edumazet@...gle.com, kuba@...nel.org, willemb@...gle.com, pabeni@...hat.com, 
	Joshua Washington <joshwash@...gle.com>, Praveen Kaligineedi <pkaligineedi@...gle.com>
Subject: Re: [PATCH net-next v2 4/5] gve: implement DQO TX datapath for AF_XDP zero-copy

Hi Jeroen,

On Thu, Jul 17, 2025 at 11:29 PM Jeroen de Borst <jeroendb@...gle.com> wrote:
>
> From: Joshua Washington <joshwash@...gle.com>
>
> In the descriptor clean path, a number of changes need to be made to
> accommodate out of order completions and double completions.
>
> The XSK stack can only handle completions being processed in order, as a
> single counter is incremented in xsk_tx_completed to sigify how many XSK
> descriptors have been completed. Because completions can come back out
> of order in DQ, a separate queue of XSK descriptors must be maintained.
> This queue keeps the pending packets in the order that they were written
> so that the descriptors can be counted in xsk_tx_completed in the same
> order.
>
> For double completions, a new pending packet state and type are
> introduced. The new type, GVE_TX_PENDING_PACKET_DQO_XSK, plays an
> anlogous role to pre-existing _SKB and _XDP_FRAME pending packet types
> for XSK descriptors. The new state, GVE_PACKET_STATE_XSK_COMPLETE,
> represents packets for which no more completions are expected. This
> includes packets which have received a packet completion or reinjection
> completion, as well as packets whose reinjection completion timer have
> timed out. At this point, such packets can be counted as part of
> xsk_tx_completed() and freed.
>
> Reviewed-by: Willem de Bruijn <willemb@...gle.com>
> Signed-off-by: Praveen Kaligineedi <pkaligineedi@...gle.com>
> Signed-off-by: Joshua Washington <joshwash@...gle.com>
> Signed-off-by: Jeroen de Borst <jeroendb@...gle.com>
> ---
>  drivers/net/ethernet/google/gve/gve.h        |  19 ++-
>  drivers/net/ethernet/google/gve/gve_dqo.h    |   1 +
>  drivers/net/ethernet/google/gve/gve_main.c   |   6 +
>  drivers/net/ethernet/google/gve/gve_tx_dqo.c | 148 +++++++++++++++++++
>  4 files changed, 171 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h
> index 9925c08e595e..ff7dc06e7fa4 100644
> --- a/drivers/net/ethernet/google/gve/gve.h
> +++ b/drivers/net/ethernet/google/gve/gve.h
> @@ -399,11 +399,17 @@ enum gve_packet_state {
>         GVE_PACKET_STATE_PENDING_REINJECT_COMPL,
>         /* No valid completion received within the specified timeout. */
>         GVE_PACKET_STATE_TIMED_OUT_COMPL,
> +       /* XSK pending packet has received a packet/reinjection completion, or
> +        * has timed out. At this point, the pending packet can be counted by
> +        * xsk_tx_complete and freed.
> +        */
> +       GVE_PACKET_STATE_XSK_COMPLETE,
>  };
>
>  enum gve_tx_pending_packet_dqo_type {
>         GVE_TX_PENDING_PACKET_DQO_SKB,
> -       GVE_TX_PENDING_PACKET_DQO_XDP_FRAME
> +       GVE_TX_PENDING_PACKET_DQO_XDP_FRAME,
> +       GVE_TX_PENDING_PACKET_DQO_XSK,
>  };
>
>  struct gve_tx_pending_packet_dqo {
> @@ -440,10 +446,10 @@ struct gve_tx_pending_packet_dqo {
>         /* Identifies the current state of the packet as defined in
>          * `enum gve_packet_state`.
>          */
> -       u8 state : 2;
> +       u8 state : 3;
>
>         /* gve_tx_pending_packet_dqo_type */
> -       u8 type : 1;
> +       u8 type : 2;
>
>         /* If packet is an outstanding miss completion, then the packet is
>          * freed if the corresponding re-injection completion is not received
> @@ -512,6 +518,8 @@ struct gve_tx_ring {
>                                 /* Cached value of `dqo_compl.free_tx_qpl_buf_cnt` */
>                                 u32 free_tx_qpl_buf_cnt;
>                         };
> +
> +                       atomic_t xsk_reorder_queue_tail;
>                 } dqo_tx;
>         };
>
> @@ -545,6 +553,9 @@ struct gve_tx_ring {
>                         /* Last TX ring index fetched by HW */
>                         atomic_t hw_tx_head;
>
> +                       u16 xsk_reorder_queue_head;
> +                       u16 xsk_reorder_queue_tail;
> +
>                         /* List to track pending packets which received a miss
>                          * completion but not a corresponding reinjection.
>                          */
> @@ -598,6 +609,8 @@ struct gve_tx_ring {
>                         struct gve_tx_pending_packet_dqo *pending_packets;
>                         s16 num_pending_packets;
>
> +                       u16 *xsk_reorder_queue;
> +
>                         u32 complq_mask; /* complq size is complq_mask + 1 */
>
>                         /* QPL fields */
> diff --git a/drivers/net/ethernet/google/gve/gve_dqo.h b/drivers/net/ethernet/google/gve/gve_dqo.h
> index bb278727f4d9..6eb442096e02 100644
> --- a/drivers/net/ethernet/google/gve/gve_dqo.h
> +++ b/drivers/net/ethernet/google/gve/gve_dqo.h
> @@ -38,6 +38,7 @@ netdev_features_t gve_features_check_dqo(struct sk_buff *skb,
>                                          netdev_features_t features);
>  bool gve_tx_poll_dqo(struct gve_notify_block *block, bool do_clean);
>  bool gve_xdp_poll_dqo(struct gve_notify_block *block);
> +bool gve_xsk_tx_poll_dqo(struct gve_notify_block *block, int budget);
>  int gve_rx_poll_dqo(struct gve_notify_block *block, int budget);
>  int gve_tx_alloc_rings_dqo(struct gve_priv *priv,
>                            struct gve_tx_alloc_rings_cfg *cfg);
> diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c
> index d5953f5d1895..c6ccc0bb40c9 100644
> --- a/drivers/net/ethernet/google/gve/gve_main.c
> +++ b/drivers/net/ethernet/google/gve/gve_main.c
> @@ -427,6 +427,12 @@ int gve_napi_poll_dqo(struct napi_struct *napi, int budget)
>
>         if (block->rx) {
>                 work_done = gve_rx_poll_dqo(block, budget);
> +
> +               /* Poll XSK TX as part of RX NAPI. Setup re-poll based on if
> +                * either datapath has more work to do.
> +                */
> +               if (priv->xdp_prog)
> +                       reschedule |= gve_xsk_tx_poll_dqo(block, budget);
>                 reschedule |= work_done == budget;
>         }
>
> diff --git a/drivers/net/ethernet/google/gve/gve_tx_dqo.c b/drivers/net/ethernet/google/gve/gve_tx_dqo.c
> index ce5370b741ec..6f1d515673d2 100644
> --- a/drivers/net/ethernet/google/gve/gve_tx_dqo.c
> +++ b/drivers/net/ethernet/google/gve/gve_tx_dqo.c
> @@ -13,6 +13,7 @@
>  #include <linux/tcp.h>
>  #include <linux/slab.h>
>  #include <linux/skbuff.h>
> +#include <net/xdp_sock_drv.h>
>
>  /* Returns true if tx_bufs are available. */
>  static bool gve_has_free_tx_qpl_bufs(struct gve_tx_ring *tx, int count)
> @@ -241,6 +242,9 @@ static void gve_tx_free_ring_dqo(struct gve_priv *priv, struct gve_tx_ring *tx,
>                 tx->dqo.tx_ring = NULL;
>         }
>
> +       kvfree(tx->dqo.xsk_reorder_queue);
> +       tx->dqo.xsk_reorder_queue = NULL;
> +
>         kvfree(tx->dqo.pending_packets);
>         tx->dqo.pending_packets = NULL;
>
> @@ -345,6 +349,17 @@ static int gve_tx_alloc_ring_dqo(struct gve_priv *priv,
>
>         tx->dqo.pending_packets[tx->dqo.num_pending_packets - 1].next = -1;
>         atomic_set_release(&tx->dqo_compl.free_pending_packets, -1);
> +
> +       /* Only alloc xsk pool for XDP queues */
> +       if (idx >= cfg->qcfg->num_queues && cfg->num_xdp_rings) {
> +               tx->dqo.xsk_reorder_queue =
> +                       kvcalloc(tx->dqo.complq_mask + 1,
> +                                sizeof(tx->dqo.xsk_reorder_queue[0]),
> +                                GFP_KERNEL);
> +               if (!tx->dqo.xsk_reorder_queue)
> +                       goto err;
> +       }
> +
>         tx->dqo_compl.miss_completions.head = -1;
>         tx->dqo_compl.miss_completions.tail = -1;
>         tx->dqo_compl.timed_out_completions.head = -1;
> @@ -992,6 +1007,38 @@ static int gve_try_tx_skb(struct gve_priv *priv, struct gve_tx_ring *tx,
>         return 0;
>  }
>
> +static void gve_xsk_reorder_queue_push_dqo(struct gve_tx_ring *tx,
> +                                          u16 completion_tag)
> +{
> +       u32 tail = atomic_read(&tx->dqo_tx.xsk_reorder_queue_tail);
> +
> +       tx->dqo.xsk_reorder_queue[tail] = completion_tag;
> +       tail = (tail + 1) & tx->dqo.complq_mask;
> +       atomic_set_release(&tx->dqo_tx.xsk_reorder_queue_tail, tail);
> +}
> +
> +static struct gve_tx_pending_packet_dqo *
> +gve_xsk_reorder_queue_head(struct gve_tx_ring *tx)
> +{
> +       u32 head = tx->dqo_compl.xsk_reorder_queue_head;
> +
> +       if (head == tx->dqo_compl.xsk_reorder_queue_tail) {
> +               tx->dqo_compl.xsk_reorder_queue_tail =
> +                       atomic_read_acquire(&tx->dqo_tx.xsk_reorder_queue_tail);
> +
> +               if (head == tx->dqo_compl.xsk_reorder_queue_tail)
> +                       return NULL;
> +       }
> +
> +       return &tx->dqo.pending_packets[tx->dqo.xsk_reorder_queue[head]];
> +}
> +
> +static void gve_xsk_reorder_queue_pop_dqo(struct gve_tx_ring *tx)
> +{
> +       tx->dqo_compl.xsk_reorder_queue_head++;
> +       tx->dqo_compl.xsk_reorder_queue_head &= tx->dqo.complq_mask;
> +}
> +
>  /* Transmit a given skb and ring the doorbell. */
>  netdev_tx_t gve_tx_dqo(struct sk_buff *skb, struct net_device *dev)
>  {
> @@ -1015,6 +1062,62 @@ netdev_tx_t gve_tx_dqo(struct sk_buff *skb, struct net_device *dev)
>         return NETDEV_TX_OK;
>  }
>
> +static bool gve_xsk_tx_dqo(struct gve_priv *priv, struct gve_tx_ring *tx,
> +                          int budget)
> +{
> +       struct xsk_buff_pool *pool = tx->xsk_pool;
> +       struct xdp_desc desc;
> +       bool repoll = false;
> +       int sent = 0;
> +
> +       spin_lock(&tx->dqo_tx.xdp_lock);
> +       for (; sent < budget; sent++) {
> +               struct gve_tx_pending_packet_dqo *pkt;
> +               s16 completion_tag;
> +               dma_addr_t addr;
> +               u32 desc_idx;
> +
> +               if (unlikely(!gve_has_avail_slots_tx_dqo(tx, 1, 1))) {
> +                       repoll = true;
> +                       break;

If the whole 'for' loop breaks here, the sent should not be increased
by one, right?

> +               }
> +
> +               if (!xsk_tx_peek_desc(pool, &desc))
> +                       break;

The same logic here.

I would use a while() or do..while() loop like how i40e_clean_tx_irq()
manipulates the budget.

> +
> +               pkt = gve_alloc_pending_packet(tx);

I checked gve_alloc_pending_packet() and noticed there is one slight
chance to return NULL? If so, it will trigger a NULL dereference
issue.

Thanks,
Jason

> +               pkt->type = GVE_TX_PENDING_PACKET_DQO_XSK;
> +               pkt->num_bufs = 0;
> +               completion_tag = pkt - tx->dqo.pending_packets;
> +
> +               addr = xsk_buff_raw_get_dma(pool, desc.addr);
> +               xsk_buff_raw_dma_sync_for_device(pool, addr, desc.len);
> +
> +               desc_idx = tx->dqo_tx.tail;
> +               gve_tx_fill_pkt_desc_dqo(tx, &desc_idx,
> +                                        true, desc.len,
> +                                        addr, completion_tag, true,
> +                                        false);
> +               ++pkt->num_bufs;
> +               gve_tx_update_tail(tx, desc_idx);
> +               tx->dqo_tx.posted_packet_desc_cnt += pkt->num_bufs;
> +               gve_xsk_reorder_queue_push_dqo(tx, completion_tag);
> +       }
> +
> +       if (sent) {
> +               gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail);
> +               xsk_tx_release(pool);
> +       }
> +
> +       spin_unlock(&tx->dqo_tx.xdp_lock);
> +
> +       u64_stats_update_begin(&tx->statss);
> +       tx->xdp_xsk_sent += sent;
> +       u64_stats_update_end(&tx->statss);
> +
> +       return (sent == budget) || repoll;
> +}
> +
>  static void add_to_list(struct gve_tx_ring *tx, struct gve_index_list *list,
>                         struct gve_tx_pending_packet_dqo *pending_packet)
>  {
> @@ -1152,6 +1255,9 @@ static void gve_handle_packet_completion(struct gve_priv *priv,
>                 pending_packet->xdpf = NULL;
>                 gve_free_pending_packet(tx, pending_packet);
>                 break;
> +       case GVE_TX_PENDING_PACKET_DQO_XSK:
> +               pending_packet->state = GVE_PACKET_STATE_XSK_COMPLETE;
> +               break;
>         default:
>                 WARN_ON_ONCE(1);
>         }
> @@ -1251,8 +1357,34 @@ static void remove_timed_out_completions(struct gve_priv *priv,
>
>                 remove_from_list(tx, &tx->dqo_compl.timed_out_completions,
>                                  pending_packet);
> +
> +               /* Need to count XSK packets in xsk_tx_completed. */
> +               if (pending_packet->type == GVE_TX_PENDING_PACKET_DQO_XSK)
> +                       pending_packet->state = GVE_PACKET_STATE_XSK_COMPLETE;
> +               else
> +                       gve_free_pending_packet(tx, pending_packet);
> +       }
> +}
> +
> +static void gve_tx_process_xsk_completions(struct gve_tx_ring *tx)
> +{
> +       u32 num_xsks = 0;
> +
> +       while (true) {
> +               struct gve_tx_pending_packet_dqo *pending_packet =
> +                       gve_xsk_reorder_queue_head(tx);
> +
> +               if (!pending_packet ||
> +                   pending_packet->state != GVE_PACKET_STATE_XSK_COMPLETE)
> +                       break;
> +
> +               num_xsks++;
> +               gve_xsk_reorder_queue_pop_dqo(tx);
>                 gve_free_pending_packet(tx, pending_packet);
>         }
> +
> +       if (num_xsks)
> +               xsk_tx_completed(tx->xsk_pool, num_xsks);
>  }
>
>  int gve_clean_tx_done_dqo(struct gve_priv *priv, struct gve_tx_ring *tx,
> @@ -1333,6 +1465,9 @@ int gve_clean_tx_done_dqo(struct gve_priv *priv, struct gve_tx_ring *tx,
>         remove_miss_completions(priv, tx);
>         remove_timed_out_completions(priv, tx);
>
> +       if (tx->xsk_pool)
> +               gve_tx_process_xsk_completions(tx);
> +
>         u64_stats_update_begin(&tx->statss);
>         tx->bytes_done += pkt_compl_bytes + reinject_compl_bytes;
>         tx->pkt_done += pkt_compl_pkts + reinject_compl_pkts;
> @@ -1365,6 +1500,19 @@ bool gve_tx_poll_dqo(struct gve_notify_block *block, bool do_clean)
>         return compl_desc->generation != tx->dqo_compl.cur_gen_bit;
>  }
>
> +bool gve_xsk_tx_poll_dqo(struct gve_notify_block *rx_block, int budget)
> +{
> +       struct gve_rx_ring *rx = rx_block->rx;
> +       struct gve_priv *priv = rx->gve;
> +       struct gve_tx_ring *tx;
> +
> +       tx = &priv->tx[gve_xdp_tx_queue_id(priv, rx->q_num)];
> +       if (tx->xsk_pool)
> +               return gve_xsk_tx_dqo(priv, tx, budget);
> +
> +       return 0;
> +}
> +
>  bool gve_xdp_poll_dqo(struct gve_notify_block *block)
>  {
>         struct gve_tx_compl_desc *compl_desc;
> --
> 2.50.0.727.gbf7dc18ff4-goog
>
>

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ