netdev - Re: [PATCH net-next v6 2/4] gve: Add support for raw addressing to the rx path

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAL9ddJdL0KNp69J6tVn_1Bp8xxwo2JxKRVajHfAriY=pUH0r1g@mail.gmail.com>
Date:   Wed, 18 Nov 2020 15:15:28 -0800
From:   David Awogbemila <awogbemila@...gle.com>
To:     Alexander Duyck <alexander.duyck@...il.com>
Cc:     Netdev <netdev@...r.kernel.org>,
        Catherine Sullivan <csully@...gle.com>,
        Yangchun Fu <yangchun@...gle.com>
Subject: Re: [PATCH net-next v6 2/4] gve: Add support for raw addressing to
 the rx path

On Wed, Nov 11, 2020 at 9:20 AM Alexander Duyck
<alexander.duyck@...il.com> wrote:
>
> On Mon, Nov 9, 2020 at 3:39 PM David Awogbemila <awogbemila@...gle.com> wrote:
> >
> > From: Catherine Sullivan <csully@...gle.com>
> >
> > Add support to use raw dma addresses in the rx path. Due to this new
> > support we can alloc a new buffer instead of making a copy.
> >
> > RX buffers are handed to the networking stack and are
> > re-allocated as needed, avoiding the need to use
> > skb_copy_to_linear_data() as in "qpl" mode.
> >
> > Reviewed-by: Yangchun Fu <yangchun@...gle.com>
> > Signed-off-by: Catherine Sullivan <csully@...gle.com>
> > Signed-off-by: David Awogbemila <awogbemila@...gle.com>
> > ---
> >  drivers/net/ethernet/google/gve/gve.h        |   6 +-
> >  drivers/net/ethernet/google/gve/gve_adminq.c |  14 +-
> >  drivers/net/ethernet/google/gve/gve_desc.h   |  10 +-
> >  drivers/net/ethernet/google/gve/gve_main.c   |   3 +-
> >  drivers/net/ethernet/google/gve/gve_rx.c     | 224 +++++++++++++++----
> >  5 files changed, 200 insertions(+), 57 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h
> > index 80cdae06ee39..a8c589dd14e4 100644
> > --- a/drivers/net/ethernet/google/gve/gve.h
> > +++ b/drivers/net/ethernet/google/gve/gve.h
> > @@ -68,6 +68,7 @@ struct gve_rx_data_queue {
> >         dma_addr_t data_bus; /* dma mapping of the slots */
> >         struct gve_rx_slot_page_info *page_info; /* page info of the buffers */
> >         struct gve_queue_page_list *qpl; /* qpl assigned to this queue */
> > +       bool raw_addressing; /* use raw_addressing? */
> >  };
>
> Again, if you care about alignment at all in this structure you should
> probably use u8 instead of bool.

Thanks, I'll adjust this.

>
> >
> >  struct gve_priv;
> > @@ -82,6 +83,7 @@ struct gve_rx_ring {
> >         u32 cnt; /* free-running total number of completed packets */
> >         u32 fill_cnt; /* free-running total number of descs and buffs posted */
> >         u32 mask; /* masks the cnt and fill_cnt to the size of the ring */
> > +       u32 db_threshold; /* threshold for posting new buffs and descs */
> >         u64 rx_copybreak_pkt; /* free-running count of copybreak packets */
> >         u64 rx_copied_pkt; /* free-running total number of copied packets */
> >         u64 rx_skb_alloc_fail; /* free-running count of skb alloc fails */
> > @@ -194,7 +196,7 @@ struct gve_priv {
> >         u16 tx_desc_cnt; /* num desc per ring */
> >         u16 rx_desc_cnt; /* num desc per ring */
> >         u16 tx_pages_per_qpl; /* tx buffer length */
> > -       u16 rx_pages_per_qpl; /* rx buffer length */
> > +       u16 rx_data_slot_cnt; /* rx buffer length */
> >         u64 max_registered_pages;
> >         u64 num_registered_pages; /* num pages registered with NIC */
> >         u32 rx_copybreak; /* copy packets smaller than this */
> > @@ -444,7 +446,7 @@ static inline u32 gve_num_tx_qpls(struct gve_priv *priv)
> >   */
> >  static inline u32 gve_num_rx_qpls(struct gve_priv *priv)
> >  {
> > -       return priv->rx_cfg.num_queues;
> > +       return priv->raw_addressing ? 0 : priv->rx_cfg.num_queues;
> >  }
> >
> >  /* Returns a pointer to the next available tx qpl in the list of qpls
> > diff --git a/drivers/net/ethernet/google/gve/gve_adminq.c b/drivers/net/ethernet/google/gve/gve_adminq.c
> > index 3e6de659b274..711d135c6b1a 100644
> > --- a/drivers/net/ethernet/google/gve/gve_adminq.c
> > +++ b/drivers/net/ethernet/google/gve/gve_adminq.c
> > @@ -369,8 +369,10 @@ static int gve_adminq_create_rx_queue(struct gve_priv *priv, u32 queue_index)
> >  {
> >         struct gve_rx_ring *rx = &priv->rx[queue_index];
> >         union gve_adminq_command cmd;
> > +       u32 qpl_id;
> >         int err;
> >
> > +       qpl_id = priv->raw_addressing ? GVE_RAW_ADDRESSING_QPL_ID : rx->data.qpl->id;
> >         memset(&cmd, 0, sizeof(cmd));
> >         cmd.opcode = cpu_to_be32(GVE_ADMINQ_CREATE_RX_QUEUE);
> >         cmd.create_rx_queue = (struct gve_adminq_create_rx_queue) {
> > @@ -381,7 +383,7 @@ static int gve_adminq_create_rx_queue(struct gve_priv *priv, u32 queue_index)
> >                 .queue_resources_addr = cpu_to_be64(rx->q_resources_bus),
> >                 .rx_desc_ring_addr = cpu_to_be64(rx->desc.bus),
> >                 .rx_data_ring_addr = cpu_to_be64(rx->data.data_bus),
> > -               .queue_page_list_id = cpu_to_be32(rx->data.qpl->id),
> > +               .queue_page_list_id = cpu_to_be32(qpl_id),
> >         };
> >
> >         err = gve_adminq_issue_cmd(priv, &cmd);
> > @@ -526,11 +528,11 @@ int gve_adminq_describe_device(struct gve_priv *priv)
> >         mac = descriptor->mac;
> >         dev_info(&priv->pdev->dev, "MAC addr: %pM\n", mac);
> >         priv->tx_pages_per_qpl = be16_to_cpu(descriptor->tx_pages_per_qpl);
> > -       priv->rx_pages_per_qpl = be16_to_cpu(descriptor->rx_pages_per_qpl);
> > -       if (priv->rx_pages_per_qpl < priv->rx_desc_cnt) {
> > -               dev_err(&priv->pdev->dev, "rx_pages_per_qpl cannot be smaller than rx_desc_cnt, setting rx_desc_cnt down to %d.\n",
> > -                       priv->rx_pages_per_qpl);
> > -               priv->rx_desc_cnt = priv->rx_pages_per_qpl;
> > +       priv->rx_data_slot_cnt = be16_to_cpu(descriptor->rx_pages_per_qpl);
> > +       if (priv->rx_data_slot_cnt < priv->rx_desc_cnt) {
> > +               dev_err(&priv->pdev->dev, "rx_data_slot_cnt cannot be smaller than rx_desc_cnt, setting rx_desc_cnt down to %d.\n",
> > +                       priv->rx_data_slot_cnt);
> > +               priv->rx_desc_cnt = priv->rx_data_slot_cnt;
> >         }
> >         priv->default_num_queues = be16_to_cpu(descriptor->default_num_queues);
> >         dev_opt = (void *)(descriptor + 1);
> > diff --git a/drivers/net/ethernet/google/gve/gve_desc.h b/drivers/net/ethernet/google/gve/gve_desc.h
> > index 54779871d52e..0aad314aefaf 100644
> > --- a/drivers/net/ethernet/google/gve/gve_desc.h
> > +++ b/drivers/net/ethernet/google/gve/gve_desc.h
> > @@ -72,12 +72,14 @@ struct gve_rx_desc {
> >  } __packed;
> >  static_assert(sizeof(struct gve_rx_desc) == 64);
> >
> > -/* As with the Tx ring format, the qpl_offset entries below are offsets into an
> > - * ordered list of registered pages.
> > +/* If the device supports raw dma addressing then the addr in data slot is
> > + * the dma address of the buffer.
> > + * If the device only supports registered segments than the addr is a byte
>
> s/than/then/

I'll correct this.

>
> > + * offset into the registered segment (an ordered list of pages) where the
> > + * buffer is.
> >   */
> >  struct gve_rx_data_slot {
> > -       /* byte offset into the rx registered segment of this slot */
> > -       __be64 qpl_offset;
> > +       __be64 addr;
> >  };
>
> So this is either the qpl_offset or an address correct? In such a case
> shouldn't this be a union? I'm just wanting to make sure that this
> isn't something where we are just calling it all addr now even though
> there are still cases where it is the qpl_offset.

Yeah a union might be a better option here as it is still a qpl_offset
in some instances as you point out. I'll adopt a union.

>
> >  /* GVE Recive Packet Descriptor Seq No */
> > diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c
> > index 70685c10db0e..225e17dd1ae5 100644
> > --- a/drivers/net/ethernet/google/gve/gve_main.c
> > +++ b/drivers/net/ethernet/google/gve/gve_main.c
> > @@ -596,6 +596,7 @@ int gve_alloc_page(struct gve_priv *priv, struct device *dev,
> >         if (dma_mapping_error(dev, *dma)) {
> >                 priv->dma_mapping_error++;
> >                 put_page(*page);
> > +               *page = NULL;
> >                 return -ENOMEM;
> >         }
> >         return 0;
>
> This piece seems like a bug fix for the exception handling path.
> Should it be pulled out into a separate patch so that it could be
> backported if needed.

Ok, I'll take it away from this patch.

>
> > @@ -694,7 +695,7 @@ static int gve_alloc_qpls(struct gve_priv *priv)
> >         }
> >         for (; i < num_qpls; i++) {
> >                 err = gve_alloc_queue_page_list(priv, i,
> > -                                               priv->rx_pages_per_qpl);
> > +                                               priv->rx_data_slot_cnt);
> >                 if (err)
> >                         goto free_qpls;
> >         }
> > diff --git a/drivers/net/ethernet/google/gve/gve_rx.c b/drivers/net/ethernet/google/gve/gve_rx.c
> > index 008fa897a3e6..49646caf930c 100644
> > --- a/drivers/net/ethernet/google/gve/gve_rx.c
> > +++ b/drivers/net/ethernet/google/gve/gve_rx.c
> > @@ -16,12 +16,39 @@ static void gve_rx_remove_from_block(struct gve_priv *priv, int queue_idx)
> >         block->rx = NULL;
> >  }
> >
> > +static void gve_rx_free_buffer(struct device *dev,
> > +                              struct gve_rx_slot_page_info *page_info,
> > +                              struct gve_rx_data_slot *data_slot)
> > +{
> > +       dma_addr_t dma = (dma_addr_t)(be64_to_cpu(data_slot->addr) -
> > +                                     page_info->page_offset);
> > +
>
> Why bother with subtraction when you are adding the page offset via an
> xor? It seems like you should be able to just mask off the page offset
> and not need to bother to store it anywhere. Either the DMA address is
> page aligned in which case you can xor in and out the value and just
> use an &= operator to strip it, or it isn't in which case you should
> be using addition and subtraction everywhere and only bitflip page
> offset as a single bit somewhere.

Ok, I'll adjust this so that it just uses and & to obtain the page
address and turn page_offset
into a u8 which keeps track of the flipped state.

>
> > +       gve_free_page(dev, page_info->page, dma, DMA_FROM_DEVICE);
> > +}
> > +
> > +static void gve_rx_unfill_pages(struct gve_priv *priv, struct gve_rx_ring *rx)
> > +{
> > +       u32 slots = rx->mask + 1;
> > +       int i;
> > +
> > +       if (rx->data.raw_addressing) {
>
> The declaration of slots and I could be moved here since they aren't
> used anywhere else in this function.

Ok.

>
> > +               for (i = 0; i < slots; i++)
> > +                       gve_rx_free_buffer(&priv->pdev->dev, &rx->data.page_info[i],
> > +                                          &rx->data.data_ring[i]);
> > +       } else {
> > +               gve_unassign_qpl(priv, rx->data.qpl->id);
> > +               rx->data.qpl = NULL;
> > +       }
> > +       kvfree(rx->data.page_info);
> > +       rx->data.page_info = NULL;
> > +}
> > +
> >  static void gve_rx_free_ring(struct gve_priv *priv, int idx)
> >  {
> >         struct gve_rx_ring *rx = &priv->rx[idx];
> >         struct device *dev = &priv->pdev->dev;
> > +       u32 slots = rx->mask + 1;
> >         size_t bytes;
> > -       u32 slots;
> >
> >         gve_rx_remove_from_block(priv, idx);
> >
> > @@ -33,11 +60,8 @@ static void gve_rx_free_ring(struct gve_priv *priv, int idx)
> >                           rx->q_resources, rx->q_resources_bus);
> >         rx->q_resources = NULL;
> >
> > -       gve_unassign_qpl(priv, rx->data.qpl->id);
> > -       rx->data.qpl = NULL;
> > -       kvfree(rx->data.page_info);
> > +       gve_rx_unfill_pages(priv, rx);
> >
> > -       slots = rx->mask + 1;
> >         bytes = sizeof(*rx->data.data_ring) * slots;
> >         dma_free_coherent(dev, bytes, rx->data.data_ring,
> >                           rx->data.data_bus);
> > @@ -52,14 +76,36 @@ static void gve_setup_rx_buffer(struct gve_rx_slot_page_info *page_info,
> >         page_info->page = page;
> >         page_info->page_offset = 0;
> >         page_info->page_address = page_address(page);
> > -       slot->qpl_offset = cpu_to_be64(addr);
> > +       slot->addr = cpu_to_be64(addr);
> > +}
> > +
> > +static int gve_rx_alloc_buffer(struct gve_priv *priv, struct device *dev,
> > +                              struct gve_rx_slot_page_info *page_info,
> > +                              struct gve_rx_data_slot *data_slot,
> > +                              struct gve_rx_ring *rx)
> > +{
> > +       struct page *page;
> > +       dma_addr_t dma;
> > +       int err;
> > +
> > +       err = gve_alloc_page(priv, dev, &page, &dma, DMA_FROM_DEVICE);
> > +       if (err) {
> > +               u64_stats_update_begin(&rx->statss);
> > +               rx->rx_buf_alloc_fail++;
> > +               u64_stats_update_end(&rx->statss);
> > +               return err;
> > +       }
>
> This just feels really clumsy to me. Do the stats really need to be
> invoked in all cases? It seems like you could pull this code out and
> put it in an exception path somewhere rather than in the middle of the
> function. That way you don't need to carry around the rx ring which
> isn't used for anything else. So for example when you are prefilling
> the buffers it looks like you are already returning an error to the
> user. In such a case the stats update might be redundant as the
> interface wasn't allowed to come up in the first place.

Ok, avoiding the stats update during a failed attempt to bring the
interface up might be better.
I'll separate the code so we don't bother with the stats update when
the interface is coming up.

>
> > +
> > +       gve_setup_rx_buffer(page_info, data_slot, dma, page);
> > +       return 0;
> >  }
> >
> >  static int gve_prefill_rx_pages(struct gve_rx_ring *rx)
> >  {
> >         struct gve_priv *priv = rx->gve;
> >         u32 slots;
> > -       int i;
> > +       int err;
> > +       int i, j;
> >
> >         /* Allocate one page per Rx queue slot. Each page is split into two
> >          * packet buffers, when possible we "page flip" between the two.
> > @@ -71,17 +117,32 @@ static int gve_prefill_rx_pages(struct gve_rx_ring *rx)
> >         if (!rx->data.page_info)
> >                 return -ENOMEM;
> >
> > -       rx->data.qpl = gve_assign_rx_qpl(priv);
> > -
> > +       if (!rx->data.raw_addressing)
> > +               rx->data.qpl = gve_assign_rx_qpl(priv);
> >         for (i = 0; i < slots; i++) {
> > -               struct page *page = rx->data.qpl->pages[i];
> > -               dma_addr_t addr = i * PAGE_SIZE;
> > -
> > -               gve_setup_rx_buffer(&rx->data.page_info[i],
> > -                                   &rx->data.data_ring[i], addr, page);
> > +               struct page *page;
> > +               dma_addr_t addr;
> > +
> > +               if (!rx->data.raw_addressing) {
> > +                       page = rx->data.qpl->pages[i];
> > +                       addr = i * PAGE_SIZE;
> > +                       gve_setup_rx_buffer(&rx->data.page_info[i],
> > +                                           &rx->data.data_ring[i], addr, page);
> > +                       continue;
> > +               }
> > +               err = gve_rx_alloc_buffer(priv, &priv->pdev->dev, &rx->data.page_info[i],
> > +                                         &rx->data.data_ring[i], rx);
> > +               if (err)
> > +                       goto alloc_err;
> >         }
> >
> >         return slots;
> > +alloc_err:
> > +       for (j = 0; j < i; j++)
> > +               gve_rx_free_buffer(&priv->pdev->dev,
> > +                                  &rx->data.page_info[j],
> > +                                  &rx->data.data_ring[j]);
>
> Instead of adding another variable you could just change this loop to
> be based on "while(i--)" and then use i as you work your way backwards
> through buffers you previously allocated.

Ok.

>
> > +       return err;
> >  }
> >
> >  static void gve_rx_add_to_block(struct gve_priv *priv, int queue_idx)
> > @@ -110,8 +171,9 @@ static int gve_rx_alloc_ring(struct gve_priv *priv, int idx)
> >         rx->gve = priv;
> >         rx->q_num = idx;
> >
> > -       slots = priv->rx_pages_per_qpl;
> > +       slots = priv->rx_data_slot_cnt;
> >         rx->mask = slots - 1;
> > +       rx->data.raw_addressing = priv->raw_addressing;
> >
> >         /* alloc rx data ring */
> >         bytes = sizeof(*rx->data.data_ring) * slots;
> > @@ -156,8 +218,8 @@ static int gve_rx_alloc_ring(struct gve_priv *priv, int idx)
> >                 err = -ENOMEM;
> >                 goto abort_with_q_resources;
> >         }
> > -       rx->mask = slots - 1;
> >         rx->cnt = 0;
> > +       rx->db_threshold = priv->rx_desc_cnt / 2;
> >         rx->desc.seqno = 1;
> >         gve_rx_add_to_block(priv, idx);
> >
> > @@ -168,7 +230,7 @@ static int gve_rx_alloc_ring(struct gve_priv *priv, int idx)
> >                           rx->q_resources, rx->q_resources_bus);
> >         rx->q_resources = NULL;
> >  abort_filled:
> > -       kvfree(rx->data.page_info);
> > +       gve_rx_unfill_pages(priv, rx);
> >  abort_with_slots:
> >         bytes = sizeof(*rx->data.data_ring) * slots;
> >         dma_free_coherent(hdev, bytes, rx->data.data_ring, rx->data.data_bus);
> > @@ -251,8 +313,7 @@ static struct sk_buff *gve_rx_copy(struct gve_rx_ring *rx,
> >         return skb;
> >  }
> >
> > -static struct sk_buff *gve_rx_add_frags(struct net_device *dev,
> > -                                       struct napi_struct *napi,
> > +static struct sk_buff *gve_rx_add_frags(struct napi_struct *napi,
> >                                         struct gve_rx_slot_page_info *page_info,
> >                                         u16 len)
> >  {
> > @@ -271,11 +332,25 @@ static struct sk_buff *gve_rx_add_frags(struct net_device *dev,
> >  static void gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info,
> >                              struct gve_rx_data_slot *data_ring)
> >  {
> > -       u64 addr = be64_to_cpu(data_ring->qpl_offset);
> > +       u64 addr = be64_to_cpu(data_ring->addr);
> >
> >         page_info->page_offset ^= PAGE_SIZE / 2;
> >         addr ^= PAGE_SIZE / 2;
> > -       data_ring->qpl_offset = cpu_to_be64(addr);
> > +       data_ring->addr = cpu_to_be64(addr);
>
> This code is far more inefficient than it needs to be. Instead of byte
> swapping addr it should be byte swapping PAGE_SIZE / 2 since it is a
> constant. A bitwise operation should work fine as long as you are only
> performing it on two be64 values.

Ok, I'll byte swap the constant instead. Thanks for pointing this out.

>
> Also as I mentioned before if you are just using the xor on the
> address directly then you don't need the page offset as you can use a
> mask to pull it out of the addr.
>
> > +}
> > +
> > +static struct sk_buff *
> > +gve_rx_raw_addressing(struct device *dev, struct net_device *netdev,
> > +                     struct gve_rx_slot_page_info *page_info, u16 len,
> > +                     struct napi_struct *napi,
> > +                     struct gve_rx_data_slot *data_slot)
> > +{
> > +       struct sk_buff *skb = gve_rx_add_frags(napi, page_info, len);
> > +
> > +       if (!skb)
> > +               return NULL;
> > +
> > +       return skb;
> >  }
> >
>
> I'm not sure I see the point of this function. It seems like you
> should just replace all references to it with gve_rx_add_frags until
> you actually have something else going on here. I am assuming this is
> addressed in a later patch.

Yes, a later patch adds more to this function. I think you're right
and we can simply
use gve_rx_add_frags so I'll adjust this patch accordingly.

>
>
> >  static bool gve_rx(struct gve_rx_ring *rx, struct gve_rx_desc *rx_desc,
> > @@ -285,7 +360,9 @@ static bool gve_rx(struct gve_rx_ring *rx, struct gve_rx_desc *rx_desc,
> >         struct gve_priv *priv = rx->gve;
> >         struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi;
> >         struct net_device *dev = priv->dev;
> > -       struct sk_buff *skb;
> > +       struct gve_rx_data_slot *data_slot;
> > +       struct sk_buff *skb = NULL;
> > +       dma_addr_t page_bus;
> >         int pagecount;
> >         u16 len;
> >
> > @@ -294,18 +371,18 @@ static bool gve_rx(struct gve_rx_ring *rx, struct gve_rx_desc *rx_desc,
> >                 u64_stats_update_begin(&rx->statss);
> >                 rx->rx_desc_err_dropped_pkt++;
> >                 u64_stats_update_end(&rx->statss);
> > -               return true;
> > +               return false;
> >         }
> >
> >         len = be16_to_cpu(rx_desc->len) - GVE_RX_PAD;
> >         page_info = &rx->data.page_info[idx];
> > -       dma_sync_single_for_cpu(&priv->pdev->dev, rx->data.qpl->page_buses[idx],
> > -                               PAGE_SIZE, DMA_FROM_DEVICE);
> >
> > -       /* gvnic can only receive into registered segments. If the buffer
> > -        * can't be recycled, our only choice is to copy the data out of
> > -        * it so that we can return it to the device.
> > -        */
> > +       data_slot = &rx->data.data_ring[idx];
> > +       page_bus = (rx->data.raw_addressing) ?
> > +                                       be64_to_cpu(data_slot->addr) - page_info->page_offset :
> > +                                       rx->data.qpl->page_buses[idx];
> > +       dma_sync_single_for_cpu(&priv->pdev->dev, page_bus,
> > +                               PAGE_SIZE, DMA_FROM_DEVICE);
> >
> >         if (PAGE_SIZE == 4096) {
> >                 if (len <= priv->rx_copybreak) {
> > @@ -316,6 +393,12 @@ static bool gve_rx(struct gve_rx_ring *rx, struct gve_rx_desc *rx_desc,
> >                         u64_stats_update_end(&rx->statss);
> >                         goto have_skb;
> >                 }
> > +               if (rx->data.raw_addressing) {
> > +                       skb = gve_rx_raw_addressing(&priv->pdev->dev, dev,
> > +                                                   page_info, len, napi,
> > +                                                    data_slot);
> > +                       goto have_skb;
> > +               }
> >                 if (unlikely(!gve_can_recycle_pages(dev))) {
> >                         skb = gve_rx_copy(rx, dev, napi, page_info, len);
> >                         goto have_skb;
> > @@ -326,12 +409,12 @@ static bool gve_rx(struct gve_rx_ring *rx, struct gve_rx_desc *rx_desc,
> >                          * the page fragment to a new SKB and pass it up the
> >                          * stack.
> >                          */
> > -                       skb = gve_rx_add_frags(dev, napi, page_info, len);
> > +                       skb = gve_rx_add_frags(napi, page_info, len);
> >                         if (!skb) {
> >                                 u64_stats_update_begin(&rx->statss);
> >                                 rx->rx_skb_alloc_fail++;
> >                                 u64_stats_update_end(&rx->statss);
> > -                               return true;
> > +                               return false;
> >                         }
> >                         /* Make sure the kernel stack can't release the page */
> >                         get_page(page_info->page);
> > @@ -347,7 +430,12 @@ static bool gve_rx(struct gve_rx_ring *rx, struct gve_rx_desc *rx_desc,
> >                         return false;
> >                 }
> >         } else {
> > -               skb = gve_rx_copy(rx, dev, napi, page_info, len);
> > +               if (rx->data.raw_addressing)
> > +                       skb = gve_rx_raw_addressing(&priv->pdev->dev, dev,
> > +                                                   page_info, len, napi,
> > +                                                   data_slot);
> > +               else
> > +                       skb = gve_rx_copy(rx, dev, napi, page_info, len);
> >         }
> >
> >  have_skb:
> > @@ -358,7 +446,7 @@ static bool gve_rx(struct gve_rx_ring *rx, struct gve_rx_desc *rx_desc,
> >                 u64_stats_update_begin(&rx->statss);
> >                 rx->rx_skb_alloc_fail++;
> >                 u64_stats_update_end(&rx->statss);
> > -               return true;
> > +               return false;
> >         }
> >
> >         if (likely(feat & NETIF_F_RXCSUM)) {
> > @@ -399,19 +487,45 @@ static bool gve_rx_work_pending(struct gve_rx_ring *rx)
> >         return (GVE_SEQNO(flags_seq) == rx->desc.seqno);
> >  }
> >
> > +static bool gve_rx_refill_buffers(struct gve_priv *priv, struct gve_rx_ring *rx)
> > +{
> > +       bool empty = rx->fill_cnt == rx->cnt;
> > +       u32 fill_cnt = rx->fill_cnt;
> > +
> > +       while (empty || ((fill_cnt & rx->mask) != (rx->cnt & rx->mask))) {
>
> So one question I would have is why do you need to mask fill_cnt and
> cnt here, but not above? Something doesn't match up.

fill_cnt and cnt are both free-running uints with fill_cnt generally
greater than cnt
as fill_cnt tracks freed/available buffers while cnt tracks used buffers.
The difference between "fill_cnt == cnt" and "(fill_cnt & rx->mask) ==
(cnt & rx->mask)" is
useful when all the buffers are completely used up.
If all the descriptors are used up ("fill_cnt == cnt") when we attempt
to refill buffers, the right
hand side of the while loop's OR condition, "(fill_cnt & rx->mask) !=
(rx->cnt & rx->mask)"
will be false and we wouldn't get to attempt to refill the queue's buffers.

>
> > +               struct gve_rx_slot_page_info *page_info;
> > +               struct device *dev = &priv->pdev->dev;
> > +               struct gve_rx_data_slot *data_slot;
> > +               u32 idx = fill_cnt & rx->mask;
> > +
> > +               page_info = &rx->data.page_info[idx];
> > +               data_slot = &rx->data.data_ring[idx];
> > +               gve_rx_free_buffer(dev, page_info, data_slot);
> > +               page_info->page = NULL;
> > +               if (gve_rx_alloc_buffer(priv, dev, page_info, data_slot, rx))
> > +                       break;
> > +               empty = false;
> > +               fill_cnt++;
> > +       }
> > +       rx->fill_cnt = fill_cnt;
> > +       return true;
> > +}
> > +
> >  bool gve_clean_rx_done(struct gve_rx_ring *rx, int budget,
> >                        netdev_features_t feat)
> >  {
> >         struct gve_priv *priv = rx->gve;
> > +       u32 work_done = 0, packets = 0;
> >         struct gve_rx_desc *desc;
> >         u32 cnt = rx->cnt;
> >         u32 idx = cnt & rx->mask;
> > -       u32 work_done = 0;
> >         u64 bytes = 0;
> >
> >         desc = rx->desc.desc_ring + idx;
> >         while ((GVE_SEQNO(desc->flags_seq) == rx->desc.seqno) &&
> >                work_done < budget) {
> > +               bool dropped;
> > +
> >                 netif_info(priv, rx_status, priv->dev,
> >                            "[%d] idx=%d desc=%p desc->flags_seq=0x%x\n",
> >                            rx->q_num, idx, desc, desc->flags_seq);
> > @@ -419,9 +533,11 @@ bool gve_clean_rx_done(struct gve_rx_ring *rx, int budget,
> >                            "[%d] seqno=%d rx->desc.seqno=%d\n",
> >                            rx->q_num, GVE_SEQNO(desc->flags_seq),
> >                            rx->desc.seqno);
> > -               bytes += be16_to_cpu(desc->len) - GVE_RX_PAD;
> > -               if (!gve_rx(rx, desc, feat, idx))
> > -                       gve_schedule_reset(priv);
> > +               dropped = !gve_rx(rx, desc, feat, idx);
> > +               if (!dropped) {
> > +                       bytes += be16_to_cpu(desc->len) - GVE_RX_PAD;
> > +                       packets++;
> > +               }
> >                 cnt++;
> >                 idx = cnt & rx->mask;
> >                 desc = rx->desc.desc_ring + idx;
> > @@ -429,15 +545,35 @@ bool gve_clean_rx_done(struct gve_rx_ring *rx, int budget,
> >                 work_done++;
> >         }
> >
> > -       if (!work_done)
> > +       if (!work_done && rx->fill_cnt - cnt > rx->db_threshold) {
> >                 return false;
>
> Since you are returning here you don't really need the else below. You
> can just drop the braces and use an if instead. However since
> everything you are doing here can be done even if work_done is 0 I
> would say to just drop the elseif entirely and instead leave the code
> as it was.
>
> > +       } else if (work_done) {
> > +               u64_stats_update_begin(&rx->statss);
> > +               rx->rpackets += packets;
> > +               rx->rbytes += bytes;
> > +               u64_stats_update_end(&rx->statss);
> > +               rx->cnt = cnt;
> > +       }
>
> The block below seems much better than optimizing for an unlikely case
> where there is no work done.

Ok, I'll use just an if and remove the elseif.




>
>
> > -       u64_stats_update_begin(&rx->statss);
> > -       rx->rpackets += work_done;
> > -       rx->rbytes += bytes;
> > -       u64_stats_update_end(&rx->statss);
> > -       rx->cnt = cnt;
> > -       rx->fill_cnt += work_done;
> > +       /* restock ring slots */
> > +       if (!rx->data.raw_addressing) {
> > +               /* In QPL mode buffs are refilled as the desc are processed */
> > +               rx->fill_cnt += work_done;
> > +       } else if (rx->fill_cnt - cnt <= rx->db_threshold) {
> > +               /* In raw addressing mode buffs are only refilled if the avail
> > +                * falls below a threshold.
> > +                */
> > +               if (!gve_rx_refill_buffers(priv, rx))
> > +                       return false;
> > +
> > +               /* If we were not able to completely refill buffers, we'll want
> > +                * to schedule this queue for work again to refill buffers.
> > +                */
> > +               if (rx->fill_cnt - cnt <= rx->db_threshold) {
> > +                       gve_rx_write_doorbell(priv, rx);
> > +                       return true;
> > +               }
> > +       }
> >
> >         gve_rx_write_doorbell(priv, rx);
> >         return gve_rx_work_pending(rx);
> > --
> > 2.29.2.222.g5d2a92d10f8-goog
> >