[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAL9ddJdL0KNp69J6tVn_1Bp8xxwo2JxKRVajHfAriY=pUH0r1g@mail.gmail.com>
Date: Wed, 18 Nov 2020 15:15:28 -0800
From: David Awogbemila <awogbemila@...gle.com>
To: Alexander Duyck <alexander.duyck@...il.com>
Cc: Netdev <netdev@...r.kernel.org>,
Catherine Sullivan <csully@...gle.com>,
Yangchun Fu <yangchun@...gle.com>
Subject: Re: [PATCH net-next v6 2/4] gve: Add support for raw addressing to
the rx path
On Wed, Nov 11, 2020 at 9:20 AM Alexander Duyck
<alexander.duyck@...il.com> wrote:
>
> On Mon, Nov 9, 2020 at 3:39 PM David Awogbemila <awogbemila@...gle.com> wrote:
> >
> > From: Catherine Sullivan <csully@...gle.com>
> >
> > Add support to use raw dma addresses in the rx path. Due to this new
> > support we can alloc a new buffer instead of making a copy.
> >
> > RX buffers are handed to the networking stack and are
> > re-allocated as needed, avoiding the need to use
> > skb_copy_to_linear_data() as in "qpl" mode.
> >
> > Reviewed-by: Yangchun Fu <yangchun@...gle.com>
> > Signed-off-by: Catherine Sullivan <csully@...gle.com>
> > Signed-off-by: David Awogbemila <awogbemila@...gle.com>
> > ---
> > drivers/net/ethernet/google/gve/gve.h | 6 +-
> > drivers/net/ethernet/google/gve/gve_adminq.c | 14 +-
> > drivers/net/ethernet/google/gve/gve_desc.h | 10 +-
> > drivers/net/ethernet/google/gve/gve_main.c | 3 +-
> > drivers/net/ethernet/google/gve/gve_rx.c | 224 +++++++++++++++----
> > 5 files changed, 200 insertions(+), 57 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h
> > index 80cdae06ee39..a8c589dd14e4 100644
> > --- a/drivers/net/ethernet/google/gve/gve.h
> > +++ b/drivers/net/ethernet/google/gve/gve.h
> > @@ -68,6 +68,7 @@ struct gve_rx_data_queue {
> > dma_addr_t data_bus; /* dma mapping of the slots */
> > struct gve_rx_slot_page_info *page_info; /* page info of the buffers */
> > struct gve_queue_page_list *qpl; /* qpl assigned to this queue */
> > + bool raw_addressing; /* use raw_addressing? */
> > };
>
> Again, if you care about alignment at all in this structure you should
> probably use u8 instead of bool.
Thanks, I'll adjust this.
>
> >
> > struct gve_priv;
> > @@ -82,6 +83,7 @@ struct gve_rx_ring {
> > u32 cnt; /* free-running total number of completed packets */
> > u32 fill_cnt; /* free-running total number of descs and buffs posted */
> > u32 mask; /* masks the cnt and fill_cnt to the size of the ring */
> > + u32 db_threshold; /* threshold for posting new buffs and descs */
> > u64 rx_copybreak_pkt; /* free-running count of copybreak packets */
> > u64 rx_copied_pkt; /* free-running total number of copied packets */
> > u64 rx_skb_alloc_fail; /* free-running count of skb alloc fails */
> > @@ -194,7 +196,7 @@ struct gve_priv {
> > u16 tx_desc_cnt; /* num desc per ring */
> > u16 rx_desc_cnt; /* num desc per ring */
> > u16 tx_pages_per_qpl; /* tx buffer length */
> > - u16 rx_pages_per_qpl; /* rx buffer length */
> > + u16 rx_data_slot_cnt; /* rx buffer length */
> > u64 max_registered_pages;
> > u64 num_registered_pages; /* num pages registered with NIC */
> > u32 rx_copybreak; /* copy packets smaller than this */
> > @@ -444,7 +446,7 @@ static inline u32 gve_num_tx_qpls(struct gve_priv *priv)
> > */
> > static inline u32 gve_num_rx_qpls(struct gve_priv *priv)
> > {
> > - return priv->rx_cfg.num_queues;
> > + return priv->raw_addressing ? 0 : priv->rx_cfg.num_queues;
> > }
> >
> > /* Returns a pointer to the next available tx qpl in the list of qpls
> > diff --git a/drivers/net/ethernet/google/gve/gve_adminq.c b/drivers/net/ethernet/google/gve/gve_adminq.c
> > index 3e6de659b274..711d135c6b1a 100644
> > --- a/drivers/net/ethernet/google/gve/gve_adminq.c
> > +++ b/drivers/net/ethernet/google/gve/gve_adminq.c
> > @@ -369,8 +369,10 @@ static int gve_adminq_create_rx_queue(struct gve_priv *priv, u32 queue_index)
> > {
> > struct gve_rx_ring *rx = &priv->rx[queue_index];
> > union gve_adminq_command cmd;
> > + u32 qpl_id;
> > int err;
> >
> > + qpl_id = priv->raw_addressing ? GVE_RAW_ADDRESSING_QPL_ID : rx->data.qpl->id;
> > memset(&cmd, 0, sizeof(cmd));
> > cmd.opcode = cpu_to_be32(GVE_ADMINQ_CREATE_RX_QUEUE);
> > cmd.create_rx_queue = (struct gve_adminq_create_rx_queue) {
> > @@ -381,7 +383,7 @@ static int gve_adminq_create_rx_queue(struct gve_priv *priv, u32 queue_index)
> > .queue_resources_addr = cpu_to_be64(rx->q_resources_bus),
> > .rx_desc_ring_addr = cpu_to_be64(rx->desc.bus),
> > .rx_data_ring_addr = cpu_to_be64(rx->data.data_bus),
> > - .queue_page_list_id = cpu_to_be32(rx->data.qpl->id),
> > + .queue_page_list_id = cpu_to_be32(qpl_id),
> > };
> >
> > err = gve_adminq_issue_cmd(priv, &cmd);
> > @@ -526,11 +528,11 @@ int gve_adminq_describe_device(struct gve_priv *priv)
> > mac = descriptor->mac;
> > dev_info(&priv->pdev->dev, "MAC addr: %pM\n", mac);
> > priv->tx_pages_per_qpl = be16_to_cpu(descriptor->tx_pages_per_qpl);
> > - priv->rx_pages_per_qpl = be16_to_cpu(descriptor->rx_pages_per_qpl);
> > - if (priv->rx_pages_per_qpl < priv->rx_desc_cnt) {
> > - dev_err(&priv->pdev->dev, "rx_pages_per_qpl cannot be smaller than rx_desc_cnt, setting rx_desc_cnt down to %d.\n",
> > - priv->rx_pages_per_qpl);
> > - priv->rx_desc_cnt = priv->rx_pages_per_qpl;
> > + priv->rx_data_slot_cnt = be16_to_cpu(descriptor->rx_pages_per_qpl);
> > + if (priv->rx_data_slot_cnt < priv->rx_desc_cnt) {
> > + dev_err(&priv->pdev->dev, "rx_data_slot_cnt cannot be smaller than rx_desc_cnt, setting rx_desc_cnt down to %d.\n",
> > + priv->rx_data_slot_cnt);
> > + priv->rx_desc_cnt = priv->rx_data_slot_cnt;
> > }
> > priv->default_num_queues = be16_to_cpu(descriptor->default_num_queues);
> > dev_opt = (void *)(descriptor + 1);
> > diff --git a/drivers/net/ethernet/google/gve/gve_desc.h b/drivers/net/ethernet/google/gve/gve_desc.h
> > index 54779871d52e..0aad314aefaf 100644
> > --- a/drivers/net/ethernet/google/gve/gve_desc.h
> > +++ b/drivers/net/ethernet/google/gve/gve_desc.h
> > @@ -72,12 +72,14 @@ struct gve_rx_desc {
> > } __packed;
> > static_assert(sizeof(struct gve_rx_desc) == 64);
> >
> > -/* As with the Tx ring format, the qpl_offset entries below are offsets into an
> > - * ordered list of registered pages.
> > +/* If the device supports raw dma addressing then the addr in data slot is
> > + * the dma address of the buffer.
> > + * If the device only supports registered segments than the addr is a byte
>
> s/than/then/
I'll correct this.
>
> > + * offset into the registered segment (an ordered list of pages) where the
> > + * buffer is.
> > */
> > struct gve_rx_data_slot {
> > - /* byte offset into the rx registered segment of this slot */
> > - __be64 qpl_offset;
> > + __be64 addr;
> > };
>
> So this is either the qpl_offset or an address correct? In such a case
> shouldn't this be a union? I'm just wanting to make sure that this
> isn't something where we are just calling it all addr now even though
> there are still cases where it is the qpl_offset.
Yeah a union might be a better option here as it is still a qpl_offset
in some instances as you point out. I'll adopt a union.
>
> > /* GVE Recive Packet Descriptor Seq No */
> > diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c
> > index 70685c10db0e..225e17dd1ae5 100644
> > --- a/drivers/net/ethernet/google/gve/gve_main.c
> > +++ b/drivers/net/ethernet/google/gve/gve_main.c
> > @@ -596,6 +596,7 @@ int gve_alloc_page(struct gve_priv *priv, struct device *dev,
> > if (dma_mapping_error(dev, *dma)) {
> > priv->dma_mapping_error++;
> > put_page(*page);
> > + *page = NULL;
> > return -ENOMEM;
> > }
> > return 0;
>
> This piece seems like a bug fix for the exception handling path.
> Should it be pulled out into a separate patch so that it could be
> backported if needed.
Ok, I'll take it away from this patch.
>
> > @@ -694,7 +695,7 @@ static int gve_alloc_qpls(struct gve_priv *priv)
> > }
> > for (; i < num_qpls; i++) {
> > err = gve_alloc_queue_page_list(priv, i,
> > - priv->rx_pages_per_qpl);
> > + priv->rx_data_slot_cnt);
> > if (err)
> > goto free_qpls;
> > }
> > diff --git a/drivers/net/ethernet/google/gve/gve_rx.c b/drivers/net/ethernet/google/gve/gve_rx.c
> > index 008fa897a3e6..49646caf930c 100644
> > --- a/drivers/net/ethernet/google/gve/gve_rx.c
> > +++ b/drivers/net/ethernet/google/gve/gve_rx.c
> > @@ -16,12 +16,39 @@ static void gve_rx_remove_from_block(struct gve_priv *priv, int queue_idx)
> > block->rx = NULL;
> > }
> >
> > +static void gve_rx_free_buffer(struct device *dev,
> > + struct gve_rx_slot_page_info *page_info,
> > + struct gve_rx_data_slot *data_slot)
> > +{
> > + dma_addr_t dma = (dma_addr_t)(be64_to_cpu(data_slot->addr) -
> > + page_info->page_offset);
> > +
>
> Why bother with subtraction when you are adding the page offset via an
> xor? It seems like you should be able to just mask off the page offset
> and not need to bother to store it anywhere. Either the DMA address is
> page aligned in which case you can xor in and out the value and just
> use an &= operator to strip it, or it isn't in which case you should
> be using addition and subtraction everywhere and only bitflip page
> offset as a single bit somewhere.
Ok, I'll adjust this so that it just uses and & to obtain the page
address and turn page_offset
into a u8 which keeps track of the flipped state.
>
> > + gve_free_page(dev, page_info->page, dma, DMA_FROM_DEVICE);
> > +}
> > +
> > +static void gve_rx_unfill_pages(struct gve_priv *priv, struct gve_rx_ring *rx)
> > +{
> > + u32 slots = rx->mask + 1;
> > + int i;
> > +
> > + if (rx->data.raw_addressing) {
>
> The declaration of slots and I could be moved here since they aren't
> used anywhere else in this function.
Ok.
>
> > + for (i = 0; i < slots; i++)
> > + gve_rx_free_buffer(&priv->pdev->dev, &rx->data.page_info[i],
> > + &rx->data.data_ring[i]);
> > + } else {
> > + gve_unassign_qpl(priv, rx->data.qpl->id);
> > + rx->data.qpl = NULL;
> > + }
> > + kvfree(rx->data.page_info);
> > + rx->data.page_info = NULL;
> > +}
> > +
> > static void gve_rx_free_ring(struct gve_priv *priv, int idx)
> > {
> > struct gve_rx_ring *rx = &priv->rx[idx];
> > struct device *dev = &priv->pdev->dev;
> > + u32 slots = rx->mask + 1;
> > size_t bytes;
> > - u32 slots;
> >
> > gve_rx_remove_from_block(priv, idx);
> >
> > @@ -33,11 +60,8 @@ static void gve_rx_free_ring(struct gve_priv *priv, int idx)
> > rx->q_resources, rx->q_resources_bus);
> > rx->q_resources = NULL;
> >
> > - gve_unassign_qpl(priv, rx->data.qpl->id);
> > - rx->data.qpl = NULL;
> > - kvfree(rx->data.page_info);
> > + gve_rx_unfill_pages(priv, rx);
> >
> > - slots = rx->mask + 1;
> > bytes = sizeof(*rx->data.data_ring) * slots;
> > dma_free_coherent(dev, bytes, rx->data.data_ring,
> > rx->data.data_bus);
> > @@ -52,14 +76,36 @@ static void gve_setup_rx_buffer(struct gve_rx_slot_page_info *page_info,
> > page_info->page = page;
> > page_info->page_offset = 0;
> > page_info->page_address = page_address(page);
> > - slot->qpl_offset = cpu_to_be64(addr);
> > + slot->addr = cpu_to_be64(addr);
> > +}
> > +
> > +static int gve_rx_alloc_buffer(struct gve_priv *priv, struct device *dev,
> > + struct gve_rx_slot_page_info *page_info,
> > + struct gve_rx_data_slot *data_slot,
> > + struct gve_rx_ring *rx)
> > +{
> > + struct page *page;
> > + dma_addr_t dma;
> > + int err;
> > +
> > + err = gve_alloc_page(priv, dev, &page, &dma, DMA_FROM_DEVICE);
> > + if (err) {
> > + u64_stats_update_begin(&rx->statss);
> > + rx->rx_buf_alloc_fail++;
> > + u64_stats_update_end(&rx->statss);
> > + return err;
> > + }
>
> This just feels really clumsy to me. Do the stats really need to be
> invoked in all cases? It seems like you could pull this code out and
> put it in an exception path somewhere rather than in the middle of the
> function. That way you don't need to carry around the rx ring which
> isn't used for anything else. So for example when you are prefilling
> the buffers it looks like you are already returning an error to the
> user. In such a case the stats update might be redundant as the
> interface wasn't allowed to come up in the first place.
Ok, avoiding the stats update during a failed attempt to bring the
interface up might be better.
I'll separate the code so we don't bother with the stats update when
the interface is coming up.
>
> > +
> > + gve_setup_rx_buffer(page_info, data_slot, dma, page);
> > + return 0;
> > }
> >
> > static int gve_prefill_rx_pages(struct gve_rx_ring *rx)
> > {
> > struct gve_priv *priv = rx->gve;
> > u32 slots;
> > - int i;
> > + int err;
> > + int i, j;
> >
> > /* Allocate one page per Rx queue slot. Each page is split into two
> > * packet buffers, when possible we "page flip" between the two.
> > @@ -71,17 +117,32 @@ static int gve_prefill_rx_pages(struct gve_rx_ring *rx)
> > if (!rx->data.page_info)
> > return -ENOMEM;
> >
> > - rx->data.qpl = gve_assign_rx_qpl(priv);
> > -
> > + if (!rx->data.raw_addressing)
> > + rx->data.qpl = gve_assign_rx_qpl(priv);
> > for (i = 0; i < slots; i++) {
> > - struct page *page = rx->data.qpl->pages[i];
> > - dma_addr_t addr = i * PAGE_SIZE;
> > -
> > - gve_setup_rx_buffer(&rx->data.page_info[i],
> > - &rx->data.data_ring[i], addr, page);
> > + struct page *page;
> > + dma_addr_t addr;
> > +
> > + if (!rx->data.raw_addressing) {
> > + page = rx->data.qpl->pages[i];
> > + addr = i * PAGE_SIZE;
> > + gve_setup_rx_buffer(&rx->data.page_info[i],
> > + &rx->data.data_ring[i], addr, page);
> > + continue;
> > + }
> > + err = gve_rx_alloc_buffer(priv, &priv->pdev->dev, &rx->data.page_info[i],
> > + &rx->data.data_ring[i], rx);
> > + if (err)
> > + goto alloc_err;
> > }
> >
> > return slots;
> > +alloc_err:
> > + for (j = 0; j < i; j++)
> > + gve_rx_free_buffer(&priv->pdev->dev,
> > + &rx->data.page_info[j],
> > + &rx->data.data_ring[j]);
>
> Instead of adding another variable you could just change this loop to
> be based on "while(i--)" and then use i as you work your way backwards
> through buffers you previously allocated.
Ok.
>
> > + return err;
> > }
> >
> > static void gve_rx_add_to_block(struct gve_priv *priv, int queue_idx)
> > @@ -110,8 +171,9 @@ static int gve_rx_alloc_ring(struct gve_priv *priv, int idx)
> > rx->gve = priv;
> > rx->q_num = idx;
> >
> > - slots = priv->rx_pages_per_qpl;
> > + slots = priv->rx_data_slot_cnt;
> > rx->mask = slots - 1;
> > + rx->data.raw_addressing = priv->raw_addressing;
> >
> > /* alloc rx data ring */
> > bytes = sizeof(*rx->data.data_ring) * slots;
> > @@ -156,8 +218,8 @@ static int gve_rx_alloc_ring(struct gve_priv *priv, int idx)
> > err = -ENOMEM;
> > goto abort_with_q_resources;
> > }
> > - rx->mask = slots - 1;
> > rx->cnt = 0;
> > + rx->db_threshold = priv->rx_desc_cnt / 2;
> > rx->desc.seqno = 1;
> > gve_rx_add_to_block(priv, idx);
> >
> > @@ -168,7 +230,7 @@ static int gve_rx_alloc_ring(struct gve_priv *priv, int idx)
> > rx->q_resources, rx->q_resources_bus);
> > rx->q_resources = NULL;
> > abort_filled:
> > - kvfree(rx->data.page_info);
> > + gve_rx_unfill_pages(priv, rx);
> > abort_with_slots:
> > bytes = sizeof(*rx->data.data_ring) * slots;
> > dma_free_coherent(hdev, bytes, rx->data.data_ring, rx->data.data_bus);
> > @@ -251,8 +313,7 @@ static struct sk_buff *gve_rx_copy(struct gve_rx_ring *rx,
> > return skb;
> > }
> >
> > -static struct sk_buff *gve_rx_add_frags(struct net_device *dev,
> > - struct napi_struct *napi,
> > +static struct sk_buff *gve_rx_add_frags(struct napi_struct *napi,
> > struct gve_rx_slot_page_info *page_info,
> > u16 len)
> > {
> > @@ -271,11 +332,25 @@ static struct sk_buff *gve_rx_add_frags(struct net_device *dev,
> > static void gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info,
> > struct gve_rx_data_slot *data_ring)
> > {
> > - u64 addr = be64_to_cpu(data_ring->qpl_offset);
> > + u64 addr = be64_to_cpu(data_ring->addr);
> >
> > page_info->page_offset ^= PAGE_SIZE / 2;
> > addr ^= PAGE_SIZE / 2;
> > - data_ring->qpl_offset = cpu_to_be64(addr);
> > + data_ring->addr = cpu_to_be64(addr);
>
> This code is far more inefficient than it needs to be. Instead of byte
> swapping addr it should be byte swapping PAGE_SIZE / 2 since it is a
> constant. A bitwise operation should work fine as long as you are only
> performing it on two be64 values.
Ok, I'll byte swap the constant instead. Thanks for pointing this out.
>
> Also as I mentioned before if you are just using the xor on the
> address directly then you don't need the page offset as you can use a
> mask to pull it out of the addr.
>
> > +}
> > +
> > +static struct sk_buff *
> > +gve_rx_raw_addressing(struct device *dev, struct net_device *netdev,
> > + struct gve_rx_slot_page_info *page_info, u16 len,
> > + struct napi_struct *napi,
> > + struct gve_rx_data_slot *data_slot)
> > +{
> > + struct sk_buff *skb = gve_rx_add_frags(napi, page_info, len);
> > +
> > + if (!skb)
> > + return NULL;
> > +
> > + return skb;
> > }
> >
>
> I'm not sure I see the point of this function. It seems like you
> should just replace all references to it with gve_rx_add_frags until
> you actually have something else going on here. I am assuming this is
> addressed in a later patch.
Yes, a later patch adds more to this function. I think you're right
and we can simply
use gve_rx_add_frags so I'll adjust this patch accordingly.
>
>
> > static bool gve_rx(struct gve_rx_ring *rx, struct gve_rx_desc *rx_desc,
> > @@ -285,7 +360,9 @@ static bool gve_rx(struct gve_rx_ring *rx, struct gve_rx_desc *rx_desc,
> > struct gve_priv *priv = rx->gve;
> > struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi;
> > struct net_device *dev = priv->dev;
> > - struct sk_buff *skb;
> > + struct gve_rx_data_slot *data_slot;
> > + struct sk_buff *skb = NULL;
> > + dma_addr_t page_bus;
> > int pagecount;
> > u16 len;
> >
> > @@ -294,18 +371,18 @@ static bool gve_rx(struct gve_rx_ring *rx, struct gve_rx_desc *rx_desc,
> > u64_stats_update_begin(&rx->statss);
> > rx->rx_desc_err_dropped_pkt++;
> > u64_stats_update_end(&rx->statss);
> > - return true;
> > + return false;
> > }
> >
> > len = be16_to_cpu(rx_desc->len) - GVE_RX_PAD;
> > page_info = &rx->data.page_info[idx];
> > - dma_sync_single_for_cpu(&priv->pdev->dev, rx->data.qpl->page_buses[idx],
> > - PAGE_SIZE, DMA_FROM_DEVICE);
> >
> > - /* gvnic can only receive into registered segments. If the buffer
> > - * can't be recycled, our only choice is to copy the data out of
> > - * it so that we can return it to the device.
> > - */
> > + data_slot = &rx->data.data_ring[idx];
> > + page_bus = (rx->data.raw_addressing) ?
> > + be64_to_cpu(data_slot->addr) - page_info->page_offset :
> > + rx->data.qpl->page_buses[idx];
> > + dma_sync_single_for_cpu(&priv->pdev->dev, page_bus,
> > + PAGE_SIZE, DMA_FROM_DEVICE);
> >
> > if (PAGE_SIZE == 4096) {
> > if (len <= priv->rx_copybreak) {
> > @@ -316,6 +393,12 @@ static bool gve_rx(struct gve_rx_ring *rx, struct gve_rx_desc *rx_desc,
> > u64_stats_update_end(&rx->statss);
> > goto have_skb;
> > }
> > + if (rx->data.raw_addressing) {
> > + skb = gve_rx_raw_addressing(&priv->pdev->dev, dev,
> > + page_info, len, napi,
> > + data_slot);
> > + goto have_skb;
> > + }
> > if (unlikely(!gve_can_recycle_pages(dev))) {
> > skb = gve_rx_copy(rx, dev, napi, page_info, len);
> > goto have_skb;
> > @@ -326,12 +409,12 @@ static bool gve_rx(struct gve_rx_ring *rx, struct gve_rx_desc *rx_desc,
> > * the page fragment to a new SKB and pass it up the
> > * stack.
> > */
> > - skb = gve_rx_add_frags(dev, napi, page_info, len);
> > + skb = gve_rx_add_frags(napi, page_info, len);
> > if (!skb) {
> > u64_stats_update_begin(&rx->statss);
> > rx->rx_skb_alloc_fail++;
> > u64_stats_update_end(&rx->statss);
> > - return true;
> > + return false;
> > }
> > /* Make sure the kernel stack can't release the page */
> > get_page(page_info->page);
> > @@ -347,7 +430,12 @@ static bool gve_rx(struct gve_rx_ring *rx, struct gve_rx_desc *rx_desc,
> > return false;
> > }
> > } else {
> > - skb = gve_rx_copy(rx, dev, napi, page_info, len);
> > + if (rx->data.raw_addressing)
> > + skb = gve_rx_raw_addressing(&priv->pdev->dev, dev,
> > + page_info, len, napi,
> > + data_slot);
> > + else
> > + skb = gve_rx_copy(rx, dev, napi, page_info, len);
> > }
> >
> > have_skb:
> > @@ -358,7 +446,7 @@ static bool gve_rx(struct gve_rx_ring *rx, struct gve_rx_desc *rx_desc,
> > u64_stats_update_begin(&rx->statss);
> > rx->rx_skb_alloc_fail++;
> > u64_stats_update_end(&rx->statss);
> > - return true;
> > + return false;
> > }
> >
> > if (likely(feat & NETIF_F_RXCSUM)) {
> > @@ -399,19 +487,45 @@ static bool gve_rx_work_pending(struct gve_rx_ring *rx)
> > return (GVE_SEQNO(flags_seq) == rx->desc.seqno);
> > }
> >
> > +static bool gve_rx_refill_buffers(struct gve_priv *priv, struct gve_rx_ring *rx)
> > +{
> > + bool empty = rx->fill_cnt == rx->cnt;
> > + u32 fill_cnt = rx->fill_cnt;
> > +
> > + while (empty || ((fill_cnt & rx->mask) != (rx->cnt & rx->mask))) {
>
> So one question I would have is why do you need to mask fill_cnt and
> cnt here, but not above? Something doesn't match up.
fill_cnt and cnt are both free-running uints with fill_cnt generally
greater than cnt
as fill_cnt tracks freed/available buffers while cnt tracks used buffers.
The difference between "fill_cnt == cnt" and "(fill_cnt & rx->mask) ==
(cnt & rx->mask)" is
useful when all the buffers are completely used up.
If all the descriptors are used up ("fill_cnt == cnt") when we attempt
to refill buffers, the right
hand side of the while loop's OR condition, "(fill_cnt & rx->mask) !=
(rx->cnt & rx->mask)"
will be false and we wouldn't get to attempt to refill the queue's buffers.
>
> > + struct gve_rx_slot_page_info *page_info;
> > + struct device *dev = &priv->pdev->dev;
> > + struct gve_rx_data_slot *data_slot;
> > + u32 idx = fill_cnt & rx->mask;
> > +
> > + page_info = &rx->data.page_info[idx];
> > + data_slot = &rx->data.data_ring[idx];
> > + gve_rx_free_buffer(dev, page_info, data_slot);
> > + page_info->page = NULL;
> > + if (gve_rx_alloc_buffer(priv, dev, page_info, data_slot, rx))
> > + break;
> > + empty = false;
> > + fill_cnt++;
> > + }
> > + rx->fill_cnt = fill_cnt;
> > + return true;
> > +}
> > +
> > bool gve_clean_rx_done(struct gve_rx_ring *rx, int budget,
> > netdev_features_t feat)
> > {
> > struct gve_priv *priv = rx->gve;
> > + u32 work_done = 0, packets = 0;
> > struct gve_rx_desc *desc;
> > u32 cnt = rx->cnt;
> > u32 idx = cnt & rx->mask;
> > - u32 work_done = 0;
> > u64 bytes = 0;
> >
> > desc = rx->desc.desc_ring + idx;
> > while ((GVE_SEQNO(desc->flags_seq) == rx->desc.seqno) &&
> > work_done < budget) {
> > + bool dropped;
> > +
> > netif_info(priv, rx_status, priv->dev,
> > "[%d] idx=%d desc=%p desc->flags_seq=0x%x\n",
> > rx->q_num, idx, desc, desc->flags_seq);
> > @@ -419,9 +533,11 @@ bool gve_clean_rx_done(struct gve_rx_ring *rx, int budget,
> > "[%d] seqno=%d rx->desc.seqno=%d\n",
> > rx->q_num, GVE_SEQNO(desc->flags_seq),
> > rx->desc.seqno);
> > - bytes += be16_to_cpu(desc->len) - GVE_RX_PAD;
> > - if (!gve_rx(rx, desc, feat, idx))
> > - gve_schedule_reset(priv);
> > + dropped = !gve_rx(rx, desc, feat, idx);
> > + if (!dropped) {
> > + bytes += be16_to_cpu(desc->len) - GVE_RX_PAD;
> > + packets++;
> > + }
> > cnt++;
> > idx = cnt & rx->mask;
> > desc = rx->desc.desc_ring + idx;
> > @@ -429,15 +545,35 @@ bool gve_clean_rx_done(struct gve_rx_ring *rx, int budget,
> > work_done++;
> > }
> >
> > - if (!work_done)
> > + if (!work_done && rx->fill_cnt - cnt > rx->db_threshold) {
> > return false;
>
> Since you are returning here you don't really need the else below. You
> can just drop the braces and use an if instead. However since
> everything you are doing here can be done even if work_done is 0 I
> would say to just drop the elseif entirely and instead leave the code
> as it was.
>
> > + } else if (work_done) {
> > + u64_stats_update_begin(&rx->statss);
> > + rx->rpackets += packets;
> > + rx->rbytes += bytes;
> > + u64_stats_update_end(&rx->statss);
> > + rx->cnt = cnt;
> > + }
>
> The block below seems much better than optimizing for an unlikely case
> where there is no work done.
Ok, I'll use just an if and remove the elseif.
>
>
> > - u64_stats_update_begin(&rx->statss);
> > - rx->rpackets += work_done;
> > - rx->rbytes += bytes;
> > - u64_stats_update_end(&rx->statss);
> > - rx->cnt = cnt;
> > - rx->fill_cnt += work_done;
> > + /* restock ring slots */
> > + if (!rx->data.raw_addressing) {
> > + /* In QPL mode buffs are refilled as the desc are processed */
> > + rx->fill_cnt += work_done;
> > + } else if (rx->fill_cnt - cnt <= rx->db_threshold) {
> > + /* In raw addressing mode buffs are only refilled if the avail
> > + * falls below a threshold.
> > + */
> > + if (!gve_rx_refill_buffers(priv, rx))
> > + return false;
> > +
> > + /* If we were not able to completely refill buffers, we'll want
> > + * to schedule this queue for work again to refill buffers.
> > + */
> > + if (rx->fill_cnt - cnt <= rx->db_threshold) {
> > + gve_rx_write_doorbell(priv, rx);
> > + return true;
> > + }
> > + }
> >
> > gve_rx_write_doorbell(priv, rx);
> > return gve_rx_work_pending(rx);
> > --
> > 2.29.2.222.g5d2a92d10f8-goog
> >
Powered by blists - more mailing lists