[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAMArcTU61G=fexf-RJDSW_sGp9dZCkJsJKC=yjg79RS9Ugjuxw@mail.gmail.com>
Date: Fri, 4 Oct 2024 19:34:45 +0900
From: Taehee Yoo <ap420073@...il.com>
To: Mina Almasry <almasrymina@...gle.com>
Cc: davem@...emloft.net, kuba@...nel.org, pabeni@...hat.com,
edumazet@...gle.com, netdev@...r.kernel.org, linux-doc@...r.kernel.org,
donald.hunter@...il.com, corbet@....net, michael.chan@...adcom.com,
kory.maincent@...tlin.com, andrew@...n.ch, maxime.chevallier@...tlin.com,
danieller@...dia.com, hengqi@...ux.alibaba.com, ecree.xilinx@...il.com,
przemyslaw.kitszel@...el.com, hkallweit1@...il.com, ahmed.zaki@...el.com,
paul.greenwalt@...el.com, rrameshbabu@...dia.com, idosch@...dia.com,
asml.silence@...il.com, kaiyuanz@...gle.com, willemb@...gle.com,
aleksander.lobakin@...el.com, dw@...idwei.uk, sridhar.samudrala@...el.com,
bcreeley@....com
Subject: Re: [PATCH net-next v3 7/7] bnxt_en: add support for device memory tcp
On Fri, Oct 4, 2024 at 3:43 AM Mina Almasry <almasrymina@...gle.com> wrote:
>
Hi Mina,
Thanks a lot for your review!
> On Thu, Oct 3, 2024 at 9:07 AM Taehee Yoo <ap420073@...il.com> wrote:
> >
> > Currently, bnxt_en driver satisfies the requirements of Device memory
> > TCP, which is tcp-data-split.
> > So, it implements Device memory TCP for bnxt_en driver.
> >
> > From now on, the aggregation ring handles netmem_ref instead of page
> > regardless of the on/off of netmem.
> > So, for the aggregation ring, memory will be handled with the netmem
> > page_pool API instead of generic page_pool API.
> >
> > If Devmem is enabled, netmem_ref is used as-is and if Devmem is not
> > enabled, netmem_ref will be converted to page and that is used.
> >
> > Driver recognizes whether the devmem is set or unset based on the
> > mp_params.mp_priv is not NULL.
> > Only if devmem is set, it passes PP_FLAG_ALLOW_UNREADABLE_NETMEM.
> >
> > Signed-off-by: Taehee Yoo <ap420073@...il.com>
> > ---
> >
> > v3:
> > - Patch added
> >
> > drivers/net/ethernet/broadcom/Kconfig | 1 +
> > drivers/net/ethernet/broadcom/bnxt/bnxt.c | 98 +++++++++++++++--------
> > drivers/net/ethernet/broadcom/bnxt/bnxt.h | 2 +-
> > 3 files changed, 66 insertions(+), 35 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/broadcom/Kconfig b/drivers/net/ethernet/broadcom/Kconfig
> > index 75ca3ddda1f5..f37ff12d4746 100644
> > --- a/drivers/net/ethernet/broadcom/Kconfig
> > +++ b/drivers/net/ethernet/broadcom/Kconfig
> > @@ -211,6 +211,7 @@ config BNXT
> > select FW_LOADER
> > select LIBCRC32C
> > select NET_DEVLINK
> > + select NET_DEVMEM
> > select PAGE_POOL
> > select DIMLIB
> > select AUXILIARY_BUS
> > diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> > index 872b15842b11..64e07d247f97 100644
> > --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> > +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> > @@ -55,6 +55,7 @@
> > #include <net/page_pool/helpers.h>
> > #include <linux/align.h>
> > #include <net/netdev_queues.h>
> > +#include <net/netdev_rx_queue.h>
> >
> > #include "bnxt_hsi.h"
> > #include "bnxt.h"
> > @@ -863,6 +864,22 @@ static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi *bnapi, int budget)
> > bnapi->events &= ~BNXT_TX_CMP_EVENT;
> > }
> >
> > +static netmem_ref __bnxt_alloc_rx_netmem(struct bnxt *bp, dma_addr_t *mapping,
> > + struct bnxt_rx_ring_info *rxr,
> > + unsigned int *offset,
> > + gfp_t gfp)
> > +{
> > + netmem_ref netmem;
> > +
> > + netmem = page_pool_alloc_netmem(rxr->page_pool, GFP_ATOMIC);
> > + if (!netmem)
> > + return 0;
> > + *offset = 0;
> > +
> > + *mapping = page_pool_get_dma_addr_netmem(netmem) + *offset;
> > + return netmem;
> > +}
> > +
> > static struct page *__bnxt_alloc_rx_page(struct bnxt *bp, dma_addr_t *mapping,
> > struct bnxt_rx_ring_info *rxr,
> > unsigned int *offset,
> > @@ -972,21 +989,21 @@ static inline u16 bnxt_find_next_agg_idx(struct bnxt_rx_ring_info *rxr, u16 idx)
> > return next;
> > }
> >
> > -static inline int bnxt_alloc_rx_page(struct bnxt *bp,
> > - struct bnxt_rx_ring_info *rxr,
> > - u16 prod, gfp_t gfp)
> > +static inline int bnxt_alloc_rx_netmem(struct bnxt *bp,
> > + struct bnxt_rx_ring_info *rxr,
> > + u16 prod, gfp_t gfp)
> > {
> > struct rx_bd *rxbd =
> > &rxr->rx_agg_desc_ring[RX_AGG_RING(bp, prod)][RX_IDX(prod)];
> > struct bnxt_sw_rx_agg_bd *rx_agg_buf;
> > - struct page *page;
> > + netmem_ref netmem;
> > dma_addr_t mapping;
> > u16 sw_prod = rxr->rx_sw_agg_prod;
> > unsigned int offset = 0;
> >
> > - page = __bnxt_alloc_rx_page(bp, &mapping, rxr, &offset, gfp);
> > + netmem = __bnxt_alloc_rx_netmem(bp, &mapping, rxr, &offset, gfp);
>
> Does __bnxt_alloc_rx_page become dead code after this change? Or is it
> still used for something?
__bnxt_alloc_rx_page() is still used.
>
> >
> > - if (!page)
> > + if (!netmem)
> > return -ENOMEM;
> >
> > if (unlikely(test_bit(sw_prod, rxr->rx_agg_bmap)))
> > @@ -996,7 +1013,7 @@ static inline int bnxt_alloc_rx_page(struct bnxt *bp,
> > rx_agg_buf = &rxr->rx_agg_ring[sw_prod];
> > rxr->rx_sw_agg_prod = RING_RX_AGG(bp, NEXT_RX_AGG(sw_prod));
> >
> > - rx_agg_buf->page = page;
> > + rx_agg_buf->netmem = netmem;
> > rx_agg_buf->offset = offset;
> > rx_agg_buf->mapping = mapping;
> > rxbd->rx_bd_haddr = cpu_to_le64(mapping);
> > @@ -1044,7 +1061,7 @@ static void bnxt_reuse_rx_agg_bufs(struct bnxt_cp_ring_info *cpr, u16 idx,
> > struct rx_agg_cmp *agg;
> > struct bnxt_sw_rx_agg_bd *cons_rx_buf, *prod_rx_buf;
> > struct rx_bd *prod_bd;
> > - struct page *page;
> > + netmem_ref netmem;
> >
> > if (p5_tpa)
> > agg = bnxt_get_tpa_agg_p5(bp, rxr, idx, start + i);
> > @@ -1061,11 +1078,11 @@ static void bnxt_reuse_rx_agg_bufs(struct bnxt_cp_ring_info *cpr, u16 idx,
> > cons_rx_buf = &rxr->rx_agg_ring[cons];
> >
> > /* It is possible for sw_prod to be equal to cons, so
> > - * set cons_rx_buf->page to NULL first.If I misunderstand about
> > + * set cons_rx_buf->netmem to 0 first.
> > */
> > - page = cons_rx_buf->page;
> > - cons_rx_buf->page = NULL;
> > - prod_rx_buf->page = page;
> > + netmem = cons_rx_buf->netmem;
> > + cons_rx_buf->netmem = 0;
> > + prod_rx_buf->netmem = netmem;
> > prod_rx_buf->offset = cons_rx_buf->offset;
> >
> > prod_rx_buf->mapping = cons_rx_buf->mapping;
> > @@ -1192,6 +1209,7 @@ static struct sk_buff *bnxt_rx_skb(struct bnxt *bp,
> >
> > static u32 __bnxt_rx_agg_pages(struct bnxt *bp,
> > struct bnxt_cp_ring_info *cpr,
> > + struct sk_buff *skb,
> > struct skb_shared_info *shinfo,
> > u16 idx, u32 agg_bufs, bool tpa,
> > struct xdp_buff *xdp)
> > @@ -1211,7 +1229,7 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp,
> > u16 cons, frag_len;
> > struct rx_agg_cmp *agg;
> > struct bnxt_sw_rx_agg_bd *cons_rx_buf;
> > - struct page *page;
> > + netmem_ref netmem;
> > dma_addr_t mapping;
> >
> > if (p5_tpa)
> > @@ -1223,9 +1241,15 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp,
> > RX_AGG_CMP_LEN) >> RX_AGG_CMP_LEN_SHIFT;
> >
> > cons_rx_buf = &rxr->rx_agg_ring[cons];
> > - skb_frag_fill_page_desc(frag, cons_rx_buf->page,
> > - cons_rx_buf->offset, frag_len);
> > - shinfo->nr_frags = i + 1;
> > + if (skb) {
> > + skb_add_rx_frag_netmem(skb, i, cons_rx_buf->netmem,
> > + cons_rx_buf->offset, frag_len,
> > + BNXT_RX_PAGE_SIZE);
> > + } else {
> > + skb_frag_fill_page_desc(frag, netmem_to_page(cons_rx_buf->netmem),
> > + cons_rx_buf->offset, frag_len);
>
> Our intention with the whole netmem design is that drivers should
> never have to call netmem_to_page(). I.e. the driver should use netmem
> unaware of whether it's page or non-page underneath, to minimize
> complexity driver needs to handle.
>
> This netmem_to_page() call can be removed by using
> skb_frag_fill_netmem_desc() instead of the page variant. But, more
> improtantly, why did the code change here? The code before calls
> skb_frag_fill_page_desc, but the new code sometimes will
> skb_frag_fill_netmem_desc() and sometimes will skb_add_rx_frag_netmem.
> I'm not sure why that logic changed.
The reason why skb_add_rx_frag_netmem() is used here is to set
skb->unreadable flag. the skb_frag_fill_netmem_desc() doesn't set
skb->unreadable because it doesn't handle skb, it only handles frag.
As far as I know, skb->unreadable should be set to true for devmem
TCP, am I misunderstood?
I tested that don't using skb_add_rx_frag_netmem() here, and it
immediately fails.
The "if (skb)" branch will be hit only when devmem TCP path.
Normal packet and XDP path will hit "else" branch.
I will use skb_frag_fill_netmem_desc() instead of
skb_frag_fill_page_desc() in the "else" branch.
With this change, as you said, there is no netmem_to_page() in bnxt_en
driver, Thanks!
>
> > + shinfo->nr_frags = i + 1;
> > + }
> > __clear_bit(cons, rxr->rx_agg_bmap);
> >
> > /* It is possible for bnxt_alloc_rx_page() to allocate
> > @@ -1233,15 +1257,15 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp,
> > * need to clear the cons entry now.
> > */
> > mapping = cons_rx_buf->mapping;
> > - page = cons_rx_buf->page;
> > - cons_rx_buf->page = NULL;
> > + netmem = cons_rx_buf->netmem;
> > + cons_rx_buf->netmem = 0;
> >
> > - if (xdp && page_is_pfmemalloc(page))
> > + if (xdp && page_is_pfmemalloc(netmem_to_page(netmem)))
>
> Similarly, add netmem_is_pfmemalloc to netmem.h, instead of doing a
> netmem_to_page() call here I think.
Thanks, I will add netmem_is_pfmemalloc() to netmem.h in a v4 patch.
>
> > xdp_buff_set_frag_pfmemalloc(xdp);
> >
> > - if (bnxt_alloc_rx_page(bp, rxr, prod, GFP_ATOMIC) != 0) {
> > + if (bnxt_alloc_rx_netmem(bp, rxr, prod, GFP_ATOMIC) != 0) {
> > --shinfo->nr_frags;
> > - cons_rx_buf->page = page;
> > + cons_rx_buf->netmem = netmem;
> >
> > /* Update prod since possibly some pages have been
> > * allocated already.
> > @@ -1269,7 +1293,7 @@ static struct sk_buff *bnxt_rx_agg_pages_skb(struct bnxt *bp,
> > struct skb_shared_info *shinfo = skb_shinfo(skb);
> > u32 total_frag_len = 0;
> >
> > - total_frag_len = __bnxt_rx_agg_pages(bp, cpr, shinfo, idx,
> > + total_frag_len = __bnxt_rx_agg_pages(bp, cpr, skb, shinfo, idx,
> > agg_bufs, tpa, NULL);
> > if (!total_frag_len) {
> > skb_mark_for_recycle(skb);
> > @@ -1277,9 +1301,6 @@ static struct sk_buff *bnxt_rx_agg_pages_skb(struct bnxt *bp,
> > return NULL;
> > }
> >
> > - skb->data_len += total_frag_len;
> > - skb->len += total_frag_len;
> > - skb->truesize += BNXT_RX_PAGE_SIZE * agg_bufs;
> > return skb;
> > }
> >
> > @@ -1294,7 +1315,7 @@ static u32 bnxt_rx_agg_pages_xdp(struct bnxt *bp,
> > if (!xdp_buff_has_frags(xdp))
> > shinfo->nr_frags = 0;
> >
> > - total_frag_len = __bnxt_rx_agg_pages(bp, cpr, shinfo,
> > + total_frag_len = __bnxt_rx_agg_pages(bp, cpr, NULL, shinfo,
> > idx, agg_bufs, tpa, xdp);
> > if (total_frag_len) {
> > xdp_buff_set_frags_flag(xdp);
> > @@ -3342,15 +3363,15 @@ static void bnxt_free_one_rx_agg_ring(struct bnxt *bp, struct bnxt_rx_ring_info
> >
> > for (i = 0; i < max_idx; i++) {
> > struct bnxt_sw_rx_agg_bd *rx_agg_buf = &rxr->rx_agg_ring[i];
> > - struct page *page = rx_agg_buf->page;
> > + netmem_ref netmem = rx_agg_buf->netmem;
> >
> > - if (!page)
> > + if (!netmem)
> > continue;
> >
> > - rx_agg_buf->page = NULL;
> > + rx_agg_buf->netmem = 0;
> > __clear_bit(i, rxr->rx_agg_bmap);
> >
> > - page_pool_recycle_direct(rxr->page_pool, page);
> > + page_pool_put_full_netmem(rxr->page_pool, netmem, true);
> > }
> > }
> >
> > @@ -3608,9 +3629,11 @@ static void bnxt_free_rx_rings(struct bnxt *bp)
> >
> > static int bnxt_alloc_rx_page_pool(struct bnxt *bp,
> > struct bnxt_rx_ring_info *rxr,
> > + int queue_idx,
> > int numa_node)
> > {
> > struct page_pool_params pp = { 0 };
> > + struct netdev_rx_queue *rxq;
> >
> > pp.pool_size = bp->rx_agg_ring_size;
> > if (BNXT_RX_PAGE_MODE(bp))
> > @@ -3621,8 +3644,15 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp,
> > pp.dev = &bp->pdev->dev;
> > pp.dma_dir = bp->rx_dir;
> > pp.max_len = PAGE_SIZE;
> > - pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV;
> > + pp.order = 0;
> > +
> > + rxq = __netif_get_rx_queue(bp->dev, queue_idx);
> > + if (rxq->mp_params.mp_priv)
> > + pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_ALLOW_UNREADABLE_NETMEM;
>
> This is not the intended use of PP_FLAG_ALLOW_UNREADABLE_NETMEM.
>
> The driver should set PP_FLAG_ALLOW_UNREADABLE_NETMEM when it's able
> to handle unreadable netmem, it should not worry about whether
> rxq->mp_params.mp_priv is set or not.
>
> You should set PP_FLAG_ALLOW_UNREADABLE_NETMEM when HDS is enabled.
> Let core figure out if mp_params.mp_priv is enabled. All the driver
> needs to report is whether it's configured to be able to handle
> unreadable netmem (which practically means HDS is enabled).
The reason why the branch exists here is the PP_FLAG_ALLOW_UNREADABLE_NETMEM
flag can't be used with PP_FLAG_DMA_SYNC_DEV.
228 if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) {
229 /* In order to request DMA-sync-for-device the page
230 * needs to be mapped
231 */
232 if (!(pool->slow.flags & PP_FLAG_DMA_MAP))
233 return -EINVAL;
234
235 if (!pool->p.max_len)
236 return -EINVAL;
237
238 pool->dma_sync = true; //here
239
240 /* pool->p.offset has to be set according to the address
241 * offset used by the DMA engine to start copying rx data
242 */
243 }
If PP_FLAG_DMA_SYNC_DEV is set, page->dma_sync is set to true.
347 int mp_dmabuf_devmem_init(struct page_pool *pool)
348 {
349 struct net_devmem_dmabuf_binding *binding = pool->mp_priv;
350
351 if (!binding)
352 return -EINVAL;
353
354 if (!pool->dma_map)
355 return -EOPNOTSUPP;
356
357 if (pool->dma_sync) //here
358 return -EOPNOTSUPP;
359
360 if (pool->p.order != 0)
361 return -E2BIG;
362
363 net_devmem_dmabuf_binding_get(binding);
364 return 0;
365 }
In the mp_dmabuf_devmem_init(), it fails when pool->dma_sync is true.
tcp-data-split can be used for normal cases, not only devmem TCP case.
If we enable tcp-data-split and disable devmem TCP, page_pool doesn't
have PP_FLAG_DMA_SYNC_DEV.
So I think mp_params.mp_priv is still useful.
Thanks a lot,
Taehee Yoo
>
> > + else
> > + pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV;
> >
> > + pp.queue_idx = queue_idx;
> > rxr->page_pool = page_pool_create(&pp);
> > if (IS_ERR(rxr->page_pool)) {
> > int err = PTR_ERR(rxr->page_pool);
> > @@ -3655,7 +3685,7 @@ static int bnxt_alloc_rx_rings(struct bnxt *bp)
> > cpu_node = cpu_to_node(cpu);
> > netdev_dbg(bp->dev, "Allocating page pool for rx_ring[%d] on numa_node: %d\n",
> > i, cpu_node);
> > - rc = bnxt_alloc_rx_page_pool(bp, rxr, cpu_node);
> > + rc = bnxt_alloc_rx_page_pool(bp, rxr, i, cpu_node);
> > if (rc)
> > return rc;
> >
> > @@ -4154,7 +4184,7 @@ static void bnxt_alloc_one_rx_ring_page(struct bnxt *bp,
> >
> > prod = rxr->rx_agg_prod;
> > for (i = 0; i < bp->rx_agg_ring_size; i++) {
> > - if (bnxt_alloc_rx_page(bp, rxr, prod, GFP_KERNEL)) {
> > + if (bnxt_alloc_rx_netmem(bp, rxr, prod, GFP_KERNEL)) {
> > netdev_warn(bp->dev, "init'ed rx ring %d with %d/%d pages only\n",
> > ring_nr, i, bp->rx_ring_size);
> > break;
> > @@ -15063,7 +15093,7 @@ static int bnxt_queue_mem_alloc(struct net_device *dev, void *qmem, int idx)
> > clone->rx_sw_agg_prod = 0;
> > clone->rx_next_cons = 0;
> >
> > - rc = bnxt_alloc_rx_page_pool(bp, clone, rxr->page_pool->p.nid);
> > + rc = bnxt_alloc_rx_page_pool(bp, clone, idx, rxr->page_pool->p.nid);
> > if (rc)
> > return rc;
> >
> > diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> > index 48f390519c35..3cf57a3c7664 100644
> > --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> > +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> > @@ -895,7 +895,7 @@ struct bnxt_sw_rx_bd {
> > };
> >
> > struct bnxt_sw_rx_agg_bd {
> > - struct page *page;
> > + netmem_ref netmem;
> > unsigned int offset;
> > dma_addr_t mapping;
> > };
> > --
> > 2.34.1
> >
>
>
> --
> Thanks,
> Mina
Powered by blists - more mailing lists