[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Z7Sf9VksFLFq2GwA@boxer>
Date: Tue, 18 Feb 2025 15:57:57 +0100
From: Maciej Fijalkowski <maciej.fijalkowski@...el.com>
To: Suman Ghosh <sumang@...vell.com>
CC: <horms@...nel.org>, <sgoutham@...vell.com>, <gakula@...vell.com>,
<sbhatta@...vell.com>, <hkelam@...vell.com>, <davem@...emloft.net>,
<edumazet@...gle.com>, <kuba@...nel.org>, <pabeni@...hat.com>,
<netdev@...r.kernel.org>, <linux-kernel@...r.kernel.org>,
<lcherian@...vell.com>, <jerinj@...vell.com>, <john.fastabend@...il.com>,
<bbhushan2@...vell.com>, <hawk@...nel.org>, <andrew+netdev@...n.ch>,
<ast@...nel.org>, <daniel@...earbox.net>, <bpf@...r.kernel.org>,
<larysa.zaremba@...el.com>
Subject: Re: [net-next PATCH v6 3/6] octeontx2-pf: AF_XDP zero copy receive
support
On Thu, Feb 13, 2025 at 11:01:38AM +0530, Suman Ghosh wrote:
> This patch adds support to AF_XDP zero copy for CN10K.
> This patch specifically adds receive side support. In this approach once
> a xdp program with zero copy support on a specific rx queue is enabled,
> then that receive quse is disabled/detached from the existing kernel
> queue and re-assigned to the umem memory.
>
> Signed-off-by: Suman Ghosh <sumang@...vell.com>
> ---
> .../ethernet/marvell/octeontx2/nic/Makefile | 2 +-
> .../ethernet/marvell/octeontx2/nic/cn10k.c | 7 +-
> .../marvell/octeontx2/nic/otx2_common.c | 114 ++++++++---
> .../marvell/octeontx2/nic/otx2_common.h | 6 +-
> .../ethernet/marvell/octeontx2/nic/otx2_pf.c | 25 ++-
> .../marvell/octeontx2/nic/otx2_txrx.c | 73 +++++--
> .../marvell/octeontx2/nic/otx2_txrx.h | 6 +
> .../ethernet/marvell/octeontx2/nic/otx2_vf.c | 12 +-
> .../ethernet/marvell/octeontx2/nic/otx2_xsk.c | 182 ++++++++++++++++++
> .../ethernet/marvell/octeontx2/nic/otx2_xsk.h | 21 ++
> .../ethernet/marvell/octeontx2/nic/qos_sq.c | 2 +-
> 11 files changed, 389 insertions(+), 61 deletions(-)
> create mode 100644 drivers/net/ethernet/marvell/octeontx2/nic/otx2_xsk.c
> create mode 100644 drivers/net/ethernet/marvell/octeontx2/nic/otx2_xsk.h
>
> diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/Makefile b/drivers/net/ethernet/marvell/octeontx2/nic/Makefile
> index cb6513ab35e7..69e0778f9ac1 100644
> --- a/drivers/net/ethernet/marvell/octeontx2/nic/Makefile
> +++ b/drivers/net/ethernet/marvell/octeontx2/nic/Makefile
> @@ -9,7 +9,7 @@ obj-$(CONFIG_RVU_ESWITCH) += rvu_rep.o
>
> rvu_nicpf-y := otx2_pf.o otx2_common.o otx2_txrx.o otx2_ethtool.o \
> otx2_flows.o otx2_tc.o cn10k.o otx2_dmac_flt.o \
> - otx2_devlink.o qos_sq.o qos.o
> + otx2_devlink.o qos_sq.o qos.o otx2_xsk.o
> rvu_nicvf-y := otx2_vf.o
> rvu_rep-y := rep.o
>
> diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c
> index a15cc86635d6..c3b6e0f60a79 100644
> --- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c
> +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c
> @@ -112,9 +112,12 @@ int cn10k_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq)
> struct otx2_nic *pfvf = dev;
> int cnt = cq->pool_ptrs;
> u64 ptrs[NPA_MAX_BURST];
> + struct otx2_pool *pool;
> dma_addr_t bufptr;
> int num_ptrs = 1;
>
> + pool = &pfvf->qset.pool[cq->cq_idx];
> +
> /* Refill pool with new buffers */
> while (cq->pool_ptrs) {
> if (otx2_alloc_buffer(pfvf, cq, &bufptr)) {
> @@ -124,7 +127,9 @@ int cn10k_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq)
> break;
> }
> cq->pool_ptrs--;
> - ptrs[num_ptrs] = (u64)bufptr + OTX2_HEAD_ROOM;
> + ptrs[num_ptrs] = pool->xsk_pool ?
> + (u64)bufptr : (u64)bufptr + OTX2_HEAD_ROOM;
> +
> num_ptrs++;
> if (num_ptrs == NPA_MAX_BURST || cq->pool_ptrs == 0) {
> __cn10k_aura_freeptr(pfvf, cq->cq_idx, ptrs,
> diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
> index 161cf33ef89e..92b0dba07853 100644
> --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
> +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
> @@ -17,6 +17,7 @@
> #include "otx2_common.h"
> #include "otx2_struct.h"
> #include "cn10k.h"
> +#include "otx2_xsk.h"
>
> static bool otx2_is_pfc_enabled(struct otx2_nic *pfvf)
> {
> @@ -549,10 +550,13 @@ static int otx2_alloc_pool_buf(struct otx2_nic *pfvf, struct otx2_pool *pool,
> }
>
> static int __otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool,
> - dma_addr_t *dma)
> + dma_addr_t *dma, int qidx, int idx)
> {
> u8 *buf;
>
> + if (pool->xsk_pool)
> + return otx2_xsk_pool_alloc_buf(pfvf, pool, dma, idx);
> +
> if (pool->page_pool)
> return otx2_alloc_pool_buf(pfvf, pool, dma);
>
> @@ -571,12 +575,12 @@ static int __otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool,
> }
>
> int otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool,
> - dma_addr_t *dma)
> + dma_addr_t *dma, int qidx, int idx)
> {
> int ret;
>
> local_bh_disable();
> - ret = __otx2_alloc_rbuf(pfvf, pool, dma);
> + ret = __otx2_alloc_rbuf(pfvf, pool, dma, qidx, idx);
> local_bh_enable();
> return ret;
> }
> @@ -584,7 +588,8 @@ int otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool,
> int otx2_alloc_buffer(struct otx2_nic *pfvf, struct otx2_cq_queue *cq,
> dma_addr_t *dma)
> {
> - if (unlikely(__otx2_alloc_rbuf(pfvf, cq->rbpool, dma)))
> + if (unlikely(__otx2_alloc_rbuf(pfvf, cq->rbpool, dma,
> + cq->cq_idx, cq->pool_ptrs - 1)))
> return -ENOMEM;
> return 0;
> }
> @@ -884,7 +889,7 @@ void otx2_sqb_flush(struct otx2_nic *pfvf)
> #define RQ_PASS_LVL_AURA (255 - ((95 * 256) / 100)) /* RED when 95% is full */
> #define RQ_DROP_LVL_AURA (255 - ((99 * 256) / 100)) /* Drop when 99% is full */
>
> -static int otx2_rq_init(struct otx2_nic *pfvf, u16 qidx, u16 lpb_aura)
> +int otx2_rq_init(struct otx2_nic *pfvf, u16 qidx, u16 lpb_aura)
> {
> struct otx2_qset *qset = &pfvf->qset;
> struct nix_aq_enq_req *aq;
> @@ -1041,7 +1046,7 @@ int otx2_sq_init(struct otx2_nic *pfvf, u16 qidx, u16 sqb_aura)
>
> }
>
> -static int otx2_cq_init(struct otx2_nic *pfvf, u16 qidx)
> +int otx2_cq_init(struct otx2_nic *pfvf, u16 qidx)
> {
> struct otx2_qset *qset = &pfvf->qset;
> int err, pool_id, non_xdp_queues;
> @@ -1057,11 +1062,18 @@ static int otx2_cq_init(struct otx2_nic *pfvf, u16 qidx)
> cq->cint_idx = qidx;
> cq->cqe_cnt = qset->rqe_cnt;
> if (pfvf->xdp_prog) {
> - pool = &qset->pool[qidx];
> xdp_rxq_info_reg(&cq->xdp_rxq, pfvf->netdev, qidx, 0);
> - xdp_rxq_info_reg_mem_model(&cq->xdp_rxq,
> - MEM_TYPE_PAGE_POOL,
> - pool->page_pool);
> + pool = &qset->pool[qidx];
> + if (pool->xsk_pool) {
> + xdp_rxq_info_reg_mem_model(&cq->xdp_rxq,
> + MEM_TYPE_XSK_BUFF_POOL,
> + NULL);
> + xsk_pool_set_rxq_info(pool->xsk_pool, &cq->xdp_rxq);
> + } else if (pool->page_pool) {
> + xdp_rxq_info_reg_mem_model(&cq->xdp_rxq,
> + MEM_TYPE_PAGE_POOL,
> + pool->page_pool);
> + }
> }
> } else if (qidx < non_xdp_queues) {
> cq->cq_type = CQ_TX;
> @@ -1281,9 +1293,10 @@ void otx2_free_bufs(struct otx2_nic *pfvf, struct otx2_pool *pool,
>
> pa = otx2_iova_to_phys(pfvf->iommu_domain, iova);
> page = virt_to_head_page(phys_to_virt(pa));
> -
> if (pool->page_pool) {
> page_pool_put_full_page(pool->page_pool, page, true);
> + } else if (pool->xsk_pool) {
> + /* Note: No way of identifying xdp_buff */
> } else {
> dma_unmap_page_attrs(pfvf->dev, iova, size,
> DMA_FROM_DEVICE,
> @@ -1298,6 +1311,7 @@ void otx2_free_aura_ptr(struct otx2_nic *pfvf, int type)
> int pool_id, pool_start = 0, pool_end = 0, size = 0;
> struct otx2_pool *pool;
> u64 iova;
> + int idx;
>
> if (type == AURA_NIX_SQ) {
> pool_start = otx2_get_pool_idx(pfvf, type, 0);
> @@ -1312,16 +1326,21 @@ void otx2_free_aura_ptr(struct otx2_nic *pfvf, int type)
>
> /* Free SQB and RQB pointers from the aura pool */
> for (pool_id = pool_start; pool_id < pool_end; pool_id++) {
> - iova = otx2_aura_allocptr(pfvf, pool_id);
> pool = &pfvf->qset.pool[pool_id];
> + iova = otx2_aura_allocptr(pfvf, pool_id);
> while (iova) {
> if (type == AURA_NIX_RQ)
> iova -= OTX2_HEAD_ROOM;
> -
> otx2_free_bufs(pfvf, pool, iova, size);
> -
> iova = otx2_aura_allocptr(pfvf, pool_id);
> }
> +
> + for (idx = 0 ; idx < pool->xdp_cnt; idx++) {
> + if (!pool->xdp[idx])
> + continue;
> +
> + xsk_buff_free(pool->xdp[idx]);
> + }
> }
> }
>
> @@ -1338,7 +1357,8 @@ void otx2_aura_pool_free(struct otx2_nic *pfvf)
> qmem_free(pfvf->dev, pool->stack);
> qmem_free(pfvf->dev, pool->fc_addr);
> page_pool_destroy(pool->page_pool);
> - pool->page_pool = NULL;
> + devm_kfree(pfvf->dev, pool->xdp);
> + pool->xsk_pool = NULL;
> }
> devm_kfree(pfvf->dev, pfvf->qset.pool);
> pfvf->qset.pool = NULL;
> @@ -1425,6 +1445,7 @@ int otx2_pool_init(struct otx2_nic *pfvf, u16 pool_id,
> int stack_pages, int numptrs, int buf_size, int type)
> {
> struct page_pool_params pp_params = { 0 };
> + struct xsk_buff_pool *xsk_pool;
> struct npa_aq_enq_req *aq;
> struct otx2_pool *pool;
> int err;
> @@ -1468,21 +1489,35 @@ int otx2_pool_init(struct otx2_nic *pfvf, u16 pool_id,
> aq->ctype = NPA_AQ_CTYPE_POOL;
> aq->op = NPA_AQ_INSTOP_INIT;
>
> - if (type != AURA_NIX_RQ) {
> - pool->page_pool = NULL;
> + if (type != AURA_NIX_RQ)
> + return 0;
> +
> + if (!test_bit(pool_id, pfvf->af_xdp_zc_qidx)) {
> + pp_params.order = get_order(buf_size);
> + pp_params.flags = PP_FLAG_DMA_MAP;
> + pp_params.pool_size = min(OTX2_PAGE_POOL_SZ, numptrs);
> + pp_params.nid = NUMA_NO_NODE;
> + pp_params.dev = pfvf->dev;
> + pp_params.dma_dir = DMA_FROM_DEVICE;
> + pool->page_pool = page_pool_create(&pp_params);
> + if (IS_ERR(pool->page_pool)) {
> + netdev_err(pfvf->netdev, "Creation of page pool failed\n");
> + return PTR_ERR(pool->page_pool);
> + }
> return 0;
> }
>
> - pp_params.order = get_order(buf_size);
> - pp_params.flags = PP_FLAG_DMA_MAP;
> - pp_params.pool_size = min(OTX2_PAGE_POOL_SZ, numptrs);
> - pp_params.nid = NUMA_NO_NODE;
> - pp_params.dev = pfvf->dev;
> - pp_params.dma_dir = DMA_FROM_DEVICE;
> - pool->page_pool = page_pool_create(&pp_params);
> - if (IS_ERR(pool->page_pool)) {
> - netdev_err(pfvf->netdev, "Creation of page pool failed\n");
> - return PTR_ERR(pool->page_pool);
> + /* Set XSK pool to support AF_XDP zero-copy */
> + xsk_pool = xsk_get_pool_from_qid(pfvf->netdev, pool_id);
> + if (xsk_pool) {
> + pool->xsk_pool = xsk_pool;
> + pool->xdp_cnt = numptrs;
> + pool->xdp = devm_kcalloc(pfvf->dev,
> + numptrs, sizeof(struct xdp_buff *), GFP_KERNEL);
What is the rationale behind having a buffer pool within your driver while
you have this very same thing within xsk_buff_pool?
You're doubling your work. Just use the xsk_buff_alloc_batch() and have a
simpler ZC implementation in your driver.
> + if (IS_ERR(pool->xdp)) {
> + netdev_err(pfvf->netdev, "Creation of xsk pool failed\n");
> + return PTR_ERR(pool->xdp);
> + }
> }
>
> return 0;
> @@ -1543,9 +1578,18 @@ int otx2_sq_aura_pool_init(struct otx2_nic *pfvf)
> }
>
> for (ptr = 0; ptr < num_sqbs; ptr++) {
> - err = otx2_alloc_rbuf(pfvf, pool, &bufptr);
> - if (err)
> + err = otx2_alloc_rbuf(pfvf, pool, &bufptr, pool_id, ptr);
> + if (err) {
> + if (pool->xsk_pool) {
> + ptr--;
> + while (ptr >= 0) {
> + xsk_buff_free(pool->xdp[ptr]);
> + ptr--;
> + }
> + }
> goto err_mem;
> + }
> +
> pfvf->hw_ops->aura_freeptr(pfvf, pool_id, bufptr);
> sq->sqb_ptrs[sq->sqb_count++] = (u64)bufptr;
> }
> @@ -1595,11 +1639,19 @@ int otx2_rq_aura_pool_init(struct otx2_nic *pfvf)
> /* Allocate pointers and free them to aura/pool */
> for (pool_id = 0; pool_id < hw->rqpool_cnt; pool_id++) {
> pool = &pfvf->qset.pool[pool_id];
> +
> for (ptr = 0; ptr < num_ptrs; ptr++) {
> - err = otx2_alloc_rbuf(pfvf, pool, &bufptr);
> - if (err)
> + err = otx2_alloc_rbuf(pfvf, pool, &bufptr, pool_id, ptr);
> + if (err) {
> + if (pool->xsk_pool) {
> + while (ptr)
> + xsk_buff_free(pool->xdp[--ptr]);
> + }
> return -ENOMEM;
> + }
> +
> pfvf->hw_ops->aura_freeptr(pfvf, pool_id,
> + pool->xsk_pool ? bufptr :
> bufptr + OTX2_HEAD_ROOM);
> }
> }
> diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
> index d5fbccb289df..60508971b62f 100644
> --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
> +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
> @@ -532,6 +532,8 @@ struct otx2_nic {
>
> /* Inline ipsec */
> struct cn10k_ipsec ipsec;
> + /* af_xdp zero-copy */
> + unsigned long *af_xdp_zc_qidx;
> };
>
> static inline bool is_otx2_lbkvf(struct pci_dev *pdev)
> @@ -1003,7 +1005,7 @@ void otx2_txschq_free_one(struct otx2_nic *pfvf, u16 lvl, u16 schq);
> void otx2_free_pending_sqe(struct otx2_nic *pfvf);
> void otx2_sqb_flush(struct otx2_nic *pfvf);
> int otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool,
> - dma_addr_t *dma);
> + dma_addr_t *dma, int qidx, int idx);
> int otx2_rxtx_enable(struct otx2_nic *pfvf, bool enable);
> void otx2_ctx_disable(struct mbox *mbox, int type, bool npa);
> int otx2_nix_config_bp(struct otx2_nic *pfvf, bool enable);
> @@ -1033,6 +1035,8 @@ void otx2_pfaf_mbox_destroy(struct otx2_nic *pf);
> void otx2_disable_mbox_intr(struct otx2_nic *pf);
> void otx2_disable_napi(struct otx2_nic *pf);
> irqreturn_t otx2_cq_intr_handler(int irq, void *cq_irq);
> +int otx2_rq_init(struct otx2_nic *pfvf, u16 qidx, u16 lpb_aura);
> +int otx2_cq_init(struct otx2_nic *pfvf, u16 qidx);
>
> /* RSS configuration APIs*/
> int otx2_rss_init(struct otx2_nic *pfvf);
> diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
> index 4347a3c95350..50a42cd5d50a 100644
> --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
> +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
> @@ -27,6 +27,7 @@
> #include "qos.h"
> #include <rvu_trace.h>
> #include "cn10k_ipsec.h"
> +#include "otx2_xsk.h"
>
> #define DRV_NAME "rvu_nicpf"
> #define DRV_STRING "Marvell RVU NIC Physical Function Driver"
> @@ -1662,9 +1663,7 @@ void otx2_free_hw_resources(struct otx2_nic *pf)
> struct nix_lf_free_req *free_req;
> struct mbox *mbox = &pf->mbox;
> struct otx2_cq_queue *cq;
> - struct otx2_pool *pool;
> struct msg_req *req;
> - int pool_id;
> int qidx;
>
> /* Ensure all SQE are processed */
> @@ -1705,13 +1704,6 @@ void otx2_free_hw_resources(struct otx2_nic *pf)
> /* Free RQ buffer pointers*/
> otx2_free_aura_ptr(pf, AURA_NIX_RQ);
>
> - for (qidx = 0; qidx < pf->hw.rx_queues; qidx++) {
> - pool_id = otx2_get_pool_idx(pf, AURA_NIX_RQ, qidx);
> - pool = &pf->qset.pool[pool_id];
> - page_pool_destroy(pool->page_pool);
> - pool->page_pool = NULL;
> - }
> -
> otx2_free_cq_res(pf);
>
> /* Free all ingress bandwidth profiles allocated */
> @@ -2788,6 +2780,8 @@ static int otx2_xdp(struct net_device *netdev, struct netdev_bpf *xdp)
> switch (xdp->command) {
> case XDP_SETUP_PROG:
> return otx2_xdp_setup(pf, xdp->prog);
> + case XDP_SETUP_XSK_POOL:
> + return otx2_xsk_pool_setup(pf, xdp->xsk.pool, xdp->xsk.queue_id);
> default:
> return -EINVAL;
> }
> @@ -2865,6 +2859,7 @@ static const struct net_device_ops otx2_netdev_ops = {
> .ndo_set_vf_vlan = otx2_set_vf_vlan,
> .ndo_get_vf_config = otx2_get_vf_config,
> .ndo_bpf = otx2_xdp,
> + .ndo_xsk_wakeup = otx2_xsk_wakeup,
> .ndo_xdp_xmit = otx2_xdp_xmit,
> .ndo_setup_tc = otx2_setup_tc,
> .ndo_set_vf_trust = otx2_ndo_set_vf_trust,
> @@ -3203,16 +3198,26 @@ static int otx2_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> /* Enable link notifications */
> otx2_cgx_config_linkevents(pf, true);
>
> + pf->af_xdp_zc_qidx = bitmap_zalloc(qcount, GFP_KERNEL);
if this is taken from ice drivers then be aware we got rid of bitmap
tracking zc enabled queues. see adbf5a42341f ("ice: remove af_xdp_zc_qps
bitmap").
in case you would still have a need for that after going through
referenced commit, please provide us some justification why.
> + if (!pf->af_xdp_zc_qidx) {
> + err = -ENOMEM;
> + goto err_sriov_cleannup;
> + }
> +
> #ifdef CONFIG_DCB
> err = otx2_dcbnl_set_ops(netdev);
> if (err)
> - goto err_pf_sriov_init;
> + goto err_free_zc_bmap;
> #endif
>
> otx2_qos_init(pf, qos_txqs);
>
> return 0;
>
> +err_free_zc_bmap:
> + bitmap_free(pf->af_xdp_zc_qidx);
> +err_sriov_cleannup:
> + otx2_sriov_vfcfg_cleanup(pf);
> err_pf_sriov_init:
> otx2_shutdown_tc(pf);
> err_mcam_flow_del:
[...]
Powered by blists - more mailing lists