[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20230206100837.451300-11-vladimir.oltean@nxp.com>
Date: Mon, 6 Feb 2023 12:08:36 +0200
From: Vladimir Oltean <vladimir.oltean@....com>
To: netdev@...r.kernel.org
Cc: "David S. Miller" <davem@...emloft.net>,
Eric Dumazet <edumazet@...gle.com>,
Jakub Kicinski <kuba@...nel.org>,
Paolo Abeni <pabeni@...hat.com>,
Claudiu Manoil <claudiu.manoil@....com>,
Björn Töpel <bjorn@...nel.org>,
Magnus Karlsson <magnus.karlsson@...el.com>,
Maciej Fijalkowski <maciej.fijalkowski@...el.com>,
Alexei Starovoitov <ast@...nel.org>,
Daniel Borkmann <daniel@...earbox.net>,
Jesper Dangaard Brouer <hawk@...nel.org>,
John Fastabend <john.fastabend@...il.com>, bpf@...r.kernel.org,
linux-kernel@...r.kernel.org
Subject: [RFC PATCH net-next 10/11] net: enetc: add RX support for zero-copy XDP sockets
Add support for filling an RX ring with buffers coming from an XSK umem.
Although enetc has up to 8 RX rings, we still use one of the 2 per-CPU
RX rings for XSK.
To set up an XSK pool on one of the RX queues, we use the
reconfiguration procedure which temporarily stops the rings.
Since the RX procedure in the NAPI poll function is completely different
(both the API for creating an xdp_buff, as well as refilling the ring
with memory from user space), create a separate enetc_clean_rx_ring_xsk()
function which gets called when we have both an XSK pool and an XDK
program on this RX queue.
Signed-off-by: Vladimir Oltean <vladimir.oltean@....com>
---
drivers/net/ethernet/freescale/enetc/enetc.c | 377 +++++++++++++++++-
drivers/net/ethernet/freescale/enetc/enetc.h | 3 +
.../net/ethernet/freescale/enetc/enetc_pf.c | 1 +
3 files changed, 373 insertions(+), 8 deletions(-)
diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c
index dee432cacf85..3990c006c011 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc.c
+++ b/drivers/net/ethernet/freescale/enetc/enetc.c
@@ -10,6 +10,7 @@
#include <net/ip6_checksum.h>
#include <net/pkt_sched.h>
#include <net/tso.h>
+#include <net/xdp_sock_drv.h>
u32 enetc_port_mac_rd(struct enetc_si *si, u32 reg)
{
@@ -103,6 +104,9 @@ static void enetc_free_rx_swbd(struct enetc_bdr *rx_ring,
rx_swbd->dir);
__free_page(rx_swbd->page);
rx_swbd->page = NULL;
+ } else if (rx_swbd->xsk_buff) {
+ xsk_buff_free(rx_swbd->xsk_buff);
+ rx_swbd->xsk_buff = NULL;
}
}
@@ -979,6 +983,44 @@ static int enetc_refill_rx_ring(struct enetc_bdr *rx_ring, const int buff_cnt)
return j;
}
+static int enetc_refill_rx_ring_xsk(struct enetc_bdr *rx_ring, int buff_cnt)
+{
+ struct xsk_buff_pool *pool = rx_ring->xdp.xsk_pool;
+ struct enetc_rx_swbd *rx_swbd;
+ struct xdp_buff *xsk_buff;
+ union enetc_rx_bd *rxbd;
+ int i, j;
+
+ i = rx_ring->next_to_use;
+ rxbd = enetc_rxbd(rx_ring, i);
+
+ for (j = 0; j < buff_cnt; j++) {
+ xsk_buff = xsk_buff_alloc(pool); // TODO use _batch?
+ if (!xsk_buff)
+ break;
+
+ rx_swbd = &rx_ring->rx_swbd[i];
+ rx_swbd->xsk_buff = xsk_buff;
+ rx_swbd->dma = xsk_buff_xdp_get_dma(xsk_buff);
+
+ /* update RxBD */
+ rxbd->w.addr = cpu_to_le64(rx_swbd->dma);
+ /* clear 'R" as well */
+ rxbd->r.lstatus = 0;
+
+ enetc_rxbd_next(rx_ring, &rxbd, &i);
+ }
+
+ if (likely(j)) {
+ rx_ring->next_to_use = i;
+
+ /* update ENETC's consumer index */
+ enetc_wr_reg_hot(rx_ring->rcir, rx_ring->next_to_use);
+ }
+
+ return j;
+}
+
#ifdef CONFIG_FSL_ENETC_PTP_CLOCK
static void enetc_get_rx_tstamp(struct net_device *ndev,
union enetc_rx_bd *rxbd,
@@ -1128,6 +1170,18 @@ static void enetc_add_rx_buff_to_skb(struct enetc_bdr *rx_ring, int i,
enetc_flip_rx_buff(rx_ring, rx_swbd);
}
+static void enetc_put_rx_swbd(struct enetc_bdr *rx_ring, int i)
+{
+ struct enetc_rx_swbd *rx_swbd = &rx_ring->rx_swbd[i];
+
+ if (rx_swbd->xsk_buff) {
+ xsk_buff_free(rx_swbd->xsk_buff);
+ rx_swbd->xsk_buff = NULL;
+ } else {
+ enetc_put_rx_buff(rx_ring, rx_swbd);
+ }
+}
+
static bool enetc_check_bd_errors_and_consume(struct enetc_bdr *rx_ring,
u32 bd_status,
union enetc_rx_bd **rxbd, int *i,
@@ -1136,7 +1190,7 @@ static bool enetc_check_bd_errors_and_consume(struct enetc_bdr *rx_ring,
if (likely(!(bd_status & ENETC_RXBD_LSTATUS(ENETC_RXBD_ERR_MASK))))
return false;
- enetc_put_rx_buff(rx_ring, &rx_ring->rx_swbd[*i]);
+ enetc_put_rx_swbd(rx_ring, *i);
(*buffs_missing)++;
enetc_rxbd_next(rx_ring, rxbd, i);
@@ -1144,7 +1198,7 @@ static bool enetc_check_bd_errors_and_consume(struct enetc_bdr *rx_ring,
dma_rmb();
bd_status = le32_to_cpu((*rxbd)->r.lstatus);
- enetc_put_rx_buff(rx_ring, &rx_ring->rx_swbd[*i]);
+ enetc_put_rx_swbd(rx_ring, *i);
(*buffs_missing)++;
enetc_rxbd_next(rx_ring, rxbd, i);
}
@@ -1484,6 +1538,43 @@ static void enetc_build_xdp_buff(struct enetc_bdr *rx_ring, u32 bd_status,
}
}
+static struct xdp_buff *enetc_build_xsk_buff(struct xsk_buff_pool *pool,
+ struct enetc_bdr *rx_ring,
+ u32 bd_status,
+ union enetc_rx_bd **rxbd, int *i,
+ int *buffs_missing, int *rx_byte_cnt)
+{
+ struct enetc_rx_swbd *rx_swbd = &rx_ring->rx_swbd[*i];
+ u16 size = le16_to_cpu((*rxbd)->r.buf_len);
+ struct xdp_buff *xsk_buff;
+
+ /* Multi-buffer frames are not supported in XSK mode */
+ if (unlikely(!(bd_status & ENETC_RXBD_LSTATUS_F))) {
+ while (!(bd_status & ENETC_RXBD_LSTATUS_F)) {
+ enetc_put_rx_swbd(rx_ring, *i);
+
+ (*buffs_missing)++;
+ enetc_rxbd_next(rx_ring, rxbd, i);
+ dma_rmb();
+ bd_status = le32_to_cpu((*rxbd)->r.lstatus);
+ }
+
+ return NULL;
+ }
+
+ xsk_buff = rx_swbd->xsk_buff;
+ xsk_buff_set_size(xsk_buff, size);
+ xsk_buff_dma_sync_for_cpu(xsk_buff, pool);
+
+ rx_swbd->xsk_buff = NULL;
+
+ (*buffs_missing)++;
+ (*rx_byte_cnt) += size;
+ enetc_rxbd_next(rx_ring, rxbd, i);
+
+ return xsk_buff;
+}
+
/* Convert RX buffer descriptors to TX buffer descriptors. These will be
* recycled back into the RX ring in enetc_clean_tx_ring.
*/
@@ -1659,11 +1750,136 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring,
return rx_frm_cnt;
}
+static void enetc_xsk_buff_to_skb(struct xdp_buff *xsk_buff,
+ struct enetc_bdr *rx_ring,
+ union enetc_rx_bd *rxbd,
+ struct napi_struct *napi)
+{
+ size_t len = xdp_get_buff_len(xsk_buff);
+ struct sk_buff *skb;
+
+ skb = napi_alloc_skb(napi, len);
+ if (unlikely(!skb)) {
+ rx_ring->stats.rx_alloc_errs++;
+ goto out;
+ }
+
+ skb_put_data(skb, xsk_buff->data, len);
+
+ enetc_get_offloads(rx_ring, rxbd, skb);
+
+ skb_record_rx_queue(skb, rx_ring->index);
+ skb->protocol = eth_type_trans(skb, rx_ring->ndev);
+
+ rx_ring->stats.packets += skb->len;
+ rx_ring->stats.bytes++;
+
+ napi_gro_receive(napi, skb);
+out:
+ xsk_buff_free(xsk_buff);
+}
+
+static int enetc_clean_rx_ring_xsk(struct enetc_bdr *rx_ring,
+ struct napi_struct *napi, int work_limit,
+ struct bpf_prog *prog,
+ struct xsk_buff_pool *pool)
+{
+ struct net_device *ndev = rx_ring->ndev;
+ union enetc_rx_bd *rxbd, *orig_rxbd;
+ int rx_frm_cnt = 0, rx_byte_cnt = 0;
+ int xdp_redirect_frm_cnt = 0;
+ struct xdp_buff *xsk_buff;
+ int buffs_missing, err, i;
+ bool wakeup_xsk = false;
+ u32 bd_status, xdp_act;
+
+ buffs_missing = enetc_bd_unused(rx_ring);
+ /* next descriptor to process */
+ i = rx_ring->next_to_clean;
+
+ while (likely(rx_frm_cnt < work_limit)) {
+ if (buffs_missing >= ENETC_RXBD_BUNDLE) {
+ buffs_missing -= enetc_refill_rx_ring_xsk(rx_ring,
+ buffs_missing);
+ wakeup_xsk |= (buffs_missing != 0);
+ }
+
+ rxbd = enetc_rxbd(rx_ring, i);
+ bd_status = le32_to_cpu(rxbd->r.lstatus);
+ if (!bd_status)
+ break;
+
+ enetc_wr_reg_hot(rx_ring->idr, BIT(rx_ring->index));
+ dma_rmb(); /* for reading other rxbd fields */
+
+ if (enetc_check_bd_errors_and_consume(rx_ring, bd_status,
+ &rxbd, &i,
+ &buffs_missing))
+ continue;
+
+ orig_rxbd = rxbd;
+
+ xsk_buff = enetc_build_xsk_buff(pool, rx_ring, bd_status,
+ &rxbd, &i, &buffs_missing,
+ &rx_byte_cnt);
+ if (!xsk_buff)
+ continue;
+
+ xdp_act = bpf_prog_run_xdp(prog, xsk_buff);
+ switch (xdp_act) {
+ default:
+ bpf_warn_invalid_xdp_action(ndev, prog, xdp_act);
+ fallthrough;
+ case XDP_ABORTED:
+ trace_xdp_exception(ndev, prog, xdp_act);
+ fallthrough;
+ case XDP_DROP:
+ xsk_buff_free(xsk_buff);
+ break;
+ case XDP_PASS:
+ enetc_xsk_buff_to_skb(xsk_buff, rx_ring, orig_rxbd,
+ napi);
+ break;
+ case XDP_REDIRECT:
+ err = xdp_do_redirect(ndev, xsk_buff, prog);
+ if (unlikely(err)) {
+ if (err == -ENOBUFS)
+ wakeup_xsk = true;
+ xsk_buff_free(xsk_buff);
+ rx_ring->stats.xdp_redirect_failures++;
+ } else {
+ xdp_redirect_frm_cnt++;
+ rx_ring->stats.xdp_redirect++;
+ }
+ }
+
+ rx_frm_cnt++;
+ }
+
+ rx_ring->next_to_clean = i;
+
+ rx_ring->stats.packets += rx_frm_cnt;
+ rx_ring->stats.bytes += rx_byte_cnt;
+
+ if (xdp_redirect_frm_cnt)
+ xdp_do_flush_map();
+
+ if (xsk_uses_need_wakeup(pool)) {
+ if (wakeup_xsk)
+ xsk_set_rx_need_wakeup(pool);
+ else
+ xsk_clear_rx_need_wakeup(pool);
+ }
+
+ return rx_frm_cnt;
+}
+
static int enetc_poll(struct napi_struct *napi, int budget)
{
struct enetc_int_vector
*v = container_of(napi, struct enetc_int_vector, napi);
struct enetc_bdr *rx_ring = &v->rx_ring;
+ struct xsk_buff_pool *pool;
struct bpf_prog *prog;
bool complete = true;
int work_done;
@@ -1676,10 +1892,15 @@ static int enetc_poll(struct napi_struct *napi, int budget)
complete = false;
prog = rx_ring->xdp.prog;
- if (prog)
+ pool = rx_ring->xdp.xsk_pool;
+ if (prog && pool)
+ work_done = enetc_clean_rx_ring_xsk(rx_ring, napi, budget, prog,
+ pool);
+ else if (prog)
work_done = enetc_clean_rx_ring_xdp(rx_ring, napi, budget, prog);
else
work_done = enetc_clean_rx_ring(rx_ring, napi, budget);
+
if (work_done == budget)
complete = false;
if (work_done)
@@ -2168,7 +2389,16 @@ static void enetc_setup_rxbdr(struct enetc_hw *hw, struct enetc_bdr *rx_ring,
rx_ring->next_to_alloc = 0;
enetc_lock_mdio();
- enetc_refill_rx_ring(rx_ring, enetc_bd_unused(rx_ring));
+ /* The XSK buffer pool and the BPF program are set up through different
+ * syscalls. From the moment the pool has been DMA mapped and until the
+ * XDP program is attached, we still need to use normal RX buffers,
+ * because we still use the normal NAPI poll routine. Only use buffers
+ * from the XSK pool when both conditions are fulfilled.
+ */
+ if (rx_ring->xdp.prog && rx_ring->xdp.xsk_pool)
+ enetc_refill_rx_ring_xsk(rx_ring, enetc_bd_unused(rx_ring));
+ else
+ enetc_refill_rx_ring(rx_ring, enetc_bd_unused(rx_ring));
enetc_unlock_mdio();
enetc_rxbdr_wr(hw, idx, ENETC_RBMR, rbmr);
@@ -2454,18 +2684,27 @@ static int enetc_xdp_rxq_mem_model_register(struct enetc_ndev_priv *priv,
int rxq)
{
struct enetc_bdr *rx_ring = priv->rx_ring[rxq];
+ struct xsk_buff_pool *pool;
+ enum xdp_mem_type type;
int err;
err = xdp_rxq_info_reg(&rx_ring->xdp.rxq, priv->ndev, rxq, 0);
if (err)
return err;
- err = xdp_rxq_info_reg_mem_model(&rx_ring->xdp.rxq,
- MEM_TYPE_PAGE_SHARED, NULL);
- if (err)
+ pool = rx_ring->xdp.xsk_pool;
+ type = !!pool ? MEM_TYPE_XSK_BUFF_POOL : MEM_TYPE_PAGE_SHARED;
+
+ err = xdp_rxq_info_reg_mem_model(&rx_ring->xdp.rxq, type, NULL);
+ if (err) {
xdp_rxq_info_unreg(&rx_ring->xdp.rxq);
+ return err;
+ }
- return err;
+ if (pool)
+ xsk_pool_set_rxq_info(pool, &rx_ring->xdp.rxq);
+
+ return 0;
}
static void enetc_xdp_rxq_mem_model_unregister(struct enetc_ndev_priv *priv,
@@ -2768,6 +3007,125 @@ static int enetc_reconfigure_xdp_cb(struct enetc_ndev_priv *priv, void *ctx)
return err;
}
+struct enetc_xsk_reconfig_ctx {
+ struct enetc_bdr *rx_ring;
+ struct xsk_buff_pool *pool;
+};
+
+static int enetc_enable_xsk_cb(struct enetc_ndev_priv *priv, void *ctx)
+{
+ struct enetc_xsk_reconfig_ctx *data = ctx;
+ struct enetc_bdr *rx_ring = data->rx_ring;
+ struct xsk_buff_pool *pool = data->pool;
+ int err;
+
+ err = xsk_pool_dma_map(pool, priv->dev, 0);
+ if (err)
+ return err;
+
+ rx_ring->xdp.xsk_pool = pool;
+
+ return 0;
+}
+
+static int enetc_disable_xsk_cb(struct enetc_ndev_priv *priv, void *ctx)
+{
+ struct enetc_xsk_reconfig_ctx *data = ctx;
+ struct enetc_bdr *rx_ring = data->rx_ring;
+ struct xsk_buff_pool *pool = data->pool;
+
+ rx_ring->xdp.xsk_pool = NULL;
+ xsk_pool_dma_unmap(pool, 0);
+
+ return 0;
+}
+
+static int enetc_enable_xsk_pool(struct net_device *ndev,
+ struct xsk_buff_pool *pool, u16 queue_id)
+{
+ struct enetc_ndev_priv *priv = netdev_priv(ndev);
+ struct enetc_xsk_reconfig_ctx ctx;
+ struct enetc_int_vector *v;
+ struct enetc_bdr *rx_ring;
+ bool extended;
+
+ if (queue_id >= priv->bdr_int_num) {
+ netdev_err(ndev, "QID %d exceeds the %d channels available\n",
+ queue_id, priv->bdr_int_num);
+ return -ERANGE;
+ }
+
+ v = priv->int_vector[queue_id];
+ rx_ring = &v->rx_ring;
+ if (rx_ring->xdp.xsk_pool) {
+ netdev_err(ndev, "QID %d already has an XSK pool attached\n",
+ rx_ring->index);
+ return -EBUSY;
+ }
+
+ /* Ensure enetc_setup_xdp_prog() won't be called before
+ * enetc_setup_xsk_pool(), because enetc_xdp_rxq_mem_model_register()
+ * depends on call ordering.
+ */
+ if (rx_ring->xdp.prog) {
+ netdev_err(ndev,
+ "Cannot use XSK if there is an XDP program already attached\n");
+ return -EINVAL;
+ }
+
+ extended = !!(priv->active_offloads & ENETC_F_RX_TSTAMP);
+ ctx.rx_ring = rx_ring;
+ ctx.pool = pool;
+
+ return enetc_reconfigure(priv, extended, enetc_enable_xsk_cb, &ctx);
+}
+
+static int enetc_disable_xsk_pool(struct net_device *ndev, u16 queue_id)
+{
+ struct enetc_ndev_priv *priv = netdev_priv(ndev);
+ struct enetc_xsk_reconfig_ctx ctx;
+ struct enetc_int_vector *v;
+ struct xsk_buff_pool *pool;
+ struct enetc_bdr *rx_ring;
+ bool extended;
+
+ if (queue_id >= priv->bdr_int_num) {
+ netdev_err(ndev, "QID %d exceeds the %d channels available\n",
+ queue_id, priv->bdr_int_num);
+ return -ERANGE;
+ }
+
+ v = priv->int_vector[queue_id];
+ rx_ring = &v->rx_ring;
+
+ pool = rx_ring->xdp.xsk_pool;
+ if (!pool) {
+ netdev_err(ndev, "QID %d does not have an XSK pool attached\n",
+ rx_ring->index);
+ return -ENOENT;
+ }
+
+ extended = !!(priv->active_offloads & ENETC_F_RX_TSTAMP);
+ ctx.rx_ring = rx_ring;
+ ctx.pool = pool;
+
+ return enetc_reconfigure(priv, extended, enetc_disable_xsk_cb, &ctx);
+}
+
+static int enetc_setup_xsk_pool(struct net_device *ndev,
+ struct xsk_buff_pool *pool,
+ u16 queue_id)
+{
+ return pool ? enetc_enable_xsk_pool(ndev, pool, queue_id) :
+ enetc_disable_xsk_pool(ndev, queue_id);
+}
+
+int enetc_xsk_wakeup(struct net_device *ndev, u32 queue_id, u32 flags)
+{
+ /* xp_assign_dev() wants this; nothing needed for RX */
+ return 0;
+}
+
static int enetc_setup_xdp_prog(struct net_device *ndev, struct bpf_prog *prog,
struct netlink_ext_ack *extack)
{
@@ -2798,6 +3156,9 @@ int enetc_setup_bpf(struct net_device *ndev, struct netdev_bpf *bpf)
switch (bpf->command) {
case XDP_SETUP_PROG:
return enetc_setup_xdp_prog(ndev, bpf->prog, bpf->extack);
+ case XDP_SETUP_XSK_POOL:
+ return enetc_setup_xsk_pool(ndev, bpf->xsk.pool,
+ bpf->xsk.queue_id);
default:
return -EINVAL;
}
diff --git a/drivers/net/ethernet/freescale/enetc/enetc.h b/drivers/net/ethernet/freescale/enetc/enetc.h
index 704aa1f9dfa3..e1a746e37c9a 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc.h
+++ b/drivers/net/ethernet/freescale/enetc/enetc.h
@@ -48,6 +48,7 @@ struct enetc_tx_swbd {
(SKB_WITH_OVERHEAD(ENETC_RXB_TRUESIZE) - XDP_PACKET_HEADROOM)
struct enetc_rx_swbd {
+ struct xdp_buff *xsk_buff;
struct page *page;
dma_addr_t dma;
enum dma_data_direction dir;
@@ -77,6 +78,7 @@ struct enetc_ring_stats {
struct enetc_xdp_data {
struct xdp_rxq_info rxq;
+ struct xsk_buff_pool *xsk_pool;
struct bpf_prog *prog;
int xdp_tx_in_flight;
};
@@ -424,6 +426,7 @@ int enetc_setup_tc_mqprio(struct net_device *ndev, void *type_data);
int enetc_setup_bpf(struct net_device *ndev, struct netdev_bpf *bpf);
int enetc_xdp_xmit(struct net_device *ndev, int num_frames,
struct xdp_frame **frames, u32 flags);
+int enetc_xsk_wakeup(struct net_device *dev, u32 queue, u32 flags);
/* ethtool */
void enetc_set_ethtool_ops(struct net_device *ndev);
diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf.c b/drivers/net/ethernet/freescale/enetc/enetc_pf.c
index 7facc7d5261e..1a95d213683b 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc_pf.c
+++ b/drivers/net/ethernet/freescale/enetc/enetc_pf.c
@@ -773,6 +773,7 @@ static const struct net_device_ops enetc_ndev_ops = {
.ndo_setup_tc = enetc_pf_setup_tc,
.ndo_bpf = enetc_setup_bpf,
.ndo_xdp_xmit = enetc_xdp_xmit,
+ .ndo_xsk_wakeup = enetc_xsk_wakeup,
};
static void enetc_pf_netdev_setup(struct enetc_si *si, struct net_device *ndev,
--
2.34.1
Powered by blists - more mailing lists