netdev - [PATCH net-next v7 3/8] net/funeth: probing and netdev ops

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20220218234536.9810-4-dmichail@fungible.com>
Date:   Fri, 18 Feb 2022 15:45:31 -0800
From:   Dimitris Michailidis <d.michailidis@...gible.com>
To:     davem@...emloft.net, kuba@...nel.org, netdev@...r.kernel.org,
        andrew@...n.ch, d.michailidis@...gible.com
Subject: [PATCH net-next v7 3/8] net/funeth: probing and netdev ops

This is the first part of the Fungible ethernet driver. It deals with
device probing, net_device creation, and netdev ops.

Signed-off-by: Dimitris Michailidis <dmichail@...gible.com>
---
 drivers/net/ethernet/fungible/funeth/funeth.h |  171 ++
 .../ethernet/fungible/funeth/funeth_main.c    | 2091 +++++++++++++++++
 2 files changed, 2262 insertions(+)
 create mode 100644 drivers/net/ethernet/fungible/funeth/funeth.h
 create mode 100644 drivers/net/ethernet/fungible/funeth/funeth_main.c

diff --git a/drivers/net/ethernet/fungible/funeth/funeth.h b/drivers/net/ethernet/fungible/funeth/funeth.h
new file mode 100644
index 000000000000..1250e10d21db
--- /dev/null
+++ b/drivers/net/ethernet/fungible/funeth/funeth.h
@@ -0,0 +1,171 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause) */
+
+#ifndef _FUNETH_H
+#define _FUNETH_H
+
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/net_tstamp.h>
+#include <linux/mutex.h>
+#include <linux/seqlock.h>
+#include <linux/xarray.h>
+#include <net/devlink.h>
+#include "fun_dev.h"
+
+#define ADMIN_SQE_SIZE SZ_128
+#define ADMIN_CQE_SIZE SZ_64
+#define ADMIN_RSP_MAX_LEN (ADMIN_CQE_SIZE - sizeof(struct fun_cqe_info))
+
+#define FUN_MAX_MTU 9024
+
+#define SQ_DEPTH 512U
+#define CQ_DEPTH 1024U
+#define RQ_DEPTH (512U / (PAGE_SIZE / 4096))
+
+#define CQ_INTCOAL_USEC 10
+#define CQ_INTCOAL_NPKT 16
+#define SQ_INTCOAL_USEC 10
+#define SQ_INTCOAL_NPKT 16
+
+#define INVALID_LPORT 0xffff
+
+#define FUN_PORT_CAP_PAUSE_MASK (FUN_PORT_CAP_TX_PAUSE | FUN_PORT_CAP_RX_PAUSE)
+
+struct fun_vport_info {
+	u8 mac[ETH_ALEN];
+	u16 vlan;
+	__be16 vlan_proto;
+	u8 qos;
+	u8 spoofchk:1;
+	u8 trusted:1;
+	unsigned int max_rate;
+};
+
+/* "subclass" of fun_dev for Ethernet functions */
+struct fun_ethdev {
+	struct fun_dev fdev;
+
+	/* the function's network ports */
+	struct net_device **netdevs;
+	unsigned int num_ports;
+
+	/* configuration for the function's virtual ports */
+	unsigned int num_vports;
+	struct fun_vport_info *vport_info;
+
+	struct mutex state_mutex; /* nests inside RTNL if both taken */
+
+	unsigned int nsqs_per_port;
+};
+
+static inline struct fun_ethdev *to_fun_ethdev(struct fun_dev *p)
+{
+	return container_of(p, struct fun_ethdev, fdev);
+}
+
+struct fun_qset {
+	struct funeth_rxq **rxqs;
+	struct funeth_txq **txqs;
+	struct funeth_txq **xdpqs;
+	unsigned int nrxqs;
+	unsigned int ntxqs;
+	unsigned int nxdpqs;
+	unsigned int rxq_start;
+	unsigned int txq_start;
+	unsigned int xdpq_start;
+	unsigned int cq_depth;
+	unsigned int rq_depth;
+	unsigned int sq_depth;
+	int state;
+};
+
+/* Per netdevice driver state, i.e., netdev_priv. */
+struct funeth_priv {
+	struct fun_dev *fdev;
+	struct pci_dev *pdev;
+	struct net_device *netdev;
+
+	struct funeth_rxq * __rcu *rxqs;
+	struct funeth_txq **txqs;
+	struct funeth_txq * __rcu *xdpqs;
+
+	struct xarray irqs;
+	unsigned int num_tx_irqs;
+	unsigned int num_rx_irqs;
+	unsigned int rx_irq_ofst;
+
+	unsigned int lane_attrs;
+	u16 lport;
+
+	/* link settings */
+	u64 port_caps;
+	u64 advertising;
+	u64 lp_advertising;
+	unsigned int link_speed;
+	u8 xcvr_type;
+	u8 active_fc;
+	u8 active_fec;
+	u8 link_down_reason;
+	seqcount_t link_seq;
+
+	u32 msg_enable;
+
+	unsigned int num_xdpqs;
+
+	/* ethtool, etc. config parameters */
+	unsigned int sq_depth;
+	unsigned int rq_depth;
+	unsigned int cq_depth;
+	unsigned int cq_irq_db;
+	u8 tx_coal_usec;
+	u8 tx_coal_count;
+	u8 rx_coal_usec;
+	u8 rx_coal_count;
+
+	struct hwtstamp_config hwtstamp_cfg;
+
+	/* cumulative queue stats from earlier queue instances */
+	u64 tx_packets;
+	u64 tx_bytes;
+	u64 tx_dropped;
+	u64 rx_packets;
+	u64 rx_bytes;
+	u64 rx_dropped;
+
+	/* RSS */
+	unsigned int rss_hw_id;
+	enum fun_eth_hash_alg hash_algo;
+	u8 rss_key[FUN_ETH_RSS_MAX_KEY_SIZE];
+	unsigned int indir_table_nentries;
+	u32 indir_table[FUN_ETH_RSS_MAX_INDIR_ENT];
+	dma_addr_t rss_dma_addr;
+	void *rss_cfg;
+
+	/* DMA area for port stats */
+	dma_addr_t stats_dma_addr;
+	__be64 *stats;
+
+	struct bpf_prog *xdp_prog;
+
+	struct devlink_port dl_port;
+
+	/* kTLS state */
+	unsigned int ktls_id;
+	atomic64_t tx_tls_add;
+	atomic64_t tx_tls_del;
+	atomic64_t tx_tls_resync;
+};
+
+void fun_set_ethtool_ops(struct net_device *netdev);
+int fun_port_write_cmd(struct funeth_priv *fp, int key, u64 data);
+int fun_port_read_cmd(struct funeth_priv *fp, int key, u64 *data);
+int fun_create_and_bind_tx(struct funeth_priv *fp, u32 sqid);
+int fun_replace_queues(struct net_device *dev, struct fun_qset *newqs,
+		       struct netlink_ext_ack *extack);
+int fun_change_num_queues(struct net_device *dev, unsigned int ntx,
+			  unsigned int nrx);
+void fun_set_ring_count(struct net_device *netdev, unsigned int ntx,
+			unsigned int nrx);
+int fun_config_rss(struct net_device *dev, int algo, const u8 *key,
+		   const u32 *qtable, u8 op);
+
+#endif /* _FUNETH_H */
diff --git a/drivers/net/ethernet/fungible/funeth/funeth_main.c b/drivers/net/ethernet/fungible/funeth/funeth_main.c
new file mode 100644
index 000000000000..65012f2d2156
--- /dev/null
+++ b/drivers/net/ethernet/fungible/funeth/funeth_main.c
@@ -0,0 +1,2091 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause)
+
+#include <linux/bpf.h>
+#include <linux/crash_dump.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/filter.h>
+#include <linux/idr.h>
+#include <linux/if_vlan.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/pci.h>
+#include <linux/rtnetlink.h>
+#include <linux/inetdevice.h>
+
+#include "funeth.h"
+#include "funeth_devlink.h"
+#include "funeth_ktls.h"
+#include "fun_port.h"
+#include "fun_queue.h"
+#include "funeth_txrx.h"
+
+#define ADMIN_SQ_DEPTH 32
+#define ADMIN_CQ_DEPTH 64
+#define ADMIN_RQ_DEPTH 16
+
+/* Default number of Tx/Rx queues. */
+#define FUN_DFLT_QUEUES 16U
+
+enum {
+	FUN_SERV_RES_CHANGE = FUN_SERV_FIRST_AVAIL,
+	FUN_SERV_DEL_PORTS,
+};
+
+static const struct pci_device_id funeth_id_table[] = {
+	{ PCI_VDEVICE(FUNGIBLE, 0x0101) },
+	{ PCI_VDEVICE(FUNGIBLE, 0x0181) },
+	{ 0, }
+};
+
+/* Issue a port write admin command with @n key/value pairs. */
+static int fun_port_write_cmds(struct funeth_priv *fp, unsigned int n,
+			       const int *keys, const u64 *data)
+{
+	unsigned int cmd_size, i;
+	union {
+		struct fun_admin_port_req req;
+		struct fun_admin_port_rsp rsp;
+		u8 v[ADMIN_SQE_SIZE];
+	} cmd;
+
+	cmd_size = offsetof(struct fun_admin_port_req, u.write.write48) +
+		n * sizeof(struct fun_admin_write48_req);
+	if (cmd_size > sizeof(cmd) || cmd_size > ADMIN_RSP_MAX_LEN)
+		return -EINVAL;
+
+	cmd.req.common = FUN_ADMIN_REQ_COMMON_INIT2(FUN_ADMIN_OP_PORT,
+						    cmd_size);
+	cmd.req.u.write =
+		FUN_ADMIN_PORT_WRITE_REQ_INIT(FUN_ADMIN_SUBOP_WRITE, 0,
+					      fp->netdev->dev_port);
+	for (i = 0; i < n; i++)
+		cmd.req.u.write.write48[i] =
+			FUN_ADMIN_WRITE48_REQ_INIT(keys[i], data[i]);
+
+	return fun_submit_admin_sync_cmd(fp->fdev, &cmd.req.common,
+					 &cmd.rsp, cmd_size, 0);
+}
+
+int fun_port_write_cmd(struct funeth_priv *fp, int key, u64 data)
+{
+	return fun_port_write_cmds(fp, 1, &key, &data);
+}
+
+/* Issue a port read admin command with @n key/value pairs. */
+static int fun_port_read_cmds(struct funeth_priv *fp, unsigned int n,
+			      const int *keys, u64 *data)
+{
+	const struct fun_admin_read48_rsp *r48rsp;
+	unsigned int cmd_size, i;
+	int rc;
+	union {
+		struct fun_admin_port_req req;
+		struct fun_admin_port_rsp rsp;
+		u8 v[ADMIN_SQE_SIZE];
+	} cmd;
+
+	cmd_size = offsetof(struct fun_admin_port_req, u.read.read48) +
+		n * sizeof(struct fun_admin_read48_req);
+	if (cmd_size > sizeof(cmd) || cmd_size > ADMIN_RSP_MAX_LEN)
+		return -EINVAL;
+
+	cmd.req.common = FUN_ADMIN_REQ_COMMON_INIT2(FUN_ADMIN_OP_PORT,
+						    cmd_size);
+	cmd.req.u.read =
+		FUN_ADMIN_PORT_READ_REQ_INIT(FUN_ADMIN_SUBOP_READ, 0,
+					     fp->netdev->dev_port);
+	for (i = 0; i < n; i++)
+		cmd.req.u.read.read48[i] = FUN_ADMIN_READ48_REQ_INIT(keys[i]);
+
+	rc = fun_submit_admin_sync_cmd(fp->fdev, &cmd.req.common,
+				       &cmd.rsp, cmd_size, 0);
+	if (rc)
+		return rc;
+
+	for (r48rsp = cmd.rsp.u.read.read48, i = 0; i < n; i++, r48rsp++) {
+		data[i] = FUN_ADMIN_READ48_RSP_DATA_G(r48rsp->key_to_data);
+		dev_dbg(fp->fdev->dev,
+			"port_read_rsp lport=%u (key_to_data=0x%llx) key=%d data:%lld retval:%lld",
+			fp->lport, r48rsp->key_to_data, keys[i], data[i],
+			FUN_ADMIN_READ48_RSP_RET_G(r48rsp->key_to_data));
+	}
+	return 0;
+}
+
+int fun_port_read_cmd(struct funeth_priv *fp, int key, u64 *data)
+{
+	return fun_port_read_cmds(fp, 1, &key, data);
+}
+
+static void fun_report_link(struct net_device *netdev)
+{
+	if (netif_carrier_ok(netdev)) {
+		const struct funeth_priv *fp = netdev_priv(netdev);
+		const char *fec = "", *pause = "";
+		int speed = fp->link_speed;
+		char unit = 'M';
+
+		if (fp->link_speed >= SPEED_1000) {
+			speed /= 1000;
+			unit = 'G';
+		}
+
+		if (fp->active_fec & FUN_PORT_FEC_RS)
+			fec = ", RS-FEC";
+		else if (fp->active_fec & FUN_PORT_FEC_FC)
+			fec = ", BASER-FEC";
+
+		if ((fp->active_fc & FUN_PORT_CAP_PAUSE_MASK) == FUN_PORT_CAP_PAUSE_MASK)
+			pause = ", Tx/Rx PAUSE";
+		else if (fp->active_fc & FUN_PORT_CAP_RX_PAUSE)
+			pause = ", Rx PAUSE";
+		else if (fp->active_fc & FUN_PORT_CAP_TX_PAUSE)
+			pause = ", Tx PAUSE";
+
+		netdev_info(netdev, "Link up at %d %cb/s full-duplex%s%s\n",
+			    speed, unit, pause, fec);
+	} else {
+		netdev_info(netdev, "Link down\n");
+	}
+}
+
+static int fun_adi_write(struct fun_dev *fdev, enum fun_admin_adi_attr attr,
+			 unsigned int adi_id, const struct fun_adi_param *param)
+{
+	struct fun_admin_adi_req req = {
+		.common = FUN_ADMIN_REQ_COMMON_INIT2(FUN_ADMIN_OP_ADI,
+						     sizeof(req)),
+		.u.write.subop = FUN_ADMIN_SUBOP_WRITE,
+		.u.write.attribute = attr,
+		.u.write.id = cpu_to_be32(adi_id),
+		.u.write.param = *param
+	};
+
+	return fun_submit_admin_sync_cmd(fdev, &req.common, NULL, 0, 0);
+}
+
+/* Configure RSS for the given port. @op determines whether a new RSS context
+ * is to be created or whether an existing one should be reconfigured. The
+ * remaining parameters specify the hashing algorithm, key, and indirection
+ * table.
+ *
+ * This initiates packet delivery to the Rx queues set in the indirection
+ * table.
+ */
+int fun_config_rss(struct net_device *dev, int algo, const u8 *key,
+		   const u32 *qtable, u8 op)
+{
+	struct funeth_priv *fp = netdev_priv(dev);
+	unsigned int table_len = fp->indir_table_nentries;
+	unsigned int len = FUN_ETH_RSS_MAX_KEY_SIZE + sizeof(u32) * table_len;
+	struct funeth_rxq **rxqs = rtnl_dereference(fp->rxqs);
+	union {
+		struct {
+			struct fun_admin_rss_req req;
+			struct fun_dataop_gl gl;
+		};
+		struct fun_admin_generic_create_rsp rsp;
+	} cmd;
+	__be32 *indir_tab;
+	u16 flags;
+	int rc;
+
+	if (op != FUN_ADMIN_SUBOP_CREATE && fp->rss_hw_id == FUN_HCI_ID_INVALID)
+		return -EINVAL;
+
+	flags = op == FUN_ADMIN_SUBOP_CREATE ?
+			FUN_ADMIN_RES_CREATE_FLAG_ALLOCATOR : 0;
+	cmd.req.common = FUN_ADMIN_REQ_COMMON_INIT2(FUN_ADMIN_OP_RSS,
+						    sizeof(cmd));
+	cmd.req.u.create =
+		FUN_ADMIN_RSS_CREATE_REQ_INIT(op, flags, fp->rss_hw_id,
+					      dev->dev_port, algo,
+					      FUN_ETH_RSS_MAX_KEY_SIZE,
+					      table_len, 0,
+					      FUN_ETH_RSS_MAX_KEY_SIZE);
+	cmd.req.u.create.dataop = FUN_DATAOP_HDR_INIT(1, 0, 1, 0, len);
+	fun_dataop_gl_init(&cmd.gl, 0, 0, len, fp->rss_dma_addr);
+
+	/* write the key and indirection table into the RSS DMA area */
+	memcpy(fp->rss_cfg, key, FUN_ETH_RSS_MAX_KEY_SIZE);
+	indir_tab = fp->rss_cfg + FUN_ETH_RSS_MAX_KEY_SIZE;
+	for (rc = 0; rc < table_len; rc++)
+		*indir_tab++ = cpu_to_be32(rxqs[*qtable++]->hw_cqid);
+
+	rc = fun_submit_admin_sync_cmd(fp->fdev, &cmd.req.common,
+				       &cmd.rsp, sizeof(cmd.rsp), 0);
+	if (!rc && op == FUN_ADMIN_SUBOP_CREATE)
+		fp->rss_hw_id = be32_to_cpu(cmd.rsp.id);
+	return rc;
+}
+
+/* Destroy the HW RSS conntext associated with the given port. This also stops
+ * all packet delivery to our Rx queues.
+ */
+static void fun_destroy_rss(struct funeth_priv *fp)
+{
+	if (fp->rss_hw_id != FUN_HCI_ID_INVALID) {
+		fun_res_destroy(fp->fdev, FUN_ADMIN_OP_RSS, 0, fp->rss_hw_id);
+		fp->rss_hw_id = FUN_HCI_ID_INVALID;
+	}
+}
+
+static void fun_irq_aff_notify(struct irq_affinity_notify *notify,
+			       const cpumask_t *mask)
+{
+	struct fun_irq *p = container_of(notify, struct fun_irq, aff_notify);
+
+	cpumask_copy(&p->affinity_mask, mask);
+}
+
+static void fun_irq_aff_release(struct kref __always_unused *ref)
+{
+}
+
+/* Allocate an IRQ structure, assign an MSI-X index and initial affinity to it,
+ * and add it to the IRQ XArray.
+ */
+static struct fun_irq *fun_alloc_qirq(struct funeth_priv *fp, unsigned int idx,
+				      int node, unsigned int xa_idx_offset)
+{
+	struct fun_irq *irq;
+	int cpu, res;
+
+	cpu = cpumask_local_spread(idx, node);
+	node = local_memory_node(cpu_to_node(cpu));
+
+	irq = kzalloc_node(sizeof(*irq), GFP_KERNEL, node);
+	if (!irq)
+		return ERR_PTR(-ENOMEM);
+
+	res = fun_reserve_irqs(fp->fdev, 1, &irq->irq_idx);
+	if (res != 1)
+		goto free_irq;
+
+	res = xa_insert(&fp->irqs, idx + xa_idx_offset, irq, GFP_KERNEL);
+	if (res)
+		goto release_irq;
+
+	irq->irq = pci_irq_vector(fp->pdev, irq->irq_idx);
+	cpumask_set_cpu(cpu, &irq->affinity_mask);
+	irq->aff_notify.notify = fun_irq_aff_notify;
+	irq->aff_notify.release = fun_irq_aff_release;
+	irq->state = FUN_IRQ_INIT;
+	return irq;
+
+release_irq:
+	fun_release_irqs(fp->fdev, 1, &irq->irq_idx);
+free_irq:
+	kfree(irq);
+	return ERR_PTR(res);
+}
+
+static void fun_free_qirq(struct funeth_priv *fp, struct fun_irq *irq)
+{
+	netif_napi_del(&irq->napi);
+	fun_release_irqs(fp->fdev, 1, &irq->irq_idx);
+	kfree(irq);
+}
+
+/* Release the IRQs reserved for Tx/Rx queues that aren't being used. */
+static void fun_prune_queue_irqs(struct net_device *dev)
+{
+	struct funeth_priv *fp = netdev_priv(dev);
+	unsigned int nreleased = 0;
+	struct fun_irq *irq;
+	unsigned long idx;
+
+	xa_for_each(&fp->irqs, idx, irq) {
+		if (irq->txq || irq->rxq)  /* skip those in use */
+			continue;
+
+		xa_erase(&fp->irqs, idx);
+		fun_free_qirq(fp, irq);
+		nreleased++;
+		if (idx < fp->rx_irq_ofst)
+			fp->num_tx_irqs--;
+		else
+			fp->num_rx_irqs--;
+	}
+	netif_info(fp, intr, dev, "Released %u queue IRQs\n", nreleased);
+}
+
+/* Reserve IRQs, one per queue, to acommodate the requested queue numbers @ntx
+ * and @nrx. IRQs are added incrementally to those we already have.
+ * We hold on to allocated IRQs until garbage collection of unused IRQs is
+ * separately requested.
+ */
+static int fun_alloc_queue_irqs(struct net_device *dev, unsigned int ntx,
+				unsigned int nrx)
+{
+	struct funeth_priv *fp = netdev_priv(dev);
+	int node = dev_to_node(&fp->pdev->dev);
+	struct fun_irq *irq;
+	unsigned int i;
+
+	for (i = fp->num_tx_irqs; i < ntx; i++) {
+		irq = fun_alloc_qirq(fp, i, node, 0);
+		if (IS_ERR(irq))
+			return PTR_ERR(irq);
+
+		fp->num_tx_irqs++;
+		netif_tx_napi_add(dev, &irq->napi, fun_txq_napi_poll,
+				  NAPI_POLL_WEIGHT);
+	}
+
+	for (i = fp->num_rx_irqs; i < nrx; i++) {
+		irq = fun_alloc_qirq(fp, i, node, fp->rx_irq_ofst);
+		if (IS_ERR(irq))
+			return PTR_ERR(irq);
+
+		fp->num_rx_irqs++;
+		netif_napi_add(dev, &irq->napi, fun_rxq_napi_poll,
+			       NAPI_POLL_WEIGHT);
+	}
+
+	netif_info(fp, intr, dev, "Reserved %u/%u IRQs for Tx/Rx queues\n",
+		   ntx, nrx);
+	return 0;
+}
+
+static void free_txqs(struct funeth_txq **txqs, unsigned int nqs,
+		      unsigned int start, int state)
+{
+	unsigned int i;
+
+	for (i = start; i < nqs && txqs[i]; i++)
+		txqs[i] = funeth_txq_free(txqs[i], state);
+}
+
+static int alloc_txqs(struct net_device *dev, struct funeth_txq **txqs,
+		      unsigned int nqs, unsigned int depth, unsigned int start,
+		      int state)
+{
+	struct funeth_priv *fp = netdev_priv(dev);
+	unsigned int i;
+	int err;
+
+	for (i = start; i < nqs; i++) {
+		err = funeth_txq_create(dev, i, depth, xa_load(&fp->irqs, i),
+					state, &txqs[i]);
+		if (err) {
+			free_txqs(txqs, nqs, start, FUN_QSTATE_DESTROYED);
+			return err;
+		}
+	}
+	return 0;
+}
+
+static void free_rxqs(struct funeth_rxq **rxqs, unsigned int nqs,
+		      unsigned int start, int state)
+{
+	unsigned int i;
+
+	for (i = start; i < nqs && rxqs[i]; i++)
+		rxqs[i] = funeth_rxq_free(rxqs[i], state);
+}
+
+static int alloc_rxqs(struct net_device *dev, struct funeth_rxq **rxqs,
+		      unsigned int nqs, unsigned int ncqe, unsigned int nrqe,
+		      unsigned int start, int state)
+{
+	struct funeth_priv *fp = netdev_priv(dev);
+	unsigned int i;
+	int err;
+
+	for (i = start; i < nqs; i++) {
+		err = funeth_rxq_create(dev, i, ncqe, nrqe,
+					xa_load(&fp->irqs, i + fp->rx_irq_ofst),
+					state, &rxqs[i]);
+		if (err) {
+			free_rxqs(rxqs, nqs, start, FUN_QSTATE_DESTROYED);
+			return err;
+		}
+	}
+	return 0;
+}
+
+static void free_xdpqs(struct funeth_txq **xdpqs, unsigned int nqs,
+		       unsigned int start, int state)
+{
+	unsigned int i;
+
+	for (i = start; i < nqs && xdpqs[i]; i++)
+		xdpqs[i] = funeth_txq_free(xdpqs[i], state);
+
+	if (state == FUN_QSTATE_DESTROYED)
+		kfree(xdpqs);
+}
+
+static struct funeth_txq **alloc_xdpqs(struct net_device *dev, unsigned int nqs,
+				       unsigned int depth, unsigned int start,
+				       int state)
+{
+	struct funeth_txq **xdpqs;
+	unsigned int i;
+	int err;
+
+	xdpqs = kcalloc(nqs, sizeof(*xdpqs), GFP_KERNEL);
+	if (!xdpqs)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = start; i < nqs; i++) {
+		err = funeth_txq_create(dev, i, depth, NULL, state, &xdpqs[i]);
+		if (err) {
+			free_xdpqs(xdpqs, nqs, start, FUN_QSTATE_DESTROYED);
+			return ERR_PTR(err);
+		}
+	}
+	return xdpqs;
+}
+
+static void fun_free_rings(struct net_device *netdev, struct fun_qset *qset)
+{
+	struct funeth_priv *fp = netdev_priv(netdev);
+	struct funeth_txq **xdpqs = qset->xdpqs;
+	struct funeth_rxq **rxqs = qset->rxqs;
+
+	/* qset may not specify any queues to operate on. In that case the
+	 * currently installed queues are implied.
+	 */
+	if (!rxqs) {
+		rxqs = rtnl_dereference(fp->rxqs);
+		xdpqs = rtnl_dereference(fp->xdpqs);
+		qset->txqs = fp->txqs;
+		qset->nrxqs = netdev->real_num_rx_queues;
+		qset->ntxqs = netdev->real_num_tx_queues;
+		qset->nxdpqs = fp->num_xdpqs;
+	}
+	if (!rxqs)
+		return;
+
+	if (rxqs == rtnl_dereference(fp->rxqs)) {
+		rcu_assign_pointer(fp->rxqs, NULL);
+		rcu_assign_pointer(fp->xdpqs, NULL);
+		synchronize_net();
+		fp->txqs = NULL;
+	}
+
+	free_rxqs(rxqs, qset->nrxqs, qset->rxq_start, qset->state);
+	free_txqs(qset->txqs, qset->ntxqs, qset->txq_start, qset->state);
+	free_xdpqs(xdpqs, qset->nxdpqs, qset->xdpq_start, qset->state);
+	if (qset->state == FUN_QSTATE_DESTROYED)
+		kfree(rxqs);
+
+	/* Tell the caller which queues were operated on. */
+	qset->rxqs = rxqs;
+	qset->xdpqs = xdpqs;
+}
+
+static int fun_alloc_rings(struct net_device *netdev, struct fun_qset *qset)
+{
+	struct funeth_txq **xdpqs = NULL, **txqs;
+	struct funeth_rxq **rxqs;
+	int err;
+
+	err = fun_alloc_queue_irqs(netdev, qset->ntxqs, qset->nrxqs);
+	if (err)
+		return err;
+
+	rxqs = kcalloc(qset->ntxqs + qset->nrxqs, sizeof(*rxqs), GFP_KERNEL);
+	if (!rxqs)
+		return -ENOMEM;
+
+	if (qset->nxdpqs) {
+		xdpqs = alloc_xdpqs(netdev, qset->nxdpqs, qset->sq_depth,
+				    qset->xdpq_start, qset->state);
+		if (IS_ERR(xdpqs)) {
+			err = PTR_ERR(xdpqs);
+			goto free_qvec;
+		}
+	}
+
+	txqs = (struct funeth_txq **)&rxqs[qset->nrxqs];
+	err = alloc_txqs(netdev, txqs, qset->ntxqs, qset->sq_depth,
+			 qset->txq_start, qset->state);
+	if (err)
+		goto free_xdpqs;
+
+	err = alloc_rxqs(netdev, rxqs, qset->nrxqs, qset->cq_depth,
+			 qset->rq_depth, qset->rxq_start, qset->state);
+	if (err)
+		goto free_txqs;
+
+	qset->rxqs = rxqs;
+	qset->txqs = txqs;
+	qset->xdpqs = xdpqs;
+	return 0;
+
+free_txqs:
+	free_txqs(txqs, qset->ntxqs, qset->txq_start, FUN_QSTATE_DESTROYED);
+free_xdpqs:
+	free_xdpqs(xdpqs, qset->nxdpqs, qset->xdpq_start, FUN_QSTATE_DESTROYED);
+free_qvec:
+	kfree(rxqs);
+	return err;
+}
+
+/* Take queues to the next level. Presently this means creating them on the
+ * device.
+ */
+static int fun_advance_ring_state(struct net_device *dev, struct fun_qset *qset)
+{
+	struct funeth_priv *fp = netdev_priv(dev);
+	int i, err;
+
+	for (i = 0; i < qset->nrxqs; i++) {
+		err = fun_rxq_create_dev(qset->rxqs[i],
+					 xa_load(&fp->irqs,
+						 i + fp->rx_irq_ofst));
+		if (err)
+			goto out;
+	}
+
+	for (i = 0; i < qset->ntxqs; i++) {
+		err = fun_txq_create_dev(qset->txqs[i], xa_load(&fp->irqs, i));
+		if (err)
+			goto out;
+	}
+
+	for (i = 0; i < qset->nxdpqs; i++) {
+		err = fun_txq_create_dev(qset->xdpqs[i], NULL);
+		if (err)
+			goto out;
+	}
+
+	return 0;
+
+out:
+	fun_free_rings(dev, qset);
+	return err;
+}
+
+static int fun_port_create(struct net_device *netdev)
+{
+	struct funeth_priv *fp = netdev_priv(netdev);
+	union {
+		struct fun_admin_port_req req;
+		struct fun_admin_port_rsp rsp;
+	} cmd;
+	int rc;
+
+	if (fp->lport != INVALID_LPORT)
+		return 0;
+
+	cmd.req.common = FUN_ADMIN_REQ_COMMON_INIT2(FUN_ADMIN_OP_PORT,
+						    sizeof(cmd.req));
+	cmd.req.u.create =
+		FUN_ADMIN_PORT_CREATE_REQ_INIT(FUN_ADMIN_SUBOP_CREATE, 0,
+					       netdev->dev_port);
+
+	rc = fun_submit_admin_sync_cmd(fp->fdev, &cmd.req.common, &cmd.rsp,
+				       sizeof(cmd.rsp), 0);
+
+	if (!rc)
+		fp->lport = be16_to_cpu(cmd.rsp.u.create.lport);
+	return rc;
+}
+
+static int fun_port_destroy(struct net_device *netdev)
+{
+	struct funeth_priv *fp = netdev_priv(netdev);
+
+	if (fp->lport == INVALID_LPORT)
+		return 0;
+
+	fp->lport = INVALID_LPORT;
+	return fun_res_destroy(fp->fdev, FUN_ADMIN_OP_PORT, 0,
+			       netdev->dev_port);
+}
+
+static int fun_eth_create(struct funeth_priv *fp)
+{
+	union {
+		struct fun_admin_eth_req req;
+		struct fun_admin_generic_create_rsp rsp;
+	} cmd;
+	int rc;
+
+	cmd.req.common = FUN_ADMIN_REQ_COMMON_INIT2(FUN_ADMIN_OP_ETH,
+						    sizeof(cmd.req));
+	cmd.req.u.create = FUN_ADMIN_ETH_CREATE_REQ_INIT(
+				FUN_ADMIN_SUBOP_CREATE,
+				FUN_ADMIN_RES_CREATE_FLAG_ALLOCATOR,
+				0, fp->netdev->dev_port);
+
+	rc = fun_submit_admin_sync_cmd(fp->fdev, &cmd.req.common, &cmd.rsp,
+				       sizeof(cmd.rsp), 0);
+	return rc ? rc : be32_to_cpu(cmd.rsp.id);
+}
+
+static int fun_vi_create(struct funeth_priv *fp)
+{
+	struct fun_admin_vi_req req = {
+		.common = FUN_ADMIN_REQ_COMMON_INIT2(FUN_ADMIN_OP_VI,
+						     sizeof(req)),
+		.u.create = FUN_ADMIN_VI_CREATE_REQ_INIT(FUN_ADMIN_SUBOP_CREATE,
+							 0,
+							 fp->netdev->dev_port,
+							 fp->netdev->dev_port)
+	};
+
+	return fun_submit_admin_sync_cmd(fp->fdev, &req.common, NULL, 0, 0);
+}
+
+/* Helper to create an ETH flow and bind an SQ to it.
+ * Returns the ETH id (>= 0) on success or a negative error.
+ */
+int fun_create_and_bind_tx(struct funeth_priv *fp, u32 sqid)
+{
+	int rc, ethid;
+
+	ethid = fun_eth_create(fp);
+	if (ethid >= 0) {
+		rc = fun_bind(fp->fdev, FUN_ADMIN_BIND_TYPE_EPSQ, sqid,
+			      FUN_ADMIN_BIND_TYPE_ETH, ethid);
+		if (rc) {
+			fun_res_destroy(fp->fdev, FUN_ADMIN_OP_ETH, 0, ethid);
+			ethid = rc;
+		}
+	}
+	return ethid;
+}
+
+static irqreturn_t fun_queue_irq_handler(int irq, void *data)
+{
+	struct fun_irq *p = data;
+
+	if (p->rxq) {
+		prefetch(p->rxq->next_cqe_info);
+		p->rxq->irq_cnt++;
+	}
+	napi_schedule_irqoff(&p->napi);
+	return IRQ_HANDLED;
+}
+
+static int fun_enable_irqs(struct net_device *dev)
+{
+	struct funeth_priv *fp = netdev_priv(dev);
+	unsigned long idx, last;
+	unsigned int qidx;
+	struct fun_irq *p;
+	const char *qtype;
+	int err;
+
+	xa_for_each(&fp->irqs, idx, p) {
+		if (p->txq) {
+			qtype = "tx";
+			qidx = p->txq->qidx;
+		} else if (p->rxq) {
+			qtype = "rx";
+			qidx = p->rxq->qidx;
+		} else {
+			continue;
+		}
+
+		if (p->state != FUN_IRQ_INIT)
+			continue;
+
+		snprintf(p->name, sizeof(p->name) - 1, "%s-%s-%u", dev->name,
+			 qtype, qidx);
+		err = request_irq(p->irq, fun_queue_irq_handler, 0, p->name, p);
+		if (err) {
+			netdev_err(dev, "Failed to allocate IRQ %u, err %d\n",
+				   p->irq, err);
+			goto unroll;
+		}
+		p->state = FUN_IRQ_REQUESTED;
+	}
+
+	xa_for_each(&fp->irqs, idx, p) {
+		if (p->state != FUN_IRQ_REQUESTED)
+			continue;
+		irq_set_affinity_notifier(p->irq, &p->aff_notify);
+		irq_set_affinity_and_hint(p->irq, &p->affinity_mask);
+		napi_enable(&p->napi);
+		p->state = FUN_IRQ_ENABLED;
+	}
+
+	return 0;
+
+unroll:
+	last = idx - 1;
+	xa_for_each_range(&fp->irqs, idx, p, 0, last)
+		if (p->state == FUN_IRQ_REQUESTED) {
+			free_irq(p->irq, p);
+			p->state = FUN_IRQ_INIT;
+		}
+
+	return err;
+}
+
+static void fun_disable_one_irq(struct fun_irq *irq)
+{
+	napi_disable(&irq->napi);
+	irq_set_affinity_notifier(irq->irq, NULL);
+	irq_update_affinity_hint(irq->irq, NULL);
+	free_irq(irq->irq, irq);
+	irq->state = FUN_IRQ_INIT;
+}
+
+static void fun_disable_irqs(struct net_device *dev)
+{
+	struct funeth_priv *fp = netdev_priv(dev);
+	struct fun_irq *p;
+	unsigned long idx;
+
+	xa_for_each(&fp->irqs, idx, p)
+		if (p->state == FUN_IRQ_ENABLED)
+			fun_disable_one_irq(p);
+}
+
+static void fun_down(struct net_device *dev, struct fun_qset *qset)
+{
+	struct funeth_priv *fp = netdev_priv(dev);
+
+	/* If we don't have queues the data path is already down.
+	 * Note netif_running(dev) may be true.
+	 */
+	if (!rcu_access_pointer(fp->rxqs))
+		return;
+
+	/* It is also down if the queues aren't on the device. */
+	if (fp->txqs[0]->init_state >= FUN_QSTATE_INIT_FULL) {
+		netif_info(fp, ifdown, dev,
+			   "Tearing down data path on device\n");
+		fun_port_write_cmd(fp, FUN_ADMIN_PORT_KEY_DISABLE, 0);
+
+		netif_carrier_off(dev);
+		netif_tx_disable(dev);
+
+		fun_destroy_rss(fp);
+		fun_res_destroy(fp->fdev, FUN_ADMIN_OP_VI, 0, dev->dev_port);
+		fun_disable_irqs(dev);
+	}
+
+	fun_free_rings(dev, qset);
+}
+
+static int fun_up(struct net_device *dev, struct fun_qset *qset)
+{
+	static const int port_keys[] = {
+		FUN_ADMIN_PORT_KEY_STATS_DMA_LOW,
+		FUN_ADMIN_PORT_KEY_STATS_DMA_HIGH,
+		FUN_ADMIN_PORT_KEY_ENABLE
+	};
+
+	struct funeth_priv *fp = netdev_priv(dev);
+	u64 vals[] = {
+		lower_32_bits(fp->stats_dma_addr),
+		upper_32_bits(fp->stats_dma_addr),
+		FUN_PORT_FLAG_ENABLE_NOTIFY
+	};
+	int err;
+
+	netif_info(fp, ifup, dev, "Setting up data path on device\n");
+
+	if (qset->rxqs[0]->init_state < FUN_QSTATE_INIT_FULL) {
+		err = fun_advance_ring_state(dev, qset);
+		if (err)
+			return err;
+	}
+
+	err = fun_vi_create(fp);
+	if (err)
+		goto free_queues;
+
+	fp->txqs = qset->txqs;
+	rcu_assign_pointer(fp->rxqs, qset->rxqs);
+	rcu_assign_pointer(fp->xdpqs, qset->xdpqs);
+
+	err = fun_enable_irqs(dev);
+	if (err)
+		goto destroy_vi;
+
+	if (fp->rss_cfg) {
+		err = fun_config_rss(dev, fp->hash_algo, fp->rss_key,
+				     fp->indir_table, FUN_ADMIN_SUBOP_CREATE);
+	} else {
+		/* The non-RSS case has only 1 queue. */
+		err = fun_bind(fp->fdev, FUN_ADMIN_BIND_TYPE_VI, dev->dev_port,
+			       FUN_ADMIN_BIND_TYPE_EPCQ,
+			       qset->rxqs[0]->hw_cqid);
+	}
+	if (err)
+		goto disable_irqs;
+
+	err = fun_port_write_cmds(fp, 3, port_keys, vals);
+	if (err)
+		goto free_rss;
+
+	netif_tx_start_all_queues(dev);
+	return 0;
+
+free_rss:
+	fun_destroy_rss(fp);
+disable_irqs:
+	fun_disable_irqs(dev);
+destroy_vi:
+	fun_res_destroy(fp->fdev, FUN_ADMIN_OP_VI, 0, dev->dev_port);
+free_queues:
+	fun_free_rings(dev, qset);
+	return err;
+}
+
+static int funeth_open(struct net_device *netdev)
+{
+	struct funeth_priv *fp = netdev_priv(netdev);
+	struct fun_qset qset = {
+		.nrxqs = netdev->real_num_rx_queues,
+		.ntxqs = netdev->real_num_tx_queues,
+		.nxdpqs = fp->num_xdpqs,
+		.cq_depth = fp->cq_depth,
+		.rq_depth = fp->rq_depth,
+		.sq_depth = fp->sq_depth,
+		.state = FUN_QSTATE_INIT_FULL,
+	};
+	int rc;
+
+	rc = fun_alloc_rings(netdev, &qset);
+	if (rc)
+		return rc;
+
+	rc = fun_up(netdev, &qset);
+	if (rc) {
+		qset.state = FUN_QSTATE_DESTROYED;
+		fun_free_rings(netdev, &qset);
+	}
+
+	return rc;
+}
+
+static int funeth_close(struct net_device *netdev)
+{
+	struct fun_qset qset = { .state = FUN_QSTATE_DESTROYED };
+
+	fun_down(netdev, &qset);
+	return 0;
+}
+
+static void fun_get_stats64(struct net_device *netdev,
+			    struct rtnl_link_stats64 *stats)
+{
+	struct funeth_priv *fp = netdev_priv(netdev);
+	struct funeth_txq **xdpqs;
+	struct funeth_rxq **rxqs;
+	unsigned int i, start;
+
+	stats->tx_packets = fp->tx_packets;
+	stats->tx_bytes   = fp->tx_bytes;
+	stats->tx_dropped = fp->tx_dropped;
+
+	stats->rx_packets = fp->rx_packets;
+	stats->rx_bytes   = fp->rx_bytes;
+	stats->rx_dropped = fp->rx_dropped;
+
+	rcu_read_lock();
+	rxqs = rcu_dereference(fp->rxqs);
+	if (!rxqs)
+		goto unlock;
+
+	for (i = 0; i < netdev->real_num_tx_queues; i++) {
+		struct funeth_txq_stats txs;
+
+		FUN_QSTAT_READ(fp->txqs[i], start, txs);
+		stats->tx_packets += txs.tx_pkts;
+		stats->tx_bytes   += txs.tx_bytes;
+		stats->tx_dropped += txs.tx_map_err + txs.tx_len_err;
+	}
+
+	for (i = 0; i < netdev->real_num_rx_queues; i++) {
+		struct funeth_rxq_stats rxs;
+
+		FUN_QSTAT_READ(rxqs[i], start, rxs);
+		stats->rx_packets += rxs.rx_pkts;
+		stats->rx_bytes   += rxs.rx_bytes;
+		stats->rx_dropped += rxs.rx_map_err + rxs.rx_mem_drops;
+	}
+
+	xdpqs = rcu_dereference(fp->xdpqs);
+	if (!xdpqs)
+		goto unlock;
+
+	for (i = 0; i < fp->num_xdpqs; i++) {
+		struct funeth_txq_stats txs;
+
+		FUN_QSTAT_READ(xdpqs[i], start, txs);
+		stats->tx_packets += txs.tx_pkts;
+		stats->tx_bytes   += txs.tx_bytes;
+	}
+unlock:
+	rcu_read_unlock();
+}
+
+static int fun_change_mtu(struct net_device *netdev, int new_mtu)
+{
+	struct funeth_priv *fp = netdev_priv(netdev);
+	int rc;
+
+	rc = fun_port_write_cmd(fp, FUN_ADMIN_PORT_KEY_MTU, new_mtu);
+	if (!rc)
+		netdev->mtu = new_mtu;
+	return rc;
+}
+
+static int fun_set_macaddr(struct net_device *netdev, void *addr)
+{
+	struct funeth_priv *fp = netdev_priv(netdev);
+	struct sockaddr *saddr = addr;
+	int rc;
+
+	if (!is_valid_ether_addr(saddr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	if (ether_addr_equal(netdev->dev_addr, saddr->sa_data))
+		return 0;
+
+	rc = fun_port_write_cmd(fp, FUN_ADMIN_PORT_KEY_MACADDR,
+				ether_addr_to_u64(saddr->sa_data));
+	if (!rc)
+		eth_hw_addr_set(netdev, saddr->sa_data);
+	return rc;
+}
+
+static int fun_get_port_attributes(struct net_device *netdev)
+{
+	static const int keys[] = {
+		FUN_ADMIN_PORT_KEY_MACADDR, FUN_ADMIN_PORT_KEY_CAPABILITIES,
+		FUN_ADMIN_PORT_KEY_ADVERT, FUN_ADMIN_PORT_KEY_MTU
+	};
+	static const int phys_keys[] = {
+		FUN_ADMIN_PORT_KEY_LANE_ATTRS,
+	};
+
+	struct funeth_priv *fp = netdev_priv(netdev);
+	u64 data[ARRAY_SIZE(keys)];
+	u8 mac[ETH_ALEN];
+	int i, rc;
+
+	rc = fun_port_read_cmds(fp, ARRAY_SIZE(keys), keys, data);
+	if (rc)
+		return rc;
+
+	for (i = 0; i < ARRAY_SIZE(keys); i++) {
+		switch (keys[i]) {
+		case FUN_ADMIN_PORT_KEY_MACADDR:
+			u64_to_ether_addr(data[i], mac);
+			if (is_zero_ether_addr(mac)) {
+				eth_hw_addr_random(netdev);
+			} else if (is_valid_ether_addr(mac)) {
+				eth_hw_addr_set(netdev, mac);
+			} else {
+				netdev_err(netdev,
+					   "device provided a bad MAC address %pM\n",
+					   mac);
+				return -EINVAL;
+			}
+			break;
+
+		case FUN_ADMIN_PORT_KEY_CAPABILITIES:
+			fp->port_caps = data[i];
+			break;
+
+		case FUN_ADMIN_PORT_KEY_ADVERT:
+			fp->advertising = data[i];
+			break;
+
+		case FUN_ADMIN_PORT_KEY_MTU:
+			netdev->mtu = data[i];
+			break;
+		}
+	}
+
+	if (!(fp->port_caps & FUN_PORT_CAP_VPORT)) {
+		rc = fun_port_read_cmds(fp, ARRAY_SIZE(phys_keys), phys_keys,
+					data);
+		if (rc)
+			return rc;
+
+		fp->lane_attrs = data[0];
+	}
+
+	if (netdev->addr_assign_type == NET_ADDR_RANDOM)
+		return fun_port_write_cmd(fp, FUN_ADMIN_PORT_KEY_MACADDR,
+					  ether_addr_to_u64(netdev->dev_addr));
+	return 0;
+}
+
+static int fun_hwtstamp_get(struct net_device *dev, struct ifreq *ifr)
+{
+	const struct funeth_priv *fp = netdev_priv(dev);
+
+	return copy_to_user(ifr->ifr_data, &fp->hwtstamp_cfg,
+			    sizeof(fp->hwtstamp_cfg)) ? -EFAULT : 0;
+}
+
+static int fun_hwtstamp_set(struct net_device *dev, struct ifreq *ifr)
+{
+	struct funeth_priv *fp = netdev_priv(dev);
+	struct hwtstamp_config cfg;
+
+	if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
+		return -EFAULT;
+
+	/* no TX HW timestamps */
+	cfg.tx_type = HWTSTAMP_TX_OFF;
+
+	switch (cfg.rx_filter) {
+	case HWTSTAMP_FILTER_NONE:
+		break;
+	case HWTSTAMP_FILTER_ALL:
+	case HWTSTAMP_FILTER_SOME:
+	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+	case HWTSTAMP_FILTER_NTP_ALL:
+		cfg.rx_filter = HWTSTAMP_FILTER_ALL;
+		break;
+	default:
+		return -ERANGE;
+	}
+
+	fp->hwtstamp_cfg = cfg;
+	return copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg)) ? -EFAULT : 0;
+}
+
+static int fun_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	switch (cmd) {
+	case SIOCSHWTSTAMP:
+		return fun_hwtstamp_set(dev, ifr);
+	case SIOCGHWTSTAMP:
+		return fun_hwtstamp_get(dev, ifr);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+/* Prepare the queues for XDP. */
+static int fun_enter_xdp(struct net_device *dev, struct bpf_prog *prog)
+{
+	struct funeth_priv *fp = netdev_priv(dev);
+	unsigned int i, nqs = num_online_cpus();
+	struct funeth_txq **xdpqs;
+	struct funeth_rxq **rxqs;
+	int err;
+
+	xdpqs = alloc_xdpqs(dev, nqs, fp->sq_depth, 0, FUN_QSTATE_INIT_FULL);
+	if (IS_ERR(xdpqs))
+		return PTR_ERR(xdpqs);
+
+	rxqs = rtnl_dereference(fp->rxqs);
+	for (i = 0; i < dev->real_num_rx_queues; i++) {
+		err = fun_rxq_set_bpf(rxqs[i], prog);
+		if (err)
+			goto out;
+	}
+
+	fp->num_xdpqs = nqs;
+	rcu_assign_pointer(fp->xdpqs, xdpqs);
+	return 0;
+out:
+	while (i--)
+		fun_rxq_set_bpf(rxqs[i], NULL);
+
+	free_xdpqs(xdpqs, nqs, 0, FUN_QSTATE_DESTROYED);
+	return err;
+}
+
+/* Set the queues for non-XDP operation. */
+static void fun_end_xdp(struct net_device *dev)
+{
+	struct funeth_priv *fp = netdev_priv(dev);
+	struct funeth_txq **xdpqs;
+	struct funeth_rxq **rxqs;
+	unsigned int i;
+
+	xdpqs = rtnl_dereference(fp->xdpqs);
+	rcu_assign_pointer(fp->xdpqs, NULL);
+	synchronize_net();
+	/* at this point both Rx and Tx XDP processing has ended */
+
+	free_xdpqs(xdpqs, fp->num_xdpqs, 0, FUN_QSTATE_DESTROYED);
+	fp->num_xdpqs = 0;
+
+	rxqs = rtnl_dereference(fp->rxqs);
+	for (i = 0; i < dev->real_num_rx_queues; i++)
+		fun_rxq_set_bpf(rxqs[i], NULL);
+}
+
+#define XDP_MAX_MTU \
+	(PAGE_SIZE - FUN_XDP_HEADROOM - VLAN_ETH_HLEN - FUN_RX_TAILROOM)
+
+static int fun_xdp_setup(struct net_device *dev, struct netdev_bpf *xdp)
+{
+	struct bpf_prog *old_prog, *prog = xdp->prog;
+	struct funeth_priv *fp = netdev_priv(dev);
+	int i, err;
+
+	/* XDP uses at most one buffer */
+	if (prog && dev->mtu > XDP_MAX_MTU) {
+		netdev_err(dev, "device MTU %u too large for XDP\n", dev->mtu);
+		NL_SET_ERR_MSG_MOD(xdp->extack,
+				   "Device MTU too large for XDP");
+		return -EINVAL;
+	}
+
+	if (!netif_running(dev)) {
+		fp->num_xdpqs = prog ? num_online_cpus() : 0;
+	} else if (prog && !fp->xdp_prog) {
+		err = fun_enter_xdp(dev, prog);
+		if (err) {
+			NL_SET_ERR_MSG_MOD(xdp->extack,
+					   "Failed to set queues for XDP.");
+			return err;
+		}
+	} else if (!prog && fp->xdp_prog) {
+		fun_end_xdp(dev);
+	} else {
+		struct funeth_rxq **rxqs = rtnl_dereference(fp->rxqs);
+
+		for (i = 0; i < dev->real_num_rx_queues; i++)
+			WRITE_ONCE(rxqs[i]->xdp_prog, prog);
+	}
+
+	dev->max_mtu = prog ? XDP_MAX_MTU : FUN_MAX_MTU;
+	old_prog = xchg(&fp->xdp_prog, prog);
+	if (old_prog)
+		bpf_prog_put(old_prog);
+
+	return 0;
+}
+
+static int fun_xdp(struct net_device *dev, struct netdev_bpf *xdp)
+{
+	switch (xdp->command) {
+	case XDP_SETUP_PROG:
+		return fun_xdp_setup(dev, xdp);
+	default:
+		return -EINVAL;
+	}
+}
+
+static struct devlink_port *fun_get_devlink_port(struct net_device *netdev)
+{
+	struct funeth_priv *fp = netdev_priv(netdev);
+
+	return &fp->dl_port;
+}
+
+static int fun_init_vports(struct fun_ethdev *ed, unsigned int n)
+{
+	if (ed->num_vports)
+		return -EINVAL;
+
+	ed->vport_info = kvcalloc(n, sizeof(*ed->vport_info), GFP_KERNEL);
+	if (!ed->vport_info)
+		return -ENOMEM;
+	ed->num_vports = n;
+	return 0;
+}
+
+static void fun_free_vports(struct fun_ethdev *ed)
+{
+	kvfree(ed->vport_info);
+	ed->vport_info = NULL;
+	ed->num_vports = 0;
+}
+
+static struct fun_vport_info *fun_get_vport(struct fun_ethdev *ed,
+					    unsigned int vport)
+{
+	if (!ed->vport_info || vport >= ed->num_vports)
+		return NULL;
+
+	return ed->vport_info + vport;
+}
+
+static int fun_set_vf_mac(struct net_device *dev, int vf, u8 *mac)
+{
+	struct funeth_priv *fp = netdev_priv(dev);
+	struct fun_adi_param mac_param = {};
+	struct fun_dev *fdev = fp->fdev;
+	struct fun_ethdev *ed = to_fun_ethdev(fdev);
+	struct fun_vport_info *vi;
+	int rc = -EINVAL;
+
+	if (is_multicast_ether_addr(mac))
+		return -EINVAL;
+
+	mutex_lock(&ed->state_mutex);
+	vi = fun_get_vport(ed, vf);
+	if (!vi)
+		goto unlock;
+
+	mac_param.u.mac = FUN_ADI_MAC_INIT(ether_addr_to_u64(mac));
+	rc = fun_adi_write(fdev, FUN_ADMIN_ADI_ATTR_MACADDR, vf + 1,
+			   &mac_param);
+	if (!rc)
+		ether_addr_copy(vi->mac, mac);
+unlock:
+	mutex_unlock(&ed->state_mutex);
+	return rc;
+}
+
+static int fun_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos,
+			   __be16 vlan_proto)
+{
+	struct funeth_priv *fp = netdev_priv(dev);
+	struct fun_adi_param vlan_param = {};
+	struct fun_dev *fdev = fp->fdev;
+	struct fun_ethdev *ed = to_fun_ethdev(fdev);
+	struct fun_vport_info *vi;
+	int rc = -EINVAL;
+
+	if (vlan > 4095 || qos > 7)
+		return -EINVAL;
+	if (vlan_proto && vlan_proto != htons(ETH_P_8021Q) &&
+	    vlan_proto != htons(ETH_P_8021AD))
+		return -EINVAL;
+
+	mutex_lock(&ed->state_mutex);
+	vi = fun_get_vport(ed, vf);
+	if (!vi)
+		goto unlock;
+
+	vlan_param.u.vlan = FUN_ADI_VLAN_INIT(be16_to_cpu(vlan_proto),
+					      ((u16)qos << VLAN_PRIO_SHIFT) | vlan);
+	rc = fun_adi_write(fdev, FUN_ADMIN_ADI_ATTR_VLAN, vf + 1, &vlan_param);
+	if (!rc) {
+		vi->vlan = vlan;
+		vi->qos = qos;
+		vi->vlan_proto = vlan_proto;
+	}
+unlock:
+	mutex_unlock(&ed->state_mutex);
+	return rc;
+}
+
+static int fun_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate,
+			   int max_tx_rate)
+{
+	struct funeth_priv *fp = netdev_priv(dev);
+	struct fun_adi_param rate_param = {};
+	struct fun_dev *fdev = fp->fdev;
+	struct fun_ethdev *ed = to_fun_ethdev(fdev);
+	struct fun_vport_info *vi;
+	int rc = -EINVAL;
+
+	if (min_tx_rate)
+		return -EINVAL;
+
+	mutex_lock(&ed->state_mutex);
+	vi = fun_get_vport(ed, vf);
+	if (!vi)
+		goto unlock;
+
+	rate_param.u.rate = FUN_ADI_RATE_INIT(max_tx_rate);
+	rc = fun_adi_write(fdev, FUN_ADMIN_ADI_ATTR_RATE, vf + 1, &rate_param);
+	if (!rc)
+		vi->max_rate = max_tx_rate;
+unlock:
+	mutex_unlock(&ed->state_mutex);
+	return rc;
+}
+
+static int fun_get_vf_config(struct net_device *dev, int vf,
+			     struct ifla_vf_info *ivi)
+{
+	struct funeth_priv *fp = netdev_priv(dev);
+	struct fun_ethdev *ed = to_fun_ethdev(fp->fdev);
+	const struct fun_vport_info *vi;
+
+	mutex_lock(&ed->state_mutex);
+	vi = fun_get_vport(ed, vf);
+	if (!vi)
+		goto unlock;
+
+	memset(ivi, 0, sizeof(*ivi));
+	ivi->vf = vf;
+	ether_addr_copy(ivi->mac, vi->mac);
+	ivi->vlan = vi->vlan;
+	ivi->qos = vi->qos;
+	ivi->vlan_proto = vi->vlan_proto;
+	ivi->max_tx_rate = vi->max_rate;
+	ivi->spoofchk = vi->spoofchk;
+unlock:
+	mutex_unlock(&ed->state_mutex);
+	return vi ? 0 : -EINVAL;
+}
+
+static void fun_uninit(struct net_device *dev)
+{
+	struct funeth_priv *fp = netdev_priv(dev);
+
+	fun_prune_queue_irqs(dev);
+	xa_destroy(&fp->irqs);
+}
+
+static const struct net_device_ops fun_netdev_ops = {
+	.ndo_open		= funeth_open,
+	.ndo_stop		= funeth_close,
+	.ndo_start_xmit		= fun_start_xmit,
+	.ndo_get_stats64	= fun_get_stats64,
+	.ndo_change_mtu		= fun_change_mtu,
+	.ndo_set_mac_address	= fun_set_macaddr,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_eth_ioctl		= fun_ioctl,
+	.ndo_uninit		= fun_uninit,
+	.ndo_bpf		= fun_xdp,
+	.ndo_xdp_xmit		= fun_xdp_xmit_frames,
+	.ndo_set_vf_mac		= fun_set_vf_mac,
+	.ndo_set_vf_vlan	= fun_set_vf_vlan,
+	.ndo_set_vf_rate	= fun_set_vf_rate,
+	.ndo_get_vf_config	= fun_get_vf_config,
+	.ndo_get_devlink_port	= fun_get_devlink_port,
+};
+
+#define GSO_ENCAP_FLAGS (NETIF_F_GSO_GRE | NETIF_F_GSO_IPXIP4 | \
+			 NETIF_F_GSO_IPXIP6 | NETIF_F_GSO_UDP_TUNNEL | \
+			 NETIF_F_GSO_UDP_TUNNEL_CSUM)
+#define TSO_FLAGS (NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_TSO_ECN)
+#define VLAN_FEAT (NETIF_F_SG | NETIF_F_HW_CSUM | TSO_FLAGS | \
+		   GSO_ENCAP_FLAGS | NETIF_F_HIGHDMA)
+
+static void fun_dflt_rss_indir(struct funeth_priv *fp, unsigned int nrx)
+{
+	unsigned int i;
+
+	for (i = 0; i < fp->indir_table_nentries; i++)
+		fp->indir_table[i] = ethtool_rxfh_indir_default(i, nrx);
+}
+
+/* Reset the RSS indirection table to equal distribution across the current
+ * number of Rx queues. Called at init time and whenever the number of Rx
+ * queues changes subsequently. Note that this may also resize the indirection
+ * table.
+ */
+static void fun_reset_rss_indir(struct net_device *dev, unsigned int nrx)
+{
+	struct funeth_priv *fp = netdev_priv(dev);
+
+	if (!fp->rss_cfg)
+		return;
+
+	/* Set the table size to the max possible that allows an equal number
+	 * of occurrences of each CQ.
+	 */
+	fp->indir_table_nentries = rounddown(FUN_ETH_RSS_MAX_INDIR_ENT, nrx);
+	fun_dflt_rss_indir(fp, nrx);
+}
+
+/* Update the RSS LUT to contain only queues in [0, nrx). Normally this will
+ * update the LUT to an equal distribution among nrx queues, If @only_if_needed
+ * is set the LUT is left unchanged if it already does not reference any queues
+ * >= nrx.
+ */
+static int fun_rss_set_qnum(struct net_device *dev, unsigned int nrx,
+			    bool only_if_needed)
+{
+	struct funeth_priv *fp = netdev_priv(dev);
+	u32 old_lut[FUN_ETH_RSS_MAX_INDIR_ENT];
+	unsigned int i, oldsz;
+	int err;
+
+	if (!fp->rss_cfg)
+		return 0;
+
+	if (only_if_needed) {
+		for (i = 0; i < fp->indir_table_nentries; i++)
+			if (fp->indir_table[i] >= nrx)
+				break;
+
+		if (i >= fp->indir_table_nentries)
+			return 0;
+	}
+
+	memcpy(old_lut, fp->indir_table, sizeof(old_lut));
+	oldsz = fp->indir_table_nentries;
+	fun_reset_rss_indir(dev, nrx);
+
+	err = fun_config_rss(dev, fp->hash_algo, fp->rss_key,
+			     fp->indir_table, FUN_ADMIN_SUBOP_MODIFY);
+	if (!err)
+		return 0;
+
+	memcpy(fp->indir_table, old_lut, sizeof(old_lut));
+	fp->indir_table_nentries = oldsz;
+	return err;
+}
+
+/* Allocate the DMA area for the RSS configuration commands to the device, and
+ * initialize the hash, hash key, indirection table size and its entries to
+ * their defaults. The indirection table defaults to equal distribution across
+ * the Rx queues.
+ */
+static int fun_init_rss(struct net_device *dev)
+{
+	struct funeth_priv *fp = netdev_priv(dev);
+	size_t size = sizeof(fp->rss_key) + sizeof(fp->indir_table);
+
+	fp->rss_hw_id = FUN_HCI_ID_INVALID;
+	if (!(fp->port_caps & FUN_PORT_CAP_OFFLOADS))
+		return 0;
+
+	fp->rss_cfg = dma_alloc_coherent(&fp->pdev->dev, size,
+					 &fp->rss_dma_addr, GFP_KERNEL);
+	if (!fp->rss_cfg)
+		return -ENOMEM;
+
+	fp->hash_algo = FUN_ETH_RSS_ALG_TOEPLITZ;
+	netdev_rss_key_fill(fp->rss_key, sizeof(fp->rss_key));
+	fun_reset_rss_indir(dev, dev->real_num_rx_queues);
+	return 0;
+}
+
+static void fun_free_rss(struct funeth_priv *fp)
+{
+	if (fp->rss_cfg) {
+		dma_free_coherent(&fp->pdev->dev,
+				  sizeof(fp->rss_key) + sizeof(fp->indir_table),
+				  fp->rss_cfg, fp->rss_dma_addr);
+		fp->rss_cfg = NULL;
+	}
+}
+
+void fun_set_ring_count(struct net_device *netdev, unsigned int ntx,
+			unsigned int nrx)
+{
+	netif_set_real_num_tx_queues(netdev, ntx);
+	if (nrx != netdev->real_num_rx_queues) {
+		netif_set_real_num_rx_queues(netdev, nrx);
+		fun_reset_rss_indir(netdev, nrx);
+	}
+}
+
+static int fun_init_stats_area(struct funeth_priv *fp)
+{
+	unsigned int nstats;
+
+	if (!(fp->port_caps & FUN_PORT_CAP_STATS))
+		return 0;
+
+	nstats = PORT_MAC_RX_STATS_MAX + PORT_MAC_TX_STATS_MAX +
+		 PORT_MAC_FEC_STATS_MAX;
+
+	fp->stats = dma_alloc_coherent(&fp->pdev->dev, nstats * sizeof(u64),
+				       &fp->stats_dma_addr, GFP_KERNEL);
+	if (!fp->stats)
+		return -ENOMEM;
+	return 0;
+}
+
+static void fun_free_stats_area(struct funeth_priv *fp)
+{
+	unsigned int nstats;
+
+	if (fp->stats) {
+		nstats = PORT_MAC_RX_STATS_MAX + PORT_MAC_TX_STATS_MAX;
+		dma_free_coherent(&fp->pdev->dev, nstats * sizeof(u64),
+				  fp->stats, fp->stats_dma_addr);
+		fp->stats = NULL;
+	}
+}
+
+static int fun_dl_port_register(struct net_device *netdev)
+{
+	struct funeth_priv *fp = netdev_priv(netdev);
+	struct devlink *dl = priv_to_devlink(fp->fdev);
+	struct devlink_port_attrs attrs = {};
+	unsigned int idx;
+
+	if (fp->port_caps & FUN_PORT_CAP_VPORT) {
+		attrs.flavour = DEVLINK_PORT_FLAVOUR_VIRTUAL;
+		idx = fp->lport;
+	} else {
+		idx = netdev->dev_port;
+		attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL;
+		attrs.lanes = fp->lane_attrs & 7;
+		if (fp->lane_attrs & FUN_PORT_LANE_SPLIT) {
+			attrs.split = 1;
+			attrs.phys.port_number = fp->lport & ~3;
+			attrs.phys.split_subport_number = fp->lport & 3;
+		} else {
+			attrs.phys.port_number = fp->lport;
+		}
+	}
+
+	devlink_port_attrs_set(&fp->dl_port, &attrs);
+
+	return devlink_port_register(dl, &fp->dl_port, idx);
+}
+
+/* Determine the max Tx/Rx queues for a port. */
+static int fun_max_qs(struct fun_ethdev *ed, unsigned int *ntx,
+		      unsigned int *nrx)
+{
+	int neth;
+
+	if (ed->num_ports > 1 || is_kdump_kernel()) {
+		*ntx = 1;
+		*nrx = 1;
+		return 0;
+	}
+
+	neth = fun_get_res_count(&ed->fdev, FUN_ADMIN_OP_ETH);
+	if (neth < 0)
+		return neth;
+
+	/* We determine the max number of queues based on the CPU
+	 * cores, device interrupts and queues, RSS size, and device Tx flows.
+	 *
+	 * - At least 1 Rx and 1 Tx queues.
+	 * - At most 1 Rx/Tx queue per core.
+	 * - Each Rx/Tx queue needs 1 SQ.
+	 */
+	*ntx = min(ed->nsqs_per_port - 1, num_online_cpus());
+	*nrx = *ntx;
+	if (*ntx > neth)
+		*ntx = neth;
+	if (*nrx > FUN_ETH_RSS_MAX_INDIR_ENT)
+		*nrx = FUN_ETH_RSS_MAX_INDIR_ENT;
+	return 0;
+}
+
+static void fun_queue_defaults(struct net_device *dev, unsigned int nsqs)
+{
+	unsigned int ntx, nrx;
+
+	ntx = min(dev->num_tx_queues, FUN_DFLT_QUEUES);
+	nrx = min(dev->num_rx_queues, FUN_DFLT_QUEUES);
+	if (ntx <= nrx) {
+		ntx = min(ntx, nsqs / 2);
+		nrx = min(nrx, nsqs - ntx);
+	} else {
+		nrx = min(nrx, nsqs / 2);
+		ntx = min(ntx, nsqs - nrx);
+	}
+
+	netif_set_real_num_tx_queues(dev, ntx);
+	netif_set_real_num_rx_queues(dev, nrx);
+}
+
+/* Replace the existing Rx/Tx/XDP queues with equal number of queues with
+ * different settings, e.g. depth. This is a disruptive replacement that
+ * temporarily shuts down the data path and should be limited to changes that
+ * can't be applied to live queues. The old queues are always discarded.
+ */
+int fun_replace_queues(struct net_device *dev, struct fun_qset *newqs,
+		       struct netlink_ext_ack *extack)
+{
+	struct fun_qset oldqs = { .state = FUN_QSTATE_DESTROYED };
+	struct funeth_priv *fp = netdev_priv(dev);
+	int err;
+
+	newqs->nrxqs = dev->real_num_rx_queues;
+	newqs->ntxqs = dev->real_num_tx_queues;
+	newqs->nxdpqs = fp->num_xdpqs;
+	newqs->state = FUN_QSTATE_INIT_SW;
+	err = fun_alloc_rings(dev, newqs);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Unable to allocate memory for new queues, keeping current settings");
+		return err;
+	}
+
+	fun_down(dev, &oldqs);
+
+	err = fun_up(dev, newqs);
+	if (!err)
+		return 0;
+
+	/* The new queues couldn't be installed. We do not retry the old queues
+	 * as they are the same to the device as the new queues and would
+	 * similarly fail.
+	 */
+	newqs->state = FUN_QSTATE_DESTROYED;
+	fun_free_rings(dev, newqs);
+	NL_SET_ERR_MSG_MOD(extack, "Unable to restore the data path with the new queues.");
+	return err;
+}
+
+/* Change the number of Rx/Tx queues of a device while it is up. This is done
+ * by incrementally adding/removing queues to meet the new requirements while
+ * handling ongoing traffic.
+ */
+int fun_change_num_queues(struct net_device *dev, unsigned int ntx,
+			  unsigned int nrx)
+{
+	unsigned int keep_tx = min(dev->real_num_tx_queues, ntx);
+	unsigned int keep_rx = min(dev->real_num_rx_queues, nrx);
+	struct funeth_priv *fp = netdev_priv(dev);
+	struct fun_qset oldqs = {
+		.rxqs = rtnl_dereference(fp->rxqs),
+		.txqs = fp->txqs,
+		.nrxqs = dev->real_num_rx_queues,
+		.ntxqs = dev->real_num_tx_queues,
+		.rxq_start = keep_rx,
+		.txq_start = keep_tx,
+		.state = FUN_QSTATE_DESTROYED
+	};
+	struct fun_qset newqs = {
+		.nrxqs = nrx,
+		.ntxqs = ntx,
+		.rxq_start = keep_rx,
+		.txq_start = keep_tx,
+		.cq_depth = fp->cq_depth,
+		.rq_depth = fp->rq_depth,
+		.sq_depth = fp->sq_depth,
+		.state = FUN_QSTATE_INIT_FULL
+	};
+	int i, err;
+
+	err = fun_alloc_rings(dev, &newqs);
+	if (err)
+		goto free_irqs;
+
+	err = fun_enable_irqs(dev); /* of any newly added queues */
+	if (err)
+		goto free_rings;
+
+	/* copy the queues we are keeping to the new set */
+	memcpy(newqs.rxqs, oldqs.rxqs, keep_rx * sizeof(*oldqs.rxqs));
+	memcpy(newqs.txqs, fp->txqs, keep_tx * sizeof(*fp->txqs));
+
+	if (nrx < dev->real_num_rx_queues) {
+		err = fun_rss_set_qnum(dev, nrx, true);
+		if (err)
+			goto disable_tx_irqs;
+
+		for (i = nrx; i < dev->real_num_rx_queues; i++)
+			fun_disable_one_irq(container_of(oldqs.rxqs[i]->napi,
+							 struct fun_irq, napi));
+
+		netif_set_real_num_rx_queues(dev, nrx);
+	}
+
+	if (ntx < dev->real_num_tx_queues)
+		netif_set_real_num_tx_queues(dev, ntx);
+
+	rcu_assign_pointer(fp->rxqs, newqs.rxqs);
+	fp->txqs = newqs.txqs;
+	synchronize_net();
+
+	if (ntx > dev->real_num_tx_queues)
+		netif_set_real_num_tx_queues(dev, ntx);
+
+	if (nrx > dev->real_num_rx_queues) {
+		netif_set_real_num_rx_queues(dev, nrx);
+		fun_rss_set_qnum(dev, nrx, false);
+	}
+
+	/* disable interrupts of any excess Tx queues */
+	for (i = keep_tx; i < oldqs.ntxqs; i++)
+		fun_disable_one_irq(oldqs.txqs[i]->irq);
+
+	fun_free_rings(dev, &oldqs);
+	fun_prune_queue_irqs(dev);
+	return 0;
+
+disable_tx_irqs:
+	for (i = oldqs.ntxqs; i < ntx; i++)
+		fun_disable_one_irq(newqs.txqs[i]->irq);
+free_rings:
+	newqs.state = FUN_QSTATE_DESTROYED;
+	fun_free_rings(dev, &newqs);
+free_irqs:
+	fun_prune_queue_irqs(dev);
+	return err;
+}
+
+static int fun_create_netdev(struct fun_ethdev *ed, unsigned int portid)
+{
+	struct fun_dev *fdev = &ed->fdev;
+	struct net_device *netdev;
+	struct funeth_priv *fp;
+	unsigned int ntx, nrx;
+	int rc;
+
+	rc = fun_max_qs(ed, &ntx, &nrx);
+	if (rc)
+		return rc;
+
+	netdev = alloc_etherdev_mqs(sizeof(*fp), ntx, nrx);
+	if (!netdev) {
+		rc = -ENOMEM;
+		goto done;
+	}
+
+	netdev->dev_port = portid;
+	fun_queue_defaults(netdev, ed->nsqs_per_port);
+
+	fp = netdev_priv(netdev);
+	fp->fdev = fdev;
+	fp->pdev = to_pci_dev(fdev->dev);
+	fp->netdev = netdev;
+	xa_init(&fp->irqs);
+	fp->rx_irq_ofst = ntx;
+	seqcount_init(&fp->link_seq);
+
+	fp->lport = INVALID_LPORT;
+	rc = fun_port_create(netdev);
+	if (rc)
+		goto free_netdev;
+
+	/* bind port to admin CQ for async events */
+	rc = fun_bind(fdev, FUN_ADMIN_BIND_TYPE_PORT, portid,
+		      FUN_ADMIN_BIND_TYPE_EPCQ, 0);
+	if (rc)
+		goto destroy_port;
+
+	rc = fun_get_port_attributes(netdev);
+	if (rc)
+		goto destroy_port;
+
+	rc = fun_init_rss(netdev);
+	if (rc)
+		goto destroy_port;
+
+	rc = fun_init_stats_area(fp);
+	if (rc)
+		goto free_rss;
+
+	SET_NETDEV_DEV(netdev, fdev->dev);
+	netdev->netdev_ops = &fun_netdev_ops;
+
+	netdev->hw_features = NETIF_F_SG | NETIF_F_RXHASH | NETIF_F_RXCSUM;
+	if (fp->port_caps & FUN_PORT_CAP_OFFLOADS)
+		netdev->hw_features |= NETIF_F_HW_CSUM | TSO_FLAGS;
+	if (fp->port_caps & FUN_PORT_CAP_ENCAP_OFFLOADS)
+		netdev->hw_features |= GSO_ENCAP_FLAGS;
+
+	netdev->features |= netdev->hw_features | NETIF_F_HIGHDMA;
+	netdev->vlan_features = netdev->features & VLAN_FEAT;
+	netdev->mpls_features = netdev->vlan_features;
+	netdev->hw_enc_features = netdev->hw_features;
+
+	netdev->min_mtu = ETH_MIN_MTU;
+	netdev->max_mtu = FUN_MAX_MTU;
+
+	fun_set_ethtool_ops(netdev);
+
+	/* configurable parameters */
+	fp->sq_depth = min(SQ_DEPTH, fdev->q_depth);
+	fp->cq_depth = min(CQ_DEPTH, fdev->q_depth);
+	fp->rq_depth = min_t(unsigned int, RQ_DEPTH, fdev->q_depth);
+	fp->rx_coal_usec  = CQ_INTCOAL_USEC;
+	fp->rx_coal_count = CQ_INTCOAL_NPKT;
+	fp->tx_coal_usec  = SQ_INTCOAL_USEC;
+	fp->tx_coal_count = SQ_INTCOAL_NPKT;
+	fp->cq_irq_db = FUN_IRQ_CQ_DB(fp->rx_coal_usec, fp->rx_coal_count);
+
+	rc = fun_dl_port_register(netdev);
+	if (rc)
+		goto free_stats;
+
+	fp->ktls_id = FUN_HCI_ID_INVALID;
+	fun_ktls_init(netdev);            /* optional, failure OK */
+
+	netif_carrier_off(netdev);
+	ed->netdevs[portid] = netdev;
+	rc = register_netdev(netdev);
+	if (rc)
+		goto unreg_devlink;
+
+	if (fp->dl_port.devlink)
+		devlink_port_type_eth_set(&fp->dl_port, netdev);
+
+	return 0;
+
+unreg_devlink:
+	ed->netdevs[portid] = NULL;
+	fun_ktls_cleanup(fp);
+	if (fp->dl_port.devlink)
+		devlink_port_unregister(&fp->dl_port);
+free_stats:
+	fun_free_stats_area(fp);
+free_rss:
+	fun_free_rss(fp);
+destroy_port:
+	fun_port_destroy(netdev);
+free_netdev:
+	free_netdev(netdev);
+done:
+	dev_err(fdev->dev, "couldn't allocate port %u, error %d", portid, rc);
+	return rc;
+}
+
+static void fun_destroy_netdev(struct net_device *netdev)
+{
+	struct funeth_priv *fp;
+
+	fp = netdev_priv(netdev);
+	if (fp->dl_port.devlink) {
+		devlink_port_type_clear(&fp->dl_port);
+		devlink_port_unregister(&fp->dl_port);
+	}
+	unregister_netdev(netdev);
+	fun_ktls_cleanup(fp);
+	fun_free_stats_area(fp);
+	fun_free_rss(fp);
+	fun_port_destroy(netdev);
+	free_netdev(netdev);
+}
+
+static int fun_create_ports(struct fun_ethdev *ed, unsigned int nports)
+{
+	struct fun_dev *fd = &ed->fdev;
+	int i, rc;
+
+	/* The admin queue takes 1 IRQ and 2 SQs. */
+	ed->nsqs_per_port = min(fd->num_irqs - 1,
+				fd->kern_end_qid - 2) / nports;
+	if (ed->nsqs_per_port < 2) {
+		dev_err(fd->dev, "Too few SQs for %u ports", nports);
+		return -EINVAL;
+	}
+
+	ed->netdevs = kcalloc(nports, sizeof(*ed->netdevs), GFP_KERNEL);
+	if (!ed->netdevs)
+		return -ENOMEM;
+
+	ed->num_ports = nports;
+	for (i = 0; i < nports; i++) {
+		rc = fun_create_netdev(ed, i);
+		if (rc)
+			goto free_netdevs;
+	}
+
+	return 0;
+
+free_netdevs:
+	while (i)
+		fun_destroy_netdev(ed->netdevs[--i]);
+	kfree(ed->netdevs);
+	ed->netdevs = NULL;
+	ed->num_ports = 0;
+	return rc;
+}
+
+static void fun_destroy_ports(struct fun_ethdev *ed)
+{
+	unsigned int i;
+
+	for (i = 0; i < ed->num_ports; i++)
+		fun_destroy_netdev(ed->netdevs[i]);
+
+	kfree(ed->netdevs);
+	ed->netdevs = NULL;
+	ed->num_ports = 0;
+}
+
+static void fun_update_link_state(const struct fun_ethdev *ed,
+				  const struct fun_admin_port_notif *notif)
+{
+	unsigned int port_idx = be16_to_cpu(notif->id);
+	struct net_device *netdev;
+	struct funeth_priv *fp;
+
+	if (port_idx >= ed->num_ports)
+		return;
+
+	netdev = ed->netdevs[port_idx];
+	fp = netdev_priv(netdev);
+
+	write_seqcount_begin(&fp->link_seq);
+	fp->link_speed = be32_to_cpu(notif->speed) * 10;  /* 10 Mbps->Mbps */
+	fp->active_fc = notif->flow_ctrl;
+	fp->active_fec = notif->fec;
+	fp->xcvr_type = notif->xcvr_type;
+	fp->link_down_reason = notif->link_down_reason;
+	fp->lp_advertising = be64_to_cpu(notif->lp_advertising);
+
+	if ((notif->link_state | notif->missed_events) & FUN_PORT_FLAG_MAC_DOWN)
+		netif_carrier_off(netdev);
+	if (notif->link_state & FUN_PORT_FLAG_MAC_UP)
+		netif_carrier_on(netdev);
+
+	write_seqcount_end(&fp->link_seq);
+	fun_report_link(netdev);
+}
+
+/* handler for async events delivered through the admin CQ */
+static void fun_event_cb(struct fun_dev *fdev, void *entry)
+{
+	u8 op = ((struct fun_admin_rsp_common *)entry)->op;
+
+	if (op == FUN_ADMIN_OP_PORT) {
+		const struct fun_admin_port_notif *rsp = entry;
+
+		if (rsp->subop == FUN_ADMIN_SUBOP_NOTIFY) {
+			fun_update_link_state(to_fun_ethdev(fdev), rsp);
+		} else if (rsp->subop == FUN_ADMIN_SUBOP_RES_COUNT) {
+			const struct fun_admin_res_count_rsp *r = entry;
+
+			if (r->count.data)
+				set_bit(FUN_SERV_RES_CHANGE, &fdev->service_flags);
+			else
+				set_bit(FUN_SERV_DEL_PORTS, &fdev->service_flags);
+			fun_serv_sched(fdev);
+		} else {
+			dev_info(fdev->dev, "adminq event unexpected op %u subop %u",
+				 op, rsp->subop);
+		}
+	} else {
+		dev_info(fdev->dev, "adminq event unexpected op %u", op);
+	}
+}
+
+/* handler for pending work managed by the service task */
+static void fun_service_cb(struct fun_dev *fdev)
+{
+	struct fun_ethdev *ed = to_fun_ethdev(fdev);
+	int rc;
+
+	if (test_and_clear_bit(FUN_SERV_DEL_PORTS, &fdev->service_flags))
+		fun_destroy_ports(ed);
+
+	if (!test_and_clear_bit(FUN_SERV_RES_CHANGE, &fdev->service_flags))
+		return;
+
+	rc = fun_get_res_count(fdev, FUN_ADMIN_OP_PORT);
+	if (rc < 0 || rc == ed->num_ports)
+		return;
+
+	if (ed->num_ports)
+		fun_destroy_ports(ed);
+	if (rc)
+		fun_create_ports(ed, rc);
+}
+
+static int funeth_sriov_configure(struct pci_dev *pdev, int nvfs)
+{
+	struct fun_dev *fdev = pci_get_drvdata(pdev);
+	struct fun_ethdev *ed = to_fun_ethdev(fdev);
+	int rc;
+
+	if (nvfs == 0) {
+		if (pci_vfs_assigned(pdev)) {
+			dev_warn(&pdev->dev,
+				 "Cannot disable SR-IOV while VFs are assigned\n");
+			return -EPERM;
+		}
+
+		mutex_lock(&ed->state_mutex);
+		fun_free_vports(ed);
+		mutex_unlock(&ed->state_mutex);
+		pci_disable_sriov(pdev);
+		return 0;
+	}
+
+	rc = pci_enable_sriov(pdev, nvfs);
+	if (rc)
+		return rc;
+
+	mutex_lock(&ed->state_mutex);
+	rc = fun_init_vports(ed, nvfs);
+	mutex_unlock(&ed->state_mutex);
+	if (rc) {
+		pci_disable_sriov(pdev);
+		return rc;
+	}
+
+	return nvfs;
+}
+
+static int funeth_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct fun_dev_params aqreq = {
+		.cqe_size_log2 = ilog2(ADMIN_CQE_SIZE),
+		.sqe_size_log2 = ilog2(ADMIN_SQE_SIZE),
+		.cq_depth      = ADMIN_CQ_DEPTH,
+		.sq_depth      = ADMIN_SQ_DEPTH,
+		.rq_depth      = ADMIN_RQ_DEPTH,
+		.min_msix      = 2,              /* 1 Rx + 1 Tx */
+		.event_cb      = fun_event_cb,
+		.serv_cb       = fun_service_cb,
+	};
+	struct devlink *devlink;
+	struct fun_ethdev *ed;
+	struct fun_dev *fdev;
+	int rc;
+
+	devlink = fun_devlink_alloc(&pdev->dev);
+	if (!devlink) {
+		dev_err(&pdev->dev, "devlink alloc failed\n");
+		return -ENOMEM;
+	}
+
+	ed = devlink_priv(devlink);
+	mutex_init(&ed->state_mutex);
+
+	fdev = &ed->fdev;
+	rc = fun_dev_enable(fdev, pdev, &aqreq, KBUILD_MODNAME);
+	if (rc)
+		goto free_devlink;
+
+	rc = fun_get_res_count(fdev, FUN_ADMIN_OP_PORT);
+	if (rc > 0)
+		rc = fun_create_ports(ed, rc);
+	if (rc < 0)
+		goto disable_dev;
+
+	fun_serv_restart(fdev);
+	fun_devlink_register(devlink);
+	return 0;
+
+disable_dev:
+	fun_dev_disable(fdev);
+free_devlink:
+	mutex_destroy(&ed->state_mutex);
+	fun_devlink_free(devlink);
+	return rc;
+}
+
+static void funeth_remove(struct pci_dev *pdev)
+{
+	struct fun_dev *fdev = pci_get_drvdata(pdev);
+	struct devlink *devlink;
+	struct fun_ethdev *ed;
+
+	ed = to_fun_ethdev(fdev);
+	devlink = priv_to_devlink(ed);
+	fun_devlink_unregister(devlink);
+
+#ifdef CONFIG_PCI_IOV
+	funeth_sriov_configure(pdev, 0);
+#endif
+
+	fun_serv_stop(fdev);
+	fun_destroy_ports(ed);
+	fun_dev_disable(fdev);
+	mutex_destroy(&ed->state_mutex);
+
+	fun_devlink_free(devlink);
+}
+
+static struct pci_driver funeth_driver = {
+	.name		 = KBUILD_MODNAME,
+	.id_table	 = funeth_id_table,
+	.probe		 = funeth_probe,
+	.remove		 = funeth_remove,
+	.shutdown	 = funeth_remove,
+	.sriov_configure = funeth_sriov_configure,
+};
+
+module_pci_driver(funeth_driver);
+
+MODULE_AUTHOR("Dimitris Michailidis <dmichail@...gible.com>");
+MODULE_DESCRIPTION("Fungible Ethernet Network Driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DEVICE_TABLE(pci, funeth_id_table);
-- 
2.25.1