netdev - Re: [PATCH net-next v2] Add Mellanox BlueField Gigabit Ethernet driver

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20200730173059.7440e21c@kicinski-fedora-pc1c0hjn.dhcp.thefacebook.com>
Date:   Thu, 30 Jul 2020 17:30:59 -0700
From:   Jakub Kicinski <kuba@...nel.org>
To:     David Thompson <dthompson@...lanox.com>
Cc:     netdev@...r.kernel.org, davem@...emloft.net, jiri@...lanox.com,
        Asmaa Mnebhi <asmaa@...lanox.com>
Subject: Re: [PATCH net-next v2] Add Mellanox BlueField Gigabit Ethernet
 driver

On Thu, 30 Jul 2020 18:53:58 -0400 David Thompson wrote:
> The logic in "mlxbf_gige_mdio.c" is the driver controlling
> the Mellanox BlueField hardware that interacts with a PHY
> device via MDIO/MDC pins.  This driver does the following:
>   - At driver probe time, it configures several BlueField MDIO
>     parameters such as sample rate, full drive, voltage and MDC
>     based on values read from ACPI table.
>   - It defines functions to read and write MDIO registers and
>     registers the MDIO bus.
>   - It defines the phy interrupt handler reporting a
>     link up/down status change
>   - This driver's probe is invoked from the main driver logic
>     while the phy interrupt handler is registered in ndo_open.

These parts will definitely need Andrew or other PHY maintainers 
to look at. Good luck with the ACPI stuff :)

> Reported-by: kernel test robot <lkp@...el.com>

Remove this, it only applies if the entire patch is prompted by the
reporter. I doubt the build bot made you write this driver.

> +config MLXBF_GIGE
> +	tristate "Mellanox Technologies BlueField Gigabit Ethernet support"
> +	depends on (ARM64 || COMPILE_TEST) && ACPI && INET

Why do you depend on INET? :S

> +	for (i = 0; i < priv->rx_q_entries; i++) {
> +		/* Allocate a receive buffer for this RX WQE. The DMA
> +		 * form (dma_addr_t) of the receive buffer address is
> +		 * stored in the RX WQE array (via 'rx_wqe_ptr') where
> +		 * it is accessible by the GigE device. The VA form of
> +		 * the receive buffer is stored in 'rx_buf[]' array in
> +		 * the driver private storage for housekeeping.
> +		 */
> +		priv->rx_buf[i] = dma_alloc_coherent(priv->dev,
> +						     MLXBF_GIGE_DEFAULT_BUF_SZ,
> +						     &rx_buf_dma,
> +						     GFP_KERNEL);

Do the buffers have to be in coherent memory? That's kinda strange.

> +		if (!priv->rx_buf[i])
> +			goto free_wqe_and_buf;
> +
> +		*rx_wqe_ptr++ = rx_buf_dma;
> +	}

> +	/* Enable RX DMA to write new packets to memory */
> +	writeq(MLXBF_GIGE_RX_DMA_EN, priv->base + MLXBF_GIGE_RX_DMA);

Looks a little strange that you enable DMA and then do further config
like CRC stripping.

> +	/* Enable removal of CRC during RX */
> +	data = readq(priv->base + MLXBF_GIGE_RX);
> +	data |= MLXBF_GIGE_RX_STRIP_CRC_EN;
> +	writeq(data, priv->base + MLXBF_GIGE_RX);
> +
> +	/* Enable RX MAC filter pass and discard counters */
> +	writeq(MLXBF_GIGE_RX_MAC_FILTER_COUNT_DISC_EN,
> +	       priv->base + MLXBF_GIGE_RX_MAC_FILTER_COUNT_DISC);
> +	writeq(MLXBF_GIGE_RX_MAC_FILTER_COUNT_PASS_EN,
> +	       priv->base + MLXBF_GIGE_RX_MAC_FILTER_COUNT_PASS);
> +
> +	/* Clear MLXBF_GIGE_INT_MASK 'receive pkt' bit to
> +	 * indicate readiness to receive pkts
> +	 */

Readiness to receive interrupts - I presume.

> +	data = readq(priv->base + MLXBF_GIGE_INT_MASK);
> +	data &= ~MLXBF_GIGE_INT_MASK_RX_RECEIVE_PACKET;
> +	writeq(data, priv->base + MLXBF_GIGE_INT_MASK);
> +
> +	writeq(ilog2(priv->rx_q_entries),
> +	       priv->base + MLXBF_GIGE_RX_WQE_SIZE_LOG2);

This is probably what enables the reception of packets?

> +	return 0;
> +
> +free_wqe_and_buf:
> +	rx_wqe_ptr = priv->rx_wqe_base;
> +	for (j = 0; j < i; j++) {
> +		dma_free_coherent(priv->dev, MLXBF_GIGE_DEFAULT_BUF_SZ,
> +				  priv->rx_buf[j], *rx_wqe_ptr);
> +		rx_wqe_ptr++;
> +	}
> +	dma_free_coherent(priv->dev, wq_size,
> +			  priv->rx_wqe_base, priv->rx_wqe_base_dma);
> +	return -ENOMEM;
> +}
> +
> +/* Transmit Initialization
> + * 1) Allocates TX WQE array using coherent DMA mapping
> + * 2) Allocates TX completion counter using coherent DMA mapping
> + */
> +static int mlxbf_gige_tx_init(struct mlxbf_gige *priv)
> +{
> +	size_t size;
> +
> +	size = MLXBF_GIGE_TX_WQE_SZ * priv->tx_q_entries;
> +	priv->tx_wqe_base = dma_alloc_coherent(priv->dev, size,
> +					       &priv->tx_wqe_base_dma,
> +					       GFP_KERNEL);
> +	if (!priv->tx_wqe_base)
> +		return -ENOMEM;
> +
> +	priv->tx_wqe_next = priv->tx_wqe_base;
> +
> +	/* Write TX WQE base address into MMIO reg */
> +	writeq(priv->tx_wqe_base_dma, priv->base + MLXBF_GIGE_TX_WQ_BASE);
> +
> +	/* Allocate address for TX completion count */
> +	priv->tx_cc = dma_alloc_coherent(priv->dev, MLXBF_GIGE_TX_CC_SZ,
> +					 &priv->tx_cc_dma, GFP_KERNEL);
> +

nit: don't put empty lines between a call and error checking

> +	if (!priv->tx_cc) {
> +		dma_free_coherent(priv->dev, size,
> +				  priv->tx_wqe_base, priv->tx_wqe_base_dma);
> +		return -ENOMEM;
> +	}
> +
> +	/* Write TX CC base address into MMIO reg */
> +	writeq(priv->tx_cc_dma, priv->base + MLXBF_GIGE_TX_CI_UPDATE_ADDRESS);
> +
> +	writeq(ilog2(priv->tx_q_entries),
> +	       priv->base + MLXBF_GIGE_TX_WQ_SIZE_LOG2);
> +
> +	priv->prev_tx_ci = 0;
> +	priv->tx_pi = 0;
> +
> +	return 0;
> +}

> +/* Start of struct ethtool_ops functions */
> +static int mlxbf_gige_get_regs_len(struct net_device *netdev)
> +{
> +	/* Return size of MMIO register space (in bytes).
> +	 *
> +	 * NOTE: MLXBF_GIGE_MAC_CFG is the last defined register offset,
> +	 * so use that plus size of single register to derive total size
> +	 */
> +	return MLXBF_GIGE_MAC_CFG + 8;

Please make a define for this, instead of the comment.

> +}
> +
> +static void mlxbf_gige_get_regs(struct net_device *netdev,
> +				struct ethtool_regs *regs, void *p)
> +{
> +	struct mlxbf_gige *priv = netdev_priv(netdev);
> +	__be64 *buff = p;
> +	int reg;
> +
> +	regs->version = MLXBF_GIGE_REGS_VERSION;
> +
> +	/* Read entire MMIO register space and store results
> +	 * into the provided buffer. Each 64-bit word is converted
> +	 * to big-endian to make the output more readable.
> +	 *
> +	 * NOTE: by design, a read to an offset without an existing
> +	 *       register will be acknowledged and return zero.
> +	 */
> +	for (reg = 0; reg <= MLXBF_GIGE_MAC_CFG; reg += 8)

Also with a define this will look less weird.

> +		*buff++ = cpu_to_be64(readq(priv->base + reg));

Can you use memcpy_fromio()?

> +}
> +
> +static void mlxbf_gige_get_ringparam(struct net_device *netdev,
> +				     struct ethtool_ringparam *ering)
> +{
> +	struct mlxbf_gige *priv = netdev_priv(netdev);
> +
> +	memset(ering, 0, sizeof(*ering));

No need, core clears this.

> +	ering->rx_max_pending = MLXBF_GIGE_MAX_RXQ_SZ;
> +	ering->tx_max_pending = MLXBF_GIGE_MAX_TXQ_SZ;
> +	ering->rx_pending = priv->rx_q_entries;
> +	ering->tx_pending = priv->tx_q_entries;
> +}
> +
> +static int mlxbf_gige_set_ringparam(struct net_device *netdev,
> +				    struct ethtool_ringparam *ering)
> +{
> +	const struct net_device_ops *ops = netdev->netdev_ops;
> +	struct mlxbf_gige *priv = netdev_priv(netdev);
> +	int new_rx_q_entries, new_tx_q_entries;
> +
> +	/* Device does not have separate queues for small/large frames */
> +	if (ering->rx_mini_pending || ering->rx_jumbo_pending)
> +		return -EINVAL;
> +
> +	/* Round up to supported values */
> +	new_rx_q_entries = roundup_pow_of_two(ering->rx_pending);
> +	new_tx_q_entries = roundup_pow_of_two(ering->tx_pending);
> +
> +	/* Range check the new values */
> +	if (new_tx_q_entries < MLXBF_GIGE_MIN_TXQ_SZ ||
> +	    new_tx_q_entries > MLXBF_GIGE_MAX_TXQ_SZ ||
> +	    new_rx_q_entries < MLXBF_GIGE_MIN_RXQ_SZ ||
> +	    new_rx_q_entries > MLXBF_GIGE_MAX_RXQ_SZ)

No need to check against max values, core does that.

> +		return -EINVAL;
> +
> +	/* If queue sizes did not change, exit now */
> +	if (new_rx_q_entries == priv->rx_q_entries &&
> +	    new_tx_q_entries == priv->tx_q_entries)
> +		return 0;
> +
> +	if (netif_running(netdev))
> +		ops->ndo_stop(netdev);
> +
> +	priv->rx_q_entries = new_rx_q_entries;
> +	priv->tx_q_entries = new_tx_q_entries;
> +
> +	if (netif_running(netdev))
> +		ops->ndo_open(netdev);

Please write the driver in a way where full stopping and starting the
interface, risking memory allocation failures is not necessary to
reconfigure rings.

> +	return 0;
> +}
> +
> +static void mlxbf_gige_get_drvinfo(struct net_device *netdev,
> +				   struct ethtool_drvinfo *info)
> +{
> +	strlcpy(info->driver, DRV_NAME, sizeof(info->driver));
> +	strlcpy(info->bus_info, dev_name(&netdev->dev), sizeof(info->bus_info));
> +}

If this is all you report - you don't need to implement drvinfo.
Core fills those in.

> +static void mlxbf_gige_get_ethtool_stats(struct net_device *netdev,
> +					 struct ethtool_stats *estats,
> +					 u64 *data)
> +{
> +	struct mlxbf_gige *priv = netdev_priv(netdev);
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&priv->lock, flags);

Why do you take a lock around stats?

> +	/* Fill data array with interface statistics
> +	 *
> +	 * NOTE: the data writes must be in
> +	 *       sync with the strings shown in
> +	 *       the mlxbf_gige_ethtool_stats_keys[] array
> +	 *
> +	 * NOTE2: certain statistics below are zeroed upon
> +	 *        port disable, so the calculation below
> +	 *        must include the "cached" value of the stat
> +	 *        plus the value read directly from hardware.
> +	 *        Cached statistics are currently:
> +	 *          rx_din_dropped_pkts
> +	 *          rx_filter_passed_pkts
> +	 *          rx_filter_discard_pkts
> +	 */
> +	*data++ = netdev->stats.rx_bytes;
> +	*data++ = netdev->stats.rx_packets;
> +	*data++ = netdev->stats.tx_bytes;
> +	*data++ = netdev->stats.tx_packets;

Please don't duplicate standard stats in ethtool.

> +	*data++ = priv->stats.hw_access_errors;
> +	*data++ = priv->stats.tx_invalid_checksums;
> +	*data++ = priv->stats.tx_small_frames;
> +	*data++ = priv->stats.tx_index_errors;
> +	*data++ = priv->stats.sw_config_errors;
> +	*data++ = priv->stats.sw_access_errors;
> +	*data++ = priv->stats.rx_truncate_errors;
> +	*data++ = priv->stats.rx_mac_errors;
> +	*data++ = (priv->stats.rx_din_dropped_pkts +
> +		   readq(priv->base + MLXBF_GIGE_RX_DIN_DROP_COUNTER));
> +	*data++ = priv->stats.tx_fifo_full;
> +	*data++ = (priv->stats.rx_filter_passed_pkts +
> +		   readq(priv->base + MLXBF_GIGE_RX_PASS_COUNTER_ALL));
> +	*data++ = (priv->stats.rx_filter_discard_pkts +
> +		   readq(priv->base + MLXBF_GIGE_RX_DISC_COUNTER_ALL));
> +
> +	spin_unlock_irqrestore(&priv->lock, flags);
> +}

> +static int mlxbf_gige_get_link_ksettings(struct net_device *netdev,
> +					 struct ethtool_link_ksettings *link_ksettings)
> +{
> +	struct phy_device *phydev = netdev->phydev;
> +	u32 supported, advertising;
> +	u32 lp_advertising = 0;
> +	int status;
> +
> +	supported = SUPPORTED_TP | SUPPORTED_1000baseT_Full |
> +		    SUPPORTED_Autoneg | SUPPORTED_Pause;
> +
> +	advertising = ADVERTISED_1000baseT_Full | ADVERTISED_Autoneg |
> +		      ADVERTISED_Pause;
> +
> +	status = phy_read(phydev, MII_LPA);
> +	if (status >= 0)
> +		lp_advertising = mii_lpa_to_ethtool_lpa_t(status & 0xffff);
> +
> +	status = phy_read(phydev, MII_STAT1000);
> +	if (status >= 0)
> +		lp_advertising |= mii_stat1000_to_ethtool_lpa_t(status & 0xffff);
> +
> +	ethtool_convert_legacy_u32_to_link_mode(link_ksettings->link_modes.supported,
> +						supported);
> +	ethtool_convert_legacy_u32_to_link_mode(link_ksettings->link_modes.advertising,
> +						advertising);
> +	ethtool_convert_legacy_u32_to_link_mode(link_ksettings->link_modes.lp_advertising,
> +						lp_advertising);
> +
> +	link_ksettings->base.autoneg = AUTONEG_ENABLE;
> +	link_ksettings->base.speed = SPEED_1000;
> +	link_ksettings->base.duplex = DUPLEX_FULL;

Hm. You're reporting this even if the link is down?

> +	link_ksettings->base.port = PORT_TP;
> +	link_ksettings->base.phy_address = MLXBF_GIGE_MDIO_DEFAULT_PHY_ADDR;
> +	link_ksettings->base.transceiver = XCVR_INTERNAL;
> +	link_ksettings->base.mdio_support = ETH_MDIO_SUPPORTS_C22;
> +	link_ksettings->base.eth_tp_mdix = ETH_TP_MDI_INVALID;
> +	link_ksettings->base.eth_tp_mdix_ctrl = ETH_TP_MDI_INVALID;

> +static irqreturn_t mlxbf_gige_rx_intr(int irq, void *dev_id)
> +{
> +	struct mlxbf_gige *priv;
> +
> +	priv = dev_id;
> +
> +	priv->rx_intr_count++;
> +
> +	/* Driver has been interrupted because a new packet is available,
> +	 * but do not process packets at this time.  Instead, disable any
> +	 * further "packet rx" interrupts and tell the networking subsystem
> +	 * to poll the driver to pick up all available packets.

I must say these comments really are excessive.

> +	 * NOTE: GigE silicon automatically disables "packet rx" interrupt by
> +	 *       setting MLXBF_GIGE_INT_MASK bit0 upon triggering the interrupt
> +	 *       to the ARM cores.  Software needs to re-enable "packet rx"
> +	 *       interrupts by clearing MLXBF_GIGE_INT_MASK bit0.
> +	 */
> +
> +	/* Tell networking subsystem to poll GigE driver */
> +	napi_schedule(&priv->napi);

_irqoff

> +
> +	return IRQ_HANDLED;
> +}
> +static u16 mlxbf_gige_tx_buffs_avail(struct mlxbf_gige *priv)
> +{
> +	unsigned long flags;
> +	u16 avail;
> +
> +	spin_lock_irqsave(&priv->lock, flags);
> +
> +	if (priv->prev_tx_ci == priv->tx_pi)
> +		avail = priv->tx_q_entries - 1;
> +	else
> +		avail = ((priv->tx_q_entries + priv->prev_tx_ci - priv->tx_pi)
> +			  % priv->tx_q_entries) - 1;

Because this is unsigned logic, and q_entries is a power of 2 you
probably don't need the if, and the modulo.

> +	spin_unlock_irqrestore(&priv->lock, flags);
> +
> +	return avail;
> +}
> +
> +static bool mlxbf_gige_handle_tx_complete(struct mlxbf_gige *priv)
> +{
> +	struct net_device_stats *stats;
> +	u16 tx_wqe_index;
> +	u64 *tx_wqe_addr;
> +	u64 tx_status;
> +	u16 tx_ci;
> +
> +	tx_status = readq(priv->base + MLXBF_GIGE_TX_STATUS);
> +	if (tx_status & MLXBF_GIGE_TX_STATUS_DATA_FIFO_FULL)
> +		priv->stats.tx_fifo_full++;
> +	tx_ci = readq(priv->base + MLXBF_GIGE_TX_CONSUMER_INDEX);
> +	stats = &priv->netdev->stats;
> +
> +	/* Transmit completion logic needs to loop until the completion
> +	 * index (in SW) equals TX consumer index (from HW).  These
> +	 * parameters are unsigned 16-bit values and the wrap case needs
> +	 * to be supported, that is TX consumer index wrapped from 0xFFFF
> +	 * to 0 while TX completion index is still < 0xFFFF.
> +	 */
> +	for (; priv->prev_tx_ci != tx_ci; priv->prev_tx_ci++) {
> +		tx_wqe_index = priv->prev_tx_ci % priv->tx_q_entries;
> +		/* Each TX WQE is 16 bytes. The 8 MSB store the 2KB TX
> +		 * buffer address and the 8 LSB contain information
> +		 * about the TX WQE.
> +		 */
> +		tx_wqe_addr = priv->tx_wqe_base +
> +			       (tx_wqe_index * MLXBF_GIGE_TX_WQE_SZ_QWORDS);
> +
> +		stats->tx_packets++;
> +		stats->tx_bytes += MLXBF_GIGE_TX_WQE_PKT_LEN(tx_wqe_addr);
> +		dma_free_coherent(priv->dev, MLXBF_GIGE_DEFAULT_BUF_SZ,
> +				  priv->tx_buf[tx_wqe_index], *tx_wqe_addr);
> +		priv->tx_buf[tx_wqe_index] = NULL;
> +	}
> +
> +	/* Since the TX ring was likely just drained, check if TX queue
> +	 * had previously been stopped and now that there are TX buffers
> +	 * available the TX queue can be awakened.
> +	 */
> +	if (netif_queue_stopped(priv->netdev) &&
> +	    mlxbf_gige_tx_buffs_avail(priv)) {
> +		netif_wake_queue(priv->netdev);
> +	}

No need for parens.

> +
> +	return true;
> +}
> +
> +static bool mlxbf_gige_rx_packet(struct mlxbf_gige *priv, int *rx_pkts)
> +{
> +	struct net_device *netdev = priv->netdev;
> +	u16 rx_pi_rem, rx_ci_rem;
> +	struct sk_buff *skb;
> +	u64 *rx_cqe_addr;
> +	u64 datalen;
> +	u64 rx_cqe;
> +	u16 rx_ci;
> +	u16 rx_pi;
> +	u8 *pktp;
> +
> +	/* Index into RX buffer array is rx_pi w/wrap based on RX_CQE_SIZE */
> +	rx_pi = readq(priv->base + MLXBF_GIGE_RX_WQE_PI);
> +	rx_pi_rem = rx_pi % priv->rx_q_entries;
> +	pktp = priv->rx_buf[rx_pi_rem];
> +	rx_cqe_addr = priv->rx_cqe_base + rx_pi_rem;
> +	rx_cqe = *rx_cqe_addr;
> +	datalen = rx_cqe & MLXBF_GIGE_RX_CQE_PKT_LEN_MASK;
> +
> +	if ((rx_cqe & MLXBF_GIGE_RX_CQE_PKT_STATUS_MASK) == 0) {
> +		/* Packet is OK, increment stats */
> +		netdev->stats.rx_packets++;
> +		netdev->stats.rx_bytes += datalen;
> +
> +		skb = dev_alloc_skb(datalen);
> +		if (!skb) {
> +			netdev->stats.rx_dropped++;
> +			return false;
> +		}
> +
> +		memcpy(skb_put(skb, datalen), pktp, datalen);
> +
> +		skb->dev = netdev;
> +		skb->protocol = eth_type_trans(skb, netdev);
> +		skb->ip_summed = CHECKSUM_NONE; /* device did not checksum packet */
> +
> +		netif_receive_skb(skb);
> +	} else if (rx_cqe & MLXBF_GIGE_RX_CQE_PKT_STATUS_MAC_ERR) {
> +		priv->stats.rx_mac_errors++;
> +	} else if (rx_cqe & MLXBF_GIGE_RX_CQE_PKT_STATUS_TRUNCATED) {
> +		priv->stats.rx_truncate_errors++;
> +	}
> +
> +	/* Let hardware know we've replenished one buffer */
> +	writeq(rx_pi + 1, priv->base + MLXBF_GIGE_RX_WQE_PI);
> +
> +	(*rx_pkts)++;
> +	rx_pi = readq(priv->base + MLXBF_GIGE_RX_WQE_PI);
> +	rx_pi_rem = rx_pi % priv->rx_q_entries;
> +	rx_ci = readq(priv->base + MLXBF_GIGE_RX_CQE_PACKET_CI);
> +	rx_ci_rem = rx_ci % priv->rx_q_entries;

Looks really wasteful to keep reading IO registers on every packet,
while you may already know there is more from the previous iteration..

> +	return rx_pi_rem != rx_ci_rem;
> +}
> +
> +/* Driver poll() function called by NAPI infrastructure */
> +static int mlxbf_gige_poll(struct napi_struct *napi, int budget)
> +{
> +	struct mlxbf_gige *priv;
> +	bool remaining_pkts;
> +	int work_done = 0;
> +	u64 data;
> +
> +	priv = container_of(napi, struct mlxbf_gige, napi);
> +
> +	mlxbf_gige_handle_tx_complete(priv);
> +
> +	do {
> +		remaining_pkts = mlxbf_gige_rx_packet(priv, &work_done);
> +	} while (remaining_pkts && work_done < budget);
> +
> +	/* If amount of work done < budget, turn off NAPI polling
> +	 * via napi_complete_done(napi, work_done) and then
> +	 * re-enable interrupts.
> +	 */
> +	if (work_done < budget && napi_complete_done(napi, work_done)) {
> +		/* Clear MLXBF_GIGE_INT_MASK 'receive pkt' bit to
> +		 * indicate receive readiness
> +		 */
> +		data = readq(priv->base + MLXBF_GIGE_INT_MASK);
> +		data &= ~MLXBF_GIGE_INT_MASK_RX_RECEIVE_PACKET;
> +		writeq(data, priv->base + MLXBF_GIGE_INT_MASK);
> +	}
> +
> +	return work_done;
> +}
> +
> +static int mlxbf_gige_request_irqs(struct mlxbf_gige *priv)
> +{
> +	int err;
> +
> +	err = devm_request_irq(priv->dev, priv->error_irq,
> +			       mlxbf_gige_error_intr, 0, "mlxbf_gige_error",
> +			       priv);
> +	if (err) {
> +		dev_err(priv->dev, "Request error_irq failure\n");
> +		return err;
> +	}
> +
> +	err = devm_request_irq(priv->dev, priv->rx_irq,
> +			       mlxbf_gige_rx_intr, 0, "mlxbf_gige_rx",
> +			       priv);
> +	if (err) {
> +		dev_err(priv->dev, "Request rx_irq failure\n");
> +		return err;
> +	}
> +
> +	err = devm_request_irq(priv->dev, priv->llu_plu_irq,
> +			       mlxbf_gige_llu_plu_intr, 0, "mlxbf_gige_llu_plu",
> +			       priv);
> +	if (err) {
> +		dev_err(priv->dev, "Request llu_plu_irq failure\n");
> +		return err;
> +	}
> +
> +	return 0;
> +}
> +
> +static void mlxbf_gige_free_irqs(struct mlxbf_gige *priv)
> +{
> +	devm_free_irq(priv->dev, priv->error_irq, priv);
> +	devm_free_irq(priv->dev, priv->rx_irq, priv);
> +	devm_free_irq(priv->dev, priv->llu_plu_irq, priv);

What's the point of devm if you free them manually, just use normal
routines, please.

> +}

> +static void mlxbf_gige_clean_port(struct mlxbf_gige *priv)
> +{
> +	u64 control, status;
> +	int cnt;
> +
> +	/* Set the CLEAN_PORT_EN bit to trigger SW reset */
> +	control = readq(priv->base + MLXBF_GIGE_CONTROL);
> +	control |= MLXBF_GIGE_CONTROL_CLEAN_PORT_EN;
> +	writeq(control, priv->base + MLXBF_GIGE_CONTROL);
> +
> +	/* Loop waiting for status ready bit to assert */
> +	cnt = 1000;
> +	do {
> +		status = readq(priv->base + MLXBF_GIGE_STATUS);
> +		if (status & MLXBF_GIGE_STATUS_READY)
> +			break;
> +		usleep_range(50, 100);
> +	} while (--cnt > 0);

check out linux/iopoll.h

> +	/* Clear the CLEAN_PORT_EN bit at end of this loop */
> +	control = readq(priv->base + MLXBF_GIGE_CONTROL);
> +	control &= ~MLXBF_GIGE_CONTROL_CLEAN_PORT_EN;
> +	writeq(control, priv->base + MLXBF_GIGE_CONTROL);
> +}
> +
> +static int mlxbf_gige_open(struct net_device *netdev)
> +{
> +	struct mlxbf_gige *priv = netdev_priv(netdev);
> +	struct phy_device *phydev = netdev->phydev;
> +	u64 int_en;
> +	int err;
> +
> +	mlxbf_gige_cache_stats(priv);
> +	mlxbf_gige_clean_port(priv);
> +	mlxbf_gige_rx_init(priv);
> +	mlxbf_gige_tx_init(priv);

These can fail.

> +	netif_napi_add(netdev, &priv->napi, mlxbf_gige_poll, NAPI_POLL_WEIGHT);
> +	napi_enable(&priv->napi);
> +	netif_start_queue(netdev);
> +
> +	err = mlxbf_gige_request_irqs(priv);

Request IRQs before you start enabling NAPI and doing stuff that can't
fail.

> +	if (err)
> +		return err;
> +
> +	phy_start(phydev);
> +
> +	/* Set bits in INT_EN that we care about */
> +	int_en = MLXBF_GIGE_INT_EN_HW_ACCESS_ERROR |
> +		 MLXBF_GIGE_INT_EN_TX_CHECKSUM_INPUTS |
> +		 MLXBF_GIGE_INT_EN_TX_SMALL_FRAME_SIZE |
> +		 MLXBF_GIGE_INT_EN_TX_PI_CI_EXCEED_WQ_SIZE |
> +		 MLXBF_GIGE_INT_EN_SW_CONFIG_ERROR |
> +		 MLXBF_GIGE_INT_EN_SW_ACCESS_ERROR |
> +		 MLXBF_GIGE_INT_EN_RX_RECEIVE_PACKET;
> +	writeq(int_en, priv->base + MLXBF_GIGE_INT_EN);
> +
> +	return 0;
> +}

> +static netdev_tx_t mlxbf_gige_start_xmit(struct sk_buff *skb,
> +					 struct net_device *netdev)
> +{
> +	struct mlxbf_gige *priv = netdev_priv(netdev);
> +	dma_addr_t tx_buf_dma;
> +	u8 *tx_buf = NULL;
> +	u64 *tx_wqe_addr;
> +	u64 word2;
> +
> +	/* Check that there is room left in TX ring */
> +	if (!mlxbf_gige_tx_buffs_avail(priv)) {
> +		/* TX ring is full, inform stack but do not free SKB */
> +		netif_stop_queue(netdev);
> +		netdev->stats.tx_dropped++;
> +		return NETDEV_TX_BUSY;

Stop the queue when the last index gets used, not when a frame that
overflows the ring arrives. re-queuing skbs is expensive.

> +	}
> +
> +	/* Allocate ptr for buffer */
> +	if (skb->len < MLXBF_GIGE_DEFAULT_BUF_SZ)
> +		tx_buf = dma_alloc_coherent(priv->dev, MLXBF_GIGE_DEFAULT_BUF_SZ,
> +					    &tx_buf_dma, GFP_KERNEL);
> +
> +	if (!tx_buf) {
> +		/* Free incoming skb, could not alloc TX buffer */
> +		dev_kfree_skb(skb);
> +		netdev->stats.tx_dropped++;
> +		return NET_XMIT_DROP;
> +	}
> +
> +	priv->tx_buf[priv->tx_pi % priv->tx_q_entries] = tx_buf;
> +
> +	/* Copy data from skb to allocated TX buffer
> +	 *
> +	 * NOTE: GigE silicon will automatically pad up to
> +	 *       minimum packet length if needed.
> +	 */
> +	skb_copy_bits(skb, 0, tx_buf, skb->len);

Why do you copy data for both TX and RX?! Can't you map the skb data
directly?

> +	/* Get address of TX WQE */
> +	tx_wqe_addr = priv->tx_wqe_next;
> +
> +	mlxbf_gige_update_tx_wqe_next(priv);
> +
> +	/* Put PA of buffer address into first 64-bit word of TX WQE */
> +	*tx_wqe_addr = tx_buf_dma;
> +
> +	/* Set TX WQE pkt_len appropriately */
> +	word2 = skb->len & MLXBF_GIGE_TX_WQE_PKT_LEN_MASK;
> +
> +	/* Write entire 2nd word of TX WQE */
> +	*(tx_wqe_addr + 1) = word2;
> +
> +	priv->tx_pi++;
> +
> +	/* Create memory barrier before write to TX PI */
> +	wmb();

please implement xmit_more

> +	writeq(priv->tx_pi, priv->base + MLXBF_GIGE_TX_PRODUCER_INDEX);
> +
> +	/* Free incoming skb, contents already copied to HW */
> +	dev_kfree_skb(skb);

No, you can't do that. The socket that produced this data will think
it's already on the wire and bombard you with more data.

> +	return NETDEV_TX_OK;
> +}

> +static const struct net_device_ops mlxbf_gige_netdev_ops = {
> +	.ndo_open		= mlxbf_gige_open,
> +	.ndo_stop		= mlxbf_gige_stop,
> +	.ndo_start_xmit		= mlxbf_gige_start_xmit,
> +	.ndo_set_mac_address	= eth_mac_addr,
> +	.ndo_validate_addr	= eth_validate_addr,
> +	.ndo_do_ioctl		= mlxbf_gige_do_ioctl,
> +	.ndo_set_rx_mode        = mlxbf_gige_set_rx_mode,

You must report standard stats.

> +};
> +
> +static u64 mlxbf_gige_mac_to_u64(u8 *addr)
> +{
> +	u64 mac = 0;
> +	int i;
> +
> +	for (i = 0; i < ETH_ALEN; i++) {
> +		mac <<= 8;
> +		mac |= addr[i];
> +	}
> +	return mac;
> +}

ether_addr_to_u64()

> +static void mlxbf_gige_u64_to_mac(u8 *addr, u64 mac)
> +{
> +	int i;
> +
> +	for (i = ETH_ALEN; i > 0; i--) {
> +		addr[i - 1] = mac & 0xFF;
> +		mac >>= 8;
> +	}
> +}

u64_to_ether_addr()