linux-kernel - [PATCH v3 7/8] thunderbolt: Networking transmit and receive

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1468495702-7467-8-git-send-email-amir.jer.levy@intel.com>
Date:	Thu, 14 Jul 2016 14:28:21 +0300
From:	Amir Levy <amir.jer.levy@...el.com>
To:	andreas.noever@...il.com, gregkh@...uxfoundation.org,
	bhelgaas@...gle.com
Cc:	linux-pci@...r.kernel.org, linux-kernel@...r.kernel.org,
	netdev@...r.kernel.org, thunderbolt-linux@...el.com,
	mika.westerberg@...el.com, tomas.winkler@...el.com,
	Amir Levy <amir.jer.levy@...el.com>
Subject: [PATCH v3 7/8] thunderbolt: Networking transmit and receive

Handling the transmission to second peer and receiving from it.
This includes communication with upper layer, the network stack
and configuration of Thunderbolt(TM) HW.

Signed-off-by: Amir Levy <amir.jer.levy@...el.com>
---
 drivers/thunderbolt/icm/icm_nhi.c |   15 +
 drivers/thunderbolt/icm/net.c     | 1475 +++++++++++++++++++++++++++++++++++++
 2 files changed, 1490 insertions(+)

diff --git a/drivers/thunderbolt/icm/icm_nhi.c b/drivers/thunderbolt/icm/icm_nhi.c
index 060bb38..f8b0527 100644
--- a/drivers/thunderbolt/icm/icm_nhi.c
+++ b/drivers/thunderbolt/icm/icm_nhi.c
@@ -1045,6 +1045,7 @@ static irqreturn_t nhi_msi(int __always_unused irq, void *data)
 {
 	struct tbt_nhi_ctxt *nhi_ctxt = data;
 	u32 isr0, isr1, imr0, imr1;
+	int i;
 
 	/* clear on read */
 	isr0 = ioread32(nhi_ctxt->iobase + REG_RING_NOTIFY_BASE);
@@ -1067,6 +1068,20 @@ static irqreturn_t nhi_msi(int __always_unused irq, void *data)
 
 	spin_unlock(&nhi_ctxt->lock);
 
+	for (i = 0; i < nhi_ctxt->num_ports; ++i) {
+		struct net_device *net_dev =
+				nhi_ctxt->net_devices[i].net_dev;
+		if (net_dev) {
+			u8 path = PATH_FROM_PORT(nhi_ctxt->num_paths, i);
+
+			if (isr0 & REG_RING_INT_RX_PROCESSED(
+					path, nhi_ctxt->num_paths))
+				tbt_net_rx_msi(net_dev);
+			if (isr0 & REG_RING_INT_TX_PROCESSED(path))
+				tbt_net_tx_msi(net_dev);
+		}
+	}
+
 	if (isr0 & REG_RING_INT_RX_PROCESSED(TBT_ICM_RING_NUM,
 					     nhi_ctxt->num_paths))
 		schedule_work(&nhi_ctxt->icm_msgs_work);
diff --git a/drivers/thunderbolt/icm/net.c b/drivers/thunderbolt/icm/net.c
index e983dfb..77cc843 100644
--- a/drivers/thunderbolt/icm/net.c
+++ b/drivers/thunderbolt/icm/net.c
@@ -135,6 +135,17 @@ struct approve_inter_domain_connection_cmd {
 
 };
 
+struct tbt_frame_header {
+	/* size of the data with the frame */
+	__le32 frame_size;
+	/* running index on the frames */
+	__le16 frame_index;
+	/* ID of the frame to match frames to specific packet */
+	__le16 frame_id;
+	/* how many frames assembles a full packet */
+	__le32 frame_count;
+};
+
 enum neg_event {
 	RECEIVE_LOGOUT = NUM_MEDIUM_STATUSES,
 	RECEIVE_LOGIN_RESPONSE,
@@ -142,15 +153,81 @@ enum neg_event {
 	NUM_NEG_EVENTS
 };
 
+enum frame_status {
+	GOOD_FRAME,
+	GOOD_AS_FIRST_FRAME,
+	GOOD_AS_FIRST_MULTICAST_FRAME,
+	FRAME_NOT_READY,
+	FRAME_ERROR,
+};
+
+enum packet_filter {
+	/* all multicast MAC addresses */
+	PACKET_TYPE_ALL_MULTICAST,
+	/* all types of MAC addresses: multicast, unicast and broadcast */
+	PACKET_TYPE_PROMISCUOUS,
+	/* all unicast MAC addresses */
+	PACKET_TYPE_UNICAST_PROMISCUOUS,
+};
+
 enum disconnect_path_stage {
 	STAGE_1 = BIT(0),
 	STAGE_2 = BIT(1)
 };
 
+struct tbt_net_stats {
+	u64 tx_packets;
+	u64 tx_bytes;
+	u64 tx_errors;
+	u64 rx_packets;
+	u64 rx_bytes;
+	u64 rx_length_errors;
+	u64 rx_over_errors;
+	u64 rx_crc_errors;
+	u64 rx_missed_errors;
+	u64 multicast;
+};
+
+static const char tbt_net_gstrings_stats[][ETH_GSTRING_LEN] = {
+	"tx_packets",
+	"tx_bytes",
+	"tx_errors",
+	"rx_packets",
+	"rx_bytes",
+	"rx_length_errors",
+	"rx_over_errors",
+	"rx_crc_errors",
+	"rx_missed_errors",
+	"multicast",
+};
+
+struct tbt_buffer {
+	dma_addr_t dma;
+	union {
+		struct tbt_frame_header *hdr;
+		struct page *page;
+	};
+	u32 page_offset;
+};
+
+struct tbt_desc_ring {
+	/* pointer to the descriptor ring memory */
+	struct tbt_buf_desc *desc;
+	/* physical address of the descriptor ring */
+	dma_addr_t dma;
+	/* array of buffer structs */
+	struct tbt_buffer *buffers;
+	/* last descriptor that was associated with a buffer */
+	u16 last_allocated;
+	/* next descriptor to check for DD status bit */
+	u16 next_to_clean;
+};
+
 /**
 *  struct tbt_port - the basic tbt_port structure
 *  @tbt_nhi_ctxt:		context of the nhi controller.
 *  @net_dev:			networking device object.
+*  @napi:			network API
 *  @login_retry_work:		work queue for sending login requests.
 *  @login_response_work:	work queue for sending login responses.
 *  @work_struct logout_work:	work queue for sending logout requests.
@@ -166,6 +243,11 @@ enum disconnect_path_stage {
 *  @login_retry_count:		counts number of login retries sent.
 *  @local_depth:		depth of the remote peer in the chain.
 *  @transmit_path:		routing parameter for the icm.
+*  @tx_ring:			transmit ring from where the packets are sent.
+*  @rx_ring:			receive ring  where the packets are received.
+*  @stats:			network statistics of the rx/tx packets.
+*  @packet_filters:		defines filters for the received packets.
+*  @multicast_hash_table:	hash table of multicast addresses.
 *  @frame_id:			counting ID of frames.
 *  @num:			port number.
 *  @local_path:			routing parameter for the icm.
@@ -175,6 +257,7 @@ enum disconnect_path_stage {
 struct tbt_port {
 	struct tbt_nhi_ctxt *nhi_ctxt;
 	struct net_device *net_dev;
+	struct napi_struct napi;
 	struct delayed_work login_retry_work;
 	struct work_struct login_response_work;
 	struct work_struct logout_work;
@@ -190,6 +273,17 @@ struct tbt_port {
 	u8 login_retry_count;
 	u8 local_depth;
 	u8 transmit_path;
+	struct tbt_desc_ring tx_ring ____cacheline_aligned_in_smp;
+	struct tbt_desc_ring rx_ring;
+	struct tbt_net_stats stats;
+	u32 packet_filters;
+	/*
+	 * hash table of 1024 boolean entries with hashing of
+	 * the multicast address
+	 */
+	u32 multicast_hash_table[DIV_ROUND_UP(
+					TBT_NET_MULTICAST_HASH_TABLE_SIZE,
+					BITS_PER_U32)];
 	u16 frame_id;
 	u8 num;
 	u8 local_path;
@@ -236,6 +330,8 @@ static void tbt_net_tear_down(struct net_device *net_dev, bool send_logout)
 		      (port->local_path * REG_OPTS_STEP);
 		u32 rx_reg_val = ioread32(rx_reg) & ~REG_OPTS_E2E_EN;
 
+		napi_disable(&port->napi);
+
 		tx_reg = iobase + REG_TX_OPTIONS_BASE +
 			 (port->local_path * REG_OPTS_STEP);
 		tx_reg_val = ioread32(tx_reg) & ~REG_OPTS_E2E_EN;
@@ -277,8 +373,1340 @@ static void tbt_net_tear_down(struct net_device *net_dev, bool send_logout)
 				       port->nhi_ctxt->num_paths);
 		spin_unlock_irqrestore(&port->nhi_ctxt->lock, flags);
 	}
+
+	port->rx_ring.next_to_clean = 0;
+	port->rx_ring.last_allocated = TBT_NET_NUM_RX_BUFS - 1;
+
+}
+
+void tbt_net_tx_msi(struct net_device *net_dev)
+{
+	struct tbt_port *port = netdev_priv(net_dev);
+	void __iomem *iobase = port->nhi_ctxt->iobase;
+	u32 prod_cons, prod, cons;
+
+	prod_cons = ioread32(TBT_RING_CONS_PROD_REG(iobase, REG_TX_RING_BASE,
+						    port->local_path));
+	prod = TBT_REG_RING_PROD_EXTRACT(prod_cons);
+	cons = TBT_REG_RING_CONS_EXTRACT(prod_cons);
+	if (prod >= TBT_NET_NUM_TX_BUFS || cons >= TBT_NET_NUM_TX_BUFS)
+		return;
+
+	if (TBT_NUM_BUFS_BETWEEN(prod, cons, TBT_NET_NUM_TX_BUFS) >=
+							TX_WAKE_THRESHOLD) {
+		netif_wake_queue(port->net_dev);
+	} else {
+		spin_lock(&port->nhi_ctxt->lock);
+		/* enable TX interrupt */
+		RING_INT_ENABLE_TX(iobase, port->local_path);
+		spin_unlock(&port->nhi_ctxt->lock);
+	}
+}
+
+static irqreturn_t tbt_net_tx_msix(int __always_unused irq, void *data)
+{
+	struct tbt_port *port = data;
+	void __iomem *iobase = port->nhi_ctxt->iobase;
+	u32 prod_cons, prod, cons;
+
+	prod_cons = ioread32(TBT_RING_CONS_PROD_REG(iobase,
+						    REG_TX_RING_BASE,
+						    port->local_path));
+	prod = TBT_REG_RING_PROD_EXTRACT(prod_cons);
+	cons = TBT_REG_RING_CONS_EXTRACT(prod_cons);
+	if (prod < TBT_NET_NUM_TX_BUFS && cons < TBT_NET_NUM_TX_BUFS &&
+	    TBT_NUM_BUFS_BETWEEN(prod, cons, TBT_NET_NUM_TX_BUFS) >=
+							TX_WAKE_THRESHOLD) {
+		spin_lock(&port->nhi_ctxt->lock);
+		/* disable TX interrupt */
+		RING_INT_DISABLE_TX(iobase, port->local_path);
+		spin_unlock(&port->nhi_ctxt->lock);
+
+		netif_wake_queue(port->net_dev);
+	}
+
+	return IRQ_HANDLED;
+}
+
+void tbt_net_rx_msi(struct net_device *net_dev)
+{
+	struct tbt_port *port = netdev_priv(net_dev);
+
+	napi_schedule_irqoff(&port->napi);
+}
+
+static irqreturn_t tbt_net_rx_msix(int __always_unused irq, void *data)
+{
+	struct tbt_port *port = data;
+
+	if (likely(napi_schedule_prep(&port->napi))) {
+		struct tbt_nhi_ctxt *nhi_ctx = port->nhi_ctxt;
+
+		spin_lock(&nhi_ctx->lock);
+		/* disable RX interrupt */
+		RING_INT_DISABLE_RX(nhi_ctx->iobase, port->local_path,
+				    nhi_ctx->num_paths);
+		spin_unlock(&nhi_ctx->lock);
+
+		__napi_schedule_irqoff(&port->napi);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static void tbt_net_pull_tail(struct sk_buff *skb)
+{
+	skb_frag_t *frag = &skb_shinfo(skb)->frags[0];
+	unsigned int pull_len;
+	unsigned char *va;
+
+	/*
+	 * it is valid to use page_address instead of kmap since we are
+	 * working with pages allocated out of the lomem pool
+	 */
+	va = skb_frag_address(frag);
+
+	pull_len = eth_get_headlen(va, TBT_NET_RX_HDR_SIZE);
+
+	/* align pull length to size of long to optimize memcpy performance */
+	skb_copy_to_linear_data(skb, va, ALIGN(pull_len, sizeof(long)));
+
+	/* update all of the pointers */
+	skb_frag_size_sub(frag, pull_len);
+	frag->page_offset += pull_len;
+	skb->data_len -= pull_len;
+	skb->tail += pull_len;
+}
+
+static inline bool tbt_net_alloc_mapped_page(struct device *dev,
+					     struct tbt_buffer *buf, gfp_t gfp)
+{
+	if (!buf->page) {
+		buf->page = alloc_page(gfp | __GFP_COLD);
+		if (unlikely(!buf->page))
+			return false;
+
+		buf->dma = dma_map_page(dev, buf->page, 0, PAGE_SIZE,
+					DMA_FROM_DEVICE);
+		if (dma_mapping_error(dev, buf->dma)) {
+			__free_page(buf->page);
+			buf->page = NULL;
+			return false;
+		}
+		buf->page_offset = 0;
+	}
+	return true;
+}
+
+static bool tbt_net_alloc_rx_buffers(struct device *dev,
+				     struct tbt_desc_ring *rx_ring,
+				     u16 cleaned_count, void __iomem *reg,
+				     gfp_t gfp)
+{
+	u16 i = (rx_ring->last_allocated + 1) & (TBT_NET_NUM_RX_BUFS - 1);
+	bool res = false;
+
+	while (cleaned_count--) {
+		struct tbt_buf_desc *desc = &rx_ring->desc[i];
+		struct tbt_buffer *buf = &rx_ring->buffers[i];
+
+		/* making sure next_to_clean won't get old buffer */
+		desc->attributes = cpu_to_le32(DESC_ATTR_REQ_STS |
+					       DESC_ATTR_INT_EN);
+		if (tbt_net_alloc_mapped_page(dev, buf, gfp)) {
+			res = true;
+			rx_ring->last_allocated = i;
+			i = (i + 1) & (TBT_NET_NUM_RX_BUFS - 1);
+			desc->phys = cpu_to_le64(buf->dma + buf->page_offset);
+		} else {
+			break;
+		}
+	}
+
+	if (res) {
+		iowrite32((rx_ring->last_allocated << REG_RING_CONS_SHIFT) &
+			  REG_RING_CONS_MASK, reg);
+	}
+
+	return res;
+}
+
+static inline bool tbt_net_multicast_mac_set(const u32 *multicast_hash_table,
+					     const u8 *ether_addr)
+{
+	u16 hash_val = TBT_NET_ETHER_ADDR_HASH(ether_addr);
+
+	return !!(multicast_hash_table[hash_val / BITS_PER_U32] &
+		  BIT(hash_val % BITS_PER_U32));
+}
+
+static enum frame_status tbt_net_check_frame(struct tbt_port *port,
+					     u16 frame_num, u32 *count,
+					     u16 index, u16 *id, u32 *size)
+{
+	struct tbt_desc_ring *rx_ring = &port->rx_ring;
+	__le32 desc_attr = rx_ring->desc[frame_num].attributes;
+	enum frame_status res = GOOD_AS_FIRST_FRAME;
+	u32 len, frame_count, frame_size;
+	struct tbt_frame_header *hdr;
+
+	if (!(desc_attr & cpu_to_le32(DESC_ATTR_DESC_DONE)))
+		return FRAME_NOT_READY;
+
+	rmb(); /* read other fields from desc after checking DD */
+
+	if (unlikely(desc_attr & cpu_to_le32(DESC_ATTR_RX_CRC_ERR))) {
+		++port->stats.rx_crc_errors;
+		goto err;
+	} else if (unlikely(desc_attr &
+				cpu_to_le32(DESC_ATTR_RX_BUF_OVRN_ERR))) {
+		++port->stats.rx_over_errors;
+		goto err;
+	}
+
+	len = (le32_to_cpu(desc_attr) & DESC_ATTR_LEN_MASK)
+	      >> DESC_ATTR_LEN_SHIFT;
+	if (len == 0)
+		len = TBT_RING_MAX_FRAME_SIZE;
+	/* should be greater than just header i.e. contains data */
+	if (unlikely(len <= sizeof(struct tbt_frame_header))) {
+		++port->stats.rx_length_errors;
+		goto err;
+	}
+
+	prefetchw(rx_ring->buffers[frame_num].page);
+	hdr = page_address(rx_ring->buffers[frame_num].page) +
+				rx_ring->buffers[frame_num].page_offset;
+	/* prefetch first cache line of first page */
+	prefetch(hdr);
+
+	/* we are reusing so sync this buffer for CPU use */
+	dma_sync_single_range_for_cpu(&port->nhi_ctxt->pdev->dev,
+				      rx_ring->buffers[frame_num].dma,
+				      rx_ring->buffers[frame_num].page_offset,
+				      TBT_RING_MAX_FRAME_SIZE,
+				      DMA_FROM_DEVICE);
+
+	frame_count = le32_to_cpu(hdr->frame_count);
+	frame_size = le32_to_cpu(hdr->frame_size);
+
+	if (unlikely((frame_size > len - sizeof(struct tbt_frame_header)) ||
+		     (frame_size == 0))) {
+		++port->stats.rx_length_errors;
+		goto err;
+	}
+	/*
+	 * In case we're in the middle of packet, validate the frame header
+	 * based on first fragment of the packet
+	 */
+	if (*count) {
+		/* check the frame count fits the count field */
+		if (frame_count != *count) {
+			++port->stats.rx_length_errors;
+			goto check_as_first;
+		}
+
+		/*
+		 * check the frame identifiers are incremented correctly,
+		 * and id is matching
+		 */
+		if ((le16_to_cpu(hdr->frame_index) != index) ||
+		    (le16_to_cpu(hdr->frame_id) != *id)) {
+			++port->stats.rx_missed_errors;
+			goto check_as_first;
+		}
+
+		*size += frame_size;
+		if (*size > TBT_NET_MTU) {
+			++port->stats.rx_length_errors;
+			goto err;
+		}
+		res = GOOD_FRAME;
+	} else { /* start of packet, validate the frame header */
+		const u8 *addr;
+
+check_as_first:
+		rx_ring->next_to_clean = frame_num;
+
+		/* validate the first packet has a valid frame count */
+		if (unlikely(frame_count == 0 ||
+			     frame_count > (TBT_NET_NUM_RX_BUFS / 4))) {
+			++port->stats.rx_length_errors;
+			goto err;
+		}
+
+		/* validate the first packet has a valid frame index */
+		if (hdr->frame_index != 0) {
+			++port->stats.rx_missed_errors;
+			goto err;
+		}
+
+		BUILD_BUG_ON(TBT_NET_RX_HDR_SIZE > TBT_RING_MAX_FRM_DATA_SZ);
+		if ((frame_count > 1) && (frame_size < TBT_NET_RX_HDR_SIZE)) {
+			++port->stats.rx_length_errors;
+			goto err;
+		}
+
+		addr = (u8 *)(hdr + 1);
+
+		/* check the packet can go through the filter */
+		if (is_multicast_ether_addr(addr)) {
+			if (!is_broadcast_ether_addr(addr)) {
+				if ((port->packet_filters &
+				     (BIT(PACKET_TYPE_PROMISCUOUS) |
+				      BIT(PACKET_TYPE_ALL_MULTICAST))) ||
+				    tbt_net_multicast_mac_set(
+					port->multicast_hash_table, addr))
+					res = GOOD_AS_FIRST_MULTICAST_FRAME;
+				else
+					goto err;
+			}
+		} else if (!(port->packet_filters &
+			     (BIT(PACKET_TYPE_PROMISCUOUS) |
+			      BIT(PACKET_TYPE_UNICAST_PROMISCUOUS))) &&
+			   !ether_addr_equal(port->net_dev->dev_addr, addr)) {
+			goto err;
+		}
+
+		*size = frame_size;
+		*count = frame_count;
+		*id = le16_to_cpu(hdr->frame_id);
+	}
+
+#if (PREFETCH_STRIDE < 128)
+	prefetch((u8 *)hdr + PREFETCH_STRIDE);
+#endif
+
+	return res;
+
+err:
+	rx_ring->next_to_clean = (frame_num + 1) & (TBT_NET_NUM_RX_BUFS - 1);
+	return FRAME_ERROR;
+}
+
+static inline unsigned int tbt_net_max_frm_data_size(
+						__maybe_unused u32 frame_size)
+{
+#if (TBT_NUM_FRAMES_PER_PAGE > 1)
+	return ALIGN(frame_size + sizeof(struct tbt_frame_header),
+		     L1_CACHE_BYTES) -
+	       sizeof(struct tbt_frame_header);
+#else
+	return TBT_RING_MAX_FRM_DATA_SZ;
+#endif
+}
+
+static int tbt_net_poll(struct napi_struct *napi, int budget)
+{
+	struct tbt_port *port = container_of(napi, struct tbt_port, napi);
+	void __iomem *reg = TBT_RING_CONS_PROD_REG(port->nhi_ctxt->iobase,
+						   REG_RX_RING_BASE,
+						   port->local_path);
+	struct tbt_desc_ring *rx_ring = &port->rx_ring;
+	u16 cleaned_count = TBT_NUM_BUFS_BETWEEN(rx_ring->last_allocated,
+						 rx_ring->next_to_clean,
+						 TBT_NET_NUM_RX_BUFS);
+	unsigned long flags;
+	int rx_packets = 0;
+
+loop:
+	while (likely(rx_packets < budget)) {
+		struct sk_buff *skb;
+		enum frame_status status;
+		bool multicast = false;
+		u32 frame_count = 0, size;
+		u16 j, frame_id;
+		int i;
+
+		/*
+		 * return some buffers to hardware, one at a time is too slow
+		 * so allocate  TBT_NET_RX_BUFFER_WRITE buffers at the same time
+		 */
+		if (cleaned_count >= TBT_NET_RX_BUFFER_WRITE) {
+			tbt_net_alloc_rx_buffers(&port->nhi_ctxt->pdev->dev,
+						 rx_ring, cleaned_count, reg,
+						 GFP_ATOMIC);
+			cleaned_count = 0;
+		}
+
+		status = tbt_net_check_frame(port, rx_ring->next_to_clean,
+					     &frame_count, 0, &frame_id,
+					     &size);
+		if (status == FRAME_NOT_READY)
+			break;
+
+		if (status == FRAME_ERROR) {
+			++cleaned_count;
+			continue;
+		}
+
+		multicast = (status == GOOD_AS_FIRST_MULTICAST_FRAME);
+
+		/*
+		 *  i is incremented up to the frame_count frames received,
+		 *  j cyclicly goes over the location from the next frame
+		 *  to clean in the ring
+		 */
+		j = (rx_ring->next_to_clean + 1);
+		j &= (TBT_NET_NUM_RX_BUFS - 1);
+		for (i = 1; i < frame_count; ++i) {
+			status = tbt_net_check_frame(port, j, &frame_count, i,
+						     &frame_id, &size);
+			if (status == FRAME_NOT_READY)
+				goto out;
+
+			j = (j + 1) & (TBT_NET_NUM_RX_BUFS - 1);
+
+			/* if a new frame is found, start over */
+			if (status == GOOD_AS_FIRST_FRAME ||
+			    status == GOOD_AS_FIRST_MULTICAST_FRAME) {
+				multicast = (status ==
+					     GOOD_AS_FIRST_MULTICAST_FRAME);
+				cleaned_count += i;
+				i = 0;
+				continue;
+			}
+
+			if (status == FRAME_ERROR) {
+				cleaned_count += (i + 1);
+				goto loop;
+			}
+		}
+
+		/* allocate a skb to store the frags */
+		skb = netdev_alloc_skb_ip_align(port->net_dev,
+						TBT_NET_RX_HDR_SIZE);
+		if (unlikely(!skb))
+			break;
+
+		/*
+		 * we will be copying header into skb->data in
+		 * tbt_net_pull_tail so it is in our interest to prefetch
+		 * it now to avoid a possible cache miss
+		 */
+		prefetchw(skb->data);
+
+		/*
+		 * if overall size of packet smaller than TBT_NET_RX_HDR_SIZE
+		 * which is a small buffer size we decided to allocate
+		 * as the base to RX
+		 */
+		if (size <= TBT_NET_RX_HDR_SIZE) {
+			struct tbt_buffer *buf =
+				&(rx_ring->buffers[rx_ring->next_to_clean]);
+			u8 *va = page_address(buf->page) + buf->page_offset +
+				 sizeof(struct tbt_frame_header);
+
+			memcpy(__skb_put(skb, size), va,
+			       ALIGN(size, sizeof(long)));
+
+			/*
+			 * Reuse buffer as-is,
+			 * just make sure it is local
+			 * Access to local memory is faster than non-local
+			 * memory so let's reuse.
+			 * If not local, let's free it and reallocate later.
+			 */
+			if (likely(page_to_nid(buf->page) == numa_node_id()))
+				/* sync the buffer for use by the device */
+				dma_sync_single_range_for_device(
+						&port->nhi_ctxt->pdev->dev,
+						buf->dma, buf->page_offset,
+						TBT_RING_MAX_FRAME_SIZE,
+						DMA_FROM_DEVICE);
+			else {
+				/* this page cannot be reused so discard it */
+				put_page(buf->page);
+				buf->page = NULL;
+				dma_unmap_page(&port->nhi_ctxt->pdev->dev,
+					       buf->dma, PAGE_SIZE,
+					       DMA_FROM_DEVICE);
+			}
+			rx_ring->next_to_clean = (rx_ring->next_to_clean + 1) &
+						 (TBT_NET_NUM_RX_BUFS - 1);
+		} else {
+			for (i = 0; i < frame_count;  ++i) {
+				struct tbt_buffer *buf = &(rx_ring->buffers[
+						rx_ring->next_to_clean]);
+				struct tbt_frame_header *hdr =
+						page_address(buf->page) +
+						buf->page_offset;
+				u32 frm_size = le32_to_cpu(hdr->frame_size);
+
+				unsigned int truesize =
+					tbt_net_max_frm_data_size(frm_size);
+
+				/* add frame to skb struct */
+				skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
+						buf->page,
+						sizeof(struct tbt_frame_header)
+							+ buf->page_offset,
+						frm_size, truesize);
+
+#if (TBT_NUM_FRAMES_PER_PAGE > 1)
+				/* move offset up to the next cache line */
+				buf->page_offset += (truesize +
+					sizeof(struct tbt_frame_header));
+
+				/*
+				 * we can reuse buffer if there is space
+				 * available and it is local
+				 */
+				if (page_to_nid(buf->page) == numa_node_id()
+				    && buf->page_offset <=
+					PAGE_SIZE - TBT_RING_MAX_FRAME_SIZE) {
+					/*
+					 * bump ref count on page before
+					 * it is given to the stack
+					 */
+					get_page(buf->page);
+					/*
+					 * sync the buffer for use by the
+					 * device
+					 */
+					dma_sync_single_range_for_device(
+						&port->nhi_ctxt->pdev->dev,
+						buf->dma, buf->page_offset,
+						TBT_RING_MAX_FRAME_SIZE,
+						DMA_FROM_DEVICE);
+				} else
+#endif
+				{
+					buf->page = NULL;
+					dma_unmap_page(
+						&port->nhi_ctxt->pdev->dev,
+						buf->dma, PAGE_SIZE,
+						DMA_FROM_DEVICE);
+				}
+
+				rx_ring->next_to_clean =
+						(rx_ring->next_to_clean + 1) &
+						(TBT_NET_NUM_RX_BUFS - 1);
+			}
+			/*
+			 * place header from the first
+			 * fragment in linear portion of buffer
+			 */
+			tbt_net_pull_tail(skb);
+		}
+
+		/* pad short packets */
+		if (unlikely(skb->len < ETH_ZLEN)) {
+			int pad_len = ETH_ZLEN - skb->len;
+
+			/* The skb is freed on error */
+			if (unlikely(skb_pad(skb, pad_len))) {
+				cleaned_count += frame_count;
+				continue;
+			}
+			__skb_put(skb, pad_len);
+		}
+
+		skb->protocol = eth_type_trans(skb, port->net_dev);
+		napi_gro_receive(&port->napi, skb);
+
+		++rx_packets;
+		port->stats.rx_bytes += size;
+		if (multicast)
+			++port->stats.multicast;
+		cleaned_count += frame_count;
+	}
+
+out:
+	port->stats.rx_packets += rx_packets;
+
+	if (cleaned_count)
+		tbt_net_alloc_rx_buffers(&port->nhi_ctxt->pdev->dev,
+					 rx_ring, cleaned_count, reg,
+					 GFP_ATOMIC);
+
+	/* If all work not completed, return budget and keep polling */
+	if (rx_packets >= budget)
+		return budget;
+
+	/* Work is done so exit the polling mode and re-enable the interrupt */
+	napi_complete(napi);
+
+	spin_lock_irqsave(&port->nhi_ctxt->lock, flags);
+	/* enable RX interrupt */
+	RING_INT_ENABLE_RX(port->nhi_ctxt->iobase, port->local_path,
+			   port->nhi_ctxt->num_paths);
+
+	spin_unlock_irqrestore(&port->nhi_ctxt->lock, flags);
+
+	return 0;
+}
+
+static int tbt_net_open(struct net_device *net_dev)
+{
+	struct tbt_port *port = netdev_priv(net_dev);
+	int res = 0;
+	int i, j;
+
+	/* change link state to off until path establishment finishes */
+	netif_carrier_off(net_dev);
+
+	/*
+	 * if we previously succeeded to allocate msix entries,
+	 * now request IRQ for them:
+	 *  2=tx data port 0,
+	 *  3=rx data port 0,
+	 *  4=tx data port 1,
+	 *  5=rx data port 1,
+	 *  ...
+	 *  if not, if msi is used, nhi_msi will handle icm & data paths
+	 */
+	if (port->nhi_ctxt->msix_entries) {
+		char name[] = "tbt-net-xx-xx";
+
+		scnprintf(name, sizeof(name), "tbt-net-rx-%02u", port->num);
+		res = devm_request_irq(&port->nhi_ctxt->pdev->dev,
+			port->nhi_ctxt->msix_entries[3+(port->num*2)].vector,
+			tbt_net_rx_msix, 0, name, port);
+		if (res) {
+			netif_err(port, ifup, net_dev, "request_irq %s failed %d\n",
+				  name, res);
+			goto out;
+		}
+		name[8] = 't';
+		res = devm_request_irq(&port->nhi_ctxt->pdev->dev,
+			port->nhi_ctxt->msix_entries[2+(port->num*2)].vector,
+			tbt_net_tx_msix, 0, name, port);
+		if (res) {
+			netif_err(port, ifup, net_dev, "request_irq %s failed %d\n",
+				  name, res);
+			goto request_irq_failure;
+		}
+	}
+	/*
+	 * Verifying that all buffer sizes are well defined.
+	 * Starting with frame(s) will not tip over the
+	 * page boundary
+	 */
+	BUILD_BUG_ON(TBT_NUM_FRAMES_PER_PAGE < 1);
+	/*
+	 * Just to make sure we have enough place for containing
+	 * 3 max MTU packets for TX
+	 */
+	BUILD_BUG_ON((TBT_NET_NUM_TX_BUFS * TBT_RING_MAX_FRAME_SIZE) <
+		     (TBT_NET_MTU * 3));
+	/* make sure the number of TX Buffers is power of 2 */
+	BUILD_BUG_ON_NOT_POWER_OF_2(TBT_NET_NUM_TX_BUFS);
+	/*
+	 * Just to make sure we have enough place for containing
+	 * 3 max MTU packets for RX
+	 */
+	BUILD_BUG_ON((TBT_NET_NUM_RX_BUFS * TBT_RING_MAX_FRAME_SIZE) <
+		     (TBT_NET_MTU * 3));
+	/* make sure the number of RX Buffers is power of 2 */
+	BUILD_BUG_ON_NOT_POWER_OF_2(TBT_NET_NUM_RX_BUFS);
+
+	port->rx_ring.last_allocated = TBT_NET_NUM_RX_BUFS - 1;
+
+	port->tx_ring.buffers = vzalloc(TBT_NET_NUM_TX_BUFS *
+					sizeof(struct tbt_buffer));
+	if (!port->tx_ring.buffers)
+		goto ring_alloc_failure;
+	port->rx_ring.buffers = vzalloc(TBT_NET_NUM_RX_BUFS *
+					sizeof(struct tbt_buffer));
+	if (!port->rx_ring.buffers)
+		goto ring_alloc_failure;
+
+	/*
+	 * Allocate TX and RX descriptors
+	 * if the total size is less than a page, do a central allocation
+	 * Otherwise, split TX and RX
+	 */
+	if (TBT_NET_SIZE_TOTAL_DESCS <= PAGE_SIZE) {
+		port->tx_ring.desc = dmam_alloc_coherent(
+				&port->nhi_ctxt->pdev->dev,
+				TBT_NET_SIZE_TOTAL_DESCS,
+				&port->tx_ring.dma,
+				GFP_KERNEL | __GFP_ZERO);
+		if (!port->tx_ring.desc)
+			goto ring_alloc_failure;
+		/* RX starts where TX finishes */
+		port->rx_ring.desc = &port->tx_ring.desc[TBT_NET_NUM_TX_BUFS];
+		port->rx_ring.dma = port->tx_ring.dma +
+			(TBT_NET_NUM_TX_BUFS * sizeof(struct tbt_buf_desc));
+	} else {
+		port->tx_ring.desc = dmam_alloc_coherent(
+				&port->nhi_ctxt->pdev->dev,
+				TBT_NET_NUM_TX_BUFS *
+						sizeof(struct tbt_buf_desc),
+				&port->tx_ring.dma,
+				GFP_KERNEL | __GFP_ZERO);
+		if (!port->tx_ring.desc)
+			goto ring_alloc_failure;
+		port->rx_ring.desc = dmam_alloc_coherent(
+				&port->nhi_ctxt->pdev->dev,
+				TBT_NET_NUM_RX_BUFS *
+						sizeof(struct tbt_buf_desc),
+				&port->rx_ring.dma,
+				GFP_KERNEL | __GFP_ZERO);
+		if (!port->rx_ring.desc)
+			goto rx_desc_alloc_failure;
+	}
+
+	/* allocate TX buffers and configure the descriptors */
+	for (i = 0; i < TBT_NET_NUM_TX_BUFS; i++) {
+		port->tx_ring.buffers[i].hdr = dma_alloc_coherent(
+			&port->nhi_ctxt->pdev->dev,
+			TBT_NUM_FRAMES_PER_PAGE * TBT_RING_MAX_FRAME_SIZE,
+			&port->tx_ring.buffers[i].dma,
+			GFP_KERNEL);
+		if (!port->tx_ring.buffers[i].hdr)
+			goto buffers_alloc_failure;
+
+		port->tx_ring.desc[i].phys =
+				cpu_to_le64(port->tx_ring.buffers[i].dma);
+		port->tx_ring.desc[i].attributes =
+				cpu_to_le32(DESC_ATTR_REQ_STS |
+					    TBT_NET_DESC_ATTR_SOF_EOF);
+
+		/*
+		 * In case the page is bigger than the frame size,
+		 * make the next buffer descriptor points
+		 * on the next frame memory address within the page
+		 */
+		for (i++, j = 1; (i < TBT_NET_NUM_TX_BUFS) &&
+				 (j < TBT_NUM_FRAMES_PER_PAGE); i++, j++) {
+			port->tx_ring.buffers[i].dma =
+				port->tx_ring.buffers[i - 1].dma +
+				TBT_RING_MAX_FRAME_SIZE;
+			port->tx_ring.buffers[i].hdr =
+				(void *)(port->tx_ring.buffers[i - 1].hdr) +
+				TBT_RING_MAX_FRAME_SIZE;
+			/* move the next offset i.e. TBT_RING_MAX_FRAME_SIZE */
+			port->tx_ring.buffers[i].page_offset =
+				port->tx_ring.buffers[i - 1].page_offset +
+				TBT_RING_MAX_FRAME_SIZE;
+			port->tx_ring.desc[i].phys =
+				cpu_to_le64(port->tx_ring.buffers[i].dma);
+			port->tx_ring.desc[i].attributes =
+				cpu_to_le32(DESC_ATTR_REQ_STS |
+					    TBT_NET_DESC_ATTR_SOF_EOF);
+		}
+		i--;
+	}
+
+	port->negotiation_status =
+			BIT(port->nhi_ctxt->net_devices[port->num].medium_sts);
+	if (port->negotiation_status == BIT(MEDIUM_READY_FOR_CONNECTION)) {
+		port->login_retry_count = 0;
+		queue_delayed_work(port->nhi_ctxt->net_workqueue,
+				   &port->login_retry_work, 0);
+	}
+
+	netif_info(port, ifup, net_dev, "Thunderbolt(TM) Networking port %u - ready for ThunderboltIP negotiation\n",
+		   port->num);
+	return 0;
+
+buffers_alloc_failure:
+	/*
+	 * Rollback the Tx buffers that were already allocated
+	 * until the failure
+	 */
+	for (i--; i >= 0; i--) {
+		/* free only for first buffer allocation */
+		if (port->tx_ring.buffers[i].page_offset == 0)
+			dma_free_coherent(&port->nhi_ctxt->pdev->dev,
+					  TBT_NUM_FRAMES_PER_PAGE *
+						TBT_RING_MAX_FRAME_SIZE,
+					  port->tx_ring.buffers[i].hdr,
+					  port->tx_ring.buffers[i].dma);
+		port->tx_ring.buffers[i].hdr = NULL;
+	}
+	/*
+	 * For central allocation, free all
+	 * otherwise free RX and then TX separately
+	 */
+	if (TBT_NET_SIZE_TOTAL_DESCS <= PAGE_SIZE) {
+		dmam_free_coherent(&port->nhi_ctxt->pdev->dev,
+				   TBT_NET_SIZE_TOTAL_DESCS,
+				   port->tx_ring.desc,
+				   port->tx_ring.dma);
+		port->rx_ring.desc = NULL;
+	} else {
+		dmam_free_coherent(&port->nhi_ctxt->pdev->dev,
+				   TBT_NET_NUM_RX_BUFS *
+						sizeof(struct tbt_buf_desc),
+				   port->rx_ring.desc,
+				   port->rx_ring.dma);
+		port->rx_ring.desc = NULL;
+rx_desc_alloc_failure:
+		dmam_free_coherent(&port->nhi_ctxt->pdev->dev,
+				   TBT_NET_NUM_TX_BUFS *
+						sizeof(struct tbt_buf_desc),
+				   port->tx_ring.desc,
+				   port->tx_ring.dma);
+	}
+	port->tx_ring.desc = NULL;
+ring_alloc_failure:
+	vfree(port->tx_ring.buffers);
+	port->tx_ring.buffers = NULL;
+	vfree(port->rx_ring.buffers);
+	port->rx_ring.buffers = NULL;
+	res = -ENOMEM;
+	netif_err(port, ifup, net_dev, "Thunderbolt(TM) Networking port %u - unable to allocate memory\n",
+		  port->num);
+
+	if (!port->nhi_ctxt->msix_entries)
+		goto out;
+
+	devm_free_irq(&port->nhi_ctxt->pdev->dev,
+		      port->nhi_ctxt->msix_entries[2 + (port->num * 2)].vector,
+		      port);
+request_irq_failure:
+	devm_free_irq(&port->nhi_ctxt->pdev->dev,
+		      port->nhi_ctxt->msix_entries[3 + (port->num * 2)].vector,
+		      port);
+out:
+	return res;
+}
+
+static int tbt_net_close(struct net_device *net_dev)
+{
+	struct tbt_port *port = netdev_priv(net_dev);
+	int i;
+
+	/*
+	 * Close connection, disable rings, flow controls
+	 * and interrupts
+	 */
+	tbt_net_tear_down(net_dev, !(port->negotiation_status &
+				     BIT(RECEIVE_LOGOUT)));
+
+	cancel_work_sync(&port->login_response_work);
+	cancel_work_sync(&port->logout_work);
+	cancel_work_sync(&port->status_reply_work);
+	cancel_work_sync(&port->approve_inter_domain_work);
+
+	/* Rollback the Tx buffers that were allocated */
+	for (i = 0; i < TBT_NET_NUM_TX_BUFS; i++) {
+		if (port->tx_ring.buffers[i].page_offset == 0)
+			dma_free_coherent(&port->nhi_ctxt->pdev->dev,
+					  TBT_NUM_FRAMES_PER_PAGE *
+						TBT_RING_MAX_FRAME_SIZE,
+					  port->tx_ring.buffers[i].hdr,
+					  port->tx_ring.buffers[i].dma);
+		port->tx_ring.buffers[i].hdr = NULL;
+	}
+	/* Unmap the Rx buffers that were allocated */
+	for (i = 0; i < TBT_NET_NUM_RX_BUFS; i++)
+		if (port->rx_ring.buffers[i].page) {
+			put_page(port->rx_ring.buffers[i].page);
+			port->rx_ring.buffers[i].page = NULL;
+			dma_unmap_page(&port->nhi_ctxt->pdev->dev,
+				       port->rx_ring.buffers[i].dma, PAGE_SIZE,
+				       DMA_FROM_DEVICE);
+		}
+
+	/*
+	 * For central allocation, free all
+	 * otherwise free RX and then TX separately
+	 */
+	if (TBT_NET_SIZE_TOTAL_DESCS <= PAGE_SIZE) {
+		dmam_free_coherent(&port->nhi_ctxt->pdev->dev,
+				   TBT_NET_SIZE_TOTAL_DESCS,
+				   port->tx_ring.desc,
+				   port->tx_ring.dma);
+		port->rx_ring.desc = NULL;
+	} else {
+		dmam_free_coherent(&port->nhi_ctxt->pdev->dev,
+				   TBT_NET_NUM_RX_BUFS *
+						sizeof(struct tbt_buf_desc),
+				   port->rx_ring.desc,
+				   port->rx_ring.dma);
+		port->rx_ring.desc = NULL;
+		dmam_free_coherent(&port->nhi_ctxt->pdev->dev,
+				   TBT_NET_NUM_TX_BUFS *
+						sizeof(struct tbt_buf_desc),
+				   port->tx_ring.desc,
+				   port->tx_ring.dma);
+	}
+	port->tx_ring.desc = NULL;
+
+	vfree(port->tx_ring.buffers);
+	port->tx_ring.buffers = NULL;
+	vfree(port->rx_ring.buffers);
+	port->rx_ring.buffers = NULL;
+
+	devm_free_irq(&port->nhi_ctxt->pdev->dev,
+		      port->nhi_ctxt->msix_entries[3 + (port->num * 2)].vector,
+		      port);
+	devm_free_irq(&port->nhi_ctxt->pdev->dev,
+		      port->nhi_ctxt->msix_entries[2 + (port->num * 2)].vector,
+		      port);
+
+	netif_info(port, ifdown, net_dev, "Thunderbolt(TM) Networking port %u - is down\n",
+		   port->num);
+
+	return 0;
+}
+
+static bool tbt_net_xmit_csum(struct sk_buff *skb,
+			      struct tbt_desc_ring *tx_ring, u32 first,
+			      u32 last, u32 frame_count)
+{
+
+	struct tbt_frame_header *hdr = tx_ring->buffers[first].hdr;
+	__wsum wsum = (__force __wsum)htonl(skb->len -
+					    skb_transport_offset(skb));
+	int offset = skb_transport_offset(skb);
+	__sum16 *tucso;  /* TCP UDP Checksum Segment Offset */
+	__be16 protocol = skb->protocol;
+	u8 *dest = (u8 *)(hdr + 1);
+	int len;
+
+	if (skb->ip_summed != CHECKSUM_PARTIAL) {
+		for (; first != last;
+			first = (first + 1) & (TBT_NET_NUM_TX_BUFS - 1)) {
+			hdr = tx_ring->buffers[first].hdr;
+			hdr->frame_count = cpu_to_le32(frame_count);
+		}
+		return true;
+	}
+
+	if (protocol == htons(ETH_P_8021Q)) {
+		struct vlan_hdr *vhdr, vh;
+
+		vhdr = skb_header_pointer(skb, ETH_HLEN, sizeof(vh), &vh);
+		if (!vhdr)
+			return false;
+
+		protocol = vhdr->h_vlan_encapsulated_proto;
+	}
+
+	/*
+	 * Data points on the beginning of packet.
+	 * Check is the checksum absolute place in the
+	 * packet.
+	 * ipcso will update IP checksum.
+	 * tucso will update TCP/UPD checksum.
+	 */
+	if (protocol == htons(ETH_P_IP)) {
+		__sum16 *ipcso = (__sum16 *)(dest +
+			((u8 *)&(ip_hdr(skb)->check) - skb->data));
+
+		*ipcso = 0;
+		*ipcso = ip_fast_csum(dest + skb_network_offset(skb),
+				      ip_hdr(skb)->ihl);
+		if (ip_hdr(skb)->protocol == IPPROTO_TCP)
+			tucso = (__sum16 *)(dest +
+				((u8 *)&(tcp_hdr(skb)->check) - skb->data));
+		else if (ip_hdr(skb)->protocol == IPPROTO_UDP)
+			tucso = (__sum16 *)(dest +
+				((u8 *)&(udp_hdr(skb)->check) - skb->data));
+		else
+			return false;
+
+		*tucso = ~csum_tcpudp_magic(ip_hdr(skb)->saddr,
+					    ip_hdr(skb)->daddr, 0,
+					    ip_hdr(skb)->protocol, 0);
+	} else if (skb_is_gso(skb)) {
+		if (skb_is_gso_v6(skb)) {
+			tucso = (__sum16 *)(dest +
+				((u8 *)&(tcp_hdr(skb)->check) - skb->data));
+			*tucso = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+						  &ipv6_hdr(skb)->daddr,
+						  0, IPPROTO_TCP, 0);
+		} else if ((protocol == htons(ETH_P_IPV6)) &&
+			   (skb_shinfo(skb)->gso_type & SKB_GSO_UDP)) {
+			tucso = (__sum16 *)(dest +
+				((u8 *)&(udp_hdr(skb)->check) - skb->data));
+			*tucso = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+						  &ipv6_hdr(skb)->daddr,
+						  0, IPPROTO_UDP, 0);
+		} else {
+			return false;
+		}
+	} else if (protocol == htons(ETH_P_IPV6)) {
+		tucso = (__sum16 *)(dest + skb_checksum_start_offset(skb) +
+				    skb->csum_offset);
+		*tucso = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+					  &ipv6_hdr(skb)->daddr,
+					  0, ipv6_hdr(skb)->nexthdr, 0);
+	} else {
+		return false;
+	}
+
+	/* First frame was headers, rest of the frames is data */
+	for (; first != last; first = (first + 1) & (TBT_NET_NUM_TX_BUFS - 1),
+								offset = 0) {
+		hdr = tx_ring->buffers[first].hdr;
+		dest = (u8 *)(hdr + 1) + offset;
+		len = le32_to_cpu(hdr->frame_size) - offset;
+		wsum = csum_partial(dest, len, wsum);
+		hdr->frame_count = cpu_to_le32(frame_count);
+	}
+	*tucso = csum_fold(wsum);
+
+	return true;
+}
+
+static netdev_tx_t tbt_net_xmit_frame(struct sk_buff *skb,
+				      struct net_device *net_dev)
+{
+	struct tbt_port *port = netdev_priv(net_dev);
+	void __iomem *iobase = port->nhi_ctxt->iobase;
+	void __iomem *reg = TBT_RING_CONS_PROD_REG(iobase,
+						   REG_TX_RING_BASE,
+						   port->local_path);
+	struct tbt_desc_ring *tx_ring = &port->tx_ring;
+	struct tbt_frame_header *hdr;
+	u32 prod_cons, prod, cons, first;
+	/* len equivalent to the fragment length */
+	unsigned int len = skb_headlen(skb);
+	/* data_len is overall packet length */
+	unsigned int data_len = skb->len;
+	u32 frm_idx, frag_num = 0;
+	const u8 *src = skb->data;
+	bool unmap = false;
+	__le32 *attr;
+	u8 *dest;
+
+	if (unlikely(data_len == 0 || data_len > TBT_NET_MTU))
+		goto invalid_packet;
+
+	prod_cons = ioread32(reg);
+	prod = TBT_REG_RING_PROD_EXTRACT(prod_cons);
+	cons = TBT_REG_RING_CONS_EXTRACT(prod_cons);
+	if (prod >= TBT_NET_NUM_TX_BUFS || cons >= TBT_NET_NUM_TX_BUFS)
+		goto tx_error;
+
+	if (data_len > (TBT_NUM_BUFS_BETWEEN(prod, cons, TBT_NET_NUM_TX_BUFS) *
+			TBT_RING_MAX_FRM_DATA_SZ)) {
+		unsigned long flags;
+
+		netif_stop_queue(net_dev);
+
+		spin_lock_irqsave(&port->nhi_ctxt->lock, flags);
+		/*
+		 * Enable TX interrupt to be notified about available buffers
+		 * and restart transmission upon this.
+		 */
+		RING_INT_ENABLE_TX(iobase, port->local_path);
+		spin_unlock_irqrestore(&port->nhi_ctxt->lock, flags);
+
+		return NETDEV_TX_BUSY;
+	}
+
+	first = prod;
+	attr = &tx_ring->desc[prod].attributes;
+	hdr = tx_ring->buffers[prod].hdr;
+	dest = (u8 *)(hdr + 1);
+	/* if overall packet is bigger than the frame data size */
+	for (frm_idx = 0; data_len > TBT_RING_MAX_FRM_DATA_SZ; ++frm_idx) {
+		u32 size_left = TBT_RING_MAX_FRM_DATA_SZ;
+
+		*attr &= cpu_to_le32(~(DESC_ATTR_LEN_MASK |
+				      DESC_ATTR_INT_EN |
+				      DESC_ATTR_DESC_DONE));
+		hdr->frame_size = cpu_to_le32(TBT_RING_MAX_FRM_DATA_SZ);
+		hdr->frame_index = cpu_to_le16(frm_idx);
+		hdr->frame_id = cpu_to_le16(port->frame_id);
+
+		do {
+			if (len > size_left) {
+				/*
+				 * Copy data onto tx buffer data with full
+				 * frame size then break
+				 * and go to next frame
+				 */
+				memcpy(dest, src, size_left);
+				len -= size_left;
+				dest += size_left;
+				src += size_left;
+				break;
+			}
+
+			memcpy(dest, src, len);
+			size_left -= len;
+			dest += len;
+
+			if (unmap) {
+				kunmap_atomic((void *)src);
+				unmap = false;
+			}
+			/*
+			 * Ensure all fragments have been processed
+			 */
+			if (frag_num < skb_shinfo(skb)->nr_frags) {
+				const skb_frag_t *frag =
+					&(skb_shinfo(skb)->frags[frag_num]);
+				len = skb_frag_size(frag);
+				/* map and then unmap quickly */
+				src = kmap_atomic(skb_frag_page(frag)) +
+							frag->page_offset;
+				unmap = true;
+				++frag_num;
+			} else if (unlikely(size_left > 0)) {
+				goto invalid_packet;
+			}
+		} while (size_left > 0);
+
+		data_len -= TBT_RING_MAX_FRM_DATA_SZ;
+		prod = (prod + 1) & (TBT_NET_NUM_TX_BUFS - 1);
+		attr = &tx_ring->desc[prod].attributes;
+		hdr = tx_ring->buffers[prod].hdr;
+		dest = (u8 *)(hdr + 1);
+	}
+
+	*attr &= cpu_to_le32(~(DESC_ATTR_LEN_MASK | DESC_ATTR_DESC_DONE));
+	/* Enable the interrupts, for resuming from stop queue later (if so) */
+	*attr |= cpu_to_le32(DESC_ATTR_INT_EN |
+		(((sizeof(struct tbt_frame_header) + data_len) <<
+		  DESC_ATTR_LEN_SHIFT) & DESC_ATTR_LEN_MASK));
+	hdr->frame_size = cpu_to_le32(data_len);
+	hdr->frame_index = cpu_to_le16(frm_idx);
+	hdr->frame_id = cpu_to_le16(port->frame_id);
+
+	/* In case  the remaining data_len is smaller than a frame */
+	while (len < data_len) {
+		memcpy(dest, src, len);
+		data_len -= len;
+		dest += len;
+
+		if (unmap) {
+			kunmap_atomic((void *)src);
+			unmap = false;
+		}
+
+		if (frag_num < skb_shinfo(skb)->nr_frags) {
+			const skb_frag_t *frag =
+					&(skb_shinfo(skb)->frags[frag_num]);
+			len = skb_frag_size(frag);
+			src = kmap_atomic(skb_frag_page(frag)) +
+							frag->page_offset;
+			unmap = true;
+			++frag_num;
+		} else if (unlikely(data_len > 0)) {
+			goto invalid_packet;
+		}
+	}
+	memcpy(dest, src, data_len);
+	if (unmap) {
+		kunmap_atomic((void *)src);
+		unmap = false;
+	}
+
+	++frm_idx;
+	prod = (prod + 1) & (TBT_NET_NUM_TX_BUFS - 1);
+
+	if (!tbt_net_xmit_csum(skb, tx_ring, first, prod, frm_idx))
+		goto invalid_packet;
+
+	if (port->match_frame_id)
+		++port->frame_id;
+
+	prod_cons &= ~REG_RING_PROD_MASK;
+	prod_cons |= (prod << REG_RING_PROD_SHIFT) & REG_RING_PROD_MASK;
+	wmb(); /* make sure producer update is done after buffers are ready */
+	iowrite32(prod_cons, reg);
+
+	++port->stats.tx_packets;
+	port->stats.tx_bytes += skb->len;
+
+	dev_consume_skb_any(skb);
+	return NETDEV_TX_OK;
+
+invalid_packet:
+	netif_err(port, tx_err, net_dev, "port %u invalid transmit packet\n",
+		  port->num);
+tx_error:
+	++port->stats.tx_errors;
+	dev_kfree_skb_any(skb);
+	return NETDEV_TX_OK;
+}
+
+static void tbt_net_set_rx_mode(struct net_device *net_dev)
+{
+	struct tbt_port *port = netdev_priv(net_dev);
+	struct netdev_hw_addr *ha;
+
+	if (net_dev->flags & IFF_PROMISC)
+		port->packet_filters |= BIT(PACKET_TYPE_PROMISCUOUS);
+	else
+		port->packet_filters &= ~BIT(PACKET_TYPE_PROMISCUOUS);
+	if (net_dev->flags & IFF_ALLMULTI)
+		port->packet_filters |= BIT(PACKET_TYPE_ALL_MULTICAST);
+	else
+		port->packet_filters &= ~BIT(PACKET_TYPE_ALL_MULTICAST);
+
+	/* if you have more than a single MAC address */
+	if (netdev_uc_count(net_dev) > 1)
+		port->packet_filters |= BIT(PACKET_TYPE_UNICAST_PROMISCUOUS);
+	/* if have a single MAC address */
+	else if (netdev_uc_count(net_dev) == 1) {
+		netdev_for_each_uc_addr(ha, net_dev)
+			/* checks whether the MAC is what we set */
+			if (ether_addr_equal(ha->addr, net_dev->dev_addr))
+				port->packet_filters &=
+					~BIT(PACKET_TYPE_UNICAST_PROMISCUOUS);
+			else
+				port->packet_filters |=
+					BIT(PACKET_TYPE_UNICAST_PROMISCUOUS);
+	} else {
+		port->packet_filters &= ~BIT(PACKET_TYPE_UNICAST_PROMISCUOUS);
+	}
+
+	/* Populate the multicast hash table with received MAC addresses */
+	memset(port->multicast_hash_table, 0,
+	       sizeof(port->multicast_hash_table));
+	netdev_for_each_mc_addr(ha, net_dev) {
+		u16 hash_val = TBT_NET_ETHER_ADDR_HASH(ha->addr);
+
+		port->multicast_hash_table[hash_val / BITS_PER_U32] |=
+						BIT(hash_val % BITS_PER_U32);
+	}
+
+}
+
+static struct rtnl_link_stats64 *tbt_net_get_stats64(
+					struct net_device *net_dev,
+					struct rtnl_link_stats64 *stats)
+{
+	struct tbt_port *port = netdev_priv(net_dev);
+
+	memset(stats, 0, sizeof(*stats));
+	stats->tx_packets = port->stats.tx_packets;
+	stats->tx_bytes = port->stats.tx_bytes;
+	stats->tx_errors = port->stats.tx_errors;
+	stats->rx_packets = port->stats.rx_packets;
+	stats->rx_bytes = port->stats.rx_bytes;
+	stats->rx_length_errors = port->stats.rx_length_errors;
+	stats->rx_over_errors = port->stats.rx_over_errors;
+	stats->rx_crc_errors = port->stats.rx_crc_errors;
+	stats->rx_missed_errors = port->stats.rx_missed_errors;
+	stats->rx_errors = stats->rx_length_errors + stats->rx_over_errors +
+			   stats->rx_crc_errors + stats->rx_missed_errors;
+	stats->multicast = port->stats.multicast;
+	return stats;
 }
 
+static int tbt_net_set_mac_address(struct net_device *net_dev, void *addr)
+{
+	struct sockaddr *saddr = addr;
+
+	if (!is_valid_ether_addr(saddr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	memcpy(net_dev->dev_addr, saddr->sa_data, net_dev->addr_len);
+
+	return 0;
+}
+
+static int tbt_net_change_mtu(struct net_device *net_dev, int new_mtu)
+{
+	struct tbt_port *port = netdev_priv(net_dev);
+
+	/* MTU < 68 is an error and causes problems on some kernels */
+	if (new_mtu < 68 || new_mtu > (TBT_NET_MTU - ETH_HLEN))
+		return -EINVAL;
+
+	netif_info(port, probe, net_dev, "Thunderbolt(TM) Networking port %u - changing MTU from %u to %d\n",
+		   port->num, net_dev->mtu, new_mtu);
+
+	net_dev->mtu = new_mtu;
+
+	return 0;
+}
+
+static const struct net_device_ops tbt_netdev_ops = {
+	/* called when the network is up'ed */
+	.ndo_open		= tbt_net_open,
+	/* called when the network is down'ed */
+	.ndo_stop		= tbt_net_close,
+	.ndo_start_xmit		= tbt_net_xmit_frame,
+	.ndo_set_rx_mode	= tbt_net_set_rx_mode,
+	.ndo_get_stats64	= tbt_net_get_stats64,
+	.ndo_set_mac_address	= tbt_net_set_mac_address,
+	.ndo_change_mtu		= tbt_net_change_mtu,
+	.ndo_validate_addr	= eth_validate_addr,
+};
+
+static int tbt_net_get_settings(__maybe_unused struct net_device *net_dev,
+				struct ethtool_cmd *ecmd)
+{
+	ecmd->supported |= SUPPORTED_20000baseKR2_Full;
+	ecmd->advertising |= ADVERTISED_20000baseKR2_Full;
+	ecmd->autoneg = AUTONEG_DISABLE;
+	ecmd->transceiver = XCVR_INTERNAL;
+	ecmd->supported |= SUPPORTED_FIBRE;
+	ecmd->advertising |= ADVERTISED_FIBRE;
+	ecmd->port = PORT_FIBRE;
+	ethtool_cmd_speed_set(ecmd, SPEED_20000);
+	ecmd->duplex = DUPLEX_FULL;
+
+	return 0;
+}
+
+
+static u32 tbt_net_get_msglevel(struct net_device *net_dev)
+{
+	struct tbt_port *port = netdev_priv(net_dev);
+
+	return port->msg_enable;
+}
+
+static void tbt_net_set_msglevel(struct net_device *net_dev, u32 data)
+{
+	struct tbt_port *port = netdev_priv(net_dev);
+
+	port->msg_enable = data;
+}
+
+static void tbt_net_get_strings(__maybe_unused struct net_device *net_dev,
+				u32 stringset, u8 *data)
+{
+	if (stringset == ETH_SS_STATS)
+		memcpy(data, tbt_net_gstrings_stats,
+		       sizeof(tbt_net_gstrings_stats));
+}
+
+static void tbt_net_get_ethtool_stats(struct net_device *net_dev,
+				      __maybe_unused struct ethtool_stats *sts,
+				      u64 *data)
+{
+	struct tbt_port *port = netdev_priv(net_dev);
+
+	memcpy(data, &port->stats, sizeof(port->stats));
+}
+
+static int tbt_net_get_sset_count(__maybe_unused struct net_device *net_dev,
+				  int sset)
+{
+	if (sset == ETH_SS_STATS)
+		return sizeof(tbt_net_gstrings_stats) / ETH_GSTRING_LEN;
+	return -EOPNOTSUPP;
+}
+
+static void tbt_net_get_drvinfo(struct net_device *net_dev,
+				struct ethtool_drvinfo *drvinfo)
+{
+	struct tbt_port *port = netdev_priv(net_dev);
+
+	strlcpy(drvinfo->driver, "Thunderbolt(TM) Networking",
+		sizeof(drvinfo->driver));
+	strlcpy(drvinfo->version, DRV_VERSION, sizeof(drvinfo->version));
+
+	strlcpy(drvinfo->bus_info, pci_name(port->nhi_ctxt->pdev),
+		sizeof(drvinfo->bus_info));
+	drvinfo->n_stats = tbt_net_get_sset_count(net_dev, ETH_SS_STATS);
+}
+
+static const struct ethtool_ops tbt_net_ethtool_ops = {
+	.get_settings		= tbt_net_get_settings,
+	.get_drvinfo		= tbt_net_get_drvinfo,
+	.get_link		= ethtool_op_get_link,
+	.get_msglevel		= tbt_net_get_msglevel,
+	.set_msglevel		= tbt_net_set_msglevel,
+	.get_strings		= tbt_net_get_strings,
+	.get_ethtool_stats	= tbt_net_get_ethtool_stats,
+	.get_sset_count		= tbt_net_get_sset_count,
+};
+
 static inline int send_message(struct tbt_port *port, const char *func,
 				enum pdf_value pdf, u32 msg_len, const u8 *msg)
 {
@@ -515,6 +1943,10 @@ void negotiation_events(struct net_device *net_dev,
 		/* configure TX ring */
 		reg = iobase + REG_TX_RING_BASE +
 		      (port->local_path * REG_RING_STEP);
+		iowrite32(lower_32_bits(port->tx_ring.dma),
+			  reg + REG_RING_PHYS_LO_OFFSET);
+		iowrite32(upper_32_bits(port->tx_ring.dma),
+			  reg + REG_RING_PHYS_HI_OFFSET);
 
 		tx_ring_conf = (TBT_NET_NUM_TX_BUFS << REG_RING_SIZE_SHIFT) &
 				REG_RING_SIZE_MASK;
@@ -557,6 +1989,10 @@ void negotiation_events(struct net_device *net_dev,
 		 */
 		reg = iobase + REG_RX_RING_BASE +
 		      (port->local_path * REG_RING_STEP);
+		iowrite32(lower_32_bits(port->rx_ring.dma),
+			  reg + REG_RING_PHYS_LO_OFFSET);
+		iowrite32(upper_32_bits(port->rx_ring.dma),
+			  reg + REG_RING_PHYS_HI_OFFSET);
 
 		rx_ring_conf = (TBT_NET_NUM_RX_BUFS << REG_RING_SIZE_SHIFT) &
 				REG_RING_SIZE_MASK;
@@ -566,6 +2002,17 @@ void negotiation_events(struct net_device *net_dev,
 				REG_RING_BUF_SIZE_MASK;
 
 		iowrite32(rx_ring_conf, reg + REG_RING_SIZE_OFFSET);
+		/* allocate RX buffers and configure the descriptors */
+		if (!tbt_net_alloc_rx_buffers(&port->nhi_ctxt->pdev->dev,
+					      &port->rx_ring,
+					      TBT_NET_NUM_RX_BUFS,
+					      reg + REG_RING_CONS_PROD_OFFSET,
+					      GFP_KERNEL)) {
+			netif_err(port, link, net_dev, "Thunderbolt(TM) Networking port %u - no memory for receive buffers\n",
+				  port->num);
+			tbt_net_tear_down(net_dev, true);
+			break;
+		}
 
 		spin_lock_irqsave(&port->nhi_ctxt->lock, flags);
 		/* enable RX interrupt */
@@ -578,6 +2025,7 @@ void negotiation_events(struct net_device *net_dev,
 		netif_info(port, link, net_dev, "Thunderbolt(TM) Networking port %u - ready\n",
 			   port->num);
 
+		napi_enable(&port->napi);
 		netif_carrier_on(net_dev);
 		netif_start_queue(net_dev);
 		break;
@@ -788,15 +2236,42 @@ struct net_device *nhi_alloc_etherdev(struct tbt_nhi_ctxt *nhi_ctxt,
 	scnprintf(net_dev->name, sizeof(net_dev->name), "tbtnet%%dp%hhu",
 		  port_num);
 
+	net_dev->netdev_ops = &tbt_netdev_ops;
+
+	netif_napi_add(net_dev, &port->napi, tbt_net_poll, NAPI_POLL_WEIGHT);
+
+	net_dev->hw_features = NETIF_F_SG |
+			       NETIF_F_ALL_TSO |
+			       NETIF_F_UFO |
+			       NETIF_F_GRO |
+			       NETIF_F_IP_CSUM |
+			       NETIF_F_IPV6_CSUM;
+	net_dev->features = net_dev->hw_features;
+	if (nhi_ctxt->pci_using_dac)
+		net_dev->features |= NETIF_F_HIGHDMA;
+
 	INIT_DELAYED_WORK(&port->login_retry_work, login_retry);
 	INIT_WORK(&port->login_response_work, login_response);
 	INIT_WORK(&port->logout_work, logout);
 	INIT_WORK(&port->status_reply_work, status_reply);
 	INIT_WORK(&port->approve_inter_domain_work, approve_inter_domain);
 
+	net_dev->ethtool_ops = &tbt_net_ethtool_ops;
+
+	tbt_net_change_mtu(net_dev, TBT_NET_MTU - ETH_HLEN);
+
+	if (register_netdev(net_dev))
+		goto err_register;
+
+	netif_carrier_off(net_dev);
+
 	netif_info(port, probe, net_dev,
 		   "Thunderbolt(TM) Networking port %u - MAC Address: %pM\n",
 		   port_num, net_dev->dev_addr);
 
 	return net_dev;
+
+err_register:
+	free_netdev(net_dev);
+	return NULL;
 }
-- 
2.7.4