lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <f85e6be597ae30fd4dab776b924c0cd0a66514c0.1312314817.git.mirq-linux@rere.qmqm.pl>
Date:	Tue,  2 Aug 2011 22:24:35 +0200 (CEST)
From:	Michał Mirosław <mirq-linux@...e.qmqm.pl>
To:	netdev@...r.kernel.org
Subject: [RFC PATCH] common receive API + r8169 use

Here is a preliminary version of common RX path for network drivers. The idea
is an extension to Eric Dumazet's patch introducing build_skb() (it's
incorporated here for easier testing).

Future plans:
 - extend this API to devices which can do split buffer receives correctly
   and use napi_gro_frags() instead;
 - implement DaveM's idea of RX buffer handling (fill first, process
   if buffers available) in parallel to my version (process first, refill
   later);
 - get rid of indirect calls in fast path (process_buffer() and
   add_buffer()) - ideas? inline netdev_rx_poll() and pass callback to it?

Version rebased on v3.0 is running succesfully on one laptop with r8169 on
board since about a week. No problems showed up yet. For net-next this
needs retesting because of changes in device reset handling.

Cards ID:

r8169 0000:05:00.0: eth0: RTL8168e/8111e at 0xffffc90000678000, 78:2b:cb:ec:df:54, XID 0c200000, ver 32, IRQ 45

lspci -v:

05:00.0 Ethernet controller: Realtek Semiconductor Co., Ltd. RTL8111/8168B PCI Express Gigabit Ethernet controller (rev 06)
        Subsystem: Dell Device 04b2
        Flags: bus master, fast devsel, latency 0, IRQ 45
        I/O ports at d000 [size=256]
        Memory at f1104000 (64-bit, prefetchable) [size=4K]
        Memory at f1100000 (64-bit, prefetchable) [size=16K]
        Capabilities: [40] Power Management version 3
        Capabilities: [50] MSI: Enable+ Count=1/1 Maskable- 64bit+
        Capabilities: [70] Express Endpoint, MSI 01
        Capabilities: [b0] MSI-X: Enable- Count=4 Masked-
        Capabilities: [d0] Vital Product Data
        Capabilities: [100] Advanced Error Reporting
        Capabilities: [140] Virtual Channel
        Capabilities: [160] Device Serial Number [...]
        Kernel driver in use: r8169

Signed-off-by: Michał Mirosław <mirq-linux@...e.qmqm.pl>
---
 drivers/net/r8169.c       |  204 ++++++++++++++++++++++++++++++++++++-----
 include/linux/netdevice.h |  227 +++++++++++++++++++++++++++++++++++++++++++++
 net/core/skbuff.c         |   49 ++++++++++
 3 files changed, 457 insertions(+), 23 deletions(-)

diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index 7d9c650..c0813fd 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -7,6 +7,7 @@
  *
  * See MAINTAINERS file for support contact information.
  */
+//#define NO_COMMON_RX_API
 
 #include <linux/module.h>
 #include <linux/moduleparam.h>
@@ -33,7 +34,7 @@
 #include <asm/io.h>
 #include <asm/irq.h>
 
-#define RTL8169_VERSION "2.3LK-NAPI"
+#define RTL8169_VERSION "in-tree+mq"
 #define MODULENAME "r8169"
 #define PFX MODULENAME ": "
 
@@ -651,6 +652,7 @@ struct rtl8169_private {
 	dma_addr_t TxPhyAddr;
 	dma_addr_t RxPhyAddr;
 	void *Rx_databuff[NUM_RX_DESC];	/* Rx data buffers */
+	struct netdev_ring rx_ring;
 	struct ring_info tx_skb[NUM_TX_DESC];	/* Tx data buffers */
 	struct timer_list timer;
 	u16 cp_cmd;
@@ -728,6 +730,20 @@ static void rtl8169_down(struct net_device *dev);
 static void rtl8169_rx_clear(struct rtl8169_private *tp);
 static int rtl8169_poll(struct napi_struct *napi, int budget);
 
+static int rtl_add_rx_buffer(struct netdev_ring *ring, void *buf,
+	dma_addr_t dma);
+static dma_addr_t rtl_get_rx_buffer_addr(struct netdev_ring *ring,
+	unsigned int i);
+static int rtl_rx_buffer(struct netdev_ring *ring);
+static void rtl_rx_complete(struct netdev_ring *ring);
+
+static const struct netdev_ring_ops rtl_rx_ring_ops = {
+	.add_buffer = rtl_add_rx_buffer,
+	.get_buffer_addr = rtl_get_rx_buffer_addr,
+	.process_buffer = rtl_rx_buffer,
+	.poll_complete = rtl_rx_complete,
+};
+
 static u32 ocp_read(struct rtl8169_private *tp, u8 mask, u16 reg)
 {
 	void __iomem *ioaddr = tp->mmio_addr;
@@ -3729,6 +3745,9 @@ rtl8169_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	dev->base_addr = (unsigned long) ioaddr;
 
 	netif_napi_add(dev, &tp->napi, rtl8169_poll, R8169_NAPI_WEIGHT);
+#ifndef NO_COMMON_RX_API
+	netdev_add_ring(dev, &tp->rx_ring, &rtl_rx_ring_ops, R8169_NAPI_WEIGHT);
+#endif
 
 	/* don't enable SG, IP_CSUM and TSO by default - it might not work
 	 * properly for all devices */
@@ -3761,9 +3780,10 @@ rtl8169_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	pci_set_drvdata(pdev, dev);
 
-	netif_info(tp, probe, dev, "%s at 0x%lx, %pM, XID %08x IRQ %d\n",
+	netif_info(tp, probe, dev, "%s at 0x%lx, %pM, XID %08x, ver %u, IRQ %d\n",
 		   rtl_chip_infos[chipset].name, dev->base_addr, dev->dev_addr,
-		   (u32)(RTL_R32(TxConfig) & 0x9cf0f8ff), dev->irq);
+		   (u32)(RTL_R32(TxConfig) & 0x9cf0f8ff), tp->mac_version,
+		   dev->irq);
 
 	if (tp->mac_version == RTL_GIGA_MAC_VER_27 ||
 	    tp->mac_version == RTL_GIGA_MAC_VER_28 ||
@@ -3883,12 +3903,17 @@ static int rtl8169_open(struct net_device *dev)
 					     &tp->TxPhyAddr, GFP_KERNEL);
 	if (!tp->TxDescArray)
 		goto err_pm_runtime_put;
-
+#ifdef NO_COMMON_RX_API
 	tp->RxDescArray = dma_alloc_coherent(&pdev->dev, R8169_RX_RING_BYTES,
 					     &tp->RxPhyAddr, GFP_KERNEL);
 	if (!tp->RxDescArray)
 		goto err_free_tx_0;
-
+#else
+	retval = netdev_alloc_ring(&tp->rx_ring, &pdev->dev, sizeof(struct RxDesc),
+		NUM_RX_DESC);
+	if (retval < 0)
+		goto err_free_tx_0;
+#endif
 	retval = rtl8169_init_ring(dev);
 	if (retval < 0)
 		goto err_free_rx_1;
@@ -3906,6 +3931,7 @@ static int rtl8169_open(struct net_device *dev)
 		goto err_release_fw_2;
 
 	napi_enable(&tp->napi);
+	napi_enable(&tp->rx_ring.napi);
 
 	rtl8169_init_phy(dev, tp);
 
@@ -3926,9 +3952,14 @@ err_release_fw_2:
 	rtl_release_firmware(tp);
 	rtl8169_rx_clear(tp);
 err_free_rx_1:
+#ifdef NO_COMMON_RX_API
 	dma_free_coherent(&pdev->dev, R8169_RX_RING_BYTES, tp->RxDescArray,
 			  tp->RxPhyAddr);
 	tp->RxDescArray = NULL;
+#else
+	netdev_clear_rx_ring(&tp->rx_ring);
+	netdev_free_ring(&tp->rx_ring, sizeof(struct RxDesc));
+#endif
 err_free_tx_0:
 	dma_free_coherent(&pdev->dev, R8169_TX_RING_BYTES, tp->TxDescArray,
 			  tp->TxPhyAddr);
@@ -3998,8 +4029,13 @@ static void rtl_set_rx_tx_desc_registers(struct rtl8169_private *tp,
 	 */
 	RTL_W32(TxDescStartAddrHigh, ((u64) tp->TxPhyAddr) >> 32);
 	RTL_W32(TxDescStartAddrLow, ((u64) tp->TxPhyAddr) & DMA_BIT_MASK(32));
+#ifdef NO_COMMON_RX_API
 	RTL_W32(RxDescAddrHigh, ((u64) tp->RxPhyAddr) >> 32);
 	RTL_W32(RxDescAddrLow, ((u64) tp->RxPhyAddr) & DMA_BIT_MASK(32));
+#else
+	RTL_W32(RxDescAddrHigh, cpu_to_le32((u64)tp->rx_ring.desc_dma >> 32));
+	RTL_W32(RxDescAddrLow, cpu_to_le32((u32)tp->rx_ring.desc_dma));
+#endif
 }
 
 static u16 rtl_rw_cpluscmd(void __iomem *ioaddr)
@@ -4808,6 +4844,29 @@ static inline void rtl8169_mark_as_last_descriptor(struct RxDesc *desc)
 	desc->opts1 |= cpu_to_le32(RingEnd);
 }
 
+static int rtl_add_rx_buffer(struct netdev_ring *ring, void *buf,
+	dma_addr_t dma)
+{
+	unsigned next_tail = (ring->tail + 1) & (NUM_RX_DESC - 1);
+	struct RxDesc *rxd = (struct RxDesc *)ring->desc_table + ring->tail;
+
+	if (next_tail == ACCESS_ONCE(ring->head))
+		return -ENOSPC;
+	ring->buf_table[ring->tail] = buf;
+	ring->tail = next_tail;
+
+	rtl8169_map_to_asic(rxd, dma, 0);
+	return 0;
+}
+
+static dma_addr_t rtl_get_rx_buffer_addr(struct netdev_ring *ring,
+	unsigned int i)
+{
+	struct RxDesc *rxd = (struct RxDesc *)ring->desc_table + i;
+
+	return le64_to_cpu(rxd->addr);
+}
+
 static int rtl8169_rx_fill(struct rtl8169_private *tp)
 {
 	unsigned int i;
@@ -4841,9 +4900,16 @@ static int rtl8169_init_ring(struct net_device *dev)
 	rtl8169_init_ring_indexes(tp);
 
 	memset(tp->tx_skb, 0x0, NUM_TX_DESC * sizeof(struct ring_info));
+#ifdef NO_COMMON_RX_API
 	memset(tp->Rx_databuff, 0x0, NUM_RX_DESC * sizeof(void *));
 
 	return rtl8169_rx_fill(tp);
+#else
+	rtl8169_mark_as_last_descriptor((struct RxDesc *)tp->rx_ring.desc_table +
+		NUM_RX_DESC - 1);
+	tp->rx_ring.bufsz = 0x4000;
+	return netdev_fill_rx_ring(&tp->rx_ring);
+#endif
 }
 
 static void rtl8169_unmap_tx_skb(struct device *d, struct ring_info *tx_skb,
@@ -4905,6 +4971,7 @@ static void rtl8169_wait_for_quiescence(struct net_device *dev)
 	synchronize_irq(dev->irq);
 
 	/* Wait for any pending NAPI task to complete */
+	napi_disable(&tp->rx_ring.napi);
 	napi_disable(&tp->napi);
 
 	rtl8169_irq_mask_and_ack(ioaddr);
@@ -4912,6 +4979,7 @@ static void rtl8169_wait_for_quiescence(struct net_device *dev)
 	tp->intr_mask = 0xffff;
 	RTL_W16(IntrMask, tp->intr_event);
 	napi_enable(&tp->napi);
+	napi_enable(&tp->rx_ring.napi);
 }
 
 static void rtl8169_reinit_task(struct work_struct *work)
@@ -4947,7 +5015,9 @@ static void rtl8169_reset_task(struct work_struct *work)
 	struct rtl8169_private *tp =
 		container_of(work, struct rtl8169_private, task.work);
 	struct net_device *dev = tp->dev;
+#ifdef NO_COMMON_RX_API
 	int i;
+#endif
 
 	rtnl_lock();
 
@@ -4955,10 +5025,12 @@ static void rtl8169_reset_task(struct work_struct *work)
 		goto out_unlock;
 
 	rtl8169_wait_for_quiescence(dev);
-
+#ifdef NO_COMMON_RX_API
 	for (i = 0; i < NUM_RX_DESC; i++)
 		rtl8169_mark_to_asic(tp->RxDescArray + i, rx_buf_sz);
-
+#else
+	netdev_reset_rx_ring(&tp->rx_ring, tp->rx_ring.bufsz);
+#endif
 	rtl8169_tx_clear(tp);
 
 	rtl8169_hw_reset(tp);
@@ -5356,6 +5428,91 @@ static int rtl8169_rx_interrupt(struct net_device *dev,
 	return count;
 }
 
+static int rtl_rx_buffer(struct netdev_ring *ring)
+{
+	struct net_device *dev = ring->napi.dev;
+	struct RxDesc *rxd = (struct RxDesc *)ring->desc_table + ring->head;
+	dma_addr_t dma = le64_to_cpu(rxd->addr);
+	void *buf = ring->buf_table[ring->head];
+	struct sk_buff *skb;
+	u32 status;
+
+	status = le32_to_cpu(ACCESS_ONCE(rxd->opts1));
+	if (status & DescOwn)
+		return -ENOENT;
+
+	netdev_dbg(dev, "RxDesc[%d] = %08x %08x %016llx %p\n",
+		ring->head, status, le32_to_cpu(rxd->opts2), dma, buf);
+
+	/*
+	 * release this descriptor - it won't be reused at least until
+	 * netdev_reuse_rx_buffer() or this function returns.
+	 */
+	if (!(status & RingEnd))
+		++ring->head;
+	else
+		ring->head = 0;
+
+ 	if (unlikely(status & RxRES)) {
+		dev->stats.rx_errors++;
+		if (status & (RxRWT | RxRUNT))
+			dev->stats.rx_length_errors++;
+		if (status & RxCRC)
+			dev->stats.rx_crc_errors++;
+		if (status & RxFOVF) {
+			rtl8169_schedule_work(dev, rtl8169_reset_task);
+			dev->stats.rx_fifo_errors++;
+		}
+		netdev_reuse_rx_buffer(ring, buf, dma);
+		return -EINVAL;
+	}
+
+	/*
+	 * The chipset is broken regarding incoming fragmented
+	 * frames. If frame size > RxMaxSize, chip fills all fragment
+	 * descriptors with flags and size from first fragment.
+	 * It ignores size set in the free buffer's descriptor.
+	 */
+	if (unlikely(rtl8169_fragmented_frame(status))) {
+		dev->stats.rx_dropped++;
+		dev->stats.rx_length_errors++;
+		netdev_reuse_rx_buffer(ring, buf, dma);
+		return -EINVAL;
+	}
+
+	skb = netdev_wrap_rx_buffer(dev, ring, buf, dma,
+		(status & 0x1FFF) - ETH_FCS_LEN);
+	if (unlikely(!skb))
+		return -ENOMEM;
+
+	skb->protocol = eth_type_trans(skb, dev);
+	rtl8169_rx_csum(skb, status);
+	rtl8169_rx_vlan_tag(rxd, skb);
+
+	dev->stats.rx_bytes += skb->len;
+	dev->stats.rx_packets++;
+	napi_gro_receive(&ring->napi, skb);
+
+	return 0;
+}
+
+static void rtl_rx_complete(struct netdev_ring *ring)
+{
+	struct rtl8169_private *tp = container_of(ring, struct rtl8169_private, rx_ring);
+	void __iomem *ioaddr = tp->mmio_addr;
+
+	/* We need for force the visibility of tp->intr_mask
+	 * for other CPUs, as we can loose an MSI interrupt
+	 * and potentially wait for a retransmit timeout if we don't.
+	 * The posted write to IntrMask is safe, as it will
+	 * eventually make it to the chip and we won't loose anything
+	 * until it does.
+	 */
+	tp->intr_mask = 0xffff;
+	wmb();
+	RTL_W16(IntrMask, tp->intr_event);
+}
+
 static irqreturn_t rtl8169_interrupt(int irq, void *dev_instance)
 {
 	struct net_device *dev = dev_instance;
@@ -5426,6 +5583,7 @@ static irqreturn_t rtl8169_interrupt(int irq, void *dev_instance)
 			RTL_W16(IntrMask, tp->intr_event & ~tp->napi_event);
 			tp->intr_mask = ~tp->napi_event;
 
+			napi_schedule(&tp->rx_ring.napi);
 			if (likely(napi_schedule_prep(&tp->napi)))
 				__napi_schedule(&tp->napi);
 			else
@@ -5453,22 +5611,16 @@ static int rtl8169_poll(struct napi_struct *napi, int budget)
 	void __iomem *ioaddr = tp->mmio_addr;
 	int work_done;
 
+#ifdef NO_COMMON_RX_API
 	work_done = rtl8169_rx_interrupt(dev, tp, ioaddr, (u32) budget);
+#else
+	work_done = 0;
+#endif
 	rtl8169_tx_interrupt(dev, tp, ioaddr);
 
 	if (work_done < budget) {
 		napi_complete(napi);
-
-		/* We need for force the visibility of tp->intr_mask
-		 * for other CPUs, as we can loose an MSI interrupt
-		 * and potentially wait for a retransmit timeout if we don't.
-		 * The posted write to IntrMask is safe, as it will
-		 * eventually make it to the chip and we won't loose anything
-		 * until it does.
-		 */
-		tp->intr_mask = 0xffff;
-		wmb();
-		RTL_W16(IntrMask, tp->intr_event);
+		rtl_rx_complete(&tp->rx_ring);
 	}
 
 	return work_done;
@@ -5494,6 +5646,7 @@ static void rtl8169_down(struct net_device *dev)
 
 	netif_stop_queue(dev);
 
+	napi_disable(&tp->rx_ring.napi);
 	napi_disable(&tp->napi);
 
 	spin_lock_irq(&tp->lock);
@@ -5514,9 +5667,11 @@ static void rtl8169_down(struct net_device *dev)
 	synchronize_sched();  /* FIXME: should this be synchronize_irq()? */
 
 	rtl8169_tx_clear(tp);
-
+#ifdef NO_COMMON_RX_API
 	rtl8169_rx_clear(tp);
-
+#else
+	netdev_clear_rx_ring(&tp->rx_ring);
+#endif
 	rtl_pll_power_down(tp);
 }
 
@@ -5534,13 +5689,16 @@ static int rtl8169_close(struct net_device *dev)
 
 	free_irq(dev->irq, dev);
 
-	dma_free_coherent(&pdev->dev, R8169_RX_RING_BYTES, tp->RxDescArray,
-			  tp->RxPhyAddr);
 	dma_free_coherent(&pdev->dev, R8169_TX_RING_BYTES, tp->TxDescArray,
 			  tp->TxPhyAddr);
 	tp->TxDescArray = NULL;
+#ifdef NO_COMMON_RX_API
+	dma_free_coherent(&pdev->dev, R8169_RX_RING_BYTES, tp->RxDescArray,
+			  tp->RxPhyAddr);
 	tp->RxDescArray = NULL;
-
+#else
+	netdev_free_ring(&tp->rx_ring, sizeof(struct RxDesc));
+#endif
 	pm_runtime_put_sync(&pdev->dev);
 
 	return 0;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ddee79b..d29218d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1502,6 +1502,231 @@ struct napi_gro_cb {
 
 #define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb)
 
+
+/* generic receive ring handling */
+
+struct netdev_ring;
+
+struct netdev_ring_ops {
+	int (*add_buffer)(struct netdev_ring *ring, void *buf, dma_addr_t dma);
+	dma_addr_t (*get_buffer_addr)(struct netdev_ring *ring, unsigned int i);
+	int (*process_buffer)(struct netdev_ring *ring);
+	void (*poll_complete)(struct netdev_ring *ring);
+};
+
+struct netdev_ring {
+	struct napi_struct napi;
+	struct netdev_ring_ops ops;
+
+	unsigned int head, tail;
+
+	void **buf_table;
+	void *desc_table;
+
+	struct device *dev;
+	void *next_buf;
+	dma_addr_t next_dma;
+	size_t bufsz;
+
+	dma_addr_t desc_dma;
+	size_t size;
+};
+
+static inline
+void netdev_free_ring(struct netdev_ring *ring, size_t elem_size)
+{
+	kfree(ring->buf_table);
+	if (ring->desc_table)
+		dma_free_coherent(ring->dev, ring->size * elem_size,
+			ring->desc_table, ring->desc_dma);
+
+	ring->buf_table = NULL;
+	ring->desc_table = NULL;
+}
+
+static inline
+int netdev_alloc_ring(struct netdev_ring *ring, struct device *dma_dev,
+	size_t elem_size, unsigned int n_elems)
+{
+	ring->head = ring->tail = 0;
+	ring->size = n_elems;
+	ring->dev = dma_dev;
+	ring->desc_table = dma_alloc_coherent(dma_dev, ring->size * elem_size,
+		&ring->desc_dma, GFP_KERNEL);
+	ring->buf_table = kcalloc(n_elems, sizeof(*ring->buf_table),
+		GFP_KERNEL);
+
+	if (likely(ring->desc_table && ring->buf_table))
+		return 0;
+
+	netdev_free_ring(ring, elem_size);
+	return -ENOMEM;
+}
+
+#define SKB_DATA_SZ(x) \
+	(SKB_DATA_ALIGN((x) + NET_SKB_PAD) - \
+	 SKB_DATA_ALIGN(SKB_WITH_OVERHEAD(0)))
+
+static inline
+int netdev_fill_rx_ring(struct netdev_ring *ring)
+{
+	void *buf;
+	dma_addr_t dma;
+	int n = 0;
+
+	if (ring->next_buf) {
+		if (ring->ops.add_buffer(ring, ring->next_buf, ring->next_dma))
+			return 0;
+		ring->next_buf = NULL;
+		n = 1;
+	}
+
+	for(;; ++n) {
+		/* max buf = 8kB-8, 8B aligned */
+		buf = kmalloc(SKB_DATA_SZ(ring->bufsz), GFP_KERNEL);
+		if (!buf)
+			break;
+		dma = dma_map_single(ring->dev, buf + NET_SKB_PAD,
+			ring->bufsz, DMA_FROM_DEVICE);	// DMA dir
+		if (unlikely(dma_mapping_error(ring->dev, dma))) {
+			kfree(buf);
+			break;
+		}
+		if (ring->ops.add_buffer(ring, buf + NET_SKB_PAD, dma)) {
+			ring->next_buf = buf + NET_SKB_PAD;
+			ring->next_dma = dma;
+			break;
+		}
+	}
+
+	return n;
+}
+
+static inline
+void netdev_clear_rx_ring(struct netdev_ring *ring)
+{
+	dma_addr_t dma;
+	void *buf;
+
+	if (ring->next_buf) {
+		buf = ring->next_buf;
+		dma = ring->next_dma;
+		ring->next_buf = NULL;
+		goto free_buf;
+	}
+
+	while (ring->tail != ring->head) {
+		if (!ring->tail)
+			ring->tail = ring->size;
+		--ring->tail;
+
+		buf = ring->buf_table[ring->tail];
+		dma = ring->ops.get_buffer_addr(ring, ring->tail);
+free_buf:
+		dma_unmap_single(ring->dev, dma, ring->bufsz, DMA_FROM_DEVICE);
+		kfree(buf - NET_SKB_PAD);
+	}
+}
+
+static inline
+void netdev_reset_rx_ring(struct netdev_ring *ring, size_t new_bufsz)
+{
+	netdev_clear_rx_ring(ring);
+	ring->head = ring->tail = 0;
+	ring->bufsz = new_bufsz;
+	netdev_fill_rx_ring(ring);
+}
+
+struct sk_buff *build_skb(void *data, unsigned int size);
+
+static inline
+void netdev_reuse_rx_buffer(struct netdev_ring *ring,
+	void *data, dma_addr_t dma)
+{
+	if (likely(!ring->ops.add_buffer(ring, data, dma)))
+		return;
+
+	if (ring->next_buf) {
+		dma_unmap_single(ring->dev, dma, ring->bufsz, DMA_FROM_DEVICE);
+		kfree(data - NET_SKB_PAD);
+	} else {
+		ring->next_buf = data;
+		ring->next_dma = dma;
+	}
+}
+
+static inline
+struct sk_buff *netdev_wrap_rx_buffer(struct net_device *dev,
+	struct netdev_ring *ring, void *data, dma_addr_t dma, unsigned int len)
+{
+	size_t bufsz = ring->bufsz;
+	struct sk_buff *skb;
+
+	if (len < 256/* rx_copybreak */) {
+		skb = netdev_alloc_skb_ip_align(dev, len);
+		if (likely(skb)) {
+			dma_sync_single_for_cpu(ring->dev, dma, len, DMA_FROM_DEVICE);
+			skb_copy_to_linear_data(skb, data, len);
+			netdev_reuse_rx_buffer(ring, data, dma);
+			goto finish_skb;
+		}
+	}
+
+	dma_unmap_single(ring->dev, dma, bufsz, DMA_FROM_DEVICE);
+	skb = build_skb(data - NET_SKB_PAD, bufsz + NET_SKB_PAD);
+	if (!skb) {
+		dma = dma_map_single(ring->dev, data, bufsz, DMA_FROM_DEVICE);
+		if (likely(!dma_mapping_error(ring->dev, dma)))
+			netdev_reuse_rx_buffer(ring, data, dma);
+		else
+			kfree(data - NET_SKB_PAD);
+		return NULL;
+	}
+
+	skb_reserve(skb, NET_SKB_PAD);
+	skb->dev = dev;
+
+finish_skb:
+	skb_put(skb, len);
+
+	return skb;
+}
+
+static int netdev_rx_poll(struct napi_struct *napi, int budget)
+{
+	struct netdev_ring *ring = container_of(napi, struct netdev_ring, napi);
+	int max = budget;
+
+	while (budget > 0) {
+		if (ring->ops.process_buffer(ring) == -ENOENT)
+			break;
+
+		--budget;
+	}
+
+	netdev_fill_rx_ring(ring);
+
+	if (budget) {
+		ring->ops.poll_complete(ring);
+		if (ring->ops.process_buffer(ring) == -ENOENT)
+			napi_complete(&ring->napi);
+		else /* raced with rx indication - just continue polling */
+			--budget;
+	}
+
+	return max - budget;
+}
+
+static inline void netdev_add_ring(struct net_device *dev, struct netdev_ring *ring,
+	const struct netdev_ring_ops *ops, int weigth)
+{
+	ring->ops = *ops;
+	netif_napi_add(dev, &ring->napi, netdev_rx_poll, weigth);
+}
+
+
+
+
 struct packet_type {
 	__be16			type;	/* This is really htons(ether_type). */
 	struct net_device	*dev;	/* NULL is wildcarded here	     */
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2beda82..92fad68 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3162,3 +3162,52 @@ void __skb_warn_lro_forwarding(const struct sk_buff *skb)
 			   " while LRO is enabled\n", skb->dev->name);
 }
 EXPORT_SYMBOL(__skb_warn_lro_forwarding);
+
+ /**
+  * build_skb - build a network buffer
+  * @data: data buffer provider by caller
+  * @size: size of data buffer, not including skb_shared_info
+  *
+  * Allocate a new &sk_buff. Caller provides space holding head and
+  * skb_shared_info. Mostly used in driver RX path.
+  * The return is the buffer. On a failure the return is %NULL.
+  * Notes :
+  *  Before IO, driver allocates only data buffer where NIC put incoming frame
+  *  Driver SHOULD add room at head (NET_SKB_PAD) and
+  *  MUST add room tail (to hold skb_shared_info)
+  *  After IO, driver calls build_skb(), to get a hot skb instead of a cold one
+  *  before giving packet to stack. RX rings only contains data buffers, not
+  *  full skbs.
+  */
+struct sk_buff *build_skb(void *data, unsigned int size)
+{
+	struct skb_shared_info *shinfo;
+	struct sk_buff *skb;
+
+	skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
+	if (!skb)
+		return NULL;
+
+	size = SKB_DATA_ALIGN(size);
+
+	memset(skb, 0, offsetof(struct sk_buff, tail));
+	skb->truesize = size + sizeof(struct sk_buff);
+	atomic_set(&skb->users, 1);
+	skb->head = data;
+	skb->data = data;
+	skb_reset_tail_pointer(skb);
+	skb->end = skb->tail + size;
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+	skb->mac_header = ~0U;
+#endif
+
+	/* make sure we initialize shinfo sequentially */
+	shinfo = skb_shinfo(skb);
+	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
+	atomic_set(&shinfo->dataref, 1);
+	kmemcheck_annotate_variable(shinfo->destructor_arg);
+
+	return skb;
+}
+EXPORT_SYMBOL_GPL(build_skb);
+
-- 
1.7.5.4

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ