[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <45DC7BF8.1030507@myri.com>
Date: Wed, 21 Feb 2007 18:06:00 +0100
From: Brice Goglin <brice@...i.com>
To: Jeff Garzik <jeff@...zik.org>
CC: netdev@...r.kernel.org
Subject: [PATCH 2/2] myri10ge: large receive offload
Add Large Receive Offload implemented in software.
Signed-off-by: Brice Goglin <brice@...i.com>
---
drivers/net/myri10ge/myri10ge.c | 422 ++++++++++++++++++++++++++++++++++++++++
1 file changed, 422 insertions(+)
Index: linux-rc/drivers/net/myri10ge/myri10ge.c
===================================================================
--- linux-rc.orig/drivers/net/myri10ge/myri10ge.c 2007-02-21 17:42:22.000000000 +0100
+++ linux-rc/drivers/net/myri10ge/myri10ge.c 2007-02-21 17:55:22.000000000 +0100
@@ -61,6 +61,8 @@
#include <linux/moduleparam.h>
#include <linux/io.h>
#include <net/checksum.h>
+#include <net/ip.h>
+#include <net/tcp.h>
#include <asm/byteorder.h>
#include <asm/io.h>
#include <asm/processor.h>
@@ -145,11 +147,32 @@
int pkt_done; /* packets completed */
};
+struct myri10ge_lro_packet {
+ struct hlist_node lro_node;
+ struct sk_buff *skb;
+ int timestamp;
+ __u32 tsval;
+ __u32 tsecr;
+ __u32 source_ip;
+ __u32 dest_ip;
+ __u32 next_seq;
+ __u32 ack_seq;
+ __wsum data_csum;
+ __u16 window;
+ __u16 source_port;
+ __u16 dest_port;
+ __u16 append_cnt;
+ __u16 mss;
+ __u16 vlan_tci;
+};
+
struct myri10ge_rx_done {
struct mcp_slot *entry;
dma_addr_t bus;
int cnt;
int idx;
+ struct hlist_head lro_active;
+ struct hlist_head lro_free;
};
struct myri10ge_priv {
@@ -161,6 +184,10 @@
struct myri10ge_rx_done rx_done;
int small_bytes;
int big_bytes;
+ int lro_flushed;
+ int lro_queued;
+ int lro_too_many_streams;
+ int lro_bad_csum;
struct net_device *dev;
struct net_device_stats stats;
u8 __iomem *sram;
@@ -274,6 +301,10 @@
module_param(myri10ge_debug, int, 0);
MODULE_PARM_DESC(myri10ge_debug, "Debug level (0=none,...,16=all)");
+static int myri10ge_lro = 8;
+module_param(myri10ge_lro, int, S_IRUGO);
+MODULE_PARM_DESC(myri10ge_lro, "Enable large N receive offload queues\n");
+
static int myri10ge_fill_thresh = 256;
module_param(myri10ge_fill_thresh, int, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(myri10ge_fill_thresh, "Number of empty rx slots allowed\n");
@@ -808,6 +839,9 @@
mgp->rx_done.idx = 0;
mgp->rx_done.cnt = 0;
mgp->link_changes = 0;
+ mgp->lro_queued = 0;
+ mgp->lro_flushed = 0;
+ mgp->lro_too_many_streams = 0;
status = myri10ge_update_mac_address(mgp, mgp->dev->dev_addr);
myri10ge_change_promisc(mgp, 0, 0);
myri10ge_change_pause(mgp, mgp->pause);
@@ -876,6 +910,357 @@
skb_pull(skb, MXGEFW_PAD);
}
+/* debug aid to check for "bad" hardware */
+
+static void
+myri10ge_frag_trim(struct skb_frag_struct *rx_frags, int old_len, int trim)
+{
+ struct skb_frag_struct *frag;
+ int offset = 0;
+ int new_len = old_len - trim;
+ int old_size;
+
+ /* find the frag where the IP payload ends. This
+ * should almost always be the 1st fragment */
+ frag = rx_frags;
+ while (offset + frag->size < new_len) {
+ offset += frag->size;
+ frag++;
+ }
+ /* adjust its length */
+ old_size = frag->size;
+ frag->size = new_len - offset;
+
+ /* release any excess pages */
+ offset += old_size;
+ while (offset < old_len) {
+ frag++;
+ offset += frag->size;
+ put_page(frag->page);
+ }
+}
+
+static inline int myri10ge_lro_csum(int tcplen, struct iphdr *iph, __wsum csum)
+{
+ if (unlikely(ip_fast_csum((u8 *) iph, iph->ihl)))
+ return -1;
+
+ if (unlikely(csum_tcpudp_magic(iph->saddr, iph->daddr,
+ tcplen, IPPROTO_TCP, csum)))
+ return -1;
+ return 0;
+}
+
+static inline void
+myri10ge_lro_flush(struct myri10ge_priv *mgp, struct myri10ge_lro_packet *lro)
+{
+ struct iphdr *iph;
+ struct tcphdr *th;
+ struct sk_buff *skb;
+ u32 *ts_ptr;
+ u32 tcplen;
+
+ skb = lro->skb;
+
+ if (lro->append_cnt) {
+ /* incorporate the new len into the ip header and
+ * re-calculate the checksum, Note that
+ * eth_type_trans() left skb->data at the start of
+ * the vlan header, so we need to skip past it to
+ * get to the IP header */
+ if (lro->vlan_tci) {
+ iph = (struct iphdr *)(skb->data + VLAN_HLEN);
+ iph->tot_len = htons(skb->len - VLAN_HLEN);
+ } else {
+ iph = (struct iphdr *)skb->data;
+ iph->tot_len = htons(skb->len);
+ }
+ iph->check = 0;
+ iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+
+ /* incorporate the latest ack into the tcp header */
+ th = (struct tcphdr *)(iph + 1);
+ th->ack_seq = lro->ack_seq;
+ th->window = lro->window;
+
+ /* incorporate latest timestamp into the tcp header */
+ if (lro->timestamp) {
+ ts_ptr = (u32 *) (th + 1);
+ ts_ptr[1] = htonl(lro->tsval);
+ ts_ptr[2] = lro->tsecr;
+ }
+
+ /*
+ * update checksum in tcp header by re-calculating the
+ * tcp pseudoheader checksum, and adding it to the checksum
+ * of the tcp payload data
+ */
+ th->check = 0;
+ tcplen = ntohs(iph->tot_len) - sizeof(*iph);
+ th->check = tcp_v4_check(tcplen, iph->saddr, iph->daddr,
+ csum_partial((char *)th,
+ th->doff << 2,
+ lro->data_csum));
+
+ skb->truesize = skb->len + sizeof(struct sk_buff);
+ }
+
+ skb_shinfo(skb)->gso_size = lro->mss;
+ netif_receive_skb(skb);
+ mgp->dev->last_rx = jiffies;
+ mgp->lro_queued += lro->append_cnt + 1;
+ mgp->lro_flushed++;
+ lro->skb = NULL;
+ lro->timestamp = 0;
+ lro->append_cnt = 0;
+ hlist_add_head(&lro->lro_node, &mgp->rx_done.lro_free);
+}
+
+static int
+myri10ge_lro_rx(struct myri10ge_priv *mgp, u8 * va,
+ struct skb_frag_struct *rx_frags, int *len, __wsum csum)
+{
+ struct ethhdr *eh;
+ struct vlan_ethhdr *vh;
+ struct iphdr *iph;
+ struct tcphdr *th;
+ struct myri10ge_lro_packet *lro;
+ u32 *ts_ptr = NULL; /* XXX -Wuninitialized */
+ struct sk_buff *skb;
+ struct skb_frag_struct *skb_frags;
+ struct hlist_node *node;
+ int opt_bytes, tcp_data_len, tcp_hdr_len, hlen, trim, llhlen;
+ __u32 seq;
+ __u16 ip_len, vlan_tci;
+
+ /* check to see that it is IP */
+ eh = (struct ethhdr *)(va + MXGEFW_PAD);
+ if (eh->h_proto == ntohs(ETH_P_IP)) {
+ llhlen = ETH_HLEN;
+ vlan_tci = 0;
+ } else if (eh->h_proto == ntohs(ETH_P_8021Q)) {
+ vh = (struct vlan_ethhdr *)(va + MXGEFW_PAD);
+ if (vh->h_vlan_encapsulated_proto != ntohs(ETH_P_IP))
+ return -1;
+ llhlen = VLAN_ETH_HLEN;
+ vlan_tci = vh->h_vlan_TCI;
+ /* HW checksum starts after the ethernet header, we
+ * must subtract off the VLAN header's checksum before
+ * csum can be used */
+ csum = csum_sub(csum,
+ csum_partial(va + MXGEFW_PAD + ETH_HLEN,
+ VLAN_HLEN, 0));
+ } else {
+ return -1;
+ }
+
+ /* now check to see if it is TCP */
+ iph = (struct iphdr *)(va + llhlen + MXGEFW_PAD);
+ if (iph->protocol != IPPROTO_TCP)
+ return -1;
+
+ /* ensure there are no options */
+ if ((iph->ihl << 2) != sizeof(*iph))
+ return -1;
+
+ /* .. and the packet is not fragmented */
+ if (iph->frag_off & htons(IP_MF | IP_OFFSET))
+ return -1;
+
+ /* find the TCP header */
+ th = (struct tcphdr *)(iph + 1);
+
+ /* ensure no bits set besides ack or psh */
+ if (th->fin || th->syn || th->rst || th->urg || th->ece
+ || th->cwr || !th->ack)
+ return -1;
+
+ /* check for timestamps. Since the only option we handle are
+ * timestamps, we only have to handle the simple case of
+ * aligned timestamps */
+
+ opt_bytes = (th->doff << 2) - sizeof(*th);
+ tcp_hdr_len = sizeof(*th) + opt_bytes;
+ if (opt_bytes != 0) {
+ ts_ptr = (u32 *) (th + 1);
+ if (unlikely(opt_bytes != TCPOLEN_TSTAMP_ALIGNED) ||
+ (*ts_ptr != ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
+ | (TCPOPT_TIMESTAMP << 8)
+ | TCPOLEN_TIMESTAMP))) {
+ return -1;
+ }
+ }
+
+ ip_len = ntohs(iph->tot_len);
+ tcp_data_len = ip_len - (th->doff << 2) - sizeof(*iph);
+
+ /*
+ * If frame is padded beyond the end of the IP packet,
+ * then we must trim the extra bytes off the end.
+ */
+ trim = *len - (ip_len + llhlen + MXGEFW_PAD);
+ if (trim != 0) {
+ /* ensure we received the full frame */
+ if (unlikely(trim < 0))
+ return -1;
+ /* trim off any padding */
+ myri10ge_frag_trim(rx_frags, *len, trim);
+ *len -= trim;
+ }
+
+ hlen = ip_len + llhlen - tcp_data_len;
+
+ seq = ntohl(th->seq);
+
+ if (unlikely(myri10ge_lro_csum(tcp_hdr_len + tcp_data_len, iph, csum))) {
+ mgp->lro_bad_csum++;
+ return -1;
+ }
+
+ /* now we have a packet that might be eligible for LRO,
+ * so see if it matches anything we might expect */
+
+ hlist_for_each_entry(lro, node, &mgp->rx_done.lro_active, lro_node) {
+ if (lro->source_port == th->source &&
+ lro->dest_port == th->dest &&
+ lro->source_ip == iph->saddr &&
+ lro->dest_ip == iph->daddr && lro->vlan_tci == vlan_tci) {
+ /* Try to append it */
+
+ if (unlikely(seq != lro->next_seq)) {
+ /* out of order packet */
+ hlist_del(&lro->lro_node);
+ myri10ge_lro_flush(mgp, lro);
+ return -1;
+ }
+ if (lro->timestamp) {
+ __u32 tsval = ntohl(*(ts_ptr + 1));
+ /* make sure timestamp values are increasing */
+ if (unlikely(lro->tsval > tsval ||
+ *(ts_ptr + 2) == 0)) {
+ return -1;
+ }
+ lro->tsval = tsval;
+ lro->tsecr = *(ts_ptr + 2);
+ }
+ lro->next_seq += tcp_data_len;
+ lro->ack_seq = th->ack_seq;
+ skb = lro->skb;
+
+ /* subtract off the checksum of the tcp header
+ * from the hardware checksum, and add it to the
+ * stored tcp data checksum. csum_block_add()
+ * is used, as the total length so far may be
+ * odd
+ */
+ lro->data_csum =
+ csum_block_add(lro->data_csum,
+ csum_sub(csum,
+ csum_partial((u8 *) th,
+ tcp_hdr_len,
+ 0)),
+ skb->data_len);
+ lro->window = th->window;
+ skb->data_len += tcp_data_len;
+ skb->len += tcp_data_len;
+ if (tcp_data_len > lro->mss)
+ lro->mss = tcp_data_len;
+
+ /* pull off the header and firmware pad
+ * before we copy the data */
+
+ hlen += MXGEFW_PAD;
+ rx_frags[0].page_offset += hlen;
+ rx_frags[0].size -= hlen;
+ *len -= hlen;
+ skb_frags =
+ &skb_shinfo(skb)->frags[skb_shinfo(skb)->nr_frags];
+ /* if it was just header (like a TCP ack with
+ * no data), release the page */
+ if (*len <= 0) {
+ put_page(rx_frags[0].page);
+ } else {
+ while (*len > 0) {
+ memcpy(skb_frags, rx_frags,
+ sizeof(*skb_frags));
+ *len -= rx_frags->size;
+ rx_frags++;
+ skb_frags++;
+ skb_shinfo(skb)->nr_frags++;
+ }
+ }
+
+ lro->append_cnt++;
+
+ /* cheap, conservative test. We may waste
+ * some slots with a 1500 byte mtu */
+ if (skb_shinfo(skb)->nr_frags
+ + MYRI10GE_MAX_FRAGS_PER_FRAME > MAX_SKB_FRAGS
+ || mgp->dev->mtu + skb->len > 65535) {
+ hlist_del(&lro->lro_node);
+ myri10ge_lro_flush(mgp, lro);
+ }
+ return 0;
+ }
+ }
+
+ /* start a new packet */
+ if (!hlist_empty(&mgp->rx_done.lro_free)) {
+ lro = hlist_entry(mgp->rx_done.lro_free.first,
+ struct myri10ge_lro_packet, lro_node);
+ /* allocate an skb to attach the page(s) to */
+
+ skb = netdev_alloc_skb(mgp->dev, hlen + 16);
+ if (unlikely(skb == NULL))
+ return -1;
+
+ myri10ge_rx_skb_build(skb, va, rx_frags, *len,
+ hlen + MXGEFW_PAD);
+ skb->protocol = eth_type_trans(skb, mgp->dev);
+ skb->dev = mgp->dev;
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ lro->skb = skb;
+ lro->source_ip = iph->saddr;
+ lro->dest_ip = iph->daddr;
+ lro->source_port = th->source;
+ lro->dest_port = th->dest;
+ lro->next_seq = seq + tcp_data_len;
+ lro->mss = tcp_data_len;
+ lro->ack_seq = th->ack_seq;
+
+ /* save the checksum of just the TCP payload by
+ * subtracting off the checksum of the TCP header from
+ * the entire hardware checksum
+ */
+ lro->data_csum = csum_sub(csum,
+ csum_partial((u8 *) th,
+ tcp_hdr_len, 0));
+ lro->window = th->window;
+ lro->vlan_tci = vlan_tci;
+ /* record timestamp if it is present */
+ if (opt_bytes) {
+ lro->timestamp = 1;
+ lro->tsval = ntohl(*(ts_ptr + 1));
+ lro->tsecr = *(ts_ptr + 2);
+ }
+ /* remove first packet from freelist.. */
+ hlist_del(&lro->lro_node);
+ /* .. and insert at the front of the active list */
+ hlist_add_head(&lro->lro_node, &mgp->rx_done.lro_active);
+
+ /* release the page if there was no data. We do it
+ * down here since the code above refers to the
+ * contents of the page */
+ if (skb_shinfo(skb)->frags[0].size <= 0) {
+ put_page(skb_shinfo(skb)->frags[0].page);
+ skb_shinfo(skb)->nr_frags = 0;
+ }
+ return 0;
+ }
+ mgp->lro_too_many_streams++;
+ return -1;
+}
+
static void
myri10ge_alloc_rx_pages(struct myri10ge_priv *mgp, struct myri10ge_rx_buf *rx,
int bytes, int watchdog)
@@ -983,9 +1368,14 @@
remainder -= MYRI10GE_ALLOC_SIZE;
}
+ if (mgp->csum_flag && myri10ge_lro &&
+ (0 == myri10ge_lro_rx(mgp, va, rx_frags, &len, csum)))
+ return 1;
hlen = MYRI10GE_HLEN > len ? len : MYRI10GE_HLEN;
/* allocate an skb to attach the page(s) to. */
+ /* This is done
+ * after trying LRO, so as to avoid skb allocation overheads */
skb = netdev_alloc_skb(dev, MYRI10GE_HLEN + 16);
if (unlikely(skb == NULL)) {
@@ -1073,6 +1463,8 @@
static inline void myri10ge_clean_rx_done(struct myri10ge_priv *mgp, int *limit)
{
struct myri10ge_rx_done *rx_done = &mgp->rx_done;
+ struct hlist_node *node, *node2;
+ struct myri10ge_lro_packet *lro;
unsigned long rx_bytes = 0;
unsigned long rx_packets = 0;
unsigned long rx_ok;
@@ -1105,6 +1497,11 @@
}
rx_done->idx = idx;
rx_done->cnt = cnt;
+ hlist_for_each_entry_safe(lro, node, node2, &mgp->rx_done.lro_active,
+ lro_node) {
+ hlist_del(&lro->lro_node);
+ myri10ge_lro_flush(mgp, lro);
+ }
mgp->stats.rx_packets += rx_packets;
mgp->stats.rx_bytes += rx_bytes;
@@ -1338,6 +1735,7 @@
"read_dma_bw_MBs", "write_dma_bw_MBs", "read_write_dma_bw_MBs",
"serial_number", "tx_pkt_start", "tx_pkt_done",
"tx_req", "tx_done", "rx_small_cnt", "rx_big_cnt",
+ "lro_queued", "lro_flushed", "lro_too_many_streams", "lro_bad_csum",
"wake_queue", "stop_queue", "watchdog_resets", "tx_linearized",
"link_changes", "link_up", "dropped_link_overflow",
"dropped_link_error_or_filtered", "dropped_multicast_filtered",
@@ -1388,6 +1786,10 @@
data[i++] = (unsigned int)mgp->tx.done;
data[i++] = (unsigned int)mgp->rx_small.cnt;
data[i++] = (unsigned int)mgp->rx_big.cnt;
+ data[i++] = (unsigned int)mgp->lro_queued;
+ data[i++] = (unsigned int)mgp->lro_flushed;
+ data[i++] = (unsigned int)mgp->lro_too_many_streams;
+ data[i++] = (unsigned int)mgp->lro_bad_csum;
data[i++] = (unsigned int)mgp->wake_queue;
data[i++] = (unsigned int)mgp->stop_queue;
data[i++] = (unsigned int)mgp->watchdog_resets;
@@ -1527,6 +1929,18 @@
goto abort_with_rx_big_ring;
}
+ bytes = sizeof(struct myri10ge_lro_packet);
+ INIT_HLIST_HEAD(&mgp->rx_done.lro_free);
+ INIT_HLIST_HEAD(&mgp->rx_done.lro_active);
+ for (i = 0; i < myri10ge_lro; i++) {
+ struct myri10ge_lro_packet *lro;
+ lro = kzalloc(bytes, GFP_KERNEL);
+ if (lro != NULL) {
+ INIT_HLIST_NODE(&lro->lro_node);
+ hlist_add_head(&lro->lro_node, &mgp->rx_done.lro_free);
+ }
+ }
+
return 0;
abort_with_rx_big_ring:
@@ -1573,10 +1987,18 @@
struct myri10ge_priv *mgp;
struct sk_buff *skb;
struct myri10ge_tx_buf *tx;
+ struct hlist_node *node, *node2;
+ struct myri10ge_lro_packet *lro;
int i, len, idx;
mgp = netdev_priv(dev);
+ hlist_for_each_entry_safe(lro, node, node2, &mgp->rx_done.lro_active,
+ lro_node) {
+ hlist_del(&lro->lro_node);
+ kfree(lro);
+ }
+
for (i = mgp->rx_big.cnt; i < mgp->rx_big.fill_cnt; i++) {
idx = i & mgp->rx_big.mask;
if (i == mgp->rx_big.fill_cnt - 1)
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists