[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date: Wed, 11 Nov 2009 09:20:42 +0100
From: Eric Dumazet <eric.dumazet@...il.com>
To: Tom Herbert <therbert@...gle.com>
CC: David Miller <davem@...emloft.net>, netdev@...r.kernel.org
Subject: Re: [PATCH 1/2] rps: core implementation
Tom Herbert a écrit :
> Third version of RPS.
>
> Signed-off-by: Tom Herbert <therbert@...gle.com>
> ---
> include/linux/interrupt.h | 1 +
> include/linux/netdevice.h | 18 ++++
> include/linux/skbuff.h | 2 +
> net/core/dev.c | 227 ++++++++++++++++++++++++++++++++++++++-------
> net/core/net-sysfs.c | 135 +++++++++++++++++++++++++++
> 5 files changed, 348 insertions(+), 35 deletions(-)
>
I must say this is really exciting :)
> diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
> index b78cf81..fa91194 100644
> --- a/include/linux/interrupt.h
> +++ b/include/linux/interrupt.h
> @@ -345,6 +345,7 @@ enum
> TIMER_SOFTIRQ,
> NET_TX_SOFTIRQ,
> NET_RX_SOFTIRQ,
> + NET_RPS_SOFTIRQ,
> BLOCK_SOFTIRQ,
> BLOCK_IOPOLL_SOFTIRQ,
> TASKLET_SOFTIRQ,
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 8380009..c1b1bbb 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -639,6 +639,18 @@ struct net_device_ops {
> };
>
> /*
> + * Structure for Receive Packet Steering. Length of map and array of CPU ID's.
> + */
> +struct rps_map {
> + int len;
> + u16 map[0];
> +};
> +
> +/* Maximum size of RPS map (for allocation) */
> +#define RPS_MAP_SIZE (sizeof(struct rps_map) + \
> + (num_possible_cpus() * sizeof(u16)))
> +
Problem of possible cpus is the number can be very large on some arches,
but yet few cpus online....
In this kind of situation, get_rps_cpu() will return -1 most of the time,
defeating goal of RPS ?
> +/*
> * The DEVICE structure.
> * Actually, this whole structure is a big mistake. It mixes I/O
> * data with strictly "high-level" data, and it has to know about
> @@ -807,6 +819,9 @@ struct net_device
> void *ax25_ptr; /* AX.25 specific data */
> struct wireless_dev *ieee80211_ptr; /* IEEE 802.11 specific data,
> assign before registering */
> + void *rps_maps; /* Array of per-NAPI maps for
> + receive packet steeing */
> + int rps_num_maps; /* Number of RPS maps */
>
> /*
> * Cache line mostly used on receive path (including eth_type_trans())
> @@ -1217,6 +1232,9 @@ struct softnet_data
> struct Qdisc *output_queue;
> struct sk_buff_head input_pkt_queue;
> struct list_head poll_list;
> +
> + struct call_single_data csd;
> +
> struct sk_buff *completion_queue;
>
> struct napi_struct backlog;
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 0c68fbd..95feac7 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -396,6 +396,8 @@ struct sk_buff {
>
> __u16 vlan_tci;
>
> + __u32 rxhash;
> +
> sk_buff_data_t transport_header;
> sk_buff_data_t network_header;
> sk_buff_data_t mac_header;
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 28b0b9e..735e7e3 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -1976,6 +1976,162 @@ int weight_p __read_mostly = 64; /*
> old backlog weight */
>
> DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
>
> +static u32 simple_hashrnd;
> +
> +/**
> + * get_rps_cpu is called from netif_receive_skb and returns the target
> + * CPU from the RPS map of the receiving NAPI instance for a given skb.
> + */
> +static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
> +{
> + u32 addr1, addr2, ports;
> + struct ipv6hdr *ip6;
> + struct iphdr *ip;
> + u32 hash, ihl;
> + u8 ip_proto;
> + int cpu;
> + struct rps_map *map = NULL;
> +
> + if (dev->rps_num_maps) {
> + /*
> + * Locate the map corresponding to the NAPI queue that
> + * the packet was received on.
> + */
> + int index = skb_get_rx_queue(skb);
> + if (index < 0 || index >= dev->rps_num_maps)
> + index = 0;
> +
> + map = (struct rps_map *)
> + (dev->rps_maps + (RPS_MAP_SIZE * index));
> + if (!map->len)
> + map = NULL;
> + }
> +
> + if (!map)
> + return -1;
> +
> + hash = skb->rxhash;
> + if (hash)
> + goto got_hash; /* Skip hash computation on packet header */
> +
> + switch (skb->protocol) {
> + case __constant_htons(ETH_P_IP):
> + if (!pskb_may_pull(skb, sizeof(*ip)))
> + return -1;
> +
> + ip = (struct iphdr *) skb->data;
> + ip_proto = ip->protocol;
> + addr1 = ip->saddr;
> + addr2 = ip->daddr;
> + ihl = ip->ihl;
> + break;
> + case __constant_htons(ETH_P_IPV6):
> + if (!pskb_may_pull(skb, sizeof(*ip6)))
> + return -1;
> +
> + ip6 = (struct ipv6hdr *) skb->data;
> + ip_proto = ip6->nexthdr;
> + addr1 = ip6->saddr.s6_addr32[3];
> + addr2 = ip6->daddr.s6_addr32[3];
> + ihl = (40 >> 2);
> + break;
> + default:
> + return -1;
> + }
> + ports = 0;
> + switch (ip_proto) {
> + case IPPROTO_TCP:
> + case IPPROTO_UDP:
> + case IPPROTO_DCCP:
> + case IPPROTO_ESP:
> + case IPPROTO_AH:
> + case IPPROTO_SCTP:
> + case IPPROTO_UDPLITE:
> + if (pskb_may_pull(skb, (ihl * 4) + 4))
> + ports = *((u32 *) (skb->data + (ihl * 4)));
> + break;
> +
> + default:
> + break;
> + }
> +
> + hash = jhash_3words(addr1, addr2, ports, simple_hashrnd);
I wonder if you tried to exchange addr1/addr2 port1/port2 so that conntracking/routing
is also speedup ...
ie make sure hash will be the same regardless of the direction of packet.
union {
u32 port;
u16 ports[2];
} p;
if (addr1 < addr2)
swap(addr1, addr2);
if (p.ports[0] < p.ports[1]);
swap(p.ports[0], p.ports[1]);
hash = jhash_3words(addr1, addr2, ports, simple_hashrnd);
I think I'll try to extend your patches with TX completion recycling too.
Ie record in skb the cpu number of original sender, and queue skb to
remote queue for destruction (sock_wfree() call and expensive scheduler calls...)
(This probably needs driver cooperation, instead of calling consume_skb(),
use a different function)
Thanks
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists