lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Wed, 11 Nov 2009 09:20:42 +0100
From:	Eric Dumazet <eric.dumazet@...il.com>
To:	Tom Herbert <therbert@...gle.com>
CC:	David Miller <davem@...emloft.net>, netdev@...r.kernel.org
Subject: Re: [PATCH 1/2] rps: core implementation

Tom Herbert a écrit :
> Third version of RPS.
> 
> Signed-off-by: Tom Herbert <therbert@...gle.com>
> ---
>  include/linux/interrupt.h |    1 +
>  include/linux/netdevice.h |   18 ++++
>  include/linux/skbuff.h    |    2 +
>  net/core/dev.c            |  227 ++++++++++++++++++++++++++++++++++++++-------
>  net/core/net-sysfs.c      |  135 +++++++++++++++++++++++++++
>  5 files changed, 348 insertions(+), 35 deletions(-)
> 

I must say this is really exciting :)

> diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
> index b78cf81..fa91194 100644
> --- a/include/linux/interrupt.h
> +++ b/include/linux/interrupt.h
> @@ -345,6 +345,7 @@ enum
>  	TIMER_SOFTIRQ,
>  	NET_TX_SOFTIRQ,
>  	NET_RX_SOFTIRQ,
> +	NET_RPS_SOFTIRQ,
>  	BLOCK_SOFTIRQ,
>  	BLOCK_IOPOLL_SOFTIRQ,
>  	TASKLET_SOFTIRQ,
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 8380009..c1b1bbb 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -639,6 +639,18 @@ struct net_device_ops {
>  };
> 
>  /*
> + * Structure for Receive Packet Steering.  Length of map and array of CPU ID's.
> + */
> +struct rps_map {
> +	int len;
> +	u16 map[0];
> +};
> +
> +/* Maximum size of RPS map (for allocation) */
> +#define RPS_MAP_SIZE (sizeof(struct rps_map) + \
> +    (num_possible_cpus() * sizeof(u16)))
> +

Problem of possible cpus is the number can be very large on some arches,
but yet few cpus online....

In this kind of situation, get_rps_cpu() will return -1 most of the time,
defeating goal of RPS ?


> +/*
>   *	The DEVICE structure.
>   *	Actually, this whole structure is a big mistake.  It mixes I/O
>   *	data with strictly "high-level" data, and it has to know about
> @@ -807,6 +819,9 @@ struct net_device
>  	void			*ax25_ptr;	/* AX.25 specific data */
>  	struct wireless_dev	*ieee80211_ptr;	/* IEEE 802.11 specific data,
>  						   assign before registering */
> +	void			*rps_maps;	/* Array of per-NAPI maps for
> +						   receive packet steeing */
> +	int			rps_num_maps;	/* Number of RPS maps */
> 
>  /*
>   * Cache line mostly used on receive path (including eth_type_trans())
> @@ -1217,6 +1232,9 @@ struct softnet_data
>  	struct Qdisc		*output_queue;
>  	struct sk_buff_head	input_pkt_queue;
>  	struct list_head	poll_list;
> +
> +	struct call_single_data	csd;
> +
>  	struct sk_buff		*completion_queue;
> 
>  	struct napi_struct	backlog;
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 0c68fbd..95feac7 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -396,6 +396,8 @@ struct sk_buff {
> 
>  	__u16			vlan_tci;
> 
> +	__u32			rxhash;
> +
>  	sk_buff_data_t		transport_header;
>  	sk_buff_data_t		network_header;
>  	sk_buff_data_t		mac_header;
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 28b0b9e..735e7e3 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -1976,6 +1976,162 @@ int weight_p __read_mostly = 64;            /*
> old backlog weight */
> 
>  DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
> 
> +static u32 simple_hashrnd;
> +
> +/**
> + * get_rps_cpu is called from netif_receive_skb and returns the target
> + * CPU from the RPS map of the receiving NAPI instance for a given skb.
> + */
> +static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
> +{
> +	u32 addr1, addr2, ports;
> +	struct ipv6hdr *ip6;
> +	struct iphdr *ip;
> +	u32 hash, ihl;
> +	u8 ip_proto;
> +	int cpu;
> +	struct rps_map *map = NULL;
> +
> +	if (dev->rps_num_maps) {
> +		/*
> +		 * Locate the map corresponding to the NAPI queue that
> +		 * the packet was received on.
> +		 */
> +		int index = skb_get_rx_queue(skb);
> +		if (index < 0 || index >= dev->rps_num_maps)
> +			index = 0;
> +
> +		map = (struct rps_map *)
> +		    (dev->rps_maps + (RPS_MAP_SIZE * index));
> +		if (!map->len)
> +			map = NULL;
> +	}
> +
> +	if (!map)
> +		return -1;
> +
> +	hash = skb->rxhash;
> +	if (hash)
> +		goto got_hash; /* Skip hash computation on packet header */
> +
> +	switch (skb->protocol) {
> +	case __constant_htons(ETH_P_IP):
> +		if (!pskb_may_pull(skb, sizeof(*ip)))
> +			return -1;
> +
> +		ip = (struct iphdr *) skb->data;
> +		ip_proto = ip->protocol;
> +		addr1 = ip->saddr;
> +		addr2 = ip->daddr;
> +		ihl = ip->ihl;
> +		break;
> +	case __constant_htons(ETH_P_IPV6):
> +		if (!pskb_may_pull(skb, sizeof(*ip6)))
> +			return -1;
> +
> +		ip6 = (struct ipv6hdr *) skb->data;
> +		ip_proto = ip6->nexthdr;
> +		addr1 = ip6->saddr.s6_addr32[3];
> +		addr2 = ip6->daddr.s6_addr32[3];
> +		ihl = (40 >> 2);
> +		break;
> +	default:
> +		return -1;
> +	}
> +	ports = 0;
> +	switch (ip_proto) {
> +	case IPPROTO_TCP:
> +	case IPPROTO_UDP:
> +	case IPPROTO_DCCP:
> +	case IPPROTO_ESP:
> +	case IPPROTO_AH:
> +	case IPPROTO_SCTP:
> +	case IPPROTO_UDPLITE:
> +		if (pskb_may_pull(skb, (ihl * 4) + 4))
> +			ports = *((u32 *) (skb->data + (ihl * 4)));
> +		break;
> +
> +	default:
> +		break;
> +	}
> +
> +	hash = jhash_3words(addr1, addr2, ports, simple_hashrnd);

I wonder if you tried to exchange addr1/addr2  port1/port2 so that conntracking/routing
is also speedup ...

ie make sure hash will be the same regardless of the direction of packet.

union {
	u32 port;
	u16 ports[2];
} p;

if (addr1 < addr2)
	swap(addr1, addr2);

if (p.ports[0] < p.ports[1]);
	swap(p.ports[0], p.ports[1]);

hash = jhash_3words(addr1, addr2, ports, simple_hashrnd);


I think I'll try to extend your patches with TX completion recycling too.

Ie record in skb the cpu number of original sender, and queue skb to
remote queue for destruction (sock_wfree() call and expensive scheduler calls...)

(This probably needs driver cooperation, instead of calling consume_skb(),
use a different function)

Thanks
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ