[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <alpine.DEB.2.00.1301201542470.2060@pokey.mtv.corp.google.com>
Date: Sun, 20 Jan 2013 16:07:48 -0800 (PST)
From: Tom Herbert <therbert@...gle.com>
To: netdev@...r.kernel.org, davem@...emloft.net
cc: netdev@...kandruth.co.uk, eric.dumazet@...il.com
Subject: [PATCH 4/5 v2] soreuseport: TCP/IPv6 implementation
Motivation for soreuseport would be something like a web server
binding to port 80 running with multiple threads, where each thread
might have it's own listener socket. This could be done as an
alternative to other models: 1) have one listener thread which
dispatches completed connections to workers. 2) accept on a single
listener socket from multiple threads. In case #1 the listener thread
can easily become the bottleneck with high connection turn-over rate.
In case #2, the proportion of connections accepted per thread tends
to be uneven under high connection load (assuming simple event loop:
while (1) { accept(); process() }, wakeup does not promote fairness
among the sockets. We have seen the disproportion to be as high
as 3:1 ratio between thread accepting most connections and the one
accepting the fewest. With so_reusport the distribution is
uniform.
Signed-off-by: Tom Herbert <therbert@...gle.com>
---
include/net/inet6_hashtables.h | 5 ++++-
include/net/netfilter/nf_tproxy_core.h | 1 +
net/ipv6/inet6_connection_sock.c | 19 ++++++++++++++-----
net/ipv6/inet6_hashtables.c | 19 ++++++++++++++++---
net/ipv6/tcp_ipv6.c | 4 +++-
5 files changed, 38 insertions(+), 10 deletions(-)
diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index 9e34c87..7ca75cb 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -71,6 +71,8 @@ extern struct sock *__inet6_lookup_established(struct net *net,
extern struct sock *inet6_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo,
+ const struct in6_addr *saddr,
+ const __be16 sport,
const struct in6_addr *daddr,
const unsigned short hnum,
const int dif);
@@ -88,7 +90,8 @@ static inline struct sock *__inet6_lookup(struct net *net,
if (sk)
return sk;
- return inet6_lookup_listener(net, hashinfo, daddr, hnum, dif);
+ return inet6_lookup_listener(net, hashinfo, saddr, sport,
+ daddr, hnum, dif);
}
static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo,
diff --git a/include/net/netfilter/nf_tproxy_core.h b/include/net/netfilter/nf_tproxy_core.h
index 1937964..36d9379 100644
--- a/include/net/netfilter/nf_tproxy_core.h
+++ b/include/net/netfilter/nf_tproxy_core.h
@@ -152,6 +152,7 @@ nf_tproxy_get_sock_v6(struct net *net, const u8 protocol,
break;
case NFT_LOOKUP_LISTENER:
sk = inet6_lookup_listener(net, &tcp_hashinfo,
+ saddr, sport,
daddr, ntohs(dport),
in->ifindex);
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 3064785..e4297a3 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -32,6 +32,9 @@ int inet6_csk_bind_conflict(const struct sock *sk,
{
const struct sock *sk2;
const struct hlist_node *node;
+ int reuse = sk->sk_reuse;
+ int reuseport = sk->sk_reuseport;
+ int uid = sock_i_uid((struct sock *)sk);
/* We must walk the whole port owner list in this case. -DaveM */
/*
@@ -42,11 +45,17 @@ int inet6_csk_bind_conflict(const struct sock *sk,
if (sk != sk2 &&
(!sk->sk_bound_dev_if ||
!sk2->sk_bound_dev_if ||
- sk->sk_bound_dev_if == sk2->sk_bound_dev_if) &&
- (!sk->sk_reuse || !sk2->sk_reuse ||
- sk2->sk_state == TCP_LISTEN) &&
- ipv6_rcv_saddr_equal(sk, sk2))
- break;
+ sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
+ if ((!reuse || !sk2->sk_reuse ||
+ sk2->sk_state == TCP_LISTEN) &&
+ (!reuseport || !sk2->sk_reuseport ||
+ (sk2->sk_state != TCP_TIME_WAIT &&
+ !uid_eq(uid,
+ sock_i_uid((struct sock *)sk2))))) {
+ if (ipv6_rcv_saddr_equal(sk, sk2))
+ break;
+ }
+ }
}
return node != NULL;
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index dea17fd..32b4a16 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -158,25 +158,38 @@ static inline int compute_score(struct sock *sk, struct net *net,
}
struct sock *inet6_lookup_listener(struct net *net,
- struct inet_hashinfo *hashinfo, const struct in6_addr *daddr,
+ struct inet_hashinfo *hashinfo, const struct in6_addr *saddr,
+ const __be16 sport, const struct in6_addr *daddr,
const unsigned short hnum, const int dif)
{
struct sock *sk;
const struct hlist_nulls_node *node;
struct sock *result;
- int score, hiscore;
+ int score, hiscore, matches = 0, reuseport = 0;
+ u32 phash = 0;
unsigned int hash = inet_lhashfn(net, hnum);
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
rcu_read_lock();
begin:
result = NULL;
- hiscore = -1;
+ hiscore = 0;
sk_nulls_for_each(sk, node, &ilb->head) {
score = compute_score(sk, net, hnum, daddr, dif);
if (score > hiscore) {
hiscore = score;
result = sk;
+ reuseport = sk->sk_reuseport;
+ if (reuseport) {
+ phash = inet6_ehashfn(net, daddr, hnum,
+ saddr, sport);
+ matches = 1;
+ }
+ } else if (score == hiscore && reuseport) {
+ matches++;
+ if (((u64)phash * matches) >> 32 == 0)
+ result = sk;
+ phash = next_pseudo_random32(phash);
}
}
/*
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 3701c3c..06087e5 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -834,7 +834,8 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
* no RST generated if md5 hash doesn't match.
*/
sk1 = inet6_lookup_listener(dev_net(skb_dst(skb)->dev),
- &tcp_hashinfo, &ipv6h->daddr,
+ &tcp_hashinfo, &ipv6h->saddr,
+ th->source, &ipv6h->daddr,
ntohs(th->source), inet6_iif(skb));
if (!sk1)
return;
@@ -1598,6 +1599,7 @@ do_time_wait:
struct sock *sk2;
sk2 = inet6_lookup_listener(dev_net(skb->dev), &tcp_hashinfo,
+ &ipv6_hdr(skb)->saddr, th->source,
&ipv6_hdr(skb)->daddr,
ntohs(th->dest), inet6_iif(skb));
if (sk2 != NULL) {
--
1.7.7.3
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists