[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20081223114202.GA14173@ioremap.net>
Date: Tue, 23 Dec 2008 14:42:02 +0300
From: Evgeniy Polyakov <zbr@...emap.net>
To: David Miller <davem@...emloft.net>
Cc: netdev@...r.kernel.org
Subject: Re: [PATCH] Allowing more than 64k bound to zero port connections.
On Mon, Dec 22, 2008 at 07:51:16PM -0800, David Miller (davem@...emloft.net) wrote:
> > Attached patch allows to remove this limit. Currently inet port
> > selection algorithm runs over the whole bind hash table and checks if
> > appropriate hash bucket does not use randomly selected port. When it
> > found given cell, system binds socket to the selected port. If sockets
> > are not freed, this will be finished after local port range is
> > exhausted, not even trying to check if bound sockets have reuse socket
> > option and thus could share the bucket.
>
> I've reviewed this enough to believe that it is implemented
> properly.
>
> However I want to do some research about socket semantics in
> this area before applying this. I'm travelling and don't
> have my favorite books with me, so this will have to wait
> until later this week.
Ok, no problem, have a nice vacations.
I've attached updated patch (tested on .24 though), which fixes a race
when 'usual' socket can sneak into the bucket and thus it will stop
being fastreuse, but we will add there additional fastreuse socket,
which then may trigger warn_on.
Fix is to check if bucket changed its fastreuse to negative and start
agin in this case, otherwise socket can be safely added. Subsequent
bucket search will not scan the whole table, but will get the first
random port, which matches our fastreuse expectations, since we already
know that all buckets are non-empty. This small optimization affects
only the case, when all buckets are non-empty and we failed to insert
reuse socket because usual one sneaked in.
Signed-off-by: Evgeniy Polyakov <zbr@...emap.net>
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 5cc182f..757b6a9 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -80,6 +80,7 @@ struct inet_bind_bucket {
struct net *ib_net;
unsigned short port;
signed short fastreuse;
+ int num_owners;
struct hlist_node node;
struct hlist_head owners;
};
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index bd1278a..67788e4 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -99,18 +99,31 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
local_bh_disable();
if (!snum) {
int remaining, rover, low, high;
+ int smallest_size, smallest_rover, get_random = 0;
+again:
inet_get_local_port_range(&low, &high);
remaining = (high - low) + 1;
- rover = net_random() % remaining + low;
+ smallest_rover = rover = net_random() % remaining + low;
+ smallest_size = ~0;
do {
head = &hashinfo->bhash[inet_bhashfn(net, rover,
hashinfo->bhash_size)];
spin_lock(&head->lock);
inet_bind_bucket_for_each(tb, node, &head->chain)
- if (tb->ib_net == net && tb->port == rover)
+ if (tb->ib_net == net && tb->port == rover) {
+ if (tb->fastreuse > 0 &&
+ sk->sk_reuse &&
+ sk->sk_state != TCP_LISTEN &&
+ tb->num_owners < smallest_size) {
+ smallest_size = tb->num_owners;
+ smallest_rover = rover;
+ if (get_random)
+ break;
+ }
goto next;
+ }
break;
next:
spin_unlock(&head->lock);
@@ -125,9 +138,19 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
* the top level, not from the 'break;' statement.
*/
ret = 1;
- if (remaining <= 0)
+ if (remaining <= 0) {
+ if (smallest_size != ~0) {
+ head = &hashinfo->bhash[inet_bhashfn(net, smallest_rover, hashinfo->bhash_size)];
+ spin_lock(&head->lock);
+ inet_bind_bucket_for_each(tb, node, &head->chain)
+ if (tb->port == smallest_rover && tb->fastreuse > 0)
+ goto tb_found;
+ spin_unlock(&head->lock);
+ get_random = 1;
+ goto again;
+ }
goto fail;
-
+ }
/* OK, here is the one we will use. HEAD is
* non-NULL and we hold it's mutex.
*/
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 4498190..4970a03 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -38,6 +38,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
tb->ib_net = hold_net(net);
tb->port = snum;
tb->fastreuse = 0;
+ tb->num_owners = 0;
INIT_HLIST_HEAD(&tb->owners);
hlist_add_head(&tb->node, &head->chain);
}
@@ -61,6 +62,7 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
{
inet_sk(sk)->num = snum;
sk_add_bind_node(sk, &tb->owners);
+ tb->num_owners++;
inet_csk(sk)->icsk_bind_hash = tb;
}
@@ -78,6 +80,7 @@ static void __inet_put_port(struct sock *sk)
spin_lock(&head->lock);
tb = inet_csk(sk)->icsk_bind_hash;
__sk_del_bind_node(sk);
+ tb->num_owners--;
inet_csk(sk)->icsk_bind_hash = NULL;
inet_sk(sk)->num = 0;
inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
@@ -450,9 +453,9 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
*/
inet_bind_bucket_for_each(tb, node, &head->chain) {
if (tb->ib_net == net && tb->port == port) {
- WARN_ON(hlist_empty(&tb->owners));
if (tb->fastreuse >= 0)
goto next_port;
+ WARN_ON(hlist_empty(&tb->owners));
if (!check_established(death_row, sk,
port, &tw))
goto ok;
--
Evgeniy Polyakov
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists