commit eedcf97aca5a1c483475216696d5e79fa868ab66 Author: Stephen Buck Date: Thu Aug 12 16:19:19 2010 +1000 Put a tproxy socket in the correct bind bucket When __inet_inherit_port() is called on a tproxy connection, the parent's inet_num is not the same as the child's inet_num. This results in the child socket being added to the wrong inet_bind_bucket. In addition to this, the discrepency causes the wrong lock is held, so that if 2 connections with different destination ports arrive at the same time, the list may be corrupted. Similarly, the wrong lock is held when the socket is removed from the bind hash in __inet_put_port(). When the parent's inet_num is different to the child's inet_num, the inet_bind_bucket needs to be retrieved directly from the hash table. This keeps the child's inet_num and icsk_bind_hash in sync eliminating the described locking errors. Below is a walk through the original code, using a scenario where the tproxy listen socket is bound to port 9999, and has a new http connection redirected to it: // inet_sk(sk)->inet_num = 9999 // inet_sk(child)->inet_num = 80 // inet_csk(sk)->icsk_bind_hash is a bucket where // inet_csk(sk)->icsk_bind_hash->port = 9999 // inet_csk(child)->icsk_bind_hash is uninitialised void __inet_inherit_port(struct sock *sk, struct sock *child) { struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; const int bhash = inet_bhashfn(sock_net(sk), inet_sk(child)->inet_num, table->bhash_size); // head is retrieved using inet_sk(child)->inet_num, so we end up with // head = &table->bhash[80] struct inet_bind_hashbucket *head = &table->bhash[bhash]; struct inet_bind_bucket *tb; spin_lock(&head->lock); // Here the child is queued in the same list as the parent. This means that // the child is queued in a inet_bind_bucket where // inet_bind_bucket->port = 9999, but the lock being held is bhash[80].lock tb = inet_csk(sk)->icsk_bind_hash; sk_add_bind_node(child, &tb->owners); // Now inet_csk(child)->icsk_bind_hash is assigned to the bucket where // inet_bind_bucket->port = 9999 resulting in the wrong lock being taken // for the list being manipulated in __inet_put_port() inet_csk(child)->icsk_bind_hash = tb; spin_unlock(&head->lock); } // inet_sk(sk)->inet_num = 9999 // inet_sk(child)->inet_num = 80 // inet_csk(sk)->icsk_bind_hash is a bucket where // inet_csk(sk)->icsk_bind_hash->port = 9999 // inet_csk(child)->icsk_bind_hash is a bucket where // inet_csk(sk)->icsk_bind_hash->port = 9999 diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 2b79377..1ba982f 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -105,13 +105,24 @@ EXPORT_SYMBOL(inet_put_port); void __inet_inherit_port(struct sock *sk, struct sock *child) { struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; - const int bhash = inet_bhashfn(sock_net(sk), inet_sk(child)->inet_num, + unsigned short port = inet_sk(child)->inet_num; + const int bhash = inet_bhashfn(sock_net(sk), port, table->bhash_size); struct inet_bind_hashbucket *head = &table->bhash[bhash]; struct inet_bind_bucket *tb; spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; + if (tb->port != port) { + struct hlist_node *node; + inet_bind_bucket_for_each(tb, node, &head->chain) { + if (net_eq(ib_net(tb), sock_net(sk)) && tb->port == port) + break; + } + if (!node) + tb = inet_bind_bucket_create(table->bind_bucket_cachep, + sock_net(sk), head, port); + } sk_add_bind_node(child, &tb->owners); inet_csk(child)->icsk_bind_hash = tb; spin_unlock(&head->lock);