[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <56252257.3090508@oracle.com>
Date:	Mon, 19 Oct 2015 10:03:19 -0700
From:	santosh shilimkar <santosh.shilimkar@...cle.com>
To:	David Laight <David.Laight@...LAB.COM>
Cc:	netdev@...r.kernel.org, davem@...emloft.net,
	linux-kernel@...r.kernel.org, ssantosh@...nel.org
Subject: Re: [RFC PATCH] RDS: convert bind hash table to re-sizable hashtable
Hi David L,
On 10/14/2015 2:15 PM, Santosh Shilimkar wrote:
> From: Santosh Shilimkar <ssantosh@...nel.org>
>
> To further improve the RDS connection scalabilty on massive systems
> where number of sockets grows into tens of thousands  of sockets, there
> is a need of larger bind hashtable. Pre-allocated 8K or 16K table is
> not very flexible in terms of memory utilisation. The rhashtable
> infrastructure gives us the flexibility to grow the hashtbable based
> on use and also comes up with inbuilt efficient bucket(chain) handling.
>
> Cc: David Laight <David.Laight@...LAB.COM>
> Cc: David Miller <davem@...emloft.net>
> Signed-off-by: Santosh Shilimkar <santosh.shilimkar@...cle.com>
> ---
> As promised in last series review, here is an RFC to conver RDS to make
> use of re-sizable hash tables.
Just want to check if you have any comments since you reviewed the
earlier patch and suggested the rhash.
>
>   net/rds/af_rds.c |  10 ++++-
>   net/rds/bind.c   | 127 ++++++++++++++++++++-----------------------------------
>   net/rds/rds.h    |   7 ++-
>   3 files changed, 58 insertions(+), 86 deletions(-)
>
> diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
> index 384ea1e..b5476aeb 100644
> --- a/net/rds/af_rds.c
> +++ b/net/rds/af_rds.c
> @@ -573,6 +573,7 @@ static void rds_exit(void)
>   	rds_threads_exit();
>   	rds_stats_exit();
>   	rds_page_exit();
> +	rds_bind_lock_destroy();
>   	rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info);
>   	rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
>   }
> @@ -582,11 +583,14 @@ static int rds_init(void)
>   {
>   	int ret;
>
> -	rds_bind_lock_init();
> +	ret = rds_bind_lock_init();
> +	if (ret)
> +		goto out;
>
>   	ret = rds_conn_init();
>   	if (ret)
> -		goto out;
> +		goto out_bind;
> +
>   	ret = rds_threads_init();
>   	if (ret)
>   		goto out_conn;
> @@ -620,6 +624,8 @@ out_conn:
>   	rds_conn_exit();
>   	rds_cong_exit();
>   	rds_page_exit();
> +out_bind:
> +	rds_bind_lock_destroy();
>   out:
>   	return ret;
>   }
> diff --git a/net/rds/bind.c b/net/rds/bind.c
> index bc6b93e..199e4cc 100644
> --- a/net/rds/bind.c
> +++ b/net/rds/bind.c
> @@ -38,54 +38,18 @@
>   #include <linux/ratelimit.h>
>   #include "rds.h"
>
> -struct bind_bucket {
> -	rwlock_t                lock;
> -	struct hlist_head	head;
> +static struct rhashtable bind_hash_table;
> +
> +static struct rhashtable_params ht_parms = {
> +	.nelem_hint = 768,
> +	.key_len = sizeof(u64),
> +	.key_offset = offsetof(struct rds_sock, rs_bound_key),
> +	.head_offset = offsetof(struct rds_sock, rs_bound_node),
> +	.max_size = 16384,
> +	.min_size = 1024,
> +	.automatic_shrinking = true,
>   };
>
> -#define BIND_HASH_SIZE 1024
> -static struct bind_bucket bind_hash_table[BIND_HASH_SIZE];
> -
> -static struct bind_bucket *hash_to_bucket(__be32 addr, __be16 port)
> -{
> -	return bind_hash_table + (jhash_2words((u32)addr, (u32)port, 0) &
> -				  (BIND_HASH_SIZE - 1));
> -}
> -
> -/* must hold either read or write lock (write lock for insert != NULL) */
> -static struct rds_sock *rds_bind_lookup(struct bind_bucket *bucket,
> -					__be32 addr, __be16 port,
> -					struct rds_sock *insert)
> -{
> -	struct rds_sock *rs;
> -	struct hlist_head *head = &bucket->head;
> -	u64 cmp;
> -	u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port);
> -
> -	hlist_for_each_entry(rs, head, rs_bound_node) {
> -		cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) |
> -		      be16_to_cpu(rs->rs_bound_port);
> -
> -		if (cmp == needle) {
> -			rds_sock_addref(rs);
> -			return rs;
> -		}
> -	}
> -
> -	if (insert) {
> -		/*
> -		 * make sure our addr and port are set before
> -		 * we are added to the list.
> -		 */
> -		insert->rs_bound_addr = addr;
> -		insert->rs_bound_port = port;
> -		rds_sock_addref(insert);
> -
> -		hlist_add_head(&insert->rs_bound_node, head);
> -	}
> -	return NULL;
> -}
> -
>   /*
>    * Return the rds_sock bound at the given local address.
>    *
> @@ -94,18 +58,14 @@ static struct rds_sock *rds_bind_lookup(struct bind_bucket *bucket,
>    */
>   struct rds_sock *rds_find_bound(__be32 addr, __be16 port)
>   {
> +	u64 key = ((u64)addr << 32) | port;
>   	struct rds_sock *rs;
> -	unsigned long flags;
> -	struct bind_bucket *bucket = hash_to_bucket(addr, port);
>
> -	read_lock_irqsave(&bucket->lock, flags);
> -	rs = rds_bind_lookup(bucket, addr, port, NULL);
> -	read_unlock_irqrestore(&bucket->lock, flags);
> -
> -	if (rs && sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) {
> -		rds_sock_put(rs);
> +	rs = rhashtable_lookup_fast(&bind_hash_table, &key, ht_parms);
> +	if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
> +		rds_sock_addref(rs);
> +	else
>   		rs = NULL;
> -	}
>
>   	rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr,
>   		ntohs(port));
> @@ -116,10 +76,9 @@ struct rds_sock *rds_find_bound(__be32 addr, __be16 port)
>   /* returns -ve errno or +ve port */
>   static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
>   {
> -	unsigned long flags;
>   	int ret = -EADDRINUSE;
>   	u16 rover, last;
> -	struct bind_bucket *bucket;
> +	u64 key;
>
>   	if (*port != 0) {
>   		rover = be16_to_cpu(*port);
> @@ -130,23 +89,31 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
>   	}
>
>   	do {
> -		struct rds_sock *rrs;
>   		if (rover == 0)
>   			rover++;
>
> -		bucket = hash_to_bucket(addr, cpu_to_be16(rover));
> -		write_lock_irqsave(&bucket->lock, flags);
> -		rrs = rds_bind_lookup(bucket, addr, cpu_to_be16(rover), rs);
> -		write_unlock_irqrestore(&bucket->lock, flags);
> -		if (!rrs) {
> +		key = ((u64)addr << 32) | cpu_to_be16(rover);
> +		if (rhashtable_lookup_fast(&bind_hash_table, &key, ht_parms))
> +			continue;
> +
> +		rs->rs_bound_key = key;
> +		rs->rs_bound_addr = addr;
> +		rs->rs_bound_port = cpu_to_be16(rover);
> +		rs->rs_bound_node.next = NULL;
> +		rds_sock_addref(rs);
> +		if (!rhashtable_insert_fast(&bind_hash_table,
> +					    &rs->rs_bound_node, ht_parms)) {
>   			*port = rs->rs_bound_port;
>   			ret = 0;
>   			rdsdebug("rs %p binding to %pI4:%d\n",
>   			  rs, &addr, (int)ntohs(*port));
>   			break;
>   		} else {
> -			rds_sock_put(rrs);
> +			rds_sock_put(rs);
> +			ret = -ENOMEM;
> +			break;
>   		}
> +
>   	} while (rover++ != last);
>
>   	return ret;
> @@ -154,23 +121,17 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
>
>   void rds_remove_bound(struct rds_sock *rs)
>   {
> -	unsigned long flags;
> -	struct bind_bucket *bucket =
> -		hash_to_bucket(rs->rs_bound_addr, rs->rs_bound_port);
> -
> -	write_lock_irqsave(&bucket->lock, flags);
>
> -	if (rs->rs_bound_addr) {
> -		rdsdebug("rs %p unbinding from %pI4:%d\n",
> -		  rs, &rs->rs_bound_addr,
> -		  ntohs(rs->rs_bound_port));
> +	if (!rs->rs_bound_addr)
> +		return;
>
> -		hlist_del_init(&rs->rs_bound_node);
> -		rds_sock_put(rs);
> -		rs->rs_bound_addr = 0;
> -	}
> +	rdsdebug("rs %p unbinding from %pI4:%d\n",
> +		 rs, &rs->rs_bound_addr,
> +		 ntohs(rs->rs_bound_port));
>
> -	write_unlock_irqrestore(&bucket->lock, flags);
> +	rhashtable_remove_fast(&bind_hash_table, &rs->rs_bound_node, ht_parms);
> +	rds_sock_put(rs);
> +	rs->rs_bound_addr = 0;
>   }
>
>   int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
> @@ -217,10 +178,12 @@ out:
>   	return ret;
>   }
>
> -void rds_bind_lock_init(void)
> +void rds_bind_lock_destroy(void)
>   {
> -	int i;
> +	rhashtable_destroy(&bind_hash_table);
> +}
>
> -	for (i = 0; i < BIND_HASH_SIZE; i++)
> -		rwlock_init(&bind_hash_table[i].lock);
> +int rds_bind_lock_init(void)
> +{
> +	return rhashtable_init(&bind_hash_table, &ht_parms);
>   }
> diff --git a/net/rds/rds.h b/net/rds/rds.h
> index 121fb81..0068840 100644
> --- a/net/rds/rds.h
> +++ b/net/rds/rds.h
> @@ -7,6 +7,7 @@
>   #include <rdma/rdma_cm.h>
>   #include <linux/mutex.h>
>   #include <linux/rds.h>
> +#include <linux/rhashtable.h>
>
>   #include "info.h"
>
> @@ -472,7 +473,8 @@ struct rds_sock {
>   	 * bound_addr used for both incoming and outgoing, no INADDR_ANY
>   	 * support.
>   	 */
> -	struct hlist_node	rs_bound_node;
> +	struct rhash_head	rs_bound_node;
> +	u64			rs_bound_key;
>   	__be32			rs_bound_addr;
>   	__be32			rs_conn_addr;
>   	__be16			rs_bound_port;
> @@ -603,7 +605,8 @@ extern wait_queue_head_t rds_poll_waitq;
>   int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
>   void rds_remove_bound(struct rds_sock *rs);
>   struct rds_sock *rds_find_bound(__be32 addr, __be16 port);
> -void rds_bind_lock_init(void);
> +int rds_bind_lock_init(void);
> +void rds_bind_lock_destroy(void);
>
>   /* cong.c */
>   int rds_cong_get_maps(struct rds_connection *conn);
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Powered by blists - more mailing lists
 
