netdev - Re: 32 core net-next stack/netfilter "scaling"

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <497F350A.9020509@cosmosbay.com>
Date:	Tue, 27 Jan 2009 17:23:38 +0100
From:	Eric Dumazet <dada1@...mosbay.com>
To:	Patrick McHardy <kaber@...sh.net>
CC:	Rick Jones <rick.jones2@...com>,
	Linux Network Development list <netdev@...r.kernel.org>,
	Netfilter Developers <netfilter-devel@...r.kernel.org>,
	Stephen Hemminger <shemminger@...tta.com>
Subject: Re: 32 core net-next stack/netfilter "scaling"

Patrick McHardy a écrit :
> Eric Dumazet wrote:
>> [PATCH] netfilter: Get rid of central rwlock in tcp conntracking
>>
>> TCP connection tracking suffers of huge contention on a global rwlock,
>> used to protect tcp conntracking state.
>> As each tcp conntrack state have no relations between each others, we
>> can switch to fine grained lock, using a spinlock per "struct ip_ct_tcp"
>>
>> tcp_print_conntrack() dont need to lock anything to read
>> ct->proto.tcp.state,
>> so speedup /proc/net/ip_conntrack as well.
> 
> Thats an interesting test-case, but one lock per conntrack just for
> TCP tracking seems like overkill. We're trying to keep the conntrack
> stuctures as small as possible, so I'd prefer an array of spinlocks
> or something like that.

Please find a new version of patch, using an array of spinlocks.

I chose to define an array at nf_conntrack level, since tcp
conntracking is one user of this array but we might find
other users of this array.

Thanks

[PATCH] netfilter: Get rid of central rwlock in tcp conntracking

TCP connection tracking suffers of huge contention on a global rwlock,
used to protect tcp conntracking state.

As each tcp conntrack state have no relations between each others, we
can switch to fine grained lock. Using an array of spinlocks avoids
enlarging size of connection tracking structures, yet giving reasonable
fanout.

tcp_print_conntrack() doesnt need to lock anything to read
ct->proto.tcp.state, so speedup /proc/net/ip_conntrack as well.

nf_conntrack_hash_rnd_initted & nf_conntrack_hash_rnd declared read_mostly

Signed-off-by: Eric Dumazet <dada1@...mosbay.com>
Reported-by: Rick Jones <rick.jones2@...com>
---
 include/net/netfilter/nf_conntrack.h   |   32 +++++++++++++++++++++
 net/netfilter/nf_conntrack_core.c      |   10 ++++--
 net/netfilter/nf_conntrack_proto_tcp.c |   34 ++++++++++-------------
 3 files changed, 53 insertions(+), 23 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 2e0c536..288aff5 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -129,6 +129,38 @@ struct nf_conn
 	struct rcu_head rcu;
 };
 
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
+	defined(CONFIG_PROVE_LOCKING)
+
+/*
+ * We reserve 16 locks per cpu (typical cache line size is 64 bytes)
+ * maxed to 512 locks so that ct_hlock[] uses at most 2048 bytes of memory.
+ * (on lockdep we have a quite big spinlock_t, so keep the size down there)
+ */
+#ifdef CONFIG_LOCKDEP
+#define CT_HASH_LOCK_SZ 64
+#elif NR_CPUS >= 32
+#define CT_HASH_LOCK_SZ 512
+#else
+#define CT_HASH_LOCK_SZ (roundup_pow_of_two(NR_CPUS) * 16)
+#endif
+
+extern spinlock_t ct_hlock[CT_HASH_LOCK_SZ];
+
+#else
+#define CT_HASH_LOCK_SZ 0
+#endif
+static inline spinlock_t *ct_lock_addr(const struct nf_conn *ct)
+{
+	if (CT_HASH_LOCK_SZ) {
+		unsigned long hash = (unsigned long)ct;
+		hash ^= hash >> 16;
+		hash ^= hash >> 8;
+		return &ct_hlock[hash % CT_HASH_LOCK_SZ];
+	}
+	return NULL;
+}
+
 static inline struct nf_conn *
 nf_ct_tuplehash_to_ctrack(const struct nf_conntrack_tuple_hash *hash)
 {
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 90ce9dd..68822d8 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -61,9 +61,9 @@ struct nf_conn nf_conntrack_untracked __read_mostly;
 EXPORT_SYMBOL_GPL(nf_conntrack_untracked);
 
 static struct kmem_cache *nf_conntrack_cachep __read_mostly;
-
-static int nf_conntrack_hash_rnd_initted;
-static unsigned int nf_conntrack_hash_rnd;
+spinlock_t ct_hlock[CT_HASH_LOCK_SZ];
+static int nf_conntrack_hash_rnd_initted __read_mostly;
+static unsigned int nf_conntrack_hash_rnd __read_mostly;
 
 static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
 				  unsigned int size, unsigned int rnd)
@@ -1141,7 +1141,7 @@ module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
 static int nf_conntrack_init_init_net(void)
 {
 	int max_factor = 8;
-	int ret;
+	int i, ret;
 
 	/* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
 	 * machine has 512 buckets. >= 1GB machines have 16384 buckets. */
@@ -1174,6 +1174,8 @@ static int nf_conntrack_init_init_net(void)
 		ret = -ENOMEM;
 		goto err_cache;
 	}
+	for (i = 0; i < CT_HASH_LOCK_SZ; i++)
+		spin_lock_init(&ct_hlock[i]);
 
 	ret = nf_conntrack_proto_init();
 	if (ret < 0)
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index a1edb9c..247e82f 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -26,9 +26,6 @@
 #include <net/netfilter/nf_conntrack_ecache.h>
 #include <net/netfilter/nf_log.h>
 
-/* Protects ct->proto.tcp */
-static DEFINE_RWLOCK(tcp_lock);
-
 /* "Be conservative in what you do,
     be liberal in what you accept from others."
     If it's non-zero, we mark only out of window RST segments as INVALID. */
@@ -297,9 +294,7 @@ static int tcp_print_conntrack(struct seq_file *s, const struct nf_conn *ct)
 {
 	enum tcp_conntrack state;
 
-	read_lock_bh(&tcp_lock);
 	state = ct->proto.tcp.state;
-	read_unlock_bh(&tcp_lock);
 
 	return seq_printf(s, "%s ", tcp_conntrack_names[state]);
 }
@@ -705,14 +700,14 @@ void nf_conntrack_tcp_update(const struct sk_buff *skb,
 
 	end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, dataoff, tcph);
 
-	write_lock_bh(&tcp_lock);
+	spin_lock_bh(ct_lock_addr(ct));
 	/*
 	 * We have to worry for the ack in the reply packet only...
 	 */
 	if (after(end, ct->proto.tcp.seen[dir].td_end))
 		ct->proto.tcp.seen[dir].td_end = end;
 	ct->proto.tcp.last_end = end;
-	write_unlock_bh(&tcp_lock);
+	spin_unlock_bh(ct_lock_addr(ct));
 	pr_debug("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i "
 		 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
 		 sender->td_end, sender->td_maxend, sender->td_maxwin,
@@ -821,7 +816,7 @@ static int tcp_packet(struct nf_conn *ct,
 	th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
 	BUG_ON(th == NULL);
 
-	write_lock_bh(&tcp_lock);
+	spin_lock_bh(ct_lock_addr(ct));
 	old_state = ct->proto.tcp.state;
 	dir = CTINFO2DIR(ctinfo);
 	index = get_conntrack_index(th);
@@ -851,7 +846,7 @@ static int tcp_packet(struct nf_conn *ct,
 		        && ct->proto.tcp.last_index == TCP_RST_SET)) {
 			/* Attempt to reopen a closed/aborted connection.
 			 * Delete this connection and look up again. */
-			write_unlock_bh(&tcp_lock);
+			spin_unlock_bh(ct_lock_addr(ct));
 
 			/* Only repeat if we can actually remove the timer.
 			 * Destruction may already be in progress in process
@@ -887,7 +882,7 @@ static int tcp_packet(struct nf_conn *ct,
 			 * that the client cannot but retransmit its SYN and
 			 * thus initiate a clean new session.
 			 */
-			write_unlock_bh(&tcp_lock);
+			spin_unlock_bh(ct_lock_addr(ct));
 			if (LOG_INVALID(net, IPPROTO_TCP))
 				nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 					  "nf_ct_tcp: killing out of sync session ");
@@ -900,7 +895,7 @@ static int tcp_packet(struct nf_conn *ct,
 		ct->proto.tcp.last_end =
 		    segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th);
 
-		write_unlock_bh(&tcp_lock);
+		spin_unlock_bh(ct_lock_addr(ct));
 		if (LOG_INVALID(net, IPPROTO_TCP))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 				  "nf_ct_tcp: invalid packet ignored ");
@@ -909,7 +904,7 @@ static int tcp_packet(struct nf_conn *ct,
 		/* Invalid packet */
 		pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
 			 dir, get_conntrack_index(th), old_state);
-		write_unlock_bh(&tcp_lock);
+		spin_unlock_bh(ct_lock_addr(ct));
 		if (LOG_INVALID(net, IPPROTO_TCP))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 				  "nf_ct_tcp: invalid state ");
@@ -940,7 +935,7 @@ static int tcp_packet(struct nf_conn *ct,
 
 	if (!tcp_in_window(ct, &ct->proto.tcp, dir, index,
 			   skb, dataoff, th, pf)) {
-		write_unlock_bh(&tcp_lock);
+		spin_unlock_bh(ct_lock_addr(ct));
 		return -NF_ACCEPT;
 	}
      in_window:
@@ -969,7 +964,7 @@ static int tcp_packet(struct nf_conn *ct,
 		timeout = nf_ct_tcp_timeout_unacknowledged;
 	else
 		timeout = tcp_timeouts[new_state];
-	write_unlock_bh(&tcp_lock);
+	spin_unlock_bh(ct_lock_addr(ct));
 
 	nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, ct);
 	if (new_state != old_state)
@@ -1022,6 +1017,7 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
 		pr_debug("nf_ct_tcp: invalid new deleting.\n");
 		return false;
 	}
+	spin_lock_init(ct_lock_addr(ct));
 
 	if (new_state == TCP_CONNTRACK_SYN_SENT) {
 		/* SYN packet */
@@ -1092,7 +1088,7 @@ static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
 	struct nlattr *nest_parms;
 	struct nf_ct_tcp_flags tmp = {};
 
-	read_lock_bh(&tcp_lock);
+	spin_lock_bh(ct_lock_addr(ct));
 	nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP | NLA_F_NESTED);
 	if (!nest_parms)
 		goto nla_put_failure;
@@ -1112,14 +1108,14 @@ static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
 	tmp.flags = ct->proto.tcp.seen[1].flags;
 	NLA_PUT(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY,
 		sizeof(struct nf_ct_tcp_flags), &tmp);
-	read_unlock_bh(&tcp_lock);
+	spin_unlock_bh(ct_lock_addr(ct));
 
 	nla_nest_end(skb, nest_parms);
 
 	return 0;
 
 nla_put_failure:
-	read_unlock_bh(&tcp_lock);
+	spin_unlock_bh(ct_lock_addr(ct));
 	return -1;
 }
 
@@ -1150,7 +1146,7 @@ static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct)
 	    nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]) >= TCP_CONNTRACK_MAX)
 		return -EINVAL;
 
-	write_lock_bh(&tcp_lock);
+	spin_lock_bh(ct_lock_addr(ct));
 	if (tb[CTA_PROTOINFO_TCP_STATE])
 		ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]);
 
@@ -1177,7 +1173,7 @@ static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct)
 		ct->proto.tcp.seen[1].td_scale =
 			nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]);
 	}
-	write_unlock_bh(&tcp_lock);
+	spin_unlock_bh(ct_lock_addr(ct));
 
 	return 0;
 }

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html