[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <49CE568A.9090104@cosmosbay.com>
Date: Sat, 28 Mar 2009 17:55:38 +0100
From: Eric Dumazet <dada1@...mosbay.com>
To: Patrick McHardy <kaber@...sh.net>
CC: Stephen Hemminger <shemminger@...tta.com>,
David Miller <davem@...emloft.net>,
Rick Jones <rick.jones2@...com>, netdev@...r.kernel.org,
netfilter-devel@...r.kernel.org
Subject: [PATCH] netfilter: finer grained nf_conn locking
Eric Dumazet a écrit :
> Patrick McHardy a écrit :
>> Stephen Hemminger wrote:
>>
>>> @@ -50,6 +50,7 @@ struct ip_ct_tcp_state {
>>>
>>> struct ip_ct_tcp
>>> {
>>> + spinlock_t lock;
>>> struct ip_ct_tcp_state seen[2]; /* connection parameters per
>>> direction */
>>> u_int8_t state; /* state of the connection (enum
>>> tcp_conntrack) */
>>> /* For detecting stale connections */
>> Eric already posted a patch to use an array of locks, which is
>> a better approach IMO since it keeps the size of the conntrack
>> entries down.
>
> Yes, we probably can use an array for short lived lock sections.
>
> The extra cost to compute the lock address is quite small, if
> the array size is known at compile time.
>
> Stephen we might also use this same array to protect the nf_conn_acct_find(ct)
> thing as well (I am referring to your RFT 3/4 patch :
> Use mod_timer_noact to remove nf_conntrack_lock)
>
> Here is a repost of patch Patrick is referring to :
>
>
> [PATCH] netfilter: Get rid of central rwlock in tcp conntracking
>
> TCP connection tracking suffers of huge contention on a global rwlock,
> used to protect tcp conntracking state.
>
> As each tcp conntrack state have no relations between each others, we
> can switch to fine grained lock. Using an array of spinlocks avoids
> enlarging size of connection tracking structures, yet giving reasonable
> fanout.
>
> tcp_print_conntrack() doesnt need to lock anything to read
> ct->proto.tcp.state, so speedup /proc/net/ip_conntrack as well.
>
> nf_conntrack_hash_rnd_initted & nf_conntrack_hash_rnd declared read_mostly
>
Hi Patrick
Apparently we could not finish the removal of tcp_lock for 2.6.30 :(
Stephen suggested using a 4 bytes hole in struct nf_conntrack,
but this is ok only if sizeof(spinlock_t) <= 4 and 64 bit arches.
We could do an hybrid thing : use nf_conn.ct_general.lock if 64 bit arches
and sizeof(spinlock_t) <= 4.
Other cases would use a carefuly sized array of spinlocks...
Thank you
[PATCH] netfilter: finer grained nf_conn locking
Introduction of fine grained lock infrastructure for nf_conn.
If possible, we use a 32bit hole on 64bit arches.
Else we use a global array of hashed spinlocks, so we dont
change size of "struct nf_conn"
Get rid of central tcp_lock rwlock used in TCP conntracking
using this infrastructure for better performance on SMP.
"tbench 8" results on my 8 core machine (32bit kernel, with
conntracking on) : 2319 MB/s instead of 2284 MB/s
Signed-off-by: Eric Dumazet <dada1@...mosbay.com>
---
include/linux/skbuff.h | 9 ++-
include/net/netfilter/nf_conntrack_l4proto.h | 2
net/netfilter/nf_conntrack_core.c | 47 +++++++++++++++++
net/netfilter/nf_conntrack_netlink.c | 4 -
net/netfilter/nf_conntrack_proto_tcp.c | 35 +++++-------
5 files changed, 74 insertions(+), 23 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index bb1981f..c737f47 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -96,7 +96,14 @@ struct pipe_inode_info;
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
struct nf_conntrack {
- atomic_t use;
+ atomic_t use;
+#if BITS_PER_LONG == 64
+ /*
+ * On 64bit arches, we might use this 32bit hole for spinlock
+ * (if a spinlock_t fits)
+ */
+ int pad;
+#endif
};
#endif
diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index ba32ed7..d66bea9 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -63,7 +63,7 @@ struct nf_conntrack_l4proto
/* convert protoinfo to nfnetink attributes */
int (*to_nlattr)(struct sk_buff *skb, struct nlattr *nla,
- const struct nf_conn *ct);
+ struct nf_conn *ct);
/* Calculate protoinfo nlattr size */
int (*nlattr_size)(void);
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 8020db6..408d287 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -32,6 +32,7 @@
#include <linux/rculist_nulls.h>
#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_lock.h>
#include <net/netfilter/nf_conntrack_l3proto.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_expect.h>
@@ -523,6 +524,7 @@ struct nf_conn *nf_conntrack_alloc(struct net *net,
return ERR_PTR(-ENOMEM);
}
+ nf_conn_lock_init(ct);
atomic_set(&ct->ct_general.use, 1);
ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
@@ -1033,11 +1035,50 @@ void nf_conntrack_flush(struct net *net, u32 pid, int report)
}
EXPORT_SYMBOL_GPL(nf_conntrack_flush);
+spinlock_t *nf_conn_hlocks __read_mostly;
+unsigned int nf_conn_hlocks_mask __read_mostly;
+
+static int nf_conn_hlocks_init(void)
+{
+ if (nf_conn_lock_type() == NF_CONN_LOCK_EXTERNAL) {
+ size_t sz;
+ int i;
+#if defined(CONFIG_PROVE_LOCKING)
+ unsigned int nr_slots = 256;
+#else
+ /* 4 nodes per cpu on VSMP, or 256 slots per cpu */
+ unsigned int nr_slots = max(256UL, ((4UL << INTERNODE_CACHE_SHIFT) /
+ sizeof(spinlock_t)));
+ nr_slots = roundup_pow_of_two(num_possible_cpus() * nr_slots);
+#endif
+ sz = nr_slots * sizeof(spinlock_t);
+ if (sz > PAGE_SIZE)
+ nf_conn_hlocks = vmalloc(sz);
+ else
+ nf_conn_hlocks = kmalloc(sz, GFP_KERNEL);
+ if (!nf_conn_hlocks)
+ return -ENOMEM;
+ nf_conn_hlocks_mask = nr_slots - 1;
+ for (i = 0; i < nr_slots; i++)
+ spin_lock_init(nf_conn_hlocks + i);
+ }
+ return 0;
+}
+
+static void nf_conn_hlocks_fini(void)
+{
+ if (is_vmalloc_addr(nf_conn_hlocks))
+ vfree(nf_conn_hlocks);
+ else
+ kfree(nf_conn_hlocks);
+}
+
static void nf_conntrack_cleanup_init_net(void)
{
nf_conntrack_helper_fini();
nf_conntrack_proto_fini();
kmem_cache_destroy(nf_conntrack_cachep);
+ nf_conn_hlocks_fini();
}
static void nf_conntrack_cleanup_net(struct net *net)
@@ -1170,6 +1211,10 @@ static int nf_conntrack_init_init_net(void)
int max_factor = 8;
int ret;
+ ret = nf_conn_hlocks_init();
+ if (ret)
+ goto err_hlocks;
+
/* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
* machine has 512 buckets. >= 1GB machines have 16384 buckets. */
if (!nf_conntrack_htable_size) {
@@ -1217,6 +1262,8 @@ err_helper:
err_proto:
kmem_cache_destroy(nf_conntrack_cachep);
err_cache:
+ nf_conn_hlocks_fini();
+err_hlocks:
return ret;
}
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index c6439c7..89ea035 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -144,7 +144,7 @@ nla_put_failure:
}
static inline int
-ctnetlink_dump_protoinfo(struct sk_buff *skb, const struct nf_conn *ct)
+ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct)
{
struct nf_conntrack_l4proto *l4proto;
struct nlattr *nest_proto;
@@ -351,7 +351,7 @@ nla_put_failure:
static int
ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
int event, int nowait,
- const struct nf_conn *ct)
+ struct nf_conn *ct)
{
struct nlmsghdr *nlh;
struct nfgenmsg *nfmsg;
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index b5ccf2b..bb5fc24 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -23,14 +23,13 @@
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_ipv6.h>
#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_lock.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_log.h>
#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
-/* Protects ct->proto.tcp */
-static DEFINE_RWLOCK(tcp_lock);
/* "Be conservative in what you do,
be liberal in what you accept from others."
@@ -300,9 +299,7 @@ static int tcp_print_conntrack(struct seq_file *s, const struct nf_conn *ct)
{
enum tcp_conntrack state;
- read_lock_bh(&tcp_lock);
state = ct->proto.tcp.state;
- read_unlock_bh(&tcp_lock);
return seq_printf(s, "%s ", tcp_conntrack_names[state]);
}
@@ -708,14 +705,14 @@ void nf_conntrack_tcp_update(const struct sk_buff *skb,
end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, dataoff, tcph);
- write_lock_bh(&tcp_lock);
+ spin_lock_bh(nf_conn_lock_addr(ct));
/*
* We have to worry for the ack in the reply packet only...
*/
if (after(end, ct->proto.tcp.seen[dir].td_end))
ct->proto.tcp.seen[dir].td_end = end;
ct->proto.tcp.last_end = end;
- write_unlock_bh(&tcp_lock);
+ spin_unlock_bh(nf_conn_lock_addr(ct));
pr_debug("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i "
"receiver end=%u maxend=%u maxwin=%u scale=%i\n",
sender->td_end, sender->td_maxend, sender->td_maxwin,
@@ -824,7 +821,7 @@ static int tcp_packet(struct nf_conn *ct,
th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
BUG_ON(th == NULL);
- write_lock_bh(&tcp_lock);
+ spin_lock_bh(nf_conn_lock_addr(ct));
old_state = ct->proto.tcp.state;
dir = CTINFO2DIR(ctinfo);
index = get_conntrack_index(th);
@@ -854,7 +851,7 @@ static int tcp_packet(struct nf_conn *ct,
&& ct->proto.tcp.last_index == TCP_RST_SET)) {
/* Attempt to reopen a closed/aborted connection.
* Delete this connection and look up again. */
- write_unlock_bh(&tcp_lock);
+ spin_unlock_bh(nf_conn_lock_addr(ct));
/* Only repeat if we can actually remove the timer.
* Destruction may already be in progress in process
@@ -890,7 +887,7 @@ static int tcp_packet(struct nf_conn *ct,
* that the client cannot but retransmit its SYN and
* thus initiate a clean new session.
*/
- write_unlock_bh(&tcp_lock);
+ spin_unlock_bh(nf_conn_lock_addr(ct));
if (LOG_INVALID(net, IPPROTO_TCP))
nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
"nf_ct_tcp: killing out of sync session ");
@@ -903,7 +900,7 @@ static int tcp_packet(struct nf_conn *ct,
ct->proto.tcp.last_end =
segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th);
- write_unlock_bh(&tcp_lock);
+ spin_unlock_bh(nf_conn_lock_addr(ct));
if (LOG_INVALID(net, IPPROTO_TCP))
nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
"nf_ct_tcp: invalid packet ignored ");
@@ -912,7 +909,7 @@ static int tcp_packet(struct nf_conn *ct,
/* Invalid packet */
pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
dir, get_conntrack_index(th), old_state);
- write_unlock_bh(&tcp_lock);
+ spin_unlock_bh(nf_conn_lock_addr(ct));
if (LOG_INVALID(net, IPPROTO_TCP))
nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
"nf_ct_tcp: invalid state ");
@@ -943,7 +940,7 @@ static int tcp_packet(struct nf_conn *ct,
if (!tcp_in_window(ct, &ct->proto.tcp, dir, index,
skb, dataoff, th, pf)) {
- write_unlock_bh(&tcp_lock);
+ spin_unlock_bh(nf_conn_lock_addr(ct));
return -NF_ACCEPT;
}
in_window:
@@ -972,7 +969,7 @@ static int tcp_packet(struct nf_conn *ct,
timeout = nf_ct_tcp_timeout_unacknowledged;
else
timeout = tcp_timeouts[new_state];
- write_unlock_bh(&tcp_lock);
+ spin_unlock_bh(nf_conn_lock_addr(ct));
nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, ct);
if (new_state != old_state)
@@ -1090,12 +1087,12 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
#include <linux/netfilter/nfnetlink_conntrack.h>
static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
- const struct nf_conn *ct)
+ struct nf_conn *ct)
{
struct nlattr *nest_parms;
struct nf_ct_tcp_flags tmp = {};
- read_lock_bh(&tcp_lock);
+ spin_lock_bh(nf_conn_lock_addr(ct));
nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP | NLA_F_NESTED);
if (!nest_parms)
goto nla_put_failure;
@@ -1115,14 +1112,14 @@ static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
tmp.flags = ct->proto.tcp.seen[1].flags;
NLA_PUT(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY,
sizeof(struct nf_ct_tcp_flags), &tmp);
- read_unlock_bh(&tcp_lock);
+ spin_unlock_bh(nf_conn_lock_addr(ct));
nla_nest_end(skb, nest_parms);
return 0;
nla_put_failure:
- read_unlock_bh(&tcp_lock);
+ spin_unlock_bh(nf_conn_lock_addr(ct));
return -1;
}
@@ -1153,7 +1150,7 @@ static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct)
nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]) >= TCP_CONNTRACK_MAX)
return -EINVAL;
- write_lock_bh(&tcp_lock);
+ spin_lock_bh(nf_conn_lock_addr(ct));
if (tb[CTA_PROTOINFO_TCP_STATE])
ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]);
@@ -1180,7 +1177,7 @@ static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct)
ct->proto.tcp.seen[1].td_scale =
nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]);
}
- write_unlock_bh(&tcp_lock);
+ spin_unlock_bh(nf_conn_lock_addr(ct));
return 0;
}
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists