[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1275409203.2738.227.camel@edumazet-laptop>
Date: Tue, 01 Jun 2010 18:20:03 +0200
From: Eric Dumazet <eric.dumazet@...il.com>
To: Patrick McHardy <kaber@...sh.net>
Cc: Netfilter Developers <netfilter-devel@...r.kernel.org>,
netdev <netdev@...r.kernel.org>
Subject: [RFC nf-next-2.6] conntrack: per cpu nf_conntrack_untracked
Le mardi 01 juin 2010 à 12:41 +0200, Patrick McHardy a écrit :
> > BTW, I notice nf_conntrack_untracked is incorrectly annotated
> > '__read_mostly'.
> >
> > It can be written very often :(
> >
> > Should'nt we special case it and let be really const ?
>
> That would need quite a bit of special-casing to avoid touching
> the reference counts. So far this is completely hidden, so I'd
> say it just shouldn't be marked __read_mostly. Alternatively we
> can make "untracked" a nfctinfo state.
I tried this suggestion, (a new IP_CT_UNTRACKED ctinfo), over a per_cpu
untracked ct, but its a bit hard.
For example, I cannot find a way to change ctnetlink_conntrack_event() :
if (ct == &nf_conntrack_untracked)
return 0;
Maybe this piece of code is not necessary, we should not come here
anyway, or it means several packets could store events for this (shared)
ct ?
Obviously, an IPS_UNTRACKED bit would be much easier to implement.
Would it be acceptable ?
Preliminary patch with IP_CT_UNTRACKED, probably not working at all...
include/linux/netfilter/nf_conntrack_common.h | 3 +
include/net/netfilter/nf_conntrack.h | 11 +++--
include/net/netfilter/nf_conntrack_core.h | 2
net/ipv4/netfilter/nf_nat_core.c | 4 +
net/ipv4/netfilter/nf_nat_standalone.c | 2
net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 4 -
net/netfilter/nf_conntrack_core.c | 32 +++++++++------
net/netfilter/nf_conntrack_netlink.c | 6 +-
net/netfilter/xt_CT.c | 13 +++---
net/netfilter/xt_NOTRACK.c | 4 -
net/netfilter/xt_TEE.c | 8 +--
net/netfilter/xt_cluster.c | 2
net/netfilter/xt_conntrack.c | 2
net/netfilter/xt_socket.c | 2
14 files changed, 58 insertions(+), 37 deletions(-)
diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h
index 14e6d32..5f7c947 100644
--- a/include/linux/netfilter/nf_conntrack_common.h
+++ b/include/linux/netfilter/nf_conntrack_common.h
@@ -15,6 +15,9 @@ enum ip_conntrack_info {
IP_CT_DIR_ORIGINAL); may be a retransmission. */
IP_CT_NEW,
+ /* Untracked */
+ IP_CT_UNTRACKED,
+
/* >= this indicates reply direction */
IP_CT_IS_REPLY,
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index bde095f..884ade9 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -175,7 +175,7 @@ static inline struct nf_conn *
nf_ct_get(const struct sk_buff *skb, enum ip_conntrack_info *ctinfo)
{
*ctinfo = skb->nfctinfo;
- return (struct nf_conn *)skb->nfct;
+ return container_of(skb->nfct, struct nf_conn, ct_general);
}
/* decrement reference count on a conntrack */
@@ -261,7 +261,7 @@ extern s16 (*nf_ct_nat_offset)(const struct nf_conn *ct,
u32 seq);
/* Fake conntrack entry for untracked connections */
-extern struct nf_conn nf_conntrack_untracked;
+DECLARE_PER_CPU(struct nf_conn, pcpu_nf_conntrack_untracked);
/* Iterate over all conntracks: if iter returns true, it's deleted. */
extern void
@@ -291,7 +291,12 @@ static inline int nf_ct_is_dying(struct nf_conn *ct)
static inline int nf_ct_is_untracked(const struct sk_buff *skb)
{
- return (skb->nfct == &nf_conntrack_untracked.ct_general);
+ return (skb->nfctinfo == IP_CT_UNTRACKED);
+}
+
+static inline int nf_ct_is_tracked(const struct sk_buff *skb)
+{
+ return (skb->nfctinfo != IP_CT_UNTRACKED);
}
extern int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp);
diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
index 3d7524f..8dd05ea 100644
--- a/include/net/netfilter/nf_conntrack_core.h
+++ b/include/net/netfilter/nf_conntrack_core.h
@@ -60,7 +60,7 @@ static inline int nf_conntrack_confirm(struct sk_buff *skb)
struct nf_conn *ct = (struct nf_conn *)skb->nfct;
int ret = NF_ACCEPT;
- if (ct && ct != &nf_conntrack_untracked) {
+ if (ct && nf_ct_is_tracked(skb)) {
if (!nf_ct_is_confirmed(ct))
ret = __nf_conntrack_confirm(skb);
if (likely(ret == NF_ACCEPT))
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index 4f8bddb..a797999 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -719,6 +719,7 @@ static int __init nf_nat_init(void)
{
size_t i;
int ret;
+ int cpu;
need_ipv4_conntrack();
@@ -742,7 +743,8 @@ static int __init nf_nat_init(void)
spin_unlock_bh(&nf_nat_lock);
/* Initialize fake conntrack so that NAT will skip it */
- nf_conntrack_untracked.status |= IPS_NAT_DONE_MASK;
+ for_each_possible_cpu(cpu)
+ per_cpu(pcpu_nf_conntrack_untracked,cpu).status |= IPS_NAT_DONE_MASK;
l3proto = nf_ct_l3proto_find_get((u_int16_t)AF_INET);
diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c
index beb2581..17af2bb 100644
--- a/net/ipv4/netfilter/nf_nat_standalone.c
+++ b/net/ipv4/netfilter/nf_nat_standalone.c
@@ -98,7 +98,7 @@ nf_nat_fn(unsigned int hooknum,
return NF_ACCEPT;
/* Don't try to NAT if this packet is not conntracked */
- if (ct == &nf_conntrack_untracked)
+ if (ctinfo == IP_CT_UNTRACKED)
return NF_ACCEPT;
nat = nfct_nat(ct);
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 9be8177..b67029c 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -208,8 +208,8 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl,
type = icmp6h->icmp6_type - 130;
if (type >= 0 && type < sizeof(noct_valid_new) &&
noct_valid_new[type]) {
- skb->nfct = &nf_conntrack_untracked.ct_general;
- skb->nfctinfo = IP_CT_NEW;
+ skb->nfct = &__get_cpu_var(pcpu_nf_conntrack_untracked).ct_general;
+ skb->nfctinfo = IP_CT_UNTRACKED;
nf_conntrack_get(skb->nfct);
return NF_ACCEPT;
}
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index eeeb8bc..eea5df1 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -62,8 +62,8 @@ EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
unsigned int nf_conntrack_max __read_mostly;
EXPORT_SYMBOL_GPL(nf_conntrack_max);
-struct nf_conn nf_conntrack_untracked __read_mostly;
-EXPORT_SYMBOL_GPL(nf_conntrack_untracked);
+DEFINE_PER_CPU(struct nf_conn, pcpu_nf_conntrack_untracked);
+EXPORT_PER_CPU_SYMBOL(pcpu_nf_conntrack_untracked);
static int nf_conntrack_hash_rnd_initted;
static unsigned int nf_conntrack_hash_rnd;
@@ -1185,10 +1185,16 @@ static void nf_ct_release_dying_list(struct net *net)
static void nf_conntrack_cleanup_init_net(void)
{
- /* wait until all references to nf_conntrack_untracked are dropped */
- while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1)
+ int cpu, use;
+ for (;;) {
+ use = 0;
+ for_each_possible_cpu(cpu)
+ use += atomic_read(&per_cpu(pcpu_nf_conntrack_untracked, cpu).ct_general.use) - 1;
+ /* wait until all references to nf_conntrack_untracked are dropped */
+ if (!use)
+ break;
schedule();
-
+ }
nf_conntrack_helper_fini();
nf_conntrack_proto_fini();
#ifdef CONFIG_NF_CONNTRACK_ZONES
@@ -1325,6 +1331,7 @@ static int nf_conntrack_init_init_net(void)
{
int max_factor = 8;
int ret;
+ int cpu;
/* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
* machine has 512 buckets. >= 1GB machines have 16384 buckets. */
@@ -1362,14 +1369,15 @@ static int nf_conntrack_init_init_net(void)
if (ret < 0)
goto err_extend;
#endif
- /* Set up fake conntrack: to never be deleted, not in any hashes */
-#ifdef CONFIG_NET_NS
- nf_conntrack_untracked.ct_net = &init_net;
-#endif
- atomic_set(&nf_conntrack_untracked.ct_general.use, 1);
- /* - and look it like as a confirmed connection */
- set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status);
+ /* Set up fake conntracks: to never be deleted, not in any hashes */
+ for_each_possible_cpu(cpu) {
+ struct nf_conn *ct = &per_cpu(pcpu_nf_conntrack_untracked, cpu);
+ write_pnet(&ct->ct_net, &init_net);
+ atomic_set(&ct->ct_general.use, 1);
+ /* - and look it like as a confirmed connection */
+ __set_bit(IPS_CONFIRMED_BIT, &ct->status);
+ }
return 0;
#ifdef CONFIG_NF_CONNTRACK_ZONES
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index c42ff6a..ac21514 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -479,9 +479,9 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
unsigned int flags = 0, group;
int err;
- /* ignore our fake conntrack entry */
- if (ct == &nf_conntrack_untracked)
- return 0;
+// /* ignore our fake conntrack entry */
+// if (ct == &nf_conntrack_untracked)
+// return 0;
if (events & (1 << IPCT_DESTROY)) {
type = IPCTNL_MSG_CT_DELETE;
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 562bf32..5723f9a 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -29,9 +29,13 @@ static unsigned int xt_ct_target(struct sk_buff *skb,
if (skb->nfct != NULL)
return XT_CONTINUE;
+ skb->nfctinfo = IP_CT_NEW;
+ if (info->flags & XT_CT_NOTRACK) {
+ ct = &__get_cpu_var(pcpu_nf_conntrack_untracked);
+ skb->nfctinfo = IP_CT_UNTRACKED;
+ }
atomic_inc(&ct->ct_general.use);
skb->nfct = &ct->ct_general;
- skb->nfctinfo = IP_CT_NEW;
return XT_CONTINUE;
}
@@ -67,8 +71,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par)
return -EINVAL;
if (info->flags & XT_CT_NOTRACK) {
- ct = &nf_conntrack_untracked;
- atomic_inc(&ct->ct_general.use);
+ ct = &__get_cpu_var(pcpu_nf_conntrack_untracked);
goto out;
}
@@ -132,14 +135,14 @@ static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par)
struct nf_conn *ct = info->ct;
struct nf_conn_help *help;
- if (ct != &nf_conntrack_untracked) {
+ if (!(info->flags & XT_CT_NOTRACK)) {
help = nfct_help(ct);
if (help)
module_put(help->helper->me);
nf_ct_l3proto_module_put(par->family);
+ nf_ct_put(info->ct);
}
- nf_ct_put(info->ct);
}
static struct xt_target xt_ct_tg __read_mostly = {
diff --git a/net/netfilter/xt_NOTRACK.c b/net/netfilter/xt_NOTRACK.c
index 512b912..9547b58 100644
--- a/net/netfilter/xt_NOTRACK.c
+++ b/net/netfilter/xt_NOTRACK.c
@@ -23,8 +23,8 @@ notrack_tg(struct sk_buff *skb, const struct xt_action_param *par)
If there is a real ct entry correspondig to this packet,
it'll hang aroun till timing out. We don't deal with it
for performance reasons. JK */
- skb->nfct = &nf_conntrack_untracked.ct_general;
- skb->nfctinfo = IP_CT_NEW;
+ skb->nfct = &__get_cpu_var(pcpu_nf_conntrack_untracked).ct_general;
+ skb->nfctinfo = IP_CT_UNTRACKED;
nf_conntrack_get(skb->nfct);
return XT_CONTINUE;
diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
index 859d9fd..b8e46b3 100644
--- a/net/netfilter/xt_TEE.c
+++ b/net/netfilter/xt_TEE.c
@@ -104,8 +104,8 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par)
#ifdef WITH_CONNTRACK
/* Avoid counting cloned packets towards the original connection. */
nf_conntrack_put(skb->nfct);
- skb->nfct = &nf_conntrack_untracked.ct_general;
- skb->nfctinfo = IP_CT_NEW;
+ skb->nfct = &__get_cpu_var(pcpu_nf_conntrack_untracked).ct_general;
+ skb->nfctinfo = IP_CT_UNTRACKED;
nf_conntrack_get(skb->nfct);
#endif
/*
@@ -177,8 +177,8 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par)
#ifdef WITH_CONNTRACK
nf_conntrack_put(skb->nfct);
- skb->nfct = &nf_conntrack_untracked.ct_general;
- skb->nfctinfo = IP_CT_NEW;
+ skb->nfct = &__get_cpu_var(pcpu_nf_conntrack_untracked).ct_general;
+ skb->nfctinfo = IP_CT_UNTRACKED;
nf_conntrack_get(skb->nfct);
#endif
if (par->hooknum == NF_INET_PRE_ROUTING ||
diff --git a/net/netfilter/xt_cluster.c b/net/netfilter/xt_cluster.c
index 30b95a1..b26f94d 100644
--- a/net/netfilter/xt_cluster.c
+++ b/net/netfilter/xt_cluster.c
@@ -120,7 +120,7 @@ xt_cluster_mt(const struct sk_buff *skb, struct xt_action_param *par)
if (ct == NULL)
return false;
- if (ct == &nf_conntrack_untracked)
+ if (nf_ct_is_untracked(skb))
return false;
if (ct->master)
diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c
index 39681f1..95bcfbb 100644
--- a/net/netfilter/xt_conntrack.c
+++ b/net/netfilter/xt_conntrack.c
@@ -123,7 +123,7 @@ conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par,
ct = nf_ct_get(skb, &ctinfo);
- if (ct == &nf_conntrack_untracked)
+ if (nf_ct_is_untracked(skb))
statebit = XT_CONNTRACK_STATE_UNTRACKED;
else if (ct != NULL)
statebit = XT_CONNTRACK_STATE_BIT(ctinfo);
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 3d54c23..1f760b5 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -127,7 +127,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
* reply packet of an established SNAT-ted connection. */
ct = nf_ct_get(skb, &ctinfo);
- if (ct && (ct != &nf_conntrack_untracked) &&
+ if (ct && nf_ct_is_tracked(skb) &&
((iph->protocol != IPPROTO_ICMP &&
ctinfo == IP_CT_IS_REPLY + IP_CT_ESTABLISHED) ||
(iph->protocol == IPPROTO_ICMP &&
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists