[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1272648388.2301.15.camel@edumazet-laptop>
Date: Fri, 30 Apr 2010 19:26:28 +0200
From: Eric Dumazet <eric.dumazet@...il.com>
To: Brian Bloniarz <bmb@...enacr.com>
Cc: hadi@...erus.ca, Changli Gao <xiaosuo@...il.com>,
David Miller <davem@...emloft.net>, therbert@...gle.com,
shemminger@...tta.com, netdev@...r.kernel.org,
Eilon Greenstein <eilong@...adcom.com>
Subject: Re: [PATCH net-next-2.6] net: sock_def_readable() and friends RCU
conversion
Le vendredi 30 avril 2010 à 09:55 -0400, Brian Bloniarz a écrit :
>
> This patch boots for me, I haven't noticed any strangeness yet.
>
> I ran a few benchmarks (the multicast fan-out mcasttest.c
> from last year, a few other things we have lying around).
> I think I see a modest improvement from this and your other
> 2 packets. Presumably the big wins are where multiple cores
> perform bh for the same socket, that's not the case in
> these benchmarks. If it's appropriate:
>
> Tested-by: Brian Bloniarz <bmb@...enacr.com>
>
> > Next one will be able to coalesce wakeup calls (they'll be delayed at
> > the end of net_rx_action(), like a patch I did last year to help
> > multicast reception)
>
> Keep em coming :)
Thanks for testing !
Here is a respin of "net: relax dst refcnt in input path"
patch for net-next-2.6
Not ready for inclusion, but seems to work quite well on multicast
load : I get about 20% more packets on mcasttest
(Avoid atomic ops on dst entries on input path, and partly on forwading
path). On mccasttest, all sockets share same dst, so producer/consumers
all fight on a single cache line.
Old ref (for informations) :
http://kerneltrap.org/mailarchive/linux-netdev/2009/7/22/6248753
Not-Signed-off-by: Eric Dumazet <eric.dumazet@...il.com>
include/linux/skbuff.h | 45 +++++++++++++++++++++++++++++++++-
include/net/dst.h | 47 +++++++++++++++++++++++++++++++++---
include/net/route.h | 2 -
include/net/sock.h | 2 +
net/bridge/br_netfilter.c | 2 -
net/core/dev.c | 3 ++
net/core/skbuff.c | 3 +-
net/core/sock.c | 6 ++++
net/ipv4/arp.c | 2 -
net/ipv4/icmp.c | 8 +++---
net/ipv4/ip_forward.c | 1
net/ipv4/ip_fragment.c | 2 -
net/ipv4/ip_input.c | 2 -
net/ipv4/ip_options.c | 11 ++++----
net/ipv4/netfilter.c | 8 +++---
net/ipv4/route.c | 15 +++++++----
net/ipv4/xfrm4_input.c | 2 -
net/ipv6/ip6_tunnel.c | 2 -
net/netfilter/nf_queue.c | 2 +
net/sched/sch_generic.c | 2 -
20 files changed, 136 insertions(+), 31 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 82f5116..6195bcf 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -414,16 +414,59 @@ struct sk_buff {
#include <asm/system.h>
+/*
+ * skb might have a dst pointer attached, refcounted or not
+ * _skb_dst low order bit is set if refcount was taken
+ */
+#define SKB_DST_NOREF 1UL
+#define SKB_DST_PTRMASK ~(SKB_DST_NOREF)
+
+/**
+ * skb_dst - returns skb dst_entry
+ * @skb: buffer
+ *
+ * Returns skb dst_entry, regardless of reference taken or not.
+ */
static inline struct dst_entry *skb_dst(const struct sk_buff *skb)
{
- return (struct dst_entry *)skb->_skb_dst;
+ return (struct dst_entry *)(skb->_skb_dst & SKB_DST_PTRMASK);
}
+/**
+ * skb_dst_set - sets skb dst
+ * @skb: buffer
+ * @dst: dst entry
+ *
+ * Sets skb dst, assuming a reference was taken on dst and should
+ * be released by skb_dst_drop()
+ */
static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst)
{
skb->_skb_dst = (unsigned long)dst;
}
+/**
+ * skb_dst_set_noref - sets skb dst, without a reference
+ * @skb: buffer
+ * @dst: dst entry
+ *
+ * Sets skb dst, assuming a reference was _not_ taken on dst
+ * skb_dst_drop() should not dst_release() this dst
+ */
+static inline void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst)
+{
+ skb->_skb_dst = (unsigned long)dst | SKB_DST_NOREF;
+}
+
+/**
+ * skb_dst_is_noref - Test is skb dst isnt refcounted
+ * @skb: buffer
+ */
+static inline bool skb_dst_is_noref(const struct sk_buff *skb)
+{
+ return (skb->_skb_dst & SKB_DST_NOREF) && skb_dst(skb);
+}
+
static inline struct rtable *skb_rtable(const struct sk_buff *skb)
{
return (struct rtable *)skb_dst(skb);
diff --git a/include/net/dst.h b/include/net/dst.h
index aac5a5f..ad6ea9e 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -168,6 +168,12 @@ static inline void dst_use(struct dst_entry *dst, unsigned long time)
dst->lastuse = time;
}
+static inline void dst_use_noref(struct dst_entry *dst, unsigned long time)
+{
+ dst->__use++;
+ dst->lastuse = time;
+}
+
static inline
struct dst_entry * dst_clone(struct dst_entry * dst)
{
@@ -177,11 +183,46 @@ struct dst_entry * dst_clone(struct dst_entry * dst)
}
extern void dst_release(struct dst_entry *dst);
+
+static inline void __skb_dst_drop(unsigned long _skb_dst)
+{
+ if (!(_skb_dst & SKB_DST_NOREF))
+ dst_release((struct dst_entry *)(_skb_dst & SKB_DST_PTRMASK));
+}
+
+/**
+ * skb_dst_drop - drops skb dst
+ * @skb: buffer
+ *
+ * Drops dst reference count if a reference was taken.
+ */
static inline void skb_dst_drop(struct sk_buff *skb)
{
- if (skb->_skb_dst)
- dst_release(skb_dst(skb));
- skb->_skb_dst = 0UL;
+ if (skb->_skb_dst) {
+ __skb_dst_drop(skb->_skb_dst);
+ skb->_skb_dst = 0UL;
+ }
+}
+
+static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb)
+{
+ nskb->_skb_dst = oskb->_skb_dst;
+ if (!(nskb->_skb_dst & SKB_DST_NOREF))
+ dst_clone(skb_dst(nskb));
+}
+
+/**
+ * skb_dst_force - makes sure skb dst is refcounted
+ * @skb: buffer
+ *
+ * If dst is not yet refcounted, let's do it
+ */
+static inline void skb_dst_force(struct sk_buff *skb)
+{
+ if (skb->_skb_dst & SKB_DST_NOREF) {
+ skb->_skb_dst &= ~SKB_DST_NOREF;
+ dst_clone(skb_dst(skb));
+ }
}
/* Children define the path of the packet through the
diff --git a/include/net/route.h b/include/net/route.h
index 2c9fba7..443f6d4 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -112,7 +112,7 @@ extern void rt_cache_flush_batch(void);
extern int __ip_route_output_key(struct net *, struct rtable **, const struct flowi *flp);
extern int ip_route_output_key(struct net *, struct rtable **, struct flowi *flp);
extern int ip_route_output_flow(struct net *, struct rtable **rp, struct flowi *flp, struct sock *sk, int flags);
-extern int ip_route_input(struct sk_buff*, __be32 dst, __be32 src, u8 tos, struct net_device *devin);
+extern int ip_route_input(struct sk_buff*, __be32 dst, __be32 src, u8 tos, struct net_device *devin, bool noref);
extern unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, unsigned short new_mtu, struct net_device *dev);
extern void ip_rt_send_redirect(struct sk_buff *skb);
diff --git a/include/net/sock.h b/include/net/sock.h
index d361c77..0a0f14d 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -598,6 +598,8 @@ static inline int sk_stream_memory_free(struct sock *sk)
/* OOB backlog add */
static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb)
{
+ /* dont let skb dst not referenced, we are going to leave rcu lock */
+ skb_dst_force(skb);
if (!sk->sk_backlog.tail) {
sk->sk_backlog.head = sk->sk_backlog.tail = skb;
} else {
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 4c4977d..c943ad4 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -350,7 +350,7 @@ static int br_nf_pre_routing_finish(struct sk_buff *skb)
}
nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING;
if (dnat_took_place(skb)) {
- if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) {
+ if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev, false))) {
struct flowi fl = {
.nl_u = {
.ip4_u = {
diff --git a/net/core/dev.c b/net/core/dev.c
index 100dcbd..c331b0e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2047,6 +2047,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
* waiting to be sent out; and the qdisc is not running -
* xmit the skb directly.
*/
+ if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
+ skb_dst_force(skb);
__qdisc_update_bstats(q, skb->len);
if (sch_direct_xmit(skb, q, dev, txq, root_lock))
__qdisc_run(q);
@@ -2055,6 +2057,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
rc = NET_XMIT_SUCCESS;
} else {
+ skb_dst_force(skb);
rc = qdisc_enqueue_root(skb, q);
qdisc_run(q);
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 4218ff4..f400196 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -531,7 +531,8 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
new->transport_header = old->transport_header;
new->network_header = old->network_header;
new->mac_header = old->mac_header;
- skb_dst_set(new, dst_clone(skb_dst(old)));
+
+ skb_dst_copy(new, old);
new->rxhash = old->rxhash;
#ifdef CONFIG_XFRM
new->sp = secpath_get(old->sp);
diff --git a/net/core/sock.c b/net/core/sock.c
index 5104175..894bed6 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -307,6 +307,11 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
*/
skb_len = skb->len;
+ /* we escape from rcu protected region, make sure we dont leak
+ * a norefcounted dst
+ */
+ skb_dst_force(skb);
+
spin_lock_irqsave(&list->lock, flags);
skb->dropcount = atomic_read(&sk->sk_drops);
__skb_queue_tail(list, skb);
@@ -1535,6 +1540,7 @@ static void __release_sock(struct sock *sk)
do {
struct sk_buff *next = skb->next;
+ WARN_ON_ONCE(skb_dst_is_noref(skb));
skb->next = NULL;
sk_backlog_rcv(sk, skb);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 6e74706..502ac9f 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -854,7 +854,7 @@ static int arp_process(struct sk_buff *skb)
}
if (arp->ar_op == htons(ARPOP_REQUEST) &&
- ip_route_input(skb, tip, sip, 0, dev) == 0) {
+ ip_route_input(skb, tip, sip, 0, dev, true) == 0) {
rt = skb_rtable(skb);
addr_type = rt->rt_type;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index f3d339f..a113c08 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -587,20 +587,20 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
err = __ip_route_output_key(net, &rt2, &fl);
else {
struct flowi fl2 = {};
- struct dst_entry *odst;
+ unsigned long odst;
fl2.fl4_dst = fl.fl4_src;
if (ip_route_output_key(net, &rt2, &fl2))
goto relookup_failed;
/* Ugh! */
- odst = skb_dst(skb_in);
+ odst = skb_in->_skb_dst; /* save old dst */
err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src,
- RT_TOS(tos), rt2->u.dst.dev);
+ RT_TOS(tos), rt2->u.dst.dev, false);
dst_release(&rt2->u.dst);
rt2 = skb_rtable(skb_in);
- skb_dst_set(skb_in, odst);
+ skb_in->_skb_dst = odst; /* restore old dst */
}
if (err)
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index af10942..0f58609 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -57,6 +57,7 @@ int ip_forward(struct sk_buff *skb)
struct rtable *rt; /* Route we use */
struct ip_options * opt = &(IPCB(skb)->opt);
+/* pr_err("ip_forward() skb->dst=%lx\n", skb->_skb_dst);*/
if (skb_warn_if_lro(skb))
goto drop;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 75347ea..cbcde7a 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -220,7 +220,7 @@ static void ip_expire(unsigned long arg)
if (qp->user == IP_DEFRAG_CONNTRACK_IN && !skb_dst(head)) {
const struct iphdr *iph = ip_hdr(head);
int err = ip_route_input(head, iph->daddr, iph->saddr,
- iph->tos, head->dev);
+ iph->tos, head->dev, false);
if (unlikely(err))
goto out_rcu_unlock;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index f8ab7a3..5d365e8 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -332,7 +332,7 @@ static int ip_rcv_finish(struct sk_buff *skb)
*/
if (skb_dst(skb) == NULL) {
int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
- skb->dev);
+ skb->dev, true);
if (unlikely(err)) {
if (err == -EHOSTUNREACH)
IP_INC_STATS_BH(dev_net(skb->dev),
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 4c09a31..1b65d68 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -601,6 +601,7 @@ int ip_options_rcv_srr(struct sk_buff *skb)
unsigned char *optptr = skb_network_header(skb) + opt->srr;
struct rtable *rt = skb_rtable(skb);
struct rtable *rt2;
+ unsigned long odst;
int err;
if (!opt->srr)
@@ -624,16 +625,16 @@ int ip_options_rcv_srr(struct sk_buff *skb)
}
memcpy(&nexthop, &optptr[srrptr-1], 4);
- rt = skb_rtable(skb);
+ odst = skb->_skb_dst;
skb_dst_set(skb, NULL);
- err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev);
+ err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev, false);
rt2 = skb_rtable(skb);
if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) {
- ip_rt_put(rt2);
- skb_dst_set(skb, &rt->u.dst);
+ skb_dst_drop(skb);
+ skb->_skb_dst = odst;
return -EINVAL;
}
- ip_rt_put(rt);
+ __skb_dst_drop(odst);
if (rt2->rt_type != RTN_LOCAL)
break;
/* Superfast 8) loopback forward */
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 82fb43c..e505007 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -17,7 +17,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
struct flowi fl = {};
- struct dst_entry *odst;
+ unsigned long odst;
unsigned int hh_len;
unsigned int type;
@@ -51,14 +51,14 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
if (ip_route_output_key(net, &rt, &fl) != 0)
return -1;
- odst = skb_dst(skb);
+ odst = skb->_skb_dst;
if (ip_route_input(skb, iph->daddr, iph->saddr,
- RT_TOS(iph->tos), rt->u.dst.dev) != 0) {
+ RT_TOS(iph->tos), rt->u.dst.dev, false) != 0) {
dst_release(&rt->u.dst);
return -1;
}
dst_release(&rt->u.dst);
- dst_release(odst);
+ __skb_dst_drop(odst);
}
if (skb_dst(skb)->error)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a947428..4f169ce 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2300,7 +2300,7 @@ martian_source:
}
int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
- u8 tos, struct net_device *dev)
+ u8 tos, struct net_device *dev, bool noref)
{
struct rtable * rth;
unsigned hash;
@@ -2326,10 +2326,15 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
rth->fl.mark == skb->mark &&
net_eq(dev_net(rth->u.dst.dev), net) &&
!rt_is_expired(rth)) {
- dst_use(&rth->u.dst, jiffies);
+ if (noref) {
+ dst_use_noref(&rth->u.dst, jiffies);
+ skb_dst_set_noref(skb, &rth->u.dst);
+ } else {
+ dst_use(&rth->u.dst, jiffies);
+ skb_dst_set(skb, &rth->u.dst);
+ }
RT_CACHE_STAT_INC(in_hit);
rcu_read_unlock();
- skb_dst_set(skb, &rth->u.dst);
return 0;
}
RT_CACHE_STAT_INC(in_hlist_search);
@@ -2991,7 +2996,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
skb->protocol = htons(ETH_P_IP);
skb->dev = dev;
local_bh_disable();
- err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
+ err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev, false);
local_bh_enable();
rt = skb_rtable(skb);
@@ -3055,7 +3060,7 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
continue;
if (rt_is_expired(rt))
continue;
- skb_dst_set(skb, dst_clone(&rt->u.dst));
+ skb_dst_set_noref(skb, dst_clone(&rt->u.dst));
if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
cb->nlh->nlmsg_seq, RTM_NEWROUTE,
1, NLM_F_MULTI) <= 0) {
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index c791bb6..0366cbc 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -28,7 +28,7 @@ static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb)
const struct iphdr *iph = ip_hdr(skb);
if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
- skb->dev))
+ skb->dev, true))
goto drop;
}
return dst_input(skb);
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 2599870..7ae0fa5 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -570,7 +570,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
} else {
ip_rt_put(rt);
if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos,
- skb2->dev) ||
+ skb2->dev, false) ||
skb_dst(skb2)->dev->type != ARPHRD_TUNNEL)
goto out;
}
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index c49ef21..cb3cde4 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -9,6 +9,7 @@
#include <linux/rcupdate.h>
#include <net/protocol.h>
#include <net/netfilter/nf_queue.h>
+#include <net/dst.h>
#include "nf_internals.h"
@@ -170,6 +171,7 @@ static int __nf_queue(struct sk_buff *skb,
dev_hold(physoutdev);
}
#endif
+ skb_dst_force(skb);
afinfo->saveroute(skb, entry);
status = qh->outfn(entry, queuenum);
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index aeddabf..21e3976 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -179,7 +179,7 @@ static inline int qdisc_restart(struct Qdisc *q)
skb = dequeue_skb(q);
if (unlikely(!skb))
return 0;
-
+ WARN_ON_ONCE(skb_dst_is_noref(skb));
root_lock = qdisc_lock(q);
dev = qdisc_dev(q);
txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists