lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <1284739159.3391.95.camel@edumazet-laptop>
Date:	Fri, 17 Sep 2010 17:59:19 +0200
From:	Eric Dumazet <eric.dumazet@...il.com>
To:	David Miller <davem@...emloft.net>
Cc:	netdev <netdev@...r.kernel.org>
Subject: [PATCH net-next-2.6] ipv4: add rcu annotations in route.c

Use __rcu attribute where appropriate.

Use rcu_dereference_raw() in contexts where no lock is held.

Use rcu_dereference_check() in contexts where the chain spinlock is
held.

Signed-off-by: Eric Dumazet <eric.dumazet@...il.com>
---
tested with CONFIG_PROVE_RCU=y

 include/net/dst.h |    2 
 net/ipv4/route.c  |  182 ++++++++++++++++++++++++++------------------
 2 files changed, 110 insertions(+), 74 deletions(-)

diff --git a/include/net/dst.h b/include/net/dst.h
index 81d1413..ce4a9b9 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -94,7 +94,7 @@ struct dst_entry {
 	unsigned long		lastuse;
 	union {
 		struct dst_entry *next;
-		struct rtable    *rt_next;
+		struct rtable __rcu *rt_next;
 		struct rt6_info   *rt6_next;
 		struct dn_route  *dn_next;
 	};
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index e24d48d..d011911 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -199,7 +199,7 @@ const __u8 ip_tos2prio[16] = {
  */
 
 struct rt_hash_bucket {
-	struct rtable	*chain;
+	struct rtable __rcu	*chain;
 };
 
 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
@@ -249,7 +249,7 @@ static inline void rt_hash_lock_init(void)
 #endif
 
 static struct rt_hash_bucket 	*rt_hash_table __read_mostly;
-static unsigned			rt_hash_mask __read_mostly;
+static unsigned int		rt_hash_mask __read_mostly;
 static unsigned int		rt_hash_log  __read_mostly;
 
 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
@@ -281,7 +281,7 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq)
 	struct rtable *r = NULL;
 
 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
-		if (!rt_hash_table[st->bucket].chain)
+		if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
 			continue;
 		rcu_read_lock_bh();
 		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
@@ -301,23 +301,24 @@ static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 {
 	struct rt_cache_iter_state *st = seq->private;
 
-	r = r->dst.rt_next;
+	r = rcu_dereference_bh(r->dst.rt_next);
 	while (!r) {
 		rcu_read_unlock_bh();
 		do {
 			if (--st->bucket < 0)
 				return NULL;
-		} while (!rt_hash_table[st->bucket].chain);
+		} while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
 		rcu_read_lock_bh();
-		r = rt_hash_table[st->bucket].chain;
+		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 	}
-	return rcu_dereference_bh(r);
+	return r;
 }
 
 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 					struct rtable *r)
 {
 	struct rt_cache_iter_state *st = seq->private;
+
 	while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 		if (dev_net(r->dst.dev) != seq_file_net(seq))
 			continue;
@@ -340,6 +341,7 @@ static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 {
 	struct rt_cache_iter_state *st = seq->private;
+
 	if (*pos)
 		return rt_cache_get_idx(seq, *pos - 1);
 	st->genid = rt_genid(seq_file_net(seq));
@@ -622,7 +624,7 @@ static inline int rt_fast_clean(struct rtable *rth)
 	/* Kill broadcast/multicast entries very aggresively, if they
 	   collide in hash table with more useful entries */
 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
-		rth->fl.iif && rth->dst.rt_next;
+		rth->fl.iif && rcu_dereference_raw(rth->dst.rt_next);
 }
 
 static inline int rt_valuable(struct rtable *rth)
@@ -708,6 +710,9 @@ static inline int rt_is_expired(struct rtable *rth)
 	return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 }
 
+#define rt_safederef(X, LOCKP) \
+	rcu_dereference_check(X, lockdep_is_held(LOCKP))
+
 /*
  * Perform a full scan of hash table and free all entries.
  * Can be called by a softirq or a process.
@@ -716,51 +721,55 @@ static inline int rt_is_expired(struct rtable *rth)
 static void rt_do_flush(int process_context)
 {
 	unsigned int i;
-	struct rtable *rth, *next;
-	struct rtable * tail;
+	struct rtable *rth, *next, *tail;
+	spinlock_t *lockp;
 
 	for (i = 0; i <= rt_hash_mask; i++) {
 		if (process_context && need_resched())
 			cond_resched();
-		rth = rt_hash_table[i].chain;
+		rth = rcu_dereference_raw(rt_hash_table[i].chain);
 		if (!rth)
 			continue;
 
-		spin_lock_bh(rt_hash_lock_addr(i));
+		lockp = rt_hash_lock_addr(i);
+		spin_lock_bh(lockp);
 #ifdef CONFIG_NET_NS
 		{
-		struct rtable ** prev, * p;
+		struct rtable __rcu **prev;
+		struct rtable *p;
 
-		rth = rt_hash_table[i].chain;
+		rth = rt_safederef(rt_hash_table[i].chain, lockp);
 
 		/* defer releasing the head of the list after spin_unlock */
-		for (tail = rth; tail; tail = tail->dst.rt_next)
+		for (tail = rth; tail != NULL;
+		     tail = rt_safederef(tail->dst.rt_next, lockp))
 			if (!rt_is_expired(tail))
 				break;
+
 		if (rth != tail)
-			rt_hash_table[i].chain = tail;
+			rcu_assign_pointer(rt_hash_table[i].chain, tail);
 
 		/* call rt_free on entries after the tail requiring flush */
 		prev = &rt_hash_table[i].chain;
-		for (p = *prev; p; p = next) {
-			next = p->dst.rt_next;
+		for (p = rt_safederef(*prev, lockp); p; p = next) {
+			next = rt_safederef(p->dst.rt_next, lockp);
 			if (!rt_is_expired(p)) {
 				prev = &p->dst.rt_next;
 			} else {
-				*prev = next;
+				rcu_assign_pointer(*prev, next);
 				rt_free(p);
 			}
 		}
 		}
 #else
-		rth = rt_hash_table[i].chain;
-		rt_hash_table[i].chain = NULL;
+		rth = rt_safederef(rt_hash_table[i].chain, lockp);
+		rcu_assign_pointer(rt_hash_table[i].chain, NULL);
 		tail = NULL;
 #endif
-		spin_unlock_bh(rt_hash_lock_addr(i));
+		spin_unlock_bh(lockp);
 
 		for (; rth != tail; rth = next) {
-			next = rth->dst.rt_next;
+			next = rcu_dereference_raw(rth->dst.rt_next);
 			rt_free(rth);
 		}
 	}
@@ -784,14 +793,15 @@ static void rt_do_flush(int process_context)
  * Returns 0 if an alias is found.
  * Returns ONE if rth has no alias before itself.
  */
-static int has_noalias(const struct rtable *head, const struct rtable *rth)
+static int has_noalias(const struct rtable *head, const struct rtable *rth,
+		       spinlock_t *lockp)
 {
 	const struct rtable *aux = head;
 
 	while (aux != rth) {
 		if (compare_hash_inputs(&aux->fl, &rth->fl))
 			return 0;
-		aux = aux->dst.rt_next;
+		aux = rt_safederef(aux->dst.rt_next, lockp);
 	}
 	return ONE;
 }
@@ -800,7 +810,8 @@ static void rt_check_expire(void)
 {
 	static unsigned int rover;
 	unsigned int i = rover, goal;
-	struct rtable *rth, **rthp;
+	struct rtable *rth;
+	struct rtable __rcu **rthp;
 	unsigned long samples = 0;
 	unsigned long sum = 0, sum2 = 0;
 	unsigned long delta;
@@ -817,6 +828,7 @@ static void rt_check_expire(void)
 	for (; goal > 0; goal--) {
 		unsigned long tmo = ip_rt_gc_timeout;
 		unsigned long length;
+		spinlock_t *lockp;
 
 		i = (i + 1) & rt_hash_mask;
 		rthp = &rt_hash_table[i].chain;
@@ -826,14 +838,16 @@ static void rt_check_expire(void)
 
 		samples++;
 
-		if (*rthp == NULL)
+		if (rcu_dereference_raw(*rthp) == NULL)
 			continue;
 		length = 0;
-		spin_lock_bh(rt_hash_lock_addr(i));
-		while ((rth = *rthp) != NULL) {
-			prefetch(rth->dst.rt_next);
+		lockp = rt_hash_lock_addr(i);
+		spin_lock_bh(lockp);
+		while ((rth = rt_safederef(*rthp, lockp)) != NULL) {
+			prefetch(rcu_dereference_raw(rth->dst.rt_next));
 			if (rt_is_expired(rth)) {
-				*rthp = rth->dst.rt_next;
+				rcu_assign_pointer(*rthp,
+					rt_safederef(rth->dst.rt_next, lockp));
 				rt_free(rth);
 				continue;
 			}
@@ -851,17 +865,18 @@ nofree:
 					 * attributes don't unfairly skew
 					 * the length computation
 					 */
-					length += has_noalias(rt_hash_table[i].chain, rth);
+					length += has_noalias(rt_hash_table[i].chain, rth, lockp);
 					continue;
 				}
 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
 				goto nofree;
 
 			/* Cleanup aged off entries. */
-			*rthp = rth->dst.rt_next;
+			rcu_assign_pointer(*rthp,
+				rt_safederef(rth->dst.rt_next, lockp));
 			rt_free(rth);
 		}
-		spin_unlock_bh(rt_hash_lock_addr(i));
+		spin_unlock_bh(lockp);
 		sum += length;
 		sum2 += length*length;
 	}
@@ -942,7 +957,8 @@ static int rt_garbage_collect(struct dst_ops *ops)
 	static unsigned long last_gc;
 	static int rover;
 	static int equilibrium;
-	struct rtable *rth, **rthp;
+	struct rtable *rth;
+	struct rtable __rcu **rthp;
 	unsigned long now = jiffies;
 	int goal;
 
@@ -991,22 +1007,25 @@ static int rt_garbage_collect(struct dst_ops *ops)
 
 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 			unsigned long tmo = expire;
+			spinlock_t *lockp;
 
 			k = (k + 1) & rt_hash_mask;
 			rthp = &rt_hash_table[k].chain;
-			spin_lock_bh(rt_hash_lock_addr(k));
-			while ((rth = *rthp) != NULL) {
+			lockp = rt_hash_lock_addr(k);
+			spin_lock_bh(lockp);
+			while ((rth = rt_safederef(*rthp, lockp)) != NULL) {
 				if (!rt_is_expired(rth) &&
 					!rt_may_expire(rth, tmo, expire)) {
 					tmo >>= 1;
 					rthp = &rth->dst.rt_next;
 					continue;
 				}
-				*rthp = rth->dst.rt_next;
+				rcu_assign_pointer(*rthp,
+					rt_safederef(rth->dst.rt_next, lockp));
 				rt_free(rth);
 				goal--;
 			}
-			spin_unlock_bh(rt_hash_lock_addr(k));
+			spin_unlock_bh(lockp);
 			if (goal <= 0)
 				break;
 		}
@@ -1061,27 +1080,30 @@ out:	return 0;
 /*
  * Returns number of entries in a hash chain that have different hash_inputs
  */
-static int slow_chain_length(const struct rtable *head)
+static int slow_chain_length(const struct rtable *head, spinlock_t *lockp)
 {
 	int length = 0;
 	const struct rtable *rth = head;
 
 	while (rth) {
-		length += has_noalias(head, rth);
-		rth = rth->dst.rt_next;
+		length += has_noalias(head, rth, lockp);
+		rth = rt_safederef(rth->dst.rt_next, lockp);
 	}
 	return length >> FRACT_BITS;
 }
 
-static int rt_intern_hash(unsigned hash, struct rtable *rt,
+static int rt_intern_hash(unsigned int hash, struct rtable *rt,
 			  struct rtable **rp, struct sk_buff *skb, int ifindex)
 {
-	struct rtable	*rth, **rthp;
+	struct rtable	*rth;
+	struct rtable __rcu **rthp;
 	unsigned long	now;
-	struct rtable *cand, **candp;
+	struct rtable *cand;
+	struct rtable __rcu **candp;
 	u32 		min_score;
 	int		chain_length;
 	int attempts = !in_softirq();
+	spinlock_t	*lockp;
 
 restart:
 	chain_length = 0;
@@ -1124,23 +1146,26 @@ restart:
 
 	rthp = &rt_hash_table[hash].chain;
 
-	spin_lock_bh(rt_hash_lock_addr(hash));
-	while ((rth = *rthp) != NULL) {
+	lockp = rt_hash_lock_addr(hash);
+	spin_lock_bh(lockp);
+	while ((rth = rt_safederef(*rthp, lockp)) != NULL) {
 		if (rt_is_expired(rth)) {
-			*rthp = rth->dst.rt_next;
+			rcu_assign_pointer(*rthp,
+				rt_safederef(rth->dst.rt_next, lockp));
 			rt_free(rth);
 			continue;
 		}
 		if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
 			/* Put it first */
-			*rthp = rth->dst.rt_next;
+			rcu_assign_pointer(*rthp,
+				rt_safederef(rth->dst.rt_next, lockp));
 			/*
 			 * Since lookup is lockfree, the deletion
 			 * must be visible to another weakly ordered CPU before
 			 * the insertion at the start of the hash chain.
 			 */
 			rcu_assign_pointer(rth->dst.rt_next,
-					   rt_hash_table[hash].chain);
+				rt_safederef(rt_hash_table[hash].chain, lockp));
 			/*
 			 * Since lookup is lockfree, the update writes
 			 * must be ordered for consistency on SMP.
@@ -1148,7 +1173,7 @@ restart:
 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
 
 			dst_use(&rth->dst, now);
-			spin_unlock_bh(rt_hash_lock_addr(hash));
+			spin_unlock_bh(lockp);
 
 			rt_drop(rt);
 			if (rp)
@@ -1181,12 +1206,15 @@ restart:
 		 * only 2 entries per bucket. We will see.
 		 */
 		if (chain_length > ip_rt_gc_elasticity) {
-			*candp = cand->dst.rt_next;
+			rcu_assign_pointer(*candp,
+				rt_safederef(cand->dst.rt_next, lockp));
 			rt_free(cand);
 		}
 	} else {
 		if (chain_length > rt_chain_length_max &&
-		    slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
+		    slow_chain_length(rt_safederef(rt_hash_table[hash].chain,
+							  lockp),
+				      lockp) > rt_chain_length_max) {
 			struct net *net = dev_net(rt->dst.dev);
 			int num = ++net->ipv4.current_rt_cache_rebuild_count;
 			if (!rt_caching(net)) {
@@ -1194,7 +1222,7 @@ restart:
 					rt->dst.dev->name, num);
 			}
 			rt_emergency_hash_rebuild(net);
-			spin_unlock_bh(rt_hash_lock_addr(hash));
+			spin_unlock_bh(lockp);
 
 			hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
 					ifindex, rt_genid(net));
@@ -1208,7 +1236,7 @@ restart:
 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
 		int err = arp_bind_neighbour(&rt->dst);
 		if (err) {
-			spin_unlock_bh(rt_hash_lock_addr(hash));
+			spin_unlock_bh(lockp);
 
 			if (err != -ENOBUFS) {
 				rt_drop(rt);
@@ -1237,14 +1265,17 @@ restart:
 		}
 	}
 
-	rt->dst.rt_next = rt_hash_table[hash].chain;
+	rcu_assign_pointer(rt->dst.rt_next,
+		rt_safederef(rt_hash_table[hash].chain, lockp));
 
 #if RT_CACHE_DEBUG >= 2
-	if (rt->dst.rt_next) {
+	if (rt_safederef(rt->dst.rt_next, lockp)) {
 		struct rtable *trt;
 		printk(KERN_DEBUG "rt_cache @%02x: %pI4",
 		       hash, &rt->rt_dst);
-		for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next)
+		for (trt = rt_safederef(rt->dst.rt_next, lockp);
+		     trt;
+		     trt = rt_safederef(trt->dst.rt_next, lockp))
 			printk(" . %pI4", &trt->rt_dst);
 		printk("\n");
 	}
@@ -1256,7 +1287,7 @@ restart:
 	 */
 	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
 
-	spin_unlock_bh(rt_hash_lock_addr(hash));
+	spin_unlock_bh(lockp);
 
 skip_hashing:
 	if (rp)
@@ -1319,22 +1350,26 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
 }
 EXPORT_SYMBOL(__ip_select_ident);
 
-static void rt_del(unsigned hash, struct rtable *rt)
+static void rt_del(unsigned int hash, struct rtable *rt)
 {
-	struct rtable **rthp, *aux;
+	struct rtable __rcu **rthp;
+	struct rtable *aux;
+	spinlock_t *lockp = rt_hash_lock_addr(hash);
 
 	rthp = &rt_hash_table[hash].chain;
-	spin_lock_bh(rt_hash_lock_addr(hash));
+
+	spin_lock_bh(lockp);
 	ip_rt_put(rt);
-	while ((aux = *rthp) != NULL) {
+	while ((aux = rt_safederef(*rthp, lockp)) != NULL) {
 		if (aux == rt || rt_is_expired(aux)) {
-			*rthp = aux->dst.rt_next;
+			rcu_assign_pointer(*rthp,
+				rt_safederef(aux->dst.rt_next, lockp));
 			rt_free(aux);
 			continue;
 		}
 		rthp = &aux->dst.rt_next;
 	}
-	spin_unlock_bh(rt_hash_lock_addr(hash));
+	spin_unlock_bh(lockp);
 }
 
 /* called in rcu_read_lock() section */
@@ -1343,7 +1378,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 {
 	int i, k;
 	struct in_device *in_dev = __in_dev_get_rcu(dev);
-	struct rtable *rth, **rthp;
+	struct rtable *rth;
+	struct rtable __rcu **rthp;
 	__be32  skeys[2] = { saddr, 0 };
 	int  ikeys[2] = { dev->ifindex, 0 };
 	struct netevent_redirect netevent;
@@ -1373,10 +1409,10 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 
 	for (i = 0; i < 2; i++) {
 		for (k = 0; k < 2; k++) {
-			unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
+			unsigned int hash = rt_hash(daddr, skeys[i], ikeys[k],
 						rt_genid(net));
 
-			rthp=&rt_hash_table[hash].chain;
+			rthp = &rt_hash_table[hash].chain;
 
 			while ((rth = rcu_dereference(*rthp)) != NULL) {
 				struct rtable *rt;
@@ -1484,7 +1520,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 			   (rt->dst.expires &&
 			    time_after_eq(jiffies, rt->dst.expires))) {
-			unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
+			unsigned int hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
 						rt->fl.oif,
 						rt_genid(dev_net(dst->dev)));
 #if RT_CACHE_DEBUG >= 1
@@ -2052,7 +2088,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
 			    struct in_device *in_dev,
 			    __be32 daddr, __be32 saddr, u32 tos)
 {
-	struct rtable* rth = NULL;
+	struct rtable *rth = NULL;
 	int err;
 	unsigned hash;
 
@@ -2097,12 +2133,12 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 			    .iif = dev->ifindex };
 	unsigned	flags = 0;
 	u32		itag = 0;
-	struct rtable * rth;
+	struct rtable	*rth;
 	unsigned	hash;
 	__be32		spec_dst;
 	int		err = -EINVAL;
 	int		free_res = 0;
-	struct net    * net = dev_net(dev);
+	struct net	*net = dev_net(dev);
 
 	/* IP on this device is disabled. */
 
@@ -2696,7 +2732,7 @@ out:	return err;
 int __ip_route_output_key(struct net *net, struct rtable **rp,
 			  const struct flowi *flp)
 {
-	unsigned hash;
+	unsigned int hash;
 	struct rtable *rth;
 
 	if (!rt_caching(net))


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ