lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20150224205108.26106.33975.stgit@ahduyck-vm-fedora20>
Date:	Tue, 24 Feb 2015 12:51:08 -0800
From:	Alexander Duyck <alexander.h.duyck@...hat.com>
To:	netdev@...r.kernel.org
Subject: [RFC PATCH 29/29] fib_trie: Push bits up one level,
 and move leaves up into parent key_vector array

This is the last bit to get the key_vector up into the parent key_vector
array.  With this the bits field is moved from the local node up to the
parent, and as a result the key_vector is now defunct.

Since the key_vector is now defunct we can do a number of things.  The
first was to remove the leaf allocation since they are now just elements in
the key_vector array contained in the tnode and trie structures, and the
second was to rearrange the fib_table_lookup and fib_find_node functions to
take advantage of the fact that the trie has been pushed up one level.

Signed-off-by: Alexander Duyck <alexander.h.duyck@...hat.com>
---
 net/ipv4/fib_trie.c |  484 ++++++++++++++++++++++++---------------------------
 1 file changed, 229 insertions(+), 255 deletions(-)

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 0953247..8f48a03 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -112,7 +112,7 @@ struct tnode {
 	t_key full_children;		/* KEYLENGTH bits needed */
 	struct key_vector __rcu *parent;
 	struct key_vector kv[0];
-#define tn_bits kv[0].bits
+#define tn_bits kv[1].tn_pos
 };
 
 #ifdef CONFIG_IP_FIB_TRIE_STATS
@@ -160,7 +160,6 @@ static size_t tnode_free_size;
 static const int sync_pages = 128;
 
 static struct kmem_cache *fn_alias_kmem __read_mostly;
-static struct kmem_cache *trie_leaf_kmem __read_mostly;
 
 static inline struct tnode *tn_info(struct key_vector *kv)
 {
@@ -190,9 +189,9 @@ static inline struct fib_table *table_info(struct key_vector *kv)
 /* This provides us with the number of children in this node, in the case of a
  * leaf this will return 0 meaning none of the children are accessible.
  */
-static inline unsigned long child_length(const struct key_vector *tn)
+static inline unsigned long child_length(struct key_vector *tn)
 {
-	return (1ul << tn->bits) & ~(1ul);
+	return (1ul << tn_info(tn)->tn_bits);
 }
 
 #define get_cindex(key, kv) (((key) ^ (kv)->key) >> (kv)->pos)
@@ -297,9 +296,7 @@ static void __node_free_rcu(struct rcu_head *head)
 {
 	struct tnode *n = container_of(head, struct tnode, rcu);
 
-	if (!n->tn_bits)
-		kmem_cache_free(trie_leaf_kmem, n);
-	else if (n->tn_bits <= TNODE_KMALLOC_MAX)
+	if (n->tn_bits <= TNODE_KMALLOC_MAX)
 		kfree(n);
 	else
 		vfree(n);
@@ -325,55 +322,12 @@ static inline void empty_child_dec(struct key_vector *n)
 	tn_info(n)->empty_children-- ? : tn_info(n)->full_children--;
 }
 
-static struct key_vector *leaf_new(t_key key, struct fib_alias *fa)
-{
-	struct tnode *kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
-	struct key_vector *l = kv->kv;
-
-	if (!kv)
-		return NULL;
-
-	/* initialize key vector */
-	l->key = key;
-	l->pos = 0;
-	l->bits = 0;
-	l->slen = fa->fa_slen;
-	l->tn_pos = 0;
-
-	/* link leaf to fib alias */
-	INIT_HLIST_HEAD(&l->leaf);
-	hlist_add_head(&fa->fa_list, &l->leaf);
-
-	return l;
-}
-
 /* Check whether a tnode 'n' is "full", i.e. it is an internal node
  * and no bits are skipped. See discussion in dyntree paper p. 6
  */
 static inline int tnode_full(struct key_vector *tn, struct key_vector *n)
 {
-	return n && IS_TNODE(n) && ((n->tn_pos + n->bits) == tn->tn_pos);
-}
-
-static void drop_child(struct key_vector *tn, t_key key)
-{
-	/* update parent tnode statistics */
-	if (!IS_TRIE(tn)) {
-		unsigned long i = get_index(key, tn);
-		struct key_vector *n = get_child(tn + i);
-
-		if (n) {
-			empty_child_inc(tn);
-			if (tnode_full(tn, n))
-				tn_info(tn)->full_children--;
-		}
-
-		/* update offset to correct key_vector for update */
-		tn += i;
-	}
-
-	/* clear tnode pointers */
-	RCU_INIT_POINTER(tn->tnode, NULL);
+	return IS_TNODE(n) && ((n->pos + n->bits) == tn->tn_pos);
 }
 
 /* Add a child at position i overwriting the old value.
@@ -385,12 +339,12 @@ static void put_child(struct key_vector *tn, unsigned long i,
 	struct key_vector *tnode = get_child(n);
 
 	if (!IS_TRIE(tn)) {
-		struct key_vector *chi = get_child(tn + i);
+		struct key_vector *chi = tn + i;
 
 		BUG_ON(i >= child_length(tn));
 
 		/* update emptyChildren and fullChildren */
-		if (chi) {
+		if (get_child(chi)) {
 			empty_child_inc(tn);
 			if (tnode_full(tn, chi))
 				tn_info(tn)->full_children--;
@@ -399,7 +353,7 @@ static void put_child(struct key_vector *tn, unsigned long i,
 			struct key_vector *kv = get_vector(tn);
 
 			empty_child_dec(tn);
-			if (tnode_full(tn, tnode))
+			if (tnode_full(tn, n))
 				tn_info(tn)->full_children++;
 
 			/* update suffix length */
@@ -408,11 +362,12 @@ static void put_child(struct key_vector *tn, unsigned long i,
 		}
 
 		/* update offset to correct key_vector for update */
-		tn += i;
+		tn = chi;
 	}
 
 	tn->key = n->key;
 	tn->pos = n->pos;
+	tn->bits = n->bits;
 	tn->slen = n->slen;
 
 	rcu_assign_pointer(tn->tnode, tnode);
@@ -424,54 +379,60 @@ static void update_children(struct key_vector *tn)
 
 	/* update all of the child parent pointers */
 	for (i = IS_TRIE(tn) ? 1 : child_length(tn); i;) {
-		struct key_vector *inode = get_child(tn + --i);
-
-		if (!inode)
-			continue;
+		struct key_vector *inode = tn + --i;
 
 		/* Either update the children of a tnode that
 		 * already belongs to us or update the child
 		 * to point to ourselves.
 		 */
-		if (node_parent(inode) == tn)
-			update_children(inode);
-		else
-			node_set_parent(inode, tn);
+		if (IS_TNODE(inode)) {
+			struct key_vector *n = get_child(inode);
+
+			if (!n)
+				continue;
+
+			if (node_parent(n) == tn)
+				update_children(n);
+			else
+				node_set_parent(n, tn);
+		} else if (inode->leaf.first) {
+			/* update hash to point to us */
+			rcu_assign_pointer(inode->leaf.first->pprev,
+					   &inode->leaf.first);
+		}
 	}
 }
 
-static void leaf_init(struct key_vector *tn, t_key key, struct key_vector *l)
+static void leaf_init(struct key_vector *tn, t_key key, struct fib_alias *fa)
 {
-	/* link leaf to parent */
-	NODE_INIT_PARENT(l, tn);
-
 	/* update parent node stats */
 	if (!IS_TRIE(tn)) {
 		struct key_vector *kv = get_vector(tn);
 		unsigned long i = get_index(key, tn);
-		struct key_vector *n = get_child(tn + i);
 
 		BUG_ON(i >= child_length(tn));
 
-		if (!n)
-			empty_child_dec(tn);
-		else if (tnode_full(tn, n))
-			tn_info(tn)->full_children--;
+		empty_child_dec(tn);
 
 		/* update suffix length */
-		if (kv->slen < l->slen)
-			kv->slen = l->slen;
+		if (kv->slen < fa->fa_slen)
+			kv->slen = fa->fa_slen;
 
 		/* update offset to correct key_vector for update */
 		tn += i;
 	}
 
+	/* We should always be handed an empty slot */
+	BUG_ON(!hlist_empty(&tn->leaf));
+
 	/* populate key vector */
 	tn->key = key;
 	tn->pos = 0;
-	tn->slen = l->slen;
+	tn->bits = 0;
+	tn->slen = fa->fa_slen;
 
-	rcu_assign_pointer(tn->tnode, l);
+	/* clean the area and drop in the new leaf */
+	hlist_add_head(&fa->fa_list, &tn->leaf);
 }
 
 static struct key_vector *tnode_new(struct key_vector *tn, t_key key,
@@ -496,11 +457,11 @@ static struct key_vector *tnode_new(struct key_vector *tn, t_key key,
 	/* update parent node stats */
 	if (!IS_TRIE(tn)) {
 		unsigned long idx = get_index(key, tn);
-		struct key_vector *n = get_child(tn + idx);
+		struct key_vector *n = tn + idx;
 
 		BUG_ON(idx >= child_length(tn));
 
-		if (!n)
+		if (!get_child(n))
 			empty_child_dec(tn);
 		else if (tnode_full(tn, n))
 			tn_info(tn)->full_children--;
@@ -508,7 +469,7 @@ static struct key_vector *tnode_new(struct key_vector *tn, t_key key,
 			tn_info(tn)->full_children++;
 
 		/* update offset to correct key_vector for update */
-		tn += idx;
+		tn = n;
 	}
 
 	/* populate tn_info section */
@@ -518,7 +479,7 @@ static struct key_vector *tnode_new(struct key_vector *tn, t_key key,
 	else
 		tnode->empty_children = 1ul << bits;
 	tnode->kv[0].tn_pos = pos;
-	tnode->kv[0].bits = bits;
+	tnode->tn_bits = bits;
 
 	/* populate keys as though we are full of leaves */
 	for (i = (1ul << bits); i--;)
@@ -527,6 +488,7 @@ static struct key_vector *tnode_new(struct key_vector *tn, t_key key,
 	/* populate key vector */
 	tn->key = key;
 	tn->pos = pos;
+	tn->bits = bits;
 	tn->slen = pos;
 
 	rcu_assign_pointer(tn->tnode, tnode->kv);
@@ -618,7 +580,7 @@ static void tnode_free(struct key_vector *tn)
 
 	while (head) {
 		head = head->next;
-		tnode_free_size += TNODE_SIZE(1ul << tn->bits);
+		tnode_free_size += TNODE_SIZE(child_length(tn));
 		node_free(tn);
 
 		tn = container_of(head, struct tnode, rcu)->kv;
@@ -664,11 +626,12 @@ static struct key_vector *resize_children(struct net *net, struct trie *t,
 
 	/* resize children now that oldtnode is freed */
 	for (i = child_length(tn); i;) {
-		struct key_vector *inode = get_child(tn + --i);
+		struct key_vector *inode = tn + --i;
+		struct key_vector *tnode = get_child(inode);
 
 		/* resize child node */
-		if (tnode_full(tn, inode))
-			tn = resize(net, t, inode);
+		if (tnode && tnode_full(tn, inode))
+			tn = resize(net, t, tnode);
 	}
 
 	return node_parent(tn);
@@ -689,7 +652,7 @@ static struct key_vector *inflate(struct net *net, struct trie *t,
 		return NULL;
 
 	tn = tnode_new(pn, oldtnode->key,
-		       oldtnode->tn_pos - 1, oldtnode->bits + 1);
+		       oldtnode->tn_pos - 1, tn_info(oldtnode)->tn_bits + 1);
 	if (!tn)
 		goto notnode;
 
@@ -712,7 +675,7 @@ static struct key_vector *inflate(struct net *net, struct trie *t,
 			continue;
 
 		/* A leaf or an internal node with skipped bits */
-		if (!tnode_full(oldtnode, tnode)) {
+		if (!tnode_full(oldtnode, inode)) {
 			put_child(tn, get_index(inode->key, tn), inode);
 			continue;
 		}
@@ -721,7 +684,7 @@ static struct key_vector *inflate(struct net *net, struct trie *t,
 		tnode_free_append(oldtnode, tnode);
 
 		/* An internal node with two children */
-		if (tnode->bits == 1) {
+		if (inode->bits == 1) {
 			put_child(tn, 2 * i + 1, tnode + 1);
 			put_child(tn, 2 * i, tnode);
 			continue;
@@ -742,11 +705,11 @@ static struct key_vector *inflate(struct net *net, struct trie *t,
 		 * two new keys.
 		 */
 		node1 = tnode_new(tn, inode->key | m,
-				  inode->pos, tnode->bits - 1);
+				  inode->pos, inode->bits - 1);
 		if (!node1)
 			goto nomem;
 		node0 = tnode_new(tn, inode->key,
-				  inode->pos, tnode->bits - 1);
+				  inode->pos, inode->bits - 1);
 
 		tnode_free_append(tn, node1);
 		if (!node0)
@@ -790,7 +753,7 @@ static struct key_vector *halve(struct net *net, struct trie *t,
 		return NULL;
 
 	tn = tnode_new(pn, oldtnode->key,
-		       oldtnode->tn_pos + 1, oldtnode->bits - 1);
+		       oldtnode->tn_pos + 1, tn_info(oldtnode)->tn_bits - 1);
 	if (!tn)
 		goto notnode;
 
@@ -842,13 +805,12 @@ notnode:
 static struct key_vector *collapse(struct net *net, struct trie *t,
 				   struct key_vector *oldtnode)
 {
-	struct key_vector *pn = node_parent(oldtnode);
+	struct key_vector *n, *pn = node_parent(oldtnode);
 	unsigned long i;
 
 	/* scan the tnode looking for that one child that might still exist */
 	for (i = child_length(oldtnode); i--;) {
-		struct key_vector *n = get_child(oldtnode + i);
-
+		n = get_child(oldtnode + i);
 		if (!n)
 			continue;
 
@@ -865,11 +827,24 @@ static struct key_vector *collapse(struct net *net, struct trie *t,
 		node_free(oldtnode);
 
 		/* resize child since it could be promoted to root */
-		return IS_TNODE(n) ? resize(net, t, n) : pn;
+		return IS_TNODE(oldtnode + i) ? resize(net, t, n) : pn;
 	}
 
 	/* no children, just update pointer to NULL */
-	drop_child(pn, oldtnode->key);
+	n = pn;
+
+	/* update parent tnode statistics */
+	if (!IS_TRIE(pn)) {
+		/* update offset to correct key_vector for update */
+		n = get_vector(oldtnode);
+
+		empty_child_inc(pn);
+		if (tnode_full(pn, n))
+			tn_info(pn)->full_children--;
+	}
+
+	/* clear tnode pointers */
+	RCU_INIT_POINTER(n->tnode, NULL);
 	node_free(oldtnode);
 
 	return pn;
@@ -909,7 +884,7 @@ static unsigned char update_suffix(struct key_vector *tn)
 		 * 0 and 1 << (bits - 1) could have that as their suffix
 		 * length.
 		 */
-		if ((slen + 1) >= (tn->pos + tp->bits))
+		if ((slen + 1) >= (tn->pos + tn->bits))
 			break;
 	}
 
@@ -1004,7 +979,7 @@ static inline bool should_halve(struct key_vector *tp, struct key_vector *tn)
 
 	/* if bits == KEYLENGTH then used = 100% on wrap, and will fail below */
 
-	return (used > 1) && (tn->bits > 1) && ((100 * used) < threshold);
+	return (used > 1) && (tn_info(tn)->tn_bits > 1) && ((100 * used) < threshold);
 }
 
 static inline bool should_collapse(struct key_vector *tn)
@@ -1014,7 +989,7 @@ static inline bool should_collapse(struct key_vector *tn)
 	used -= tn_info(tn)->empty_children;
 
 	/* account for bits == KEYLENGTH case */
-	if ((tn->bits == KEYLENGTH) && tn_info(tn)->full_children)
+	if ((tn_info(tn)->tn_bits == KEYLENGTH) && tn_info(tn)->full_children)
 		used -= KEY_MAX;
 
 	/* One child or none, time to drop us from the trie */
@@ -1096,11 +1071,6 @@ static struct key_vector *resize(struct net *net, struct trie *t,
 
 static void leaf_pull_suffix(struct key_vector *tp, struct key_vector *l)
 {
-	struct key_vector *n = tp + get_index(l->key, tp);
-
-	/* update our local vector first */
-	n->slen = l->slen;
-
 	/* work our way back up the trie sorting out slen in the key vectors */
 	while (!IS_TRIE(tp)) {
 		/* if the suffix doesn't change then we are done */
@@ -1113,20 +1083,13 @@ static void leaf_pull_suffix(struct key_vector *tp, struct key_vector *l)
 
 static void leaf_push_suffix(struct key_vector *tn, struct key_vector *l)
 {
-	struct key_vector *n = tn + get_index(l->key, tn);
-
-	/* update our local vector first */
-	n->slen = l->slen;
-
 	/* work our way back up the trie sorting out slen in the key vectors */
-	while (!IS_TRIE(tn)) {
-		n = get_vector(tn);
+	if (!IS_TRIE(tn)) {
+		struct key_vector *n = get_vector(tn);
 
 		/* if the suffix doesn't change then we are done */
 		if (n->slen < l->slen)
 			n->slen = l->slen;
-
-		tn = node_parent(tn);
 	}
 }
 
@@ -1142,9 +1105,6 @@ static struct key_vector *fib_find_node(struct trie *t,
 		n += index;
 
 		index = get_cindex(key, n);
-		n = get_child_rcu(n);
-		if (!n)
-			break;
 
 		/* This bit of code is a bit tricky but it combines multiple
 		 * checks into a single check.  The prefix consists of the
@@ -1160,7 +1120,11 @@ static struct key_vector *fib_find_node(struct trie *t,
 			return NULL;
 
 		/* keep searching until we find a perfect match leaf or NULL */
-	} while (IS_TNODE(n));
+		if (IS_LEAF(n))
+			break;
+
+		n = get_child_rcu(n);
+	} while (n);
 
 	return n;
 }
@@ -1203,18 +1167,13 @@ static struct fib_table *fib_insert_node(struct net *net, struct trie *t,
 					 struct key_vector *tp,
 					 struct fib_alias *new, t_key key)
 {
-	struct key_vector *tn, *l, *n;
+	struct key_vector *tn, *n;
 
 	/* allocate the new parent that must be replaced */
 	tn = vector_clone(tp);
 	if (!tn)
 		return NULL;
 
-	/* allocate the new leaf we will insert */
-	l = leaf_new(key, new);
-	if (!l)
-		goto noleaf;
-
 	/* retrieve child from parent node */
 	n = tp + get_index(key, tp);
 
@@ -1241,14 +1200,12 @@ static struct fib_table *fib_insert_node(struct net *net, struct trie *t,
 	}
 
 	/* Case 3: n is NULL, and will just insert a new leaf */
-	leaf_init(n, key, l);
+	leaf_init(n, key, new);
 
 	vector_replace(net, tp, tn);
 
 	return trie_rebalance(net, t, n);
 notnode:
-	node_free(l);
-noleaf:
 	vector_free(tn);
 
 	return NULL;
@@ -1266,6 +1223,9 @@ static struct fib_table *fib_insert_alias(struct net *net, struct trie *t,
 	if (!fa) {
 		struct fib_alias *last;
 
+		if (hlist_empty(&l->leaf) && !IS_TRIE(tp))
+			empty_child_dec(tp);
+
 		hlist_for_each_entry(last, &l->leaf, fa_list) {
 			if (new->fa_slen < last->fa_slen)
 				break;
@@ -1284,7 +1244,7 @@ static struct fib_table *fib_insert_alias(struct net *net, struct trie *t,
 		leaf_push_suffix(tp, l);
 	}
 
-	return table_info(t->kv);
+	return trie_rebalance(net, t, tp);
 }
 
 /* Caller must hold RTNL. */
@@ -1456,25 +1416,20 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
 	struct trie_use_stats __percpu *stats = t->stats;
 #endif
 	const t_key key = ntohl(flp->daddr);
-	struct key_vector *n, *pn, *tn;
-	unsigned long cindex;
+	struct key_vector *pn, *n, *l = t->kv;
+	unsigned long cindex, index = 0;
 	struct fib_alias *fa;
 
-	pn = t->kv;
-	cindex = 0;
-
-	tn = pn;
-	n = get_child_rcu(tn);
-	if (!n)
-		return -EAGAIN;
-
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 	this_cpu_inc(stats->gets);
 #endif
+	pn = l;
+	cindex = index;
 
 	/* Step 1: Travel to the longest prefix match in the trie */
 	for (;;) {
-		unsigned long index = get_cindex(key, tn);
+		n = l + index;
+		index = get_cindex(key, n);
 
 		/* This bit of code is a bit tricky but it combines multiple
 		 * checks into a single check.  The prefix consists of the
@@ -1489,6 +1444,11 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
 		if (index & (~0ul << n->bits))
 			break;
 
+		/* grab the pointers for the next object */
+		l = get_child_rcu(n);
+		if (!l)
+			goto backtrace;
+
 		/* we have found a leaf. Prefixes have already been compared */
 		if (IS_LEAF(n))
 			goto found;
@@ -1496,43 +1456,19 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
 		/* only record pn and cindex if we are going to be chopping
 		 * bits later.  Otherwise we are just wasting cycles.
 		 */
-		if (tn->slen > tn->pos) {
-			pn = n;
+		if (n->slen > n->pos) {
+			pn = l;
 			cindex = index;
 		}
-
-		tn = n + index;
-
-		/* verify there is a tnode to go with the key vector */
-		n = get_child_rcu(tn);
-		if (unlikely(!n))
-			goto backtrace;
 	}
 
 	/* Step 2: Sort out leaves and begin backtracing for longest prefix */
 	for (;;) {
-		/* This test verifies that none of the bits that differ
-		 * between the key and the prefix exist in the region of
-		 * the lsb and higher in the prefix.
-		 */
-		if (unlikely(prefix_mismatch(key, tn)) || (tn->slen <= tn->pos))
-			goto backtrace;
-
-		/* exit out and process leaf */
-		if (unlikely(IS_LEAF(n)))
-			break;
-
-		/* Don't bother recording parent info.  Since we are in
-		 * prefix match mode we will have to come back to wherever
-		 * we started this traversal anyway
-		 */
-
-		tn = n;
-
-		while ((n = get_child_rcu(tn)) == NULL) {
+		/* grab the pointers for the next object */
+		while ((l = get_child_rcu(n)) == NULL) {
 backtrace:
 #ifdef CONFIG_IP_FIB_TRIE_STATS
-			if (!n)
+			if (!l)
 				this_cpu_inc(stats->null_node_hit);
 #endif
 			/* If we are at cindex 0 there are no more bits for
@@ -1561,24 +1497,46 @@ backtrace:
 			cindex &= cindex - 1;
 
 			/* grab pointer for next child node */
-			tn = pn + cindex;
+			n = pn + cindex;
 		}
-	}
 
+		/* This test verifies that none of the bits that differ
+		 * between the key and the prefix exist in the region of
+		 * the lsb and higher in the prefix.
+		 */
+		if (unlikely(prefix_mismatch(key, n)) || (n->slen <= n->pos))
+			goto backtrace;
+
+		/* exit out and process leaf */
+		if (unlikely(IS_LEAF(n)))
+			break;
+
+		/* Don't bother recording parent info.  Since we are in
+		 * prefix match mode we will have to come back to wherever
+		 * we started this traversal anyway
+		 */
+
+		n = l;
+	}
 found:
 	/* Step 3: Process the leaf, if that fails fall back to backtracing */
-	hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {
-		struct fib_info *fi = fa->fa_info;
+	fa = hlist_entry(&l->leaf.first, typeof(*fa), fa_list.next);
+	hlist_for_each_entry_from_rcu(fa, fa_list) {
+		struct fib_info *fi;
 		int nhsel, err;
 
-		if (((key ^ n->key) >> fa->fa_slen) &&
-		    (fa->fa_slen != KEYLENGTH))
+		if (((unsigned long)(key ^ n->key) >> fa->fa_slen) &&
+		    ((BITS_PER_LONG > KEYLENGTH) ||
+		     (fa->fa_slen != KEYLENGTH)))
 			continue;
 		if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
 			continue;
+
+		fi = fa->fa_info;
+
 		if (fi->fib_dead)
 			continue;
-		if (fa->fa_info->fib_scope < flp->flowi4_scope)
+		if (fi->fib_scope < flp->flowi4_scope)
 			continue;
 		fib_alias_accessed(fa);
 		err = fib_props[fa->fa_type].error;
@@ -1636,9 +1594,13 @@ static void fib_remove_alias(struct net *net, struct trie *t,
 	 * out parent suffix lengths as a part of trie_rebalance
 	 */
 	if (hlist_empty(&l->leaf)) {
-		drop_child(tp, l->key);
-		node_free(l);
-		trie_rebalance(net, t, tp);
+		l->slen = 0;
+
+		if (!IS_TRIE(tp)) {
+			empty_child_inc(tp);
+			trie_rebalance(net, t, tp);
+		}
+
 		return;
 	}
 
@@ -1723,51 +1685,55 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
 /* Scan for the next leaf starting at the provided key value */
 static struct key_vector *leaf_walk_rcu(struct key_vector **pn, t_key key)
 {
-	struct key_vector *tn, *n = *pn;
-	unsigned long cindex;
+	struct key_vector *l, *n, *tn = *pn;
+	unsigned long cindex = get_index(key, tn);
 
 	/* this loop is meant to try and find the key in the trie */
-	do {
-		/* record parent and next child index */
-		tn = n;
-		cindex = get_index(key, tn);
+	while (cindex < child_length(tn)) {
+		n = tn + cindex++;
 
-		if (cindex >> tn->bits)
-			break;
-
-		/* descend into the next child */
-		n = get_child_rcu(tn + cindex++);
-		if (!n)
+		l = get_child_rcu(n);
+		if (!l)
 			break;
 
 		/* guarantee forward progress on the keys */
-		if (IS_LEAF(n) && (n->key >= key))
+		if (IS_LEAF(n)) {
+			if (n->key < key)
+				break;
 			goto found;
-	} while (IS_TNODE(n));
+		}
+
+		/* record parent and next child index */
+		tn = l;
+		cindex = get_index(key, tn);
+	}
 
 	/* this loop will search for the next leaf with a greater key */
 	while (!IS_TRIE(tn)) {
+		t_key pkey;
+
 		/* if we exhausted the parent node we will need to climb */
-		if (cindex >> tn->bits) {
-			t_key pkey = tn->key;
+		while (cindex < child_length(tn)) {
+			/* grab the next available node */
+			n = tn + cindex++;
 
-			tn = node_parent_rcu(tn);
-			cindex = get_index(pkey, tn) + 1;
-			continue;
-		}
+			l = get_child_rcu(n);
+			if (!l)
+				continue;
 
-		/* grab the next available node */
-		n = get_child_rcu(tn + cindex++);
-		if (!n)
-			continue;
+			/* no need to compare keys since we bumped the index */
+			if (IS_LEAF(n))
+				goto found;
 
-		/* no need to compare keys since we bumped the index */
-		if (IS_LEAF(n))
-			goto found;
+			/* Rescan start scanning in new node */
+			tn = l;
+			cindex = 0;
+		}
 
-		/* Rescan start scanning in new node */
-		tn = n;
-		cindex = 0;
+		pkey = tn->key;
+
+		tn = node_parent_rcu(tn);
+		cindex = get_index(pkey, tn) + 1;
 	}
 
 	*pn = tn;
@@ -1790,7 +1756,6 @@ int fib_table_flush(struct net *net, struct fib_table *tb)
 
 	/* walk trie in reverse order */
 	for (;;) {
-		unsigned char slen = 0;
 		struct key_vector *n;
 
 		if (!(cindex--)) {
@@ -1807,42 +1772,47 @@ int fib_table_flush(struct net *net, struct fib_table *tb)
 			continue;
 		}
 
-		/* grab the next available node */
-		n = get_child(pn + cindex);
-		if (!n)
-			continue;
+		/* locate key vector within the array */
+		n = pn + cindex;
 
 		if (IS_TNODE(n)) {
+			/* grab the next available node */
+			n = get_child(n);
+
 			/* record pn and cindex for leaf walking */
-			pn = n;
-			cindex = 1ul << n->bits;
+			if (n) {
+				pn = n;
+				cindex = child_length(n);
+			}
 
 			continue;
 		}
 
-		hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
-			struct fib_info *fi = fa->fa_info;
+		if (!hlist_empty(&n->leaf)) {
+			unsigned char slen = 0;
 
-			if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
-				hlist_del_rcu(&fa->fa_list);
-				fib_release_info(fa->fa_info);
-				alias_free_mem_rcu(fa);
-				found++;
+			hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
+				struct fib_info *fi = fa->fa_info;
 
-				continue;
-			}
+				if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
+					hlist_del_rcu(&fa->fa_list);
+					fib_release_info(fa->fa_info);
+					alias_free_mem_rcu(fa);
+					found++;
 
-			slen = fa->fa_slen;
-		}
+					continue;
+				}
+
+				/* track suffix length of non-flushed leaves */
+				slen = fa->fa_slen;
+			}
 
-		/* update leaf slen */
-		n->slen = slen;
+			/* reset slen and update tnode */
+			n->slen = slen;
 
-		if (hlist_empty(&n->leaf)) {
-			drop_child(pn, n->key);
-			node_free(n);
-		} else {
-			leaf_pull_suffix(pn, n);
+			/* update parent status */
+			if (hlist_empty(&n->leaf) && !IS_TRIE(pn))
+				empty_child_inc(pn);
 		}
 	}
 
@@ -1945,11 +1915,6 @@ void __init fib_trie_init(void)
 	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
 					  sizeof(struct fib_alias),
 					  0, SLAB_PANIC, NULL);
-
-	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
-					   sizeof(struct tnode) +
-					   sizeof(struct key_vector),
-					   0, SLAB_PANIC, NULL);
 }
 
 struct fib_table *fib_trie_table(u32 id)
@@ -1999,17 +1964,22 @@ static struct key_vector *fib_trie_get_next(struct fib_trie_iter *iter)
 
 	while (!IS_TRIE(pn)) {
 		while (cindex < child_length(pn)) {
-			struct key_vector *n = get_child_rcu(pn + cindex++);
-
-			if (!n)
-				continue;
+			struct key_vector *n = pn + cindex++;
 
 			if (IS_LEAF(n)) {
+				if (hlist_empty(&n->leaf))
+					continue;
+
 				iter->tnode = pn;
 				iter->index = cindex;
 			} else {
+				struct key_vector *tnode = get_child_rcu(n);
+
+				if (!tnode)
+					continue;
+
 				/* push down one level */
-				iter->tnode = n;
+				iter->tnode = tnode;
 				iter->index = 0;
 				++iter->depth;
 			}
@@ -2034,26 +2004,30 @@ static struct key_vector *fib_trie_get_next(struct fib_trie_iter *iter)
 static struct key_vector *fib_trie_get_first(struct fib_trie_iter *iter,
 					     struct trie *t)
 {
-	struct key_vector *n, *pn = t->kv;
+	struct key_vector *pn = t->kv;
 
 	if (!t)
 		return NULL;
 
-	n = rcu_dereference(pn->tnode);
-	if (!n)
-		return NULL;
+	if (IS_TNODE(pn)) {
+		struct key_vector *n = get_child_rcu(pn);
+
+		if (!n)
+			return NULL;
 
-	if (IS_TNODE(n)) {
 		iter->tnode = n;
 		iter->index = 0;
 		iter->depth = 1;
 	} else {
+		if (hlist_empty(&pn->leaf))
+			return NULL;
+
 		iter->tnode = pn;
 		iter->index = 0;
 		iter->depth = 0;
 	}
 
-	return n;
+	return pn;
 }
 
 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
@@ -2079,7 +2053,7 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s)
 			s->tnodes++;
 			if (n->bits < MAX_STAT_DEPTH)
 				s->nodesizes[n->bits]++;
-			s->nullpointers += tn_info(n)->empty_children;
+			s->nullpointers += tn_info(iter.tnode)->empty_children;
 		}
 	}
 	rcu_read_unlock();
@@ -2124,7 +2098,7 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
 	seq_printf(seq, "\tPointers: %u\n", pointers);
 
 	bytes += sizeof(struct key_vector) * pointers;
-	seq_printf(seq, "Empty vectors: %u\n", stat->nullpointers);
+	seq_printf(seq, "Empty leaves: %u\n", stat->nullpointers);
 	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
 }
 
@@ -2177,7 +2151,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
 	seq_printf(seq,
 		   "Basic info: size of leaf:"
 		   " %Zd bytes, size of tnode: %Zd bytes.\n",
-		   TNODE_SIZE(1), TNODE_SIZE(0));
+		   sizeof(struct key_vector), TNODE_SIZE(0));
 
 	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
 		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
@@ -2345,17 +2319,17 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
 	const struct fib_trie_iter *iter = seq->private;
 	struct key_vector *n = v;
 
-	if (IS_TRIE(node_parent_rcu(n)))
+	if (IS_TRIE(n))
 		fib_table_print(seq, iter->tb);
 
 	if (IS_TNODE(n)) {
-		__be32 prf = htonl((n->key >> n->tn_pos) << n->tn_pos);
+		__be32 prf = htonl(n->key);
 
 		seq_indent(seq, iter->depth-1);
 		seq_printf(seq, "  +-- %pI4/%zu %u %u %u\n",
-			   &prf, KEYLENGTH - n->tn_pos - n->bits, n->bits,
-			   tn_info(n)->full_children,
-			   tn_info(n)->empty_children);
+			   &prf, KEYLENGTH - n->pos - n->bits, n->bits,
+			   tn_info(iter->tnode)->full_children,
+			   tn_info(iter->tnode)->empty_children);
 	} else {
 		__be32 val = htonl(n->key);
 		struct fib_alias *fa;

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ