netdev - [PATCH] xfrm: cache bundle lookup results in flow cache

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <1268655610-7845-1-git-send-email-timo.teras@iki.fi>
Date:	Mon, 15 Mar 2010 14:20:10 +0200
From:	Timo Teras <timo.teras@....fi>
To:	netdev@...r.kernel.org
Cc:	Timo Teras <timo.teras@....fi>,
	Herbert Xu <herbert@...dor.apana.org.au>
Subject: [PATCH] xfrm: cache bundle lookup results in flow cache

Instead of doing O(n) xfrm_find_bundle() call per-packet, cache
the previous lookup results in flow cache. The flow cache is
updated to be per-netns and more generic.

The flow cache no longer holds reference (which was not really
used in the first place, as it depended on garbage collection).
Now this is more explicit. The cache validity is maintained as follows:
- On policy insert, the whole cache is invalideted by incrementing
  generation id. No synchronization required as genid checks make
  sure no old objects are dereferenced.
- On policy removal from lists the object is marked deleted, and
  this invalidated the policy pointer.
- Policy object deletion requires explicit synchronization to remove
  stale pointers before can actually free the policy objects.
  xfrm_policy_gc_task() synchronizes the cache.
- Bundle creation and expiry is reflected in xfrm_bundle_ok() check
  before any bundle from cache is used.
- Bundle deletion is done by incrementing policy->bundles_genid and
  synchronizing with other cpu's so there is no stale bundle pointers
  left. After this the bundle objects can be safely deleted.

Basic testing done on 2.6.32 based kernel. This gives a boost of
several magnitudes on transmit path.

Signed-off-by: Timo Teras <timo.teras@....fi>
Cc: Herbert Xu <herbert@...dor.apana.org.au>
---
 include/net/flow.h               |   39 ++++-
 include/net/netns/xfrm.h         |    4 +
 include/net/xfrm.h               |    1 +
 net/core/flow.c                  |  342 ++++++++++++++++++--------------------
 net/ipv6/inet6_connection_sock.c |    6 +-
 net/xfrm/xfrm_policy.c           |  271 +++++++++++++++++++++---------
 6 files changed, 394 insertions(+), 269 deletions(-)

diff --git a/include/net/flow.h b/include/net/flow.h
index 809970b..814a9d2 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -8,6 +8,9 @@
 #define _NET_FLOW_H
 
 #include <linux/in6.h>
+#include <linux/notifier.h>
+#include <linux/timer.h>
+#include <linux/slab.h>
 #include <asm/atomic.h>
 
 struct flowi {
@@ -86,13 +89,37 @@ struct flowi {
 
 struct net;
 struct sock;
-typedef int (*flow_resolve_t)(struct net *net, struct flowi *key, u16 family,
-			      u8 dir, void **objp, atomic_t **obj_refp);
 
-extern void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family,
-			       u8 dir, flow_resolve_t resolver);
-extern void flow_cache_flush(void);
-extern atomic_t flow_cache_genid;
+struct flow_cache_percpu;
+struct flow_cache_entry;
+
+struct flow_cache {
+	u32				hash_shift;
+	u32				order;
+	struct flow_cache_percpu *	percpu;
+	struct notifier_block		hotcpu_notifier;
+	int				low_watermark;
+	int				high_watermark;
+	struct timer_list		rnd_timer;
+	struct kmem_cache *		flow_cachep;
+};
+
+struct flow_cache_entry {
+	struct flow_cache_entry	*next;
+	struct flowi		key;
+	u16			family;
+	u8			dir;
+};
+
+extern struct flow_cache_entry *flow_cache_lookup(
+	struct flow_cache *cache, struct flowi *key,
+	u16 family, u8 dir);
+extern void flow_cache_entry_put(struct flow_cache_entry *fce);
+
+void flow_cache_flush(struct flow_cache *fc,
+		      void (*flush)(struct flow_cache *fc, struct flow_cache_entry *fce));
+extern int flow_cache_init(struct flow_cache *cache, size_t entry_size);
+extern void flow_cache_fini(struct flow_cache *cache);
 
 static inline int flow_cache_uli_match(struct flowi *fl1, struct flowi *fl2)
 {
diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
index 74f119a..1b223c9 100644
--- a/include/net/netns/xfrm.h
+++ b/include/net/netns/xfrm.h
@@ -42,6 +42,10 @@ struct netns_xfrm {
 	struct xfrm_policy_hash	policy_bydst[XFRM_POLICY_MAX * 2];
 	unsigned int		policy_count[XFRM_POLICY_MAX * 2];
 	struct work_struct	policy_hash_work;
+	atomic_t		policy_genid;
+	struct hlist_head	policy_gc_list;
+	struct work_struct	policy_gc_work;
+	struct flow_cache	flow_cache;
 
 	struct dst_ops		xfrm4_dst_ops;
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index d74e080..f469b9b 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -488,6 +488,7 @@ struct xfrm_policy {
 	struct xfrm_lifetime_cfg lft;
 	struct xfrm_lifetime_cur curlft;
 	struct dst_entry       *bundles;
+	atomic_t		bundles_genid;
 	struct xfrm_policy_walk_entry walk;
 	u8			type;
 	u8			action;
diff --git a/net/core/flow.c b/net/core/flow.c
index 9601587..e3782c2 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -25,114 +25,85 @@
 #include <asm/atomic.h>
 #include <linux/security.h>
 
-struct flow_cache_entry {
-	struct flow_cache_entry	*next;
-	u16			family;
-	u8			dir;
-	u32			genid;
-	struct flowi		key;
-	void			*object;
-	atomic_t		*object_ref;
-};
-
-atomic_t flow_cache_genid = ATOMIC_INIT(0);
-
-static u32 flow_hash_shift;
-#define flow_hash_size	(1 << flow_hash_shift)
-static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL };
-
-#define flow_table(cpu) (per_cpu(flow_tables, cpu))
-
-static struct kmem_cache *flow_cachep __read_mostly;
 
-static int flow_lwm, flow_hwm;
-
-struct flow_percpu_info {
-	int hash_rnd_recalc;
-	u32 hash_rnd;
-	int count;
+struct flow_cache_percpu {
+	struct flow_cache_entry **	hash_table;
+	int				hash_count;
+	u32				hash_rnd;
+	int				hash_rnd_recalc;
+	struct tasklet_struct		flush_tasklet;
 };
-static DEFINE_PER_CPU(struct flow_percpu_info, flow_hash_info) = { 0 };
-
-#define flow_hash_rnd_recalc(cpu) \
-	(per_cpu(flow_hash_info, cpu).hash_rnd_recalc)
-#define flow_hash_rnd(cpu) \
-	(per_cpu(flow_hash_info, cpu).hash_rnd)
-#define flow_count(cpu) \
-	(per_cpu(flow_hash_info, cpu).count)
-
-static struct timer_list flow_hash_rnd_timer;
-
-#define FLOW_HASH_RND_PERIOD	(10 * 60 * HZ)
 
 struct flow_flush_info {
-	atomic_t cpuleft;
-	struct completion completion;
+	void (*flush)(struct flow_cache *fc, struct flow_cache_entry *fce);
+	struct flow_cache *		cache;
+	atomic_t			cpuleft;
+	struct completion		completion;
 };
-static DEFINE_PER_CPU(struct tasklet_struct, flow_flush_tasklets) = { NULL };
 
-#define flow_flush_tasklet(cpu) (&per_cpu(flow_flush_tasklets, cpu))
+#define flow_cache_hash_size(cache)	(1 << (cache)->hash_shift)
+#define FLOW_HASH_RND_PERIOD		(10 * 60 * HZ)
 
 static void flow_cache_new_hashrnd(unsigned long arg)
 {
+	struct flow_cache *fc = (struct flow_cache *) arg;
 	int i;
 
 	for_each_possible_cpu(i)
-		flow_hash_rnd_recalc(i) = 1;
+		per_cpu_ptr(fc->percpu, i)->hash_rnd_recalc = 1;
 
-	flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
-	add_timer(&flow_hash_rnd_timer);
+	fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
+	add_timer(&fc->rnd_timer);
 }
 
-static void flow_entry_kill(int cpu, struct flow_cache_entry *fle)
-{
-	if (fle->object)
-		atomic_dec(fle->object_ref);
-	kmem_cache_free(flow_cachep, fle);
-	flow_count(cpu)--;
-}
-
-static void __flow_cache_shrink(int cpu, int shrink_to)
+static void __flow_cache_shrink(struct flow_cache *fc,
+				struct flow_cache_percpu *fcp,
+				int shrink_to)
 {
 	struct flow_cache_entry *fle, **flp;
 	int i;
 
-	for (i = 0; i < flow_hash_size; i++) {
+	for (i = 0; i < flow_cache_hash_size(fc); i++) {
 		int k = 0;
 
-		flp = &flow_table(cpu)[i];
+		flp = &fcp->hash_table[i];
 		while ((fle = *flp) != NULL && k < shrink_to) {
 			k++;
 			flp = &fle->next;
 		}
 		while ((fle = *flp) != NULL) {
 			*flp = fle->next;
-			flow_entry_kill(cpu, fle);
+
+			kmem_cache_free(fc->flow_cachep, fle);
+			fcp->hash_count--;
 		}
 	}
 }
 
-static void flow_cache_shrink(int cpu)
+static void flow_cache_shrink(struct flow_cache *fc,
+			      struct flow_cache_percpu *fcp)
 {
-	int shrink_to = flow_lwm / flow_hash_size;
+	int shrink_to = fc->low_watermark / flow_cache_hash_size(fc);
 
-	__flow_cache_shrink(cpu, shrink_to);
+	__flow_cache_shrink(fc, fcp, shrink_to);
 }
 
-static void flow_new_hash_rnd(int cpu)
+static void flow_new_hash_rnd(struct flow_cache *fc,
+			      struct flow_cache_percpu *fcp)
 {
-	get_random_bytes(&flow_hash_rnd(cpu), sizeof(u32));
-	flow_hash_rnd_recalc(cpu) = 0;
-
-	__flow_cache_shrink(cpu, 0);
+	get_random_bytes(&fcp->hash_rnd, sizeof(u32));
+	fcp->hash_rnd_recalc = 0;
+	__flow_cache_shrink(fc, fcp, 0);
 }
 
-static u32 flow_hash_code(struct flowi *key, int cpu)
+static u32 flow_hash_code(struct flow_cache *fc,
+			  struct flow_cache_percpu *fcp,
+			  struct flowi *key)
 {
 	u32 *k = (u32 *) key;
 
-	return (jhash2(k, (sizeof(*key) / sizeof(u32)), flow_hash_rnd(cpu)) &
-		(flow_hash_size - 1));
+	return (jhash2(k, (sizeof(*key) / sizeof(u32)), fcp->hash_rnd)
+		& (flow_cache_hash_size(fc) - 1));
 }
 
 #if (BITS_PER_LONG == 64)
@@ -165,128 +136,100 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2)
 	return 0;
 }
 
-void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
-			flow_resolve_t resolver)
+struct flow_cache_entry *flow_cache_lookup(struct flow_cache *fc,
+					   struct flowi *key,
+					   u16 family, u8 dir)
 {
 	struct flow_cache_entry *fle, **head;
+	struct flow_cache_percpu *fcp;
 	unsigned int hash;
-	int cpu;
 
 	local_bh_disable();
-	cpu = smp_processor_id();
+	fcp = per_cpu_ptr(fc->percpu, smp_processor_id());
 
 	fle = NULL;
 	/* Packet really early in init?  Making flow_cache_init a
 	 * pre-smp initcall would solve this.  --RR */
-	if (!flow_table(cpu))
+	if (!fcp->hash_table)
 		goto nocache;
 
-	if (flow_hash_rnd_recalc(cpu))
-		flow_new_hash_rnd(cpu);
-	hash = flow_hash_code(key, cpu);
+	if (fcp->hash_rnd_recalc)
+		flow_new_hash_rnd(fc, fcp);
+
+	hash = flow_hash_code(fc, fcp, key);
 
-	head = &flow_table(cpu)[hash];
+	head = &fcp->hash_table[hash];
 	for (fle = *head; fle; fle = fle->next) {
 		if (fle->family == family &&
 		    fle->dir == dir &&
 		    flow_key_compare(key, &fle->key) == 0) {
-			if (fle->genid == atomic_read(&flow_cache_genid)) {
-				void *ret = fle->object;
-
-				if (ret)
-					atomic_inc(fle->object_ref);
-				local_bh_enable();
-
-				return ret;
-			}
-			break;
-		}
-	}
-
-	if (!fle) {
-		if (flow_count(cpu) > flow_hwm)
-			flow_cache_shrink(cpu);
-
-		fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);
-		if (fle) {
-			fle->next = *head;
-			*head = fle;
-			fle->family = family;
-			fle->dir = dir;
-			memcpy(&fle->key, key, sizeof(*key));
-			fle->object = NULL;
-			flow_count(cpu)++;
+			return fle;
 		}
 	}
 
-nocache:
-	{
-		int err;
-		void *obj;
-		atomic_t *obj_ref;
-
-		err = resolver(net, key, family, dir, &obj, &obj_ref);
+	if (fcp->hash_count > fc->high_watermark)
+		flow_cache_shrink(fc, fcp);
 
-		if (fle && !err) {
-			fle->genid = atomic_read(&flow_cache_genid);
+	fle = kmem_cache_zalloc(fc->flow_cachep, GFP_ATOMIC);
+	if (!fle)
+		goto nocache;
 
-			if (fle->object)
-				atomic_dec(fle->object_ref);
+	fle->next = *head;
+	*head = fle;
+	fle->family = family;
+	fle->dir = dir;
+	memcpy(&fle->key, key, sizeof(*key));
+	fcp->hash_count++;
+	return fle;
 
-			fle->object = obj;
-			fle->object_ref = obj_ref;
-			if (obj)
-				atomic_inc(fle->object_ref);
-		}
-		local_bh_enable();
+nocache:
+	local_bh_enable();
+	return NULL;
+}
 
-		if (err)
-			obj = ERR_PTR(err);
-		return obj;
-	}
+void flow_cache_entry_put(struct flow_cache_entry *fce)
+{
+	local_bh_enable();
 }
 
 static void flow_cache_flush_tasklet(unsigned long data)
 {
-	struct flow_flush_info *info = (void *)data;
+	struct flow_flush_info *info = (void *) data;
+	struct flow_cache *fc = (void *) info->cache;
+	struct flow_cache_percpu *fcp;
 	int i;
-	int cpu;
 
-	cpu = smp_processor_id();
-	for (i = 0; i < flow_hash_size; i++) {
-		struct flow_cache_entry *fle;
+	if (info->flush == NULL)
+		goto done;
 
-		fle = flow_table(cpu)[i];
-		for (; fle; fle = fle->next) {
-			unsigned genid = atomic_read(&flow_cache_genid);
-
-			if (!fle->object || fle->genid == genid)
-				continue;
+	fcp = per_cpu_ptr(fc->percpu, smp_processor_id());
+	for (i = 0; i < flow_cache_hash_size(fc); i++) {
+		struct flow_cache_entry *fle;
 
-			fle->object = NULL;
-			atomic_dec(fle->object_ref);
-		}
+		fle = fcp->hash_table[i];
+		for (; fle; fle = fle->next)
+			info->flush(fc, fle);
 	}
 
+done:
 	if (atomic_dec_and_test(&info->cpuleft))
 		complete(&info->completion);
 }
 
-static void flow_cache_flush_per_cpu(void *) __attribute__((__unused__));
 static void flow_cache_flush_per_cpu(void *data)
 {
 	struct flow_flush_info *info = data;
-	int cpu;
 	struct tasklet_struct *tasklet;
+	int cpu;
 
 	cpu = smp_processor_id();
-
-	tasklet = flow_flush_tasklet(cpu);
-	tasklet->data = (unsigned long)info;
+	tasklet = &per_cpu_ptr(info->cache->percpu, cpu)->flush_tasklet;
+	tasklet->data = (unsigned long) data;
 	tasklet_schedule(tasklet);
 }
 
-void flow_cache_flush(void)
+void flow_cache_flush(struct flow_cache *fc,
+		      void (*flush)(struct flow_cache *fc, struct flow_cache_entry *fce))
 {
 	struct flow_flush_info info;
 	static DEFINE_MUTEX(flow_flush_sem);
@@ -294,6 +237,8 @@ void flow_cache_flush(void)
 	/* Don't want cpus going down or up during this. */
 	get_online_cpus();
 	mutex_lock(&flow_flush_sem);
+	info.cache = fc;
+	info.flush = flush;
 	atomic_set(&info.cpuleft, num_online_cpus());
 	init_completion(&info.completion);
 
@@ -307,62 +252,99 @@ void flow_cache_flush(void)
 	put_online_cpus();
 }
 
-static void __init flow_cache_cpu_prepare(int cpu)
+static void __init flow_cache_cpu_prepare(struct flow_cache *fc,
+					  struct flow_cache_percpu *fcp)
+{
+	fcp->hash_table = (struct flow_cache_entry **)
+		__get_free_pages(GFP_KERNEL|__GFP_ZERO, fc->order);
+	fcp->hash_rnd_recalc = 1;
+	fcp->hash_count = 0;
+
+	tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0);
+}
+
+static int __cpuinit flow_cache_cpu(struct notifier_block *nfb,
+				    unsigned long action,
+				    void *hcpu)
+{
+	struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier);
+	int cpu = (unsigned long) hcpu;
+	struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		flow_cache_cpu_prepare(fc, fcp);
+		if (!fcp->hash_table)
+			return NOTIFY_BAD;
+		break;
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		if (fcp->hash_table) {
+			__flow_cache_shrink(fc, fcp, 0);
+			free_pages((unsigned long) fcp->hash_table, fc->order);
+			fcp->hash_table = NULL;
+		}
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+int flow_cache_init(struct flow_cache *fc, size_t entry_size)
 {
-	struct tasklet_struct *tasklet;
 	unsigned long order;
+	int i, r;
+
+	BUG_ON(entry_size < sizeof(struct flow_cache_entry));
+	fc->flow_cachep = kmem_cache_create("flow_cache",
+					entry_size,
+					0, SLAB_PANIC,
+					NULL);
+	fc->hash_shift = 10;
+	fc->low_watermark = 2 * flow_cache_hash_size(fc);
+	fc->high_watermark = 4 * flow_cache_hash_size(fc);
+	fc->percpu = alloc_percpu(struct flow_cache_percpu);
 
 	for (order = 0;
 	     (PAGE_SIZE << order) <
-		     (sizeof(struct flow_cache_entry *)*flow_hash_size);
+		(sizeof(struct flow_cache_entry *) * flow_cache_hash_size(fc));
 	     order++)
 		/* NOTHING */;
+	fc->order = order;
 
-	flow_table(cpu) = (struct flow_cache_entry **)
-		__get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
-	if (!flow_table(cpu))
-		panic("NET: failed to allocate flow cache order %lu\n", order);
+	setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd, (unsigned long) fc);
+	fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
+	add_timer(&fc->rnd_timer);
 
-	flow_hash_rnd_recalc(cpu) = 1;
-	flow_count(cpu) = 0;
+	for_each_online_cpu(i) {
+		r = flow_cache_cpu(&fc->hotcpu_notifier,
+				   CPU_UP_PREPARE, (void*) i);
+		if (r != NOTIFY_OK)
+			panic("NET: failed to allocate flow cache order %lu\n", order);
+	}
 
-	tasklet = flow_flush_tasklet(cpu);
-	tasklet_init(tasklet, flow_cache_flush_tasklet, 0);
-}
+	fc->hotcpu_notifier = (struct notifier_block){
+		.notifier_call = flow_cache_cpu,
+	};
+	register_hotcpu_notifier(&fc->hotcpu_notifier);
 
-static int flow_cache_cpu(struct notifier_block *nfb,
-			  unsigned long action,
-			  void *hcpu)
-{
-	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
-		__flow_cache_shrink((unsigned long)hcpu, 0);
-	return NOTIFY_OK;
+	return 0;
 }
 
-static int __init flow_cache_init(void)
+void flow_cache_fini(struct flow_cache *fc)
 {
 	int i;
 
-	flow_cachep = kmem_cache_create("flow_cache",
-					sizeof(struct flow_cache_entry),
-					0, SLAB_PANIC,
-					NULL);
-	flow_hash_shift = 10;
-	flow_lwm = 2 * flow_hash_size;
-	flow_hwm = 4 * flow_hash_size;
-
-	setup_timer(&flow_hash_rnd_timer, flow_cache_new_hashrnd, 0);
-	flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
-	add_timer(&flow_hash_rnd_timer);
+	del_timer(&fc->rnd_timer);
+	unregister_hotcpu_notifier(&fc->hotcpu_notifier);
 
 	for_each_possible_cpu(i)
-		flow_cache_cpu_prepare(i);
+		flow_cache_cpu(&fc->hotcpu_notifier, CPU_DEAD, (void*) i);
 
-	hotcpu_notifier(flow_cache_cpu, 0);
-	return 0;
+	free_percpu(fc->percpu);
+	kmem_cache_destroy(fc->flow_cachep);
 }
 
-module_init(flow_cache_init);
-
-EXPORT_SYMBOL(flow_cache_genid);
 EXPORT_SYMBOL(flow_cache_lookup);
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 3516e6f..588ba76 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -151,8 +151,9 @@ void __inet6_csk_dst_store(struct sock *sk, struct dst_entry *dst,
 
 #ifdef CONFIG_XFRM
 	{
+		struct net *net = sock_net(sk);
 		struct rt6_info *rt = (struct rt6_info  *)dst;
-		rt->rt6i_flow_cache_genid = atomic_read(&flow_cache_genid);
+		rt->rt6i_flow_cache_genid = atomic_read(&net->xfrm.policy_genid);
 	}
 #endif
 }
@@ -166,8 +167,9 @@ struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie)
 
 #ifdef CONFIG_XFRM
 	if (dst) {
+		struct net *net = sock_net(sk);
 		struct rt6_info *rt = (struct rt6_info *)dst;
-		if (rt->rt6i_flow_cache_genid != atomic_read(&flow_cache_genid)) {
+		if (rt->rt6i_flow_cache_genid != atomic_read(&net->xfrm.policy_genid)) {
 			__sk_dst_reset(sk);
 			dst = NULL;
 		}
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 843e066..228b813 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -44,7 +44,6 @@ static struct xfrm_policy_afinfo *xfrm_policy_afinfo[NPROTO];
 
 static struct kmem_cache *xfrm_dst_cache __read_mostly;
 
-static HLIST_HEAD(xfrm_policy_gc_list);
 static DEFINE_SPINLOCK(xfrm_policy_gc_lock);
 
 static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family);
@@ -53,6 +52,7 @@ static void xfrm_init_pmtu(struct dst_entry *dst);
 
 static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
 						int dir);
+static int stale_bundle(struct dst_entry *dst);
 
 static inline int
 __xfrm4_selector_match(struct xfrm_selector *sel, struct flowi *fl)
@@ -216,6 +216,35 @@ expired:
 	xfrm_pol_put(xp);
 }
 
+struct xfrm_flow_cache_entry {
+	struct flow_cache_entry fce;
+	struct xfrm_policy *policy;
+	struct xfrm_dst *dst;
+	u32 policy_genid, bundles_genid;
+};
+#define XFRM_CACHE_NO_POLICY ((struct xfrm_policy *) -1)
+
+void xfrm_flow_cache_entry_validate(struct flow_cache *fc,
+				    struct flow_cache_entry *fce)
+{
+	struct net *net = container_of(fc, struct net, xfrm.flow_cache);
+	struct xfrm_flow_cache_entry *xfc =
+		container_of(fce, struct xfrm_flow_cache_entry, fce);
+
+	if (xfc->policy_genid != atomic_read(&net->xfrm.policy_genid))
+		goto invalid;
+	if (xfc->policy == NULL || xfc->policy == XFRM_CACHE_NO_POLICY)
+		return;
+	if (xfc->policy->walk.dead)
+		goto invalid;
+	if (xfc->bundles_genid != atomic_read(&xfc->policy->bundles_genid))
+		goto invalid_dst;
+	return;
+invalid:
+	xfc->policy = NULL;
+invalid_dst:
+	xfc->dst = NULL;
+}
 
 /* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2
  * SPD calls.
@@ -269,27 +298,26 @@ static void xfrm_policy_gc_kill(struct xfrm_policy *policy)
 	if (del_timer(&policy->timer))
 		atomic_dec(&policy->refcnt);
 
-	if (atomic_read(&policy->refcnt) > 1)
-		flow_cache_flush();
-
 	xfrm_pol_put(policy);
 }
 
 static void xfrm_policy_gc_task(struct work_struct *work)
 {
+	struct net *net = container_of(work, struct net, xfrm.policy_gc_work);
 	struct xfrm_policy *policy;
 	struct hlist_node *entry, *tmp;
 	struct hlist_head gc_list;
 
 	spin_lock_bh(&xfrm_policy_gc_lock);
-	gc_list.first = xfrm_policy_gc_list.first;
-	INIT_HLIST_HEAD(&xfrm_policy_gc_list);
+	gc_list.first = net->xfrm.policy_gc_list.first;
+	INIT_HLIST_HEAD(&net->xfrm.policy_gc_list);
 	spin_unlock_bh(&xfrm_policy_gc_lock);
 
+	flow_cache_flush(&net->xfrm.flow_cache, xfrm_flow_cache_entry_validate);
+
 	hlist_for_each_entry_safe(policy, entry, tmp, &gc_list, bydst)
 		xfrm_policy_gc_kill(policy);
 }
-static DECLARE_WORK(xfrm_policy_gc_work, xfrm_policy_gc_task);
 
 /* Rule must be locked. Release descentant resources, announce
  * entry dead. The rule must be unlinked from lists to the moment.
@@ -297,6 +325,7 @@ static DECLARE_WORK(xfrm_policy_gc_work, xfrm_policy_gc_task);
 
 static void xfrm_policy_kill(struct xfrm_policy *policy)
 {
+	struct net *net = xp_net(policy);
 	int dead;
 
 	write_lock_bh(&policy->lock);
@@ -310,10 +339,10 @@ static void xfrm_policy_kill(struct xfrm_policy *policy)
 	}
 
 	spin_lock_bh(&xfrm_policy_gc_lock);
-	hlist_add_head(&policy->bydst, &xfrm_policy_gc_list);
+	hlist_add_head(&policy->bydst, &net->xfrm.policy_gc_list);
 	spin_unlock_bh(&xfrm_policy_gc_lock);
 
-	schedule_work(&xfrm_policy_gc_work);
+	schedule_work(&net->xfrm.policy_gc_work);
 }
 
 static unsigned int xfrm_policy_hashmax __read_mostly = 1 * 1024 * 1024;
@@ -588,7 +617,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 		hlist_add_head(&policy->bydst, chain);
 	xfrm_pol_hold(policy);
 	net->xfrm.policy_count[dir]++;
-	atomic_inc(&flow_cache_genid);
+	atomic_inc(&net->xfrm.policy_genid);
 	if (delpol)
 		__xfrm_policy_unlink(delpol, dir);
 	policy->index = delpol ? delpol->index : xfrm_gen_index(net, dir);
@@ -621,11 +650,13 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 			gc_list = dst;
 
 			policy->bundles = NULL;
+			atomic_inc(&policy->bundles_genid);
 		}
 		write_unlock(&policy->lock);
 	}
 	read_unlock_bh(&xfrm_policy_lock);
 
+	flow_cache_flush(&net->xfrm.flow_cache, NULL);
 	while (gc_list) {
 		struct dst_entry *dst = gc_list;
 
@@ -672,7 +703,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
 	write_unlock_bh(&xfrm_policy_lock);
 
 	if (ret && delete) {
-		atomic_inc(&flow_cache_genid);
+		atomic_inc(&net->xfrm.policy_genid);
 		xfrm_policy_kill(ret);
 	}
 	return ret;
@@ -714,7 +745,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
 	write_unlock_bh(&xfrm_policy_lock);
 
 	if (ret && delete) {
-		atomic_inc(&flow_cache_genid);
+		atomic_inc(&net->xfrm.policy_genid);
 		xfrm_policy_kill(ret);
 	}
 	return ret;
@@ -835,7 +866,7 @@ int xfrm_policy_flush(struct net *net, u8 type, struct xfrm_audit *audit_info)
 	}
 	if (!cnt)
 		err = -ESRCH;
-	atomic_inc(&flow_cache_genid);
+	atomic_inc(&net->xfrm.policy_genid);
 out:
 	write_unlock_bh(&xfrm_policy_lock);
 	return err;
@@ -989,32 +1020,18 @@ fail:
 	return ret;
 }
 
-static int xfrm_policy_lookup(struct net *net, struct flowi *fl, u16 family,
-			      u8 dir, void **objp, atomic_t **obj_refp)
+static struct xfrm_policy *xfrm_policy_lookup(
+		struct net *net, struct flowi *fl,
+		u16 family, u8 dir)
 {
+#ifdef CONFIG_XFRM_SUB_POLICY
 	struct xfrm_policy *pol;
-	int err = 0;
 
-#ifdef CONFIG_XFRM_SUB_POLICY
 	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir);
-	if (IS_ERR(pol)) {
-		err = PTR_ERR(pol);
-		pol = NULL;
-	}
-	if (pol || err)
-		goto end;
+	if (pol != NULL)
+		return pol;
 #endif
-	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir);
-	if (IS_ERR(pol)) {
-		err = PTR_ERR(pol);
-		pol = NULL;
-	}
-#ifdef CONFIG_XFRM_SUB_POLICY
-end:
-#endif
-	if ((*objp = (void *) pol) != NULL)
-		*obj_refp = &pol->refcnt;
-	return err;
+	return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir);
 }
 
 static inline int policy_to_flow_dir(int dir)
@@ -1100,12 +1117,14 @@ static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
 
 int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
 {
+	struct net *net = xp_net(pol);
+
 	write_lock_bh(&xfrm_policy_lock);
 	pol = __xfrm_policy_unlink(pol, dir);
 	write_unlock_bh(&xfrm_policy_lock);
 	if (pol) {
 		if (dir < XFRM_POLICY_MAX)
-			atomic_inc(&flow_cache_genid);
+			atomic_inc(&net->xfrm.policy_genid);
 		xfrm_policy_kill(pol);
 		return 0;
 	}
@@ -1545,13 +1564,34 @@ xfrm_dst_update_origin(struct dst_entry *dst, struct flowi *fl)
 #endif
 }
 
-static int stale_bundle(struct dst_entry *dst);
-
 /* Main function: finds/creates a bundle for given flow.
  *
  * At the moment we eat a raw IP route. Mostly to speed up lookups
  * on interfaces with disabled IPsec.
  */
+
+static void xfrm_flow_cache_update(struct net *net, struct flowi *key,
+				   u16 family, u8 dir,
+				   struct xfrm_policy *pol,
+				   struct xfrm_dst *dst)
+{
+	struct flow_cache_entry *fce;
+	struct xfrm_flow_cache_entry *xf;
+
+	fce = flow_cache_lookup(&net->xfrm.flow_cache,
+				key, family, dir);
+	if (fce == NULL)
+		return;
+
+	xf = container_of(fce, struct xfrm_flow_cache_entry, fce);
+	xf->policy_genid = atomic_read(&net->xfrm.policy_genid);
+	xf->policy = pol;
+	if (dst != NULL)
+		xf->bundles_genid = atomic_read(&pol->bundles_genid);
+	xf->dst = dst;
+	flow_cache_entry_put(fce);
+}
+
 int __xfrm_lookup(struct net *net, struct dst_entry **dst_p, struct flowi *fl,
 		  struct sock *sk, int flags)
 {
@@ -1570,8 +1610,10 @@ int __xfrm_lookup(struct net *net, struct dst_entry **dst_p, struct flowi *fl,
 	u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT);
 
 restart:
-	genid = atomic_read(&flow_cache_genid);
+	family = dst_orig->ops->family;
+	genid = atomic_read(&net->xfrm.policy_genid);
 	policy = NULL;
+	dst = NULL;
 	for (pi = 0; pi < ARRAY_SIZE(pols); pi++)
 		pols[pi] = NULL;
 	npols = 0;
@@ -1588,24 +1630,51 @@ restart:
 	}
 
 	if (!policy) {
+		struct flow_cache_entry *fce;
+		struct xfrm_flow_cache_entry *xf;
+
 		/* To accelerate a bit...  */
 		if ((dst_orig->flags & DST_NOXFRM) ||
 		    !net->xfrm.policy_count[XFRM_POLICY_OUT])
 			goto nopol;
 
-		policy = flow_cache_lookup(net, fl, dst_orig->ops->family,
-					   dir, xfrm_policy_lookup);
-		err = PTR_ERR(policy);
-		if (IS_ERR(policy)) {
-			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
-			goto dropdst;
+		fce = flow_cache_lookup(&net->xfrm.flow_cache,
+					fl, family, dir);
+		if (fce == NULL)
+			goto no_cache;
+
+		xf = container_of(fce, struct xfrm_flow_cache_entry, fce);
+		xfrm_flow_cache_entry_validate(&net->xfrm.flow_cache, fce);
+		if (xf->policy != NULL) {
+			policy = xf->policy;
+			if (policy != XFRM_CACHE_NO_POLICY)
+				xfrm_pol_hold(policy);
+			if (xf->dst != NULL)
+				dst = dst_clone((struct dst_entry *) xf->dst);
+		}
+		flow_cache_entry_put(fce);
+		if (policy == XFRM_CACHE_NO_POLICY)
+			goto nopol;
+		if (dst && !xfrm_bundle_ok(policy, (struct xfrm_dst *) dst, fl, family, 0)) {
+			dst_release(dst);
+			dst = NULL;
 		}
 	}
+no_cache:
+	if (!policy) {
+		policy = xfrm_policy_lookup(net, fl, family, dir);
+		if (!policy) {
+			xfrm_flow_cache_update(
+				net, fl, family, dir,
+				XFRM_CACHE_NO_POLICY, NULL);
+			goto nopol;
+		}
+	}
+	if (IS_ERR(policy)) {
+		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
+		goto dropdst;
+	}
 
-	if (!policy)
-		goto nopol;
-
-	family = dst_orig->ops->family;
 	pols[0] = policy;
 	npols ++;
 	xfrm_nr += pols[0]->xfrm_nr;
@@ -1616,6 +1685,9 @@ restart:
 
 	policy->curlft.use_time = get_seconds();
 
+	if (dst)
+		goto dst_found;
+
 	switch (policy->action) {
 	default:
 	case XFRM_POLICY_BLOCK:
@@ -1626,18 +1698,11 @@ restart:
 
 	case XFRM_POLICY_ALLOW:
 #ifndef CONFIG_XFRM_SUB_POLICY
-		if (policy->xfrm_nr == 0) {
-			/* Flow passes not transformed. */
-			xfrm_pol_put(policy);
-			return 0;
-		}
+		if (policy->xfrm_nr == 0)
+			goto no_transform;
 #endif
 
-		/* Try to find matching bundle.
-		 *
-		 * LATER: help from flow cache. It is optional, this
-		 * is required only for output policy.
-		 */
+		/* Try to find matching bundle the hard way. */
 		dst = xfrm_find_bundle(fl, policy, family);
 		if (IS_ERR(dst)) {
 			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR);
@@ -1677,12 +1742,8 @@ restart:
 		 * they are searched. See above not-transformed bypass
 		 * is surrounded by non-sub policy configuration, too.
 		 */
-		if (xfrm_nr == 0) {
-			/* Flow passes not transformed. */
-			xfrm_pols_put(pols, npols);
-			return 0;
-		}
-
+		if (xfrm_nr == 0)
+			goto no_transform;
 #endif
 		nx = xfrm_tmpl_resolve(pols, npols, fl, xfrm, family);
 
@@ -1713,7 +1774,7 @@ restart:
 					goto error;
 				}
 				if (nx == -EAGAIN ||
-				    genid != atomic_read(&flow_cache_genid)) {
+				    genid != atomic_read(&net->xfrm.policy_genid)) {
 					xfrm_pols_put(pols, npols);
 					goto restart;
 				}
@@ -1724,11 +1785,8 @@ restart:
 				goto error;
 			}
 		}
-		if (nx == 0) {
-			/* Flow passes not transformed. */
-			xfrm_pols_put(pols, npols);
-			return 0;
-		}
+		if (nx == 0)
+			goto no_transform;
 
 		dst = xfrm_bundle_create(policy, xfrm, nx, fl, dst_orig);
 		err = PTR_ERR(dst);
@@ -1777,6 +1835,9 @@ restart:
 		dst_hold(dst);
 		write_unlock_bh(&policy->lock);
 	}
+	xfrm_flow_cache_update(net, fl, family, dir,
+			       policy, (struct xfrm_dst *) dst);
+dst_found:
 	*dst_p = dst;
 	dst_release(dst_orig);
 	xfrm_pols_put(pols, npols);
@@ -1794,7 +1855,12 @@ nopol:
 	if (flags & XFRM_LOOKUP_ICMP)
 		goto dropdst;
 	return 0;
+no_transform:
+	/* Flow passes not transformed. */
+	xfrm_pols_put(pols, npols);
+	return 0;
 }
+
 EXPORT_SYMBOL(__xfrm_lookup);
 
 int xfrm_lookup(struct net *net, struct dst_entry **dst_p, struct flowi *fl,
@@ -1952,10 +2018,35 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 		}
 	}
 
-	if (!pol)
-		pol = flow_cache_lookup(net, &fl, family, fl_dir,
-					xfrm_policy_lookup);
-
+	if (!pol) {
+		struct flow_cache_entry *fce;
+		struct xfrm_flow_cache_entry *xf;
+
+		fce = flow_cache_lookup(&net->xfrm.flow_cache,
+					&fl, family, dir);
+		if (fce != NULL) {
+			xf = container_of(fce, struct xfrm_flow_cache_entry, fce);
+			xfrm_flow_cache_entry_validate(&net->xfrm.flow_cache, fce);
+			if (xf->policy != NULL) {
+				pol = xf->policy;
+				if (pol != XFRM_CACHE_NO_POLICY)
+					xfrm_pol_hold(pol);
+				else
+					pol = NULL;
+			} else {
+				pol = xfrm_policy_lookup(net, &fl, family, dir);
+				if (!IS_ERR(pol)) {
+					if (pol)
+						xf->policy = pol;
+					else
+						xf->policy = XFRM_CACHE_NO_POLICY;
+				}
+				xf->dst = NULL;
+				xf->policy_genid = atomic_read(&net->xfrm.policy_genid);
+			}
+			flow_cache_entry_put(fce);
+		}
+	}
 	if (IS_ERR(pol)) {
 		XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
 		return 0;
@@ -2153,6 +2244,7 @@ static void prune_one_bundle(struct xfrm_policy *pol, int (*func)(struct dst_ent
 			dstp = &dst->next;
 		}
 	}
+	atomic_inc(&pol->bundles_genid);
 	write_unlock(&pol->lock);
 }
 
@@ -2180,6 +2272,7 @@ static void xfrm_prune_bundles(struct net *net, int (*func)(struct dst_entry *))
 	}
 	read_unlock_bh(&xfrm_policy_lock);
 
+	flow_cache_flush(&net->xfrm.flow_cache, NULL);
 	while (gc_list) {
 		struct dst_entry *dst = gc_list;
 		gc_list = dst->next;
@@ -2498,6 +2591,9 @@ static int __net_init xfrm_policy_init(struct net *net)
 
 	INIT_LIST_HEAD(&net->xfrm.policy_all);
 	INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize);
+	INIT_HLIST_HEAD(&net->xfrm.policy_gc_list);
+	INIT_WORK(&net->xfrm.policy_gc_work, xfrm_policy_gc_task);
+	flow_cache_init(&net->xfrm.flow_cache, sizeof(struct xfrm_flow_cache_entry));
 	if (net_eq(net, &init_net))
 		register_netdevice_notifier(&xfrm_dev_notifier);
 	return 0;
@@ -2531,7 +2627,7 @@ static void xfrm_policy_fini(struct net *net)
 	audit_info.sessionid = -1;
 	audit_info.secid = 0;
 	xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, &audit_info);
-	flush_work(&xfrm_policy_gc_work);
+	flush_work(&net->xfrm.policy_gc_work);
 
 	WARN_ON(!list_empty(&net->xfrm.policy_all));
 
@@ -2549,6 +2645,8 @@ static void xfrm_policy_fini(struct net *net)
 	sz = (net->xfrm.policy_idx_hmask + 1) * sizeof(struct hlist_head);
 	WARN_ON(!hlist_empty(net->xfrm.policy_byidx));
 	xfrm_hash_free(net->xfrm.policy_byidx, sz);
+
+	flow_cache_fini(&net->xfrm.flow_cache);
 }
 
 static int __net_init xfrm_net_init(struct net *net)
@@ -2756,8 +2854,9 @@ static int migrate_tmpl_match(struct xfrm_migrate *m, struct xfrm_tmpl *t)
 static int xfrm_policy_migrate(struct xfrm_policy *pol,
 			       struct xfrm_migrate *m, int num_migrate)
 {
+	struct net *net = xp_net(pol);
 	struct xfrm_migrate *mp;
-	struct dst_entry *dst;
+	struct dst_entry *gc_list = NULL, *tail;
 	int i, j, n = 0;
 
 	write_lock_bh(&pol->lock);
@@ -2782,15 +2881,25 @@ static int xfrm_policy_migrate(struct xfrm_policy *pol,
 			       sizeof(pol->xfrm_vec[i].saddr));
 			pol->xfrm_vec[i].encap_family = mp->new_family;
 			/* flush bundles */
-			while ((dst = pol->bundles) != NULL) {
-				pol->bundles = dst->next;
-				dst_free(dst);
-			}
+			tail = pol->bundles;
+			while (tail->next)
+				tail = tail->next;
+			tail->next = gc_list;
+			gc_list = pol->bundles;
+			pol->bundles = NULL;
+			atomic_inc(&pol->bundles_genid);
 		}
 	}
-
 	write_unlock_bh(&pol->lock);
 
+	flow_cache_flush(&net->xfrm.flow_cache, NULL);
+	while (gc_list) {
+		struct dst_entry *dst = gc_list;
+
+		gc_list = dst->next;
+		dst_free(dst);
+	}
+
 	if (!n)
 		return -ENODATA;
 
-- 
1.6.3.3

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html