netdev - [PATCH 5/6] bpf: hash: avoid to call kmalloc() in eBPF prog

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1450178464-27721-6-git-send-email-tom.leiming@gmail.com>
Date:	Tue, 15 Dec 2015 19:21:03 +0800
From:	Ming Lei <tom.leiming@...il.com>
To:	linux-kernel@...r.kernel.org, Alexei Starovoitov <ast@...nel.org>
Cc:	"David S. Miller" <davem@...emloft.net>, netdev@...r.kernel.org,
	Ming Lei <tom.leiming@...il.com>
Subject: [PATCH 5/6] bpf: hash: avoid to call kmalloc() in eBPF prog

kmalloc() is often a bit time-consuming, also
one atomic counter has to be used to track the total
allocated elements, which is also not good.

This patch pre-allocates element pool in htab_map_alloc(),
then use percpu_ida to allocate one slot from the pool,
then the runtime allocation/freeing cost can be decreased.

>From my test, at least 10% fio throughput is improved in block
I/O test when tools/biolatency of bcc(iovisor) is running.

Signed-off-by: Ming Lei <tom.leiming@...il.com>
---
 kernel/bpf/hashtab.c | 204 +++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 167 insertions(+), 37 deletions(-)

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 8543fea..c1600c3 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -13,23 +13,165 @@
 #include <linux/jhash.h>
 #include <linux/filter.h>
 #include <linux/vmalloc.h>
+#include <linux/percpu_ida.h>
+
+/* each htab element is struct htab_elem + key + value */
+struct htab_elem {
+	union {
+		struct hlist_node hash_node;
+
+		/* used after deleted from hash */
+		struct bpf_htab *htab;
+	};
+	struct rcu_head rcu;
+	u32 hash;
+	u32 tag;
+	char key[0] __aligned(8);
+};
 
 struct bpf_htab {
 	struct bpf_map map;
 	struct hlist_head *buckets;
-	atomic_t count;	/* number of elements in this hashtable */
 	u32 n_buckets;	/* number of hash buckets */
 	u32 elem_size;	/* size of each element in bytes */
-};
 
-/* each htab element is struct htab_elem + key + value */
-struct htab_elem {
-	struct hlist_node hash_node;
-	struct rcu_head rcu;
-	u32 hash;
-	char key[0] __aligned(8);
+	struct list_head page_list;
+	struct htab_elem **elems;
+	struct percpu_ida elems_pool;
 };
 
+static size_t order_to_size(unsigned int order)
+{
+	return (size_t)PAGE_SIZE << order;
+}
+
+/* Called from syscall, and the code is borrowed from blk_mq */
+static int htab_pre_alloc_elems(struct bpf_htab *htab)
+{
+	const unsigned max_order = 4;
+	unsigned elem_size = htab->elem_size, i;
+	unsigned nr_entries = htab->map.max_entries;
+	size_t left = nr_entries * elem_size;
+
+	htab->elems = kzalloc(nr_entries * sizeof(struct htab_elem *),
+			      GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
+	if (!htab->elems)
+		goto fail;
+
+	INIT_LIST_HEAD(&htab->page_list);
+
+	for (i = 0; i < nr_entries; ) {
+		int this_order = max_order;
+		struct page *page;
+		int j, to_do;
+		void *p;
+
+		while (left < order_to_size(this_order - 1) && this_order)
+			this_order--;
+
+		do {
+			page = alloc_pages(GFP_KERNEL | __GFP_NOWARN |
+					   __GFP_NORETRY | __GFP_ZERO,
+					   this_order);
+			if (page)
+				break;
+			if (!this_order--)
+				break;
+			if (order_to_size(this_order) < elem_size)
+				break;
+		} while (1);
+
+		if (!page)
+			goto fail;
+
+		page->private = this_order;
+		list_add_tail(&page->lru, &htab->page_list);
+
+		p = page_address(page);
+
+		to_do = min_t(unsigned,
+			      order_to_size(this_order) / elem_size,
+			      nr_entries - i);
+		left -= to_do * elem_size;
+
+		for (j = 0; j < to_do; j++) {
+			htab->elems[i] = p;
+			p += elem_size;
+			i++;
+		}
+	}
+	return 0;
+
+fail:
+	kfree(htab->elems);
+	return -ENOMEM;
+}
+
+static void htab_destroy_elems(struct bpf_htab *htab)
+{
+	struct page *page;
+
+	while (!list_empty(&htab->page_list)) {
+		page = list_first_entry(&htab->page_list, struct page, lru);
+		list_del_init(&page->lru);
+		__free_pages(page, page->private);
+	}
+
+	kfree(htab->elems);
+}
+
+static int htab_init_elems_allocator(struct bpf_htab *htab)
+{
+	int ret = htab_pre_alloc_elems(htab);
+
+	if (ret)
+		return ret;
+
+	ret = percpu_ida_init(&htab->elems_pool, htab->map.max_entries);
+	if (ret)
+		htab_destroy_elems(htab);
+	return ret;
+}
+
+static void htab_deinit_elems_allocator(struct bpf_htab *htab)
+{
+	htab_destroy_elems(htab);
+	percpu_ida_destroy(&htab->elems_pool);
+}
+
+static struct htab_elem *htab_alloc_elem(struct bpf_htab *htab)
+{
+	int tag = percpu_ida_alloc(&htab->elems_pool, TASK_RUNNING);
+	struct htab_elem *elem;
+
+	if (tag < 0)
+		return NULL;
+
+	elem = htab->elems[tag];
+	elem->tag = tag;
+	return elem;
+}
+
+static void htab_free_elem(struct bpf_htab *htab, struct htab_elem *elem)
+{
+	percpu_ida_free(&htab->elems_pool, elem->tag);
+}
+
+static void htab_free_elem_cb(struct rcu_head *head)
+{
+	struct htab_elem *elem = container_of(head, struct htab_elem, rcu);
+
+	htab_free_elem(elem->htab, elem);
+}
+
+static void htab_free_elem_rcu(struct bpf_htab *htab,
+			       struct htab_elem *elem)
+{
+	hlist_del_rcu_lock(&elem->hash_node);
+	elem->htab = htab;
+	call_rcu(&elem->rcu, htab_free_elem_cb);
+}
+
 /* Called from syscall */
 static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 {
@@ -72,9 +214,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 		 */
 		goto free_htab;
 
-	htab->elem_size = sizeof(struct htab_elem) +
-			  round_up(htab->map.key_size, 8) +
-			  htab->map.value_size;
+	htab->elem_size = round_up(sizeof(struct htab_elem) +
+				   round_up(htab->map.key_size, 8) +
+				   htab->map.value_size,
+				   cache_line_size());
 
 	/* prevent zero size kmalloc and check for u32 overflow */
 	if (htab->n_buckets == 0 ||
@@ -104,10 +247,14 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	for (i = 0; i < htab->n_buckets; i++)
 		INIT_HLIST_HEAD(&htab->buckets[i]);
 
-	atomic_set(&htab->count, 0);
+	err = htab_init_elems_allocator(htab);
+	if (err)
+		goto free_buckets;
 
 	return &htab->map;
 
+free_buckets:
+	kvfree(htab->buckets);
 free_htab:
 	kfree(htab);
 	return ERR_PTR(err);
@@ -242,9 +389,9 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	WARN_ON_ONCE(!rcu_read_lock_held());
 
 	/* allocate new element outside of lock */
-	l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
+	l_new = htab_alloc_elem(htab);
 	if (!l_new)
-		return -ENOMEM;
+		return -E2BIG;
 
 	key_size = map->key_size;
 
@@ -261,14 +408,6 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	l_old = lookup_elem_raw(hlist_get_head_lock(head, &h), l_new->hash,
 			key, key_size);
 
-	if (!l_old && unlikely(atomic_read(&htab->count) >= map->max_entries)) {
-		/* if elem with this 'key' doesn't exist and we've reached
-		 * max_entries limit, fail insertion of new elem
-		 */
-		ret = -E2BIG;
-		goto err;
-	}
-
 	if (l_old && map_flags == BPF_NOEXIST) {
 		/* elem already exists */
 		ret = -EEXIST;
@@ -285,12 +424,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	 * search will find it before old elem
 	 */
 	hlist_add_head_rcu_lock(&l_new->hash_node, head);
-	if (l_old) {
-		hlist_del_rcu_lock(&l_old->hash_node);
-		kfree_rcu(l_old, rcu);
-	} else {
-		atomic_inc(&htab->count);
-	}
+	if (l_old)
+		htab_free_elem_rcu(htab, l_old);
 	bit_spin_unlock(HLIST_LOCK_BIT, (unsigned long *)&head->first);
 	raw_local_irq_restore(flags);
 
@@ -298,7 +433,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 err:
 	bit_spin_unlock(HLIST_LOCK_BIT, (unsigned long *)&head->first);
 	raw_local_irq_restore(flags);
-	kfree(l_new);
+	htab_free_elem(htab, l_new);
 	return ret;
 }
 
@@ -324,11 +459,8 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
 	bit_spin_lock(HLIST_LOCK_BIT, (unsigned long *)&head->first);
 
 	l = lookup_elem_raw(hlist_get_head_lock(head, &h), hash, key, key_size);
-
 	if (l) {
-		hlist_del_rcu_lock(&l->hash_node);
-		atomic_dec(&htab->count);
-		kfree_rcu(l, rcu);
+		htab_free_elem_rcu(htab, l);
 		ret = 0;
 	}
 
@@ -349,11 +481,8 @@ static void delete_all_elements(struct bpf_htab *htab)
 
 		head = hlist_get_head_lock(head, &h);
 
-		hlist_for_each_entry_safe(l, n, head, hash_node) {
+		hlist_for_each_entry_safe(l, n, head, hash_node)
 			hlist_del_rcu(&l->hash_node);
-			atomic_dec(&htab->count);
-			kfree(l);
-		}
 	}
 }
 
@@ -373,6 +502,7 @@ static void htab_map_free(struct bpf_map *map)
 	 * executed. It's ok. Proceed to free residual elements and map itself
 	 */
 	delete_all_elements(htab);
+	htab_deinit_elems_allocator(htab);
 	kvfree(htab->buckets);
 	kfree(htab);
 }
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html