netdev - Re: [PATCH bpf-next 1/2] bpf: implement MAP_GET_NEXT_KEY command for LPM

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <7b976f6d-84a0-0f12-67fb-7873c9d14792@fb.com>
Date:   Mon, 22 Jan 2018 16:05:23 -0800
From:   Yonghong Song <yhs@...com>
To:     Eric Dumazet <eric.dumazet@...il.com>, <ast@...com>,
        <daniel@...earbox.net>, <netdev@...r.kernel.org>
CC:     <kernel-team@...com>
Subject: Re: [PATCH bpf-next 1/2] bpf: implement MAP_GET_NEXT_KEY command for
 LPM_TRIE map



On 1/22/18 11:28 AM, Eric Dumazet wrote:
> On Thu, 2018-01-18 at 15:08 -0800, Yonghong Song wrote:
>> Current LPM_TRIE map type does not implement MAP_GET_NEXT_KEY
>> command. This command is handy when users want to enumerate
>> keys. Otherwise, a different map which supports key
>> enumeration may be required to store the keys. If the
>> map data is sparse and all map data are to be deleted without
>> closing file descriptor, using MAP_GET_NEXT_KEY to find
>> all keys is much faster than enumerating all key space.
>>
>> This patch implements MAP_GET_NEXT_KEY command for LPM_TRIE map.
>> If user provided key pointer is NULL or the key does not have
>> an exact match in the trie, the first key will be returned.
>> Otherwise, the next key will be returned.
>>
>> In this implemenation, key enumeration follows a postorder
>> traversal of internal trie. More specific keys
>> will be returned first than less specific ones, given
>> a sequence of MAP_GET_NEXT_KEY syscalls.
>>
>> Signed-off-by: Yonghong Song <yhs@...com>
>> ---
>>   kernel/bpf/lpm_trie.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++++--
>>   1 file changed, 93 insertions(+), 2 deletions(-)
>>
>> diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
>> index 584e022..d7ea962 100644
>> --- a/kernel/bpf/lpm_trie.c
>> +++ b/kernel/bpf/lpm_trie.c
>> @@ -591,9 +591,100 @@ static void trie_free(struct bpf_map *map)
>>   	raw_spin_unlock(&trie->lock);
>>   }
>>   
>> -static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key)
>> +static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
>>   {
>> -	return -ENOTSUPP;
>> +	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
>> +	struct bpf_lpm_trie_key *key = _key, *next_key = _next_key;
>> +	struct lpm_trie_node *node, *next_node = NULL, *parent;
>> +	struct lpm_trie_node **node_stack = NULL;
>> +	struct lpm_trie_node __rcu **root;
>> +	int err = 0, stack_ptr = -1;
>> +	unsigned int next_bit;
>> +	size_t matchlen;
>> +
>> +	/* The get_next_key follows postorder. For the 4 node example in
>> +	 * the top of this file, the trie_get_next_key() returns the following
>> +	 * one after another:
>> +	 *   192.168.0.0/24
>> +	 *   192.168.1.0/24
>> +	 *   192.168.128.0/24
>> +	 *   192.168.0.0/16
>> +	 *
>> +	 * The idea is to return more specific keys before less specific ones.
>> +	 */
>> +
>> +	/* Empty trie */
>> +	if (!rcu_dereference(trie->root))
>> +		return -ENOENT;
>> +
>> +	/* For invalid key, find the leftmost node in the trie */
>> +	if (!key || key->prefixlen > trie->max_prefixlen) {
>> +		root = &trie->root;
>> +		goto find_leftmost;
>> +	}
>> +
>> +	node_stack = kmalloc(trie->max_prefixlen * sizeof(struct lpm_trie_node *),
>> +			     GFP_USER | __GFP_NOWARN);
> 
> It is illegal to use a blocking kmalloc() while holding RCU.
> 
> CONFIG_DEBUG_ATOMIC_SLEEP=y is your friend

Thanks Eric for spotting the problem. Will fix the issue soon.

> 
>> +	if (!node_stack)
>> +		return -ENOMEM;
>> +
>> +	/* Try to find the exact node for the given key */
>> +	for (node = rcu_dereference(trie->root); node;) {
>> +		node_stack[++stack_ptr] = node;
>> +		matchlen = longest_prefix_match(trie, node, key);
>> +		if (node->prefixlen != matchlen ||
>> +		    node->prefixlen == key->prefixlen)
>> +			break;
>> +
>> +		next_bit = extract_bit(key->data, node->prefixlen);
>> +		node = rcu_dereference(node->child[next_bit]);
>> +	}
>> +	if (!node || node->prefixlen != key->prefixlen ||
>> +	    (node->flags & LPM_TREE_NODE_FLAG_IM)) {
>> +		root = &trie->root;
>> +		goto find_leftmost;
>> +	}
>> +
>> +	/* The node with the exactly-matching key has been found,
>> +	 * find the first node in postorder after the matched node.
>> +	 */
>> +	node = node_stack[stack_ptr];
>> +	while (stack_ptr > 0) {
>> +		parent = node_stack[stack_ptr - 1];
>> +		if (rcu_dereference(parent->child[0]) == node &&
>> +		    rcu_dereference(parent->child[1])) {
>> +			root = &parent->child[1];
>> +			goto find_leftmost;
>> +		}
>> +		if (!(parent->flags & LPM_TREE_NODE_FLAG_IM)) {
>> +			next_node = parent;
>> +			goto do_copy;
>> +		}
>> +
>> +		node = parent;
>> +		stack_ptr--;
>> +	}
>> +
>> +	/* did not find anything */
>> +	err = -ENOENT;
>> +	goto free_stack;
>> +
>> +find_leftmost:
>> +	/* Find the leftmost non-intermediate node, all intermediate nodes
>> +	 * have exact two children, so this function will never return NULL.
>> +	 */
>> +	for (node = rcu_dereference(*root); node;) {
>> +		if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
>> +			next_node = node;
>> +		node = rcu_dereference(node->child[0]);
>> +	}
>> +do_copy:
>> +	next_key->prefixlen = next_node->prefixlen;
>> +	memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key, data),
>> +	       next_node->data, trie->data_size);
>> +free_stack:
>> +	kfree(node_stack);
>> +	return err;
>>   }
>>   
>>   const struct bpf_map_ops trie_map_ops = {