linux-kernel - Re: [PATCH v2 bpf-next 03/10] bpf: introduce per-cpu cgroup local storage

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <27B81458-17C5-4840-9679-D1F5FBF6E805@fb.com>
Date:   Tue, 25 Sep 2018 18:54:33 +0000
From:   Song Liu <songliubraving@...com>
To:     Roman Gushchin <guro@...com>
CC:     "netdev@...r.kernel.org" <netdev@...r.kernel.org>,
        "linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
        Kernel Team <Kernel-team@...com>,
        Daniel Borkmann <daniel@...earbox.net>,
        "Alexei Starovoitov" <ast@...nel.org>
Subject: Re: [PATCH v2 bpf-next 03/10] bpf: introduce per-cpu cgroup local
 storage



> On Sep 25, 2018, at 8:21 AM, Roman Gushchin <guro@...com> wrote:
> 
> This commit introduced per-cpu cgroup local storage.
> 
> Per-cpu cgroup local storage is very similar to simple cgroup storage
> (let's call it shared), except all the data is per-cpu.
> 
> The main goal of per-cpu variant is to implement super fast
> counters (e.g. packet counters), which don't require neither
> lookups, neither atomic operations.
> 
> From userspace's point of view, accessing a per-cpu cgroup storage
> is similar to other per-cpu map types (e.g. per-cpu hashmaps and
> arrays).
> 
> Writing to a per-cpu cgroup storage is not atomic, but is performed
> by copying longs, so some minimal atomicity is here, exactly
> as with other per-cpu maps.
> 
> Signed-off-by: Roman Gushchin <guro@...com>
> Cc: Daniel Borkmann <daniel@...earbox.net>
> Cc: Alexei Starovoitov <ast@...nel.org>
> ---
> include/linux/bpf-cgroup.h |  20 ++++-
> include/linux/bpf.h        |   1 +
> include/linux/bpf_types.h  |   1 +
> include/uapi/linux/bpf.h   |   1 +
> kernel/bpf/helpers.c       |   8 +-
> kernel/bpf/local_storage.c | 148 ++++++++++++++++++++++++++++++++-----
> kernel/bpf/syscall.c       |  11 ++-
> kernel/bpf/verifier.c      |  15 +++-
> 8 files changed, 177 insertions(+), 28 deletions(-)
> 
> diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> index 7e0c9a1d48b7..9bd907657f9b 100644
> --- a/include/linux/bpf-cgroup.h
> +++ b/include/linux/bpf-cgroup.h
> @@ -37,7 +37,10 @@ struct bpf_storage_buffer {
> };
> 
> struct bpf_cgroup_storage {
> -	struct bpf_storage_buffer *buf;
> +	union {
> +		struct bpf_storage_buffer *buf;
> +		char __percpu *percpu_buf;

"char *" here looks weird. Did you mean to use "void *"?

> +	};
> 	struct bpf_cgroup_storage_map *map;
> 	struct bpf_cgroup_storage_key key;
> 	struct list_head list;
> @@ -109,6 +112,9 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
> static inline enum bpf_cgroup_storage_type cgroup_storage_type(
> 	struct bpf_map *map)
> {
> +	if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
> +		return BPF_CGROUP_STORAGE_PERCPU;
> +
> 	return BPF_CGROUP_STORAGE_SHARED;
> }
> 
> @@ -131,6 +137,10 @@ void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage);
> int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map);
> void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map);
> 
> +int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value);
> +int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
> +				     void *value, u64 flags);
> +
> /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
> #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)			      \
> ({									      \
> @@ -285,6 +295,14 @@ static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
> 	struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return 0; }
> static inline void bpf_cgroup_storage_free(
> 	struct bpf_cgroup_storage *storage) {}
> +static inline int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key,
> +						 void *value) {
> +	return 0;
> +}
> +static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
> +					void *key, void *value, u64 flags) {
> +	return 0;
> +}
> 
> #define cgroup_bpf_enabled (0)
> #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index b457fbe7b70b..018299a595c8 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -274,6 +274,7 @@ struct bpf_prog_offload {
> 
> enum bpf_cgroup_storage_type {
> 	BPF_CGROUP_STORAGE_SHARED,
> +	BPF_CGROUP_STORAGE_PERCPU,
> 	__BPF_CGROUP_STORAGE_MAX
> };
> 
> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> index c9bd6fb765b0..5432f4c9f50e 100644
> --- a/include/linux/bpf_types.h
> +++ b/include/linux/bpf_types.h
> @@ -43,6 +43,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops)
> #endif
> #ifdef CONFIG_CGROUP_BPF
> BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops)
> +BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, cgroup_storage_map_ops)
> #endif
> BPF_MAP_TYPE(BPF_MAP_TYPE_HASH, htab_map_ops)
> BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_HASH, htab_percpu_map_ops)
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index aa5ccd2385ed..e2070d819e04 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -127,6 +127,7 @@ enum bpf_map_type {
> 	BPF_MAP_TYPE_SOCKHASH,
> 	BPF_MAP_TYPE_CGROUP_STORAGE,
> 	BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
> +	BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
> };
> 
> enum bpf_prog_type {
> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> index e42f8789b7ea..1f21ef1c4ad3 100644
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -206,10 +206,16 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
> 	 */
> 	enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
> 	struct bpf_cgroup_storage *storage;
> +	void *ptr = NULL;

Not necessary to initialize to NULL. 

> 
> 	storage = this_cpu_read(bpf_cgroup_storage[stype]);
> 
> -	return (unsigned long)&READ_ONCE(storage->buf)->data[0];
> +	if (stype == BPF_CGROUP_STORAGE_SHARED)
> +		ptr = &READ_ONCE(storage->buf)->data[0];
> +	else
> +		ptr = this_cpu_ptr(storage->percpu_buf);
> +
> +	return (unsigned long)ptr;
> }
> 
> const struct bpf_func_proto bpf_get_local_storage_proto = {
> diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
> index 6742292fb39e..d991355b5b46 100644
> --- a/kernel/bpf/local_storage.c
> +++ b/kernel/bpf/local_storage.c
> @@ -152,6 +152,71 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
> 	return 0;
> }
> 
> +int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key,
> +				   void *value)
> +{
> +	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
> +	struct bpf_cgroup_storage_key *key = _key;
> +	struct bpf_cgroup_storage *storage;
> +	int cpu, off = 0;
> +	u32 size;
> +
> +	rcu_read_lock();
> +	storage = cgroup_storage_lookup(map, key, false);
> +	if (!storage) {
> +		rcu_read_unlock();
> +		return -ENOENT;
> +	}
> +
> +	/* per_cpu areas are zero-filled and bpf programs can only
> +	 * access 'value_size' of them, so copying rounded areas
> +	 * will not leak any kernel data
> +	 */
> +	size = round_up(_map->value_size, 8);
> +	for_each_possible_cpu(cpu) {
> +		bpf_long_memcpy(value + off,
> +				per_cpu_ptr(storage->percpu_buf, cpu), size);
> +		off += size;
> +	}
> +	rcu_read_unlock();
> +	return 0;
> +}
> +
> +int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key,
> +				     void *value, u64 map_flags)
> +{
> +	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
> +	struct bpf_cgroup_storage_key *key = _key;
> +	struct bpf_cgroup_storage *storage;
> +	int cpu, off = 0;
> +	u32 size;
> +
> +	if (unlikely(map_flags & BPF_EXIST))
> +		return -EINVAL;
> +
> +	rcu_read_lock();
> +	storage = cgroup_storage_lookup(map, key, false);
> +	if (!storage) {
> +		rcu_read_unlock();
> +		return -ENOENT;
> +	}
> +
> +	/* the user space will provide round_up(value_size, 8) bytes that
> +	 * will be copied into per-cpu area. bpf programs can only access
> +	 * value_size of it. During lookup the same extra bytes will be
> +	 * returned or zeros which were zero-filled by percpu_alloc,
> +	 * so no kernel data leaks possible
> +	 */
> +	size = round_up(_map->value_size, 8);
> +	for_each_possible_cpu(cpu) {
> +		bpf_long_memcpy(per_cpu_ptr(storage->percpu_buf, cpu),
> +				value + off, size);
> +		off += size;
> +	}
> +	rcu_read_unlock();
> +	return 0;
> +}
> +
> static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key,
> 				       void *_next_key)
> {
> @@ -292,55 +357,98 @@ struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
> {
> 	struct bpf_cgroup_storage *storage;
> 	struct bpf_map *map;
> +	gfp_t flags;
> +	size_t size;
> 	u32 pages;
> 
> 	map = prog->aux->cgroup_storage[stype];
> 	if (!map)
> 		return NULL;
> 
> -	pages = round_up(sizeof(struct bpf_cgroup_storage) +
> -			 sizeof(struct bpf_storage_buffer) +
> -			 map->value_size, PAGE_SIZE) >> PAGE_SHIFT;
> +	if (stype == BPF_CGROUP_STORAGE_SHARED) {
> +		size = sizeof(struct bpf_storage_buffer) + map->value_size;
> +		pages = round_up(sizeof(struct bpf_cgroup_storage) + size,
> +				 PAGE_SIZE) >> PAGE_SHIFT;
> +	} else {
> +		size = map->value_size;
> +		pages = round_up(round_up(size, 8) * num_possible_cpus(),
> +				 PAGE_SIZE) >> PAGE_SHIFT;
> +	}
> +
> 	if (bpf_map_charge_memlock(map, pages))
> 		return ERR_PTR(-EPERM);
> 
> 	storage = kmalloc_node(sizeof(struct bpf_cgroup_storage),
> 			       __GFP_ZERO | GFP_USER, map->numa_node);
> -	if (!storage) {
> -		bpf_map_uncharge_memlock(map, pages);
> -		return ERR_PTR(-ENOMEM);
> -	}
> +	if (!storage)
> +		goto enomem;
> 
> -	storage->buf = kmalloc_node(sizeof(struct bpf_storage_buffer) +
> -				    map->value_size, __GFP_ZERO | GFP_USER,
> -				    map->numa_node);
> -	if (!storage->buf) {
> -		bpf_map_uncharge_memlock(map, pages);
> -		kfree(storage);
> -		return ERR_PTR(-ENOMEM);
> +	flags = __GFP_ZERO | GFP_USER;
> +
> +	if (stype == BPF_CGROUP_STORAGE_SHARED) {
> +		storage->buf = kmalloc_node(size, flags, map->numa_node);
> +		if (!storage->buf)
> +			goto enomem;
> +	} else {
> +		storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags);
> +		if (!storage->percpu_buf)
> +			goto enomem;
> 	}
> 
> 	storage->map = (struct bpf_cgroup_storage_map *)map;
> 
> 	return storage;
> +
> +enomem:
> +	bpf_map_uncharge_memlock(map, pages);
> +	kfree(storage);
> +	return ERR_PTR(-ENOMEM);
> +}
> +
> +static void free_cgroup_storage_rcu(struct rcu_head *rcu)

Maybe rename as free_shared_cgroup_storage_rcu()?

> +{
> +	struct bpf_cgroup_storage *storage =
> +		container_of(rcu, struct bpf_cgroup_storage, rcu);
> +
> +	kfree(storage->buf);
> +	kfree(storage);
> +}
> +
> +static void free_percpu_cgroup_storage_rcu(struct rcu_head *rcu)
> +{
> +	struct bpf_cgroup_storage *storage =
> +		container_of(rcu, struct bpf_cgroup_storage, rcu);
> +
> +	free_percpu(storage->percpu_buf);
> +	kfree(storage);
> }
> 
> void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage)
> {
> -	u32 pages;
> +	enum bpf_cgroup_storage_type stype;
> 	struct bpf_map *map;
> +	u32 pages;
> 
> 	if (!storage)
> 		return;
> 
> 	map = &storage->map->map;
> -	pages = round_up(sizeof(struct bpf_cgroup_storage) +
> -			 sizeof(struct bpf_storage_buffer) +
> -			 map->value_size, PAGE_SIZE) >> PAGE_SHIFT;
> +	stype = cgroup_storage_type(map);
> +	if (stype == BPF_CGROUP_STORAGE_SHARED)
> +		pages = round_up(sizeof(struct bpf_cgroup_storage) +
> +				 sizeof(struct bpf_storage_buffer) +
> +				 map->value_size, PAGE_SIZE) >> PAGE_SHIFT;
> +	else
> +		pages = round_up(round_up(map->value_size, 8) *
> +				 num_possible_cpus(),
> +				 PAGE_SIZE) >> PAGE_SHIFT;
> +
> 	bpf_map_uncharge_memlock(map, pages);
> 
> -	kfree_rcu(storage->buf, rcu);
> -	kfree_rcu(storage, rcu);
> +	if (stype == BPF_CGROUP_STORAGE_SHARED)
> +		call_rcu(&storage->rcu, free_cgroup_storage_rcu);
> +	else
> +		call_rcu(&storage->rcu, free_percpu_cgroup_storage_rcu);
> }
> 
> void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 8c91d2b41b1e..5742df21598c 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -686,7 +686,8 @@ static int map_lookup_elem(union bpf_attr *attr)
> 
> 	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
> 	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
> -	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
> +	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
> +	    map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
> 		value_size = round_up(map->value_size, 8) * num_possible_cpus();
> 	else if (IS_FD_MAP(map))
> 		value_size = sizeof(u32);
> @@ -705,6 +706,8 @@ static int map_lookup_elem(union bpf_attr *attr)
> 		err = bpf_percpu_hash_copy(map, key, value);
> 	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
> 		err = bpf_percpu_array_copy(map, key, value);
> +	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
> +		err = bpf_percpu_cgroup_storage_copy(map, key, value);
> 	} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
> 		err = bpf_stackmap_copy(map, key, value);
> 	} else if (IS_FD_ARRAY(map)) {
> @@ -774,7 +777,8 @@ static int map_update_elem(union bpf_attr *attr)
> 
> 	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
> 	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
> -	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
> +	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
> +	    map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
> 		value_size = round_up(map->value_size, 8) * num_possible_cpus();
> 	else
> 		value_size = map->value_size;
> @@ -809,6 +813,9 @@ static int map_update_elem(union bpf_attr *attr)
> 		err = bpf_percpu_hash_update(map, key, value, attr->flags);
> 	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
> 		err = bpf_percpu_array_update(map, key, value, attr->flags);
> +	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
> +		err = bpf_percpu_cgroup_storage_update(map, key, value,
> +						       attr->flags);
> 	} else if (IS_FD_ARRAY(map)) {
> 		rcu_read_lock();
> 		err = bpf_fd_array_map_update_elem(map, f.file, key, value,
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index e75f36de91d6..d94073deb68a 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -2074,6 +2074,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
> 			goto error;
> 		break;
> 	case BPF_MAP_TYPE_CGROUP_STORAGE:
> +	case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE:
> 		if (func_id != BPF_FUNC_get_local_storage)
> 			goto error;
> 		break;
> @@ -2164,7 +2165,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
> 			goto error;
> 		break;
> 	case BPF_FUNC_get_local_storage:
> -		if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE)
> +		if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
> +		    map->map_type != BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
> 			goto error;
> 		break;
> 	case BPF_FUNC_sk_select_reuseport:
> @@ -5049,6 +5051,12 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
> 	return 0;
> }
> 
> +static bool bpf_map_is_cgroup_storage(struct bpf_map *map)
> +{
> +	return (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE ||
> +		map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE);
> +}
> +
> /* look for pseudo eBPF instructions that access map FDs and
>  * replace them with actual map pointers
>  */
> @@ -5139,10 +5147,9 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
> 			}
> 			env->used_maps[env->used_map_cnt++] = map;
> 
> -			if (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE &&
> +			if (bpf_map_is_cgroup_storage(map) &&
> 			    bpf_cgroup_storage_assign(env->prog, map)) {
> -				verbose(env,
> -					"only one cgroup storage is allowed\n");
> +				verbose(env, "only one cgroup storage of each type is allowed\n");
> 				fdput(f);
> 				return -EBUSY;
> 			}
> -- 
> 2.17.1
>