linux-kernel - Re: [PATCH v3 bpf-next 03/10] bpf: introduce per-cpu cgroup local storage

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20180928100302.GB9018@castle.DHCP.thefacebook.com>
Date:   Fri, 28 Sep 2018 11:03:03 +0100
From:   Roman Gushchin <guro@...com>
To:     Alexei Starovoitov <alexei.starovoitov@...il.com>
CC:     <netdev@...r.kernel.org>, Song Liu <songliubraving@...com>,
        <linux-kernel@...r.kernel.org>, <kernel-team@...com>,
        Daniel Borkmann <daniel@...earbox.net>,
        Alexei Starovoitov <ast@...nel.org>
Subject: Re: [PATCH v3 bpf-next 03/10] bpf: introduce per-cpu cgroup local
 storage

On Fri, Sep 28, 2018 at 10:45:31AM +0200, Alexei Starovoitov wrote:
> On Wed, Sep 26, 2018 at 12:33:19PM +0100, Roman Gushchin wrote:
> > This commit introduced per-cpu cgroup local storage.
> > 
> > Per-cpu cgroup local storage is very similar to simple cgroup storage
> > (let's call it shared), except all the data is per-cpu.
> > 
> > The main goal of per-cpu variant is to implement super fast
> > counters (e.g. packet counters), which don't require neither
> > lookups, neither atomic operations.
> > 
> > From userspace's point of view, accessing a per-cpu cgroup storage
> > is similar to other per-cpu map types (e.g. per-cpu hashmaps and
> > arrays).
> > 
> > Writing to a per-cpu cgroup storage is not atomic, but is performed
> > by copying longs, so some minimal atomicity is here, exactly
> > as with other per-cpu maps.
> > 
> > Signed-off-by: Roman Gushchin <guro@...com>
> > Cc: Daniel Borkmann <daniel@...earbox.net>
> > Cc: Alexei Starovoitov <ast@...nel.org>
> > ---
> >  include/linux/bpf-cgroup.h |  20 ++++-
> >  include/linux/bpf.h        |   1 +
> >  include/linux/bpf_types.h  |   1 +
> >  include/uapi/linux/bpf.h   |   1 +
> >  kernel/bpf/helpers.c       |   8 +-
> >  kernel/bpf/local_storage.c | 148 ++++++++++++++++++++++++++++++++-----
> >  kernel/bpf/syscall.c       |  11 ++-
> >  kernel/bpf/verifier.c      |  15 +++-
> >  8 files changed, 177 insertions(+), 28 deletions(-)
> > 
> > diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> > index 7e0c9a1d48b7..588dd5f0bd85 100644
> > --- a/include/linux/bpf-cgroup.h
> > +++ b/include/linux/bpf-cgroup.h
> > @@ -37,7 +37,10 @@ struct bpf_storage_buffer {
> >  };
> >  
> >  struct bpf_cgroup_storage {
> > -	struct bpf_storage_buffer *buf;
> > +	union {
> > +		struct bpf_storage_buffer *buf;
> > +		void __percpu *percpu_buf;
> > +	};
> >  	struct bpf_cgroup_storage_map *map;
> >  	struct bpf_cgroup_storage_key key;
> >  	struct list_head list;
> > @@ -109,6 +112,9 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
> >  static inline enum bpf_cgroup_storage_type cgroup_storage_type(
> >  	struct bpf_map *map)
> >  {
> > +	if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
> > +		return BPF_CGROUP_STORAGE_PERCPU;
> > +
> >  	return BPF_CGROUP_STORAGE_SHARED;
> >  }
> >  
> > @@ -131,6 +137,10 @@ void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage);
> >  int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map);
> >  void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map);
> >  
> > +int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value);
> > +int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
> > +				     void *value, u64 flags);
> > +
> >  /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
> >  #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)			      \
> >  ({									      \
> > @@ -285,6 +295,14 @@ static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
> >  	struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return 0; }
> >  static inline void bpf_cgroup_storage_free(
> >  	struct bpf_cgroup_storage *storage) {}
> > +static inline int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key,
> > +						 void *value) {
> > +	return 0;
> > +}
> > +static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
> > +					void *key, void *value, u64 flags) {
> > +	return 0;
> > +}
> >  
> >  #define cgroup_bpf_enabled (0)
> >  #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index b457fbe7b70b..018299a595c8 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -274,6 +274,7 @@ struct bpf_prog_offload {
> >  
> >  enum bpf_cgroup_storage_type {
> >  	BPF_CGROUP_STORAGE_SHARED,
> > +	BPF_CGROUP_STORAGE_PERCPU,
> >  	__BPF_CGROUP_STORAGE_MAX
> >  };
> >  
> > diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> > index c9bd6fb765b0..5432f4c9f50e 100644
> > --- a/include/linux/bpf_types.h
> > +++ b/include/linux/bpf_types.h
> > @@ -43,6 +43,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops)
> >  #endif
> >  #ifdef CONFIG_CGROUP_BPF
> >  BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops)
> > +BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, cgroup_storage_map_ops)
> >  #endif
> >  BPF_MAP_TYPE(BPF_MAP_TYPE_HASH, htab_map_ops)
> >  BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_HASH, htab_percpu_map_ops)
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index aa5ccd2385ed..e2070d819e04 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -127,6 +127,7 @@ enum bpf_map_type {
> >  	BPF_MAP_TYPE_SOCKHASH,
> >  	BPF_MAP_TYPE_CGROUP_STORAGE,
> >  	BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
> > +	BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
> >  };
> >  
> >  enum bpf_prog_type {
> > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> > index e42f8789b7ea..6502115e8f55 100644
> > --- a/kernel/bpf/helpers.c
> > +++ b/kernel/bpf/helpers.c
> > @@ -206,10 +206,16 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
> >  	 */
> >  	enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
> >  	struct bpf_cgroup_storage *storage;
> > +	void *ptr;
> >  
> >  	storage = this_cpu_read(bpf_cgroup_storage[stype]);
> >  
> > -	return (unsigned long)&READ_ONCE(storage->buf)->data[0];
> > +	if (stype == BPF_CGROUP_STORAGE_SHARED)
> > +		ptr = &READ_ONCE(storage->buf)->data[0];
> > +	else
> > +		ptr = this_cpu_ptr(storage->percpu_buf);
> > +
> > +	return (unsigned long)ptr;
> >  }
> >  
> >  const struct bpf_func_proto bpf_get_local_storage_proto = {
> > diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
> > index 6742292fb39e..c739f6dcc3c2 100644
> > --- a/kernel/bpf/local_storage.c
> > +++ b/kernel/bpf/local_storage.c
> > @@ -152,6 +152,71 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
> >  	return 0;
> >  }
> >  
> > +int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key,
> > +				   void *value)
> > +{
> > +	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
> > +	struct bpf_cgroup_storage_key *key = _key;
> > +	struct bpf_cgroup_storage *storage;
> > +	int cpu, off = 0;
> > +	u32 size;
> > +
> > +	rcu_read_lock();
> > +	storage = cgroup_storage_lookup(map, key, false);
> > +	if (!storage) {
> > +		rcu_read_unlock();
> > +		return -ENOENT;
> > +	}
> > +
> > +	/* per_cpu areas are zero-filled and bpf programs can only
> > +	 * access 'value_size' of them, so copying rounded areas
> > +	 * will not leak any kernel data
> > +	 */
> > +	size = round_up(_map->value_size, 8);
> > +	for_each_possible_cpu(cpu) {
> > +		bpf_long_memcpy(value + off,
> > +				per_cpu_ptr(storage->percpu_buf, cpu), size);
> > +		off += size;
> > +	}
> > +	rcu_read_unlock();
> > +	return 0;
> > +}
> > +
> > +int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key,
> > +				     void *value, u64 map_flags)
> > +{
> > +	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
> > +	struct bpf_cgroup_storage_key *key = _key;
> > +	struct bpf_cgroup_storage *storage;
> > +	int cpu, off = 0;
> > +	u32 size;
> > +
> > +	if (unlikely(map_flags & BPF_EXIST))
> > +		return -EINVAL;
> 
> that should have been BPF_NOEXIST ?

Yeah, or maybe even better s/&/!= ?
It's probably better to require BPF_EXIST flag to update a cgroup storage?
Agree? If so, let me fix this for both shared and per-cpu versions in
a follow-up patch.

> 
> > +
> > +	rcu_read_lock();
> > +	storage = cgroup_storage_lookup(map, key, false);
> > +	if (!storage) {
> > +		rcu_read_unlock();
> > +		return -ENOENT;
> > +	}
> > +
> > +	/* the user space will provide round_up(value_size, 8) bytes that
> > +	 * will be copied into per-cpu area. bpf programs can only access
> > +	 * value_size of it. During lookup the same extra bytes will be
> > +	 * returned or zeros which were zero-filled by percpu_alloc,
> > +	 * so no kernel data leaks possible
> > +	 */
> > +	size = round_up(_map->value_size, 8);
> > +	for_each_possible_cpu(cpu) {
> > +		bpf_long_memcpy(per_cpu_ptr(storage->percpu_buf, cpu),
> > +				value + off, size);
> > +		off += size;
> > +	}
> > +	rcu_read_unlock();
> 
> storage_update and storage_copy look essentially the same
> with the only difference that src/dst swap.
> Would it be possible to combine them ?
> Not sure whether #define template would look better.

I'll try to refactor this and next one in a follow-up patch.

Thanks!