lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CACVXFVPAR52e01vVqtZ5LF5j7zscjAVgOccW1erj-a2VYScJrg@mail.gmail.com>
Date:	Wed, 13 Jan 2016 10:42:49 +0800
From:	Ming Lei <tom.leiming@...il.com>
To:	Martin KaFai Lau <kafai@...com>
Cc:	Network Development <netdev@...r.kernel.org>,
	Linux Kernel Mailing List <linux-kernel@...r.kernel.org>,
	Alexei Starovoitov <alexei.starovoitov@...il.com>,
	FB Kernel Team <kernel-team@...com>
Subject: Re: [PATCH v2 net-next 3/4] bpf: bpf_htab: Add syscall to iterate
 percpu value of a key

On Tue, Jan 12, 2016 at 4:20 PM, Martin KaFai Lau <kafai@...com> wrote:
> Add map_lookup_percpu_elem() syscall to lookup value
> of a particular CPU.
>
> The user is expected to loop through all CPU values by:
> for (cpu = 0; cpu < sysconf(_SC_NPROCESSORS_CONF); cpu++) {
>         if (!bpf_map_lookup_percpu_elem(map, &key, &value, cpu))

Thinking of further, the above way can not work well, the iteration may
takes long time to collect the value from each CPU, and finally
the accumulated value has been stale for long.

Syscall is often the preempt point.

>                 total_value += value;
> }
>
> * If the cpu is offline, errno == ENXIO
> * If the key is deleted before the iteration is done,
>   errno == ENOENT.
>
> Signed-off-by: Martin KaFai Lau <kafai@...com>
> ---
>  include/uapi/linux/bpf.h |  2 ++
>  kernel/bpf/syscall.c     | 93 ++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 95 insertions(+)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 43ae40c..2c73c09 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -73,6 +73,7 @@ enum bpf_cmd {
>         BPF_PROG_LOAD,
>         BPF_OBJ_PIN,
>         BPF_OBJ_GET,
> +       BPF_MAP_LOOKUP_PERCPU_ELEM,
>  };
>
>  enum bpf_map_type {
> @@ -115,6 +116,7 @@ union bpf_attr {
>                         __aligned_u64 next_key;
>                 };
>                 __u64           flags;
> +               __u32           cpu;
>         };
>
>         struct { /* anonymous struct used by BPF_PROG_LOAD command */
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 6373970..ba1172b 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -289,6 +289,96 @@ err_put:
>         return err;
>  }
>
> +/* last field in 'union bpf_attr' used by this command */
> +#define BPF_MAP_LOOKUP_PERCPU_ELEM_LAST_FIELD cpu
> +
> +struct percpu_map_value_info {
> +       struct bpf_map *map;
> +       void *key;
> +       void *value;
> +       bool found;
> +};
> +
> +static void percpu_map_lookup_value(void *i)
> +{
> +       struct percpu_map_value_info *info = (struct percpu_map_value_info *)i;
> +       struct bpf_map *map = info->map;
> +       void *ptr;
> +
> +       rcu_read_lock();
> +       ptr = map->ops->map_lookup_elem(map, info->key);
> +       if (ptr) {
> +               memcpy(info->value, ptr, map->value_size);
> +               info->found = true;
> +       } else {
> +               info->found = false;
> +       }
> +       rcu_read_unlock();
> +}
> +
> +static int map_lookup_percpu_elem(union bpf_attr *attr)
> +{
> +       void __user *ukey = u64_to_ptr(attr->key);
> +       void __user *uvalue = u64_to_ptr(attr->value);
> +       u32 __user ucpu = attr->cpu;
> +       int ufd = attr->map_fd;
> +       struct percpu_map_value_info value_info;
> +       struct bpf_map *map;
> +       void *key, *value;
> +       struct fd f;
> +       int err;
> +
> +       if (CHECK_ATTR(BPF_MAP_LOOKUP_PERCPU_ELEM) ||
> +           ucpu >= num_possible_cpus())
> +               return -EINVAL;
> +
> +       f = fdget(ufd);
> +       map = __bpf_map_get(f);
> +       if (IS_ERR(map))
> +               return PTR_ERR(map);
> +
> +       err = -ENOMEM;
> +       key = kmalloc(map->key_size, GFP_USER);

The kmalloc may trigger memory reclaim and take dozens of seconds
or more.

> +       if (!key)
> +               goto err_put;
> +
> +       err = -EFAULT;
> +       if (copy_from_user(key, ukey, map->key_size) != 0)
> +               goto free_key;
> +
> +       err = -ENOMEM;
> +       value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN);

Same with above.

> +       if (!value)
> +               goto free_key;
> +
> +       value_info.map = map;
> +       value_info.key = key;
> +       value_info.value = value;
> +
> +       err = smp_call_function_single(ucpu, percpu_map_lookup_value,
> +                                      &value_info, 1);

It is slow too.

> +       if (err)
> +               goto free_value;
> +
> +       err = -ENOENT;
> +       if (!value_info.found)
> +               goto free_value;
> +
> +       err = -EFAULT;
> +       if (copy_to_user(uvalue, value, map->value_size) != 0)
> +               goto free_value;

page fault may be triggered.

> +
> +       err = 0;
> +
> +free_value:
> +       kfree(value);
> +free_key:
> +       kfree(key);
> +err_put:
> +       fdput(f);
> +       return err;
> +}
> +
>  #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
>
>  static int map_update_elem(union bpf_attr *attr)
> @@ -792,6 +882,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
>         case BPF_OBJ_GET:
>                 err = bpf_obj_get(&attr);
>                 break;
> +       case BPF_MAP_LOOKUP_PERCPU_ELEM:
> +               err = map_lookup_percpu_elem(&attr);
> +               break;
>         default:
>                 err = -EINVAL;
>                 break;

So I don't think it is good to retrieve value from one CPU via one
single system call, and accumulate them finally in userspace.

One approach I thought of is to define the function(or sort of)

 handle_cpu_value(u32 cpu, void *val_cpu, void *val_total)

in bpf kernel code for collecting value from each cpu and
accumulating them into 'val_total', and most of situations, the
function can be implemented without loop most of situations.
kernel can call this function directly, and the total value can be
return to userspace by one single syscall.

Alexei and anyone, could you comment on this draft idea for
perpcu map?

Thanks,
Ming Lei

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ