[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAEf4BzbySjaBQSMTET=HGD_K748GOXZZQ7zMhgtEqE-JgJGbdQ@mail.gmail.com>
Date: Wed, 6 May 2020 00:30:03 -0700
From: Andrii Nakryiko <andrii.nakryiko@...il.com>
To: Yonghong Song <yhs@...com>
Cc: Andrii Nakryiko <andriin@...com>, bpf <bpf@...r.kernel.org>,
Martin KaFai Lau <kafai@...com>,
Networking <netdev@...r.kernel.org>,
Alexei Starovoitov <ast@...com>,
Daniel Borkmann <daniel@...earbox.net>,
Kernel Team <kernel-team@...com>
Subject: Re: [PATCH bpf-next v2 11/20] bpf: add task and task/file iterator targets
On Sun, May 3, 2020 at 11:28 PM Yonghong Song <yhs@...com> wrote:
>
> Only the tasks belonging to "current" pid namespace
> are enumerated.
>
> For task/file target, the bpf program will have access to
> struct task_struct *task
> u32 fd
> struct file *file
> where fd/file is an open file for the task.
>
> Signed-off-by: Yonghong Song <yhs@...com>
> ---
I might be missing some subtleties with task refcounting for task_file
iterator, asked few questions below...
> kernel/bpf/Makefile | 2 +-
> kernel/bpf/task_iter.c | 336 +++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 337 insertions(+), 1 deletion(-)
> create mode 100644 kernel/bpf/task_iter.c
>
> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
> index b2b5eefc5254..37b2d8620153 100644
> --- a/kernel/bpf/Makefile
> +++ b/kernel/bpf/Makefile
> @@ -2,7 +2,7 @@
> obj-y := core.o
> CFLAGS_core.o += $(call cc-disable-warning, override-init)
>
> -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o
> +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o
> obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
> obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
> obj-$(CONFIG_BPF_SYSCALL) += disasm.o
> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> new file mode 100644
> index 000000000000..1ca258f6e9f4
> --- /dev/null
> +++ b/kernel/bpf/task_iter.c
> @@ -0,0 +1,336 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (c) 2020 Facebook */
> +
> +#include <linux/init.h>
> +#include <linux/namei.h>
> +#include <linux/pid_namespace.h>
> +#include <linux/fs.h>
> +#include <linux/fdtable.h>
> +#include <linux/filter.h>
> +
> +struct bpf_iter_seq_task_common {
> + struct pid_namespace *ns;
> +};
> +
> +struct bpf_iter_seq_task_info {
> + struct bpf_iter_seq_task_common common;
you have comment below in init_seq_pidns() that common is supposed to
be the very first field, but I think it's more important and
appropriate here, so that whoever adds anything here knows that order
of field is important.
> + struct task_struct *task;
> + u32 id;
> +};
> +
[...]
> +static int __task_seq_show(struct seq_file *seq, void *v, bool in_stop)
> +{
> + struct bpf_iter_meta meta;
> + struct bpf_iter__task ctx;
> + struct bpf_prog *prog;
> + int ret = 0;
> +
> + meta.seq = seq;
> + prog = bpf_iter_get_info(&meta, in_stop);
> + if (prog) {
nit: `if (!prog) return 0;` here would reduce nesting level below
> + meta.seq = seq;
> + ctx.meta = &meta;
> + ctx.task = v;
> + ret = bpf_iter_run_prog(prog, &ctx);
> + }
> +
> + return 0;
return **ret**; ?
> +}
> +
[...]
> +
> +static struct file *task_file_seq_get_next(struct pid_namespace *ns, u32 *id,
> + int *fd, struct task_struct **task,
> + struct files_struct **fstruct)
> +{
> + struct files_struct *files;
> + struct task_struct *tk;
> + u32 sid = *id;
> + int sfd;
> +
> + /* If this function returns a non-NULL file object,
> + * it held a reference to the files_struct and file.
> + * Otherwise, it does not hold any reference.
> + */
> +again:
> + if (*fstruct) {
> + files = *fstruct;
> + sfd = *fd;
> + } else {
> + tk = task_seq_get_next(ns, &sid);
> + if (!tk)
> + return NULL;
> +
> + files = get_files_struct(tk);
> + put_task_struct(tk);
task is put here, but is still used below.. is there some additional
hidden refcounting involved?
> + if (!files) {
> + sid = ++(*id);
> + *fd = 0;
> + goto again;
> + }
> + *fstruct = files;
> + *task = tk;
> + if (sid == *id) {
> + sfd = *fd;
> + } else {
> + *id = sid;
> + sfd = 0;
> + }
> + }
> +
> + rcu_read_lock();
> + for (; sfd < files_fdtable(files)->max_fds; sfd++) {
files_fdtable does rcu_dereference on each iteration, would it be
better to just cache files_fdtable(files)->max_fds into local
variable? It's unlikely that there will be many iterations, but
still...
> + struct file *f;
> +
> + f = fcheck_files(files, sfd);
> + if (!f)
> + continue;
> + *fd = sfd;
> + get_file(f);
> + rcu_read_unlock();
> + return f;
> + }
> +
> + /* the current task is done, go to the next task */
> + rcu_read_unlock();
> + put_files_struct(files);
> + *fstruct = NULL;
*task = NULL; for completeness?
> + sid = ++(*id);
> + *fd = 0;
> + goto again;
> +}
> +
> +static void *task_file_seq_start(struct seq_file *seq, loff_t *pos)
> +{
> + struct bpf_iter_seq_task_file_info *info = seq->private;
> + struct files_struct *files = NULL;
> + struct task_struct *task = NULL;
> + struct file *file;
> + u32 id = info->id;
> + int fd = info->fd;
> +
> + file = task_file_seq_get_next(info->common.ns, &id, &fd, &task, &files);
> + if (!file) {
> + info->files = NULL;
what about info->task here?
> + return NULL;
> + }
> +
> + ++*pos;
> + info->id = id;
> + info->fd = fd;
> + info->task = task;
> + info->files = files;
> +
> + return file;
> +}
> +
[...]
> +
> +struct bpf_iter__task_file {
> + __bpf_md_ptr(struct bpf_iter_meta *, meta);
> + __bpf_md_ptr(struct task_struct *, task);
> + u32 fd;
nit: sort of works by accident (due to all other field being 8-byte
aligned pointers), shall we add __attribute__((aligned(8)))?
> + __bpf_md_ptr(struct file *, file);
> +};
> +
[...]
Powered by blists - more mailing lists