[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <6592573c-8cb3-8cb8-2030-f88a3d974400@fb.com>
Date: Thu, 4 Nov 2021 09:46:16 -0700
From: Yonghong Song <yhs@...com>
To: Song Liu <songliubraving@...com>, <bpf@...r.kernel.org>,
<netdev@...r.kernel.org>
CC: <ast@...nel.org>, <daniel@...earbox.net>, <andrii@...nel.org>,
<kernel-team@...com>, <kpsingh@...nel.org>
Subject: Re: [PATCH v2 bpf-next 1/2] bpf: introduce helper bpf_find_vma
On 11/4/21 12:00 AM, Song Liu wrote:
> In some profiler use cases, it is necessary to map an address to the
> backing file, e.g., a shared library. bpf_find_vma helper provides a
> flexible way to achieve this. bpf_find_vma maps an address of a task to
> the vma (vm_area_struct) for this address, and feed the vma to an callback
> BPF function. The callback function is necessary here, as we need to
> ensure mmap_sem is unlocked.
>
> It is necessary to lock mmap_sem for find_vma. To lock and unlock mmap_sem
> safely when irqs are disable, we use the same mechanism as stackmap with
> build_id. Specifically, when irqs are disabled, the unlocked is postponed
> in an irq_work. Refactor stackmap.c so that the irq_work is shared among
> bpf_find_vma and stackmap helpers.
>
> Signed-off-by: Song Liu <songliubraving@...com>
> ---
> include/linux/bpf.h | 1 +
> include/uapi/linux/bpf.h | 20 ++++++++++
> kernel/bpf/mmap_unlock_work.h | 65 +++++++++++++++++++++++++++++++
> kernel/bpf/stackmap.c | 71 ++++++++--------------------------
> kernel/bpf/task_iter.c | 49 +++++++++++++++++++++++
> kernel/bpf/verifier.c | 36 +++++++++++++++++
> kernel/trace/bpf_trace.c | 2 +
> tools/include/uapi/linux/bpf.h | 20 ++++++++++
> 8 files changed, 209 insertions(+), 55 deletions(-)
> create mode 100644 kernel/bpf/mmap_unlock_work.h
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 2be6dfd68df99..df3410bff4b06 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -2157,6 +2157,7 @@ extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto;
> extern const struct bpf_func_proto bpf_sk_setsockopt_proto;
> extern const struct bpf_func_proto bpf_sk_getsockopt_proto;
> extern const struct bpf_func_proto bpf_kallsyms_lookup_name_proto;
> +extern const struct bpf_func_proto bpf_find_vma_proto;
>
> const struct bpf_func_proto *tracing_prog_func_proto(
> enum bpf_func_id func_id, const struct bpf_prog *prog);
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index ba5af15e25f5c..22fa7b74de451 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -4938,6 +4938,25 @@ union bpf_attr {
> * **-ENOENT** if symbol is not found.
> *
> * **-EPERM** if caller does not have permission to obtain kernel address.
> + *
> + * long bpf_find_vma(struct task_struct *task, u64 addr, void *callback_fn, void *callback_ctx, u64 flags)
> + * Description
> + * Find vma of *task* that contains *addr*, call *callback_fn*
> + * function with *task*, *vma*, and *callback_ctx*.
> + * The *callback_fn* should be a static function and
> + * the *callback_ctx* should be a pointer to the stack.
> + * The *flags* is used to control certain aspects of the helper.
> + * Currently, the *flags* must be 0.
> + *
> + * The expected callback signature is
> + *
> + * long (\*callback_fn)(struct task_struct \*task, struct vm_area_struct \*vma, void \*ctx);
ctx => callback_ctx
this should make it clear what this 'ctx' is.
> + *
> + * Return
> + * 0 on success.
> + * **-ENOENT** if *task->mm* is NULL, or no vma contains *addr*.
> + * **-EBUSY** if failed to try lock mmap_lock.
> + * **-EINVAL** for invalid **flags**.
> */
> #define __BPF_FUNC_MAPPER(FN) \
> FN(unspec), \
> @@ -5120,6 +5139,7 @@ union bpf_attr {
> FN(trace_vprintk), \
> FN(skc_to_unix_sock), \
> FN(kallsyms_lookup_name), \
> + FN(find_vma), \
> /* */
>
> /* integer value in 'imm' field of BPF_CALL instruction selects which helper
[...]
> +
> static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
> u64 *ips, u32 trace_nr, bool user)
> {
> - int i;
> + struct mmap_unlock_irq_work *work = NULL;
> + bool irq_work_busy = bpf_mmap_unlock_get_irq_work(&work);
> struct vm_area_struct *vma;
> - bool irq_work_busy = false;
> - struct stack_map_irq_work *work = NULL;
> -
> - if (irqs_disabled()) {
> - if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
> - work = this_cpu_ptr(&up_read_work);
> - if (irq_work_is_busy(&work->irq_work)) {
> - /* cannot queue more up_read, fallback */
> - irq_work_busy = true;
> - }
> - } else {
> - /*
> - * PREEMPT_RT does not allow to trylock mmap sem in
> - * interrupt disabled context. Force the fallback code.
> - */
> - irq_work_busy = true;
> - }
> - }
> + int i;
I think moving 'int i' is unnecessary here since there is no
functionality change.
>
> - /*
> - * We cannot do up_read() when the irq is disabled, because of
> - * risk to deadlock with rq_lock. To do build_id lookup when the
> - * irqs are disabled, we need to run up_read() in irq_work. We use
> - * a percpu variable to do the irq_work. If the irq_work is
> - * already used by another lookup, we fall back to report ips.
> - *
> - * Same fallback is used for kernel stack (!user) on a stackmap
> - * with build_id.
> + /* If the irq_work is in use, fall back to report ips. Same
> + * fallback is used for kernel stack (!user) on a stackmap with
> + * build_id.
> */
> if (!user || !current || !current->mm || irq_work_busy ||
> !mmap_read_trylock(current->mm)) {
[...]
> +
> + irq_work_busy = bpf_mmap_unlock_get_irq_work(&work);
> +
> + if (irq_work_busy || !mmap_read_trylock(mm))
> + return -EBUSY;
> +
> + vma = find_vma(mm, start);
> +
> + if (vma && vma->vm_start <= start && vma->vm_end > start) {
> + callback_fn((u64)(long)task, (u64)(long)vma,
> + (u64)(long)callback_ctx, 0, 0);
> + ret = 0;
> + }
> + bpf_mmap_unlock_mm(work, mm);
> + return ret;
> +}
> +
> +BTF_ID_LIST_SINGLE(btf_find_vma_ids, struct, task_struct)
We have global btf_task_struct_ids, maybe reuse it?
> +
> +const struct bpf_func_proto bpf_find_vma_proto = {
> + .func = bpf_find_vma,
> + .ret_type = RET_INTEGER,
> + .arg1_type = ARG_PTR_TO_BTF_ID,
> + .arg1_btf_id = &btf_find_vma_ids[0],
> + .arg2_type = ARG_ANYTHING,
> + .arg3_type = ARG_PTR_TO_FUNC,
> + .arg4_type = ARG_PTR_TO_STACK_OR_NULL,
> + .arg5_type = ARG_ANYTHING,
> +};
> +
> static int __init task_iter_init(void)
> {
> int ret;
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index f0dca726ebfde..a65526112924a 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -6132,6 +6132,35 @@ static int set_timer_callback_state(struct bpf_verifier_env *env,
> return 0;
> }
>
> +BTF_ID_LIST_SINGLE(btf_set_find_vma_ids, struct, vm_area_struct)
In task_iter.c, we have
BTF_ID_LIST(btf_task_file_ids)
BTF_ID(struct, file)
BTF_ID(struct, vm_area_struct)
Maybe it is worthwhile to separate them so we can put vm_area_struct as
global to be reused.
> +
> +static int set_find_vma_callback_state(struct bpf_verifier_env *env,
> + struct bpf_func_state *caller,
> + struct bpf_func_state *callee,
> + int insn_idx)
> +{
> + /* bpf_find_vma(struct task_struct *task, u64 start,
start => addr ?
> + * void *callback_fn, void *callback_ctx, u64 flags)
> + * (callback_fn)(struct task_struct *task,
> + * struct vm_area_struct *vma, void *ctx);
ctx => callback_ctx ?
> + */
> + callee->regs[BPF_REG_1] = caller->regs[BPF_REG_1];
> +
> + callee->regs[BPF_REG_2].type = PTR_TO_BTF_ID;
> + __mark_reg_known_zero(&callee->regs[BPF_REG_2]);
> + callee->regs[BPF_REG_2].btf = btf_vmlinux;
> + callee->regs[BPF_REG_2].btf_id = btf_set_find_vma_ids[0];
> +
> + /* pointer to stack or null */
> + callee->regs[BPF_REG_3] = caller->regs[BPF_REG_4];
> +
> + /* unused */
> + __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
> + __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
> + callee->in_callback_fn = true;
> + return 0;
> +}
> +
[...]
Powered by blists - more mailing lists