[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <9b23b2c6-28b2-3ab3-4e8b-1fa0c926c4d2@fb.com>
Date: Mon, 14 Jun 2021 09:51:37 -0700
From: Yonghong Song <yhs@...com>
To: Alexei Starovoitov <alexei.starovoitov@...il.com>,
<davem@...emloft.net>
CC: <daniel@...earbox.net>, <andrii@...nel.org>,
<netdev@...r.kernel.org>, <bpf@...r.kernel.org>,
<kernel-team@...com>
Subject: Re: [PATCH v2 bpf-next 1/3] bpf: Introduce bpf_timer
On 6/10/21 9:24 PM, Alexei Starovoitov wrote:
> From: Alexei Starovoitov <ast@...nel.org>
>
> Introduce 'struct bpf_timer { __u64 :64; __u64 :64; };' that can be embedded
> in hash/array/lru maps as regular field and helpers to operate on it:
>
> // Initialize the timer to call 'callback_fn' static function
> // First 4 bits of 'flags' specify clockid.
> // Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed.
> long bpf_timer_init(struct bpf_timer *timer, void *callback_fn, int flags);
>
> // Start the timer and set its expiration 'nsec' nanoseconds from the current time.
> long bpf_timer_start(struct bpf_timer *timer, u64 nsec);
>
> // Cancel the timer and wait for callback_fn to finish if it was running.
> long bpf_timer_cancel(struct bpf_timer *timer);
>
> Here is how BPF program might look like:
> struct map_elem {
> int counter;
> struct bpf_timer timer;
> };
>
> struct {
> __uint(type, BPF_MAP_TYPE_HASH);
> __uint(max_entries, 1000);
> __type(key, int);
> __type(value, struct map_elem);
> } hmap SEC(".maps");
>
> static int timer_cb(void *map, int *key, struct map_elem *val);
> /* val points to particular map element that contains bpf_timer. */
>
> SEC("fentry/bpf_fentry_test1")
> int BPF_PROG(test1, int a)
> {
> struct map_elem *val;
> int key = 0;
>
> val = bpf_map_lookup_elem(&hmap, &key);
> if (val) {
> bpf_timer_init(&val->timer, timer_cb, CLOCK_REALTIME);
> bpf_timer_start(&val->timer, 1000 /* call timer_cb2 in 1 usec */);
> }
> }
>
> This patch adds helper implementations that rely on hrtimers
> to call bpf functions as timers expire.
> The following patch adds necessary safety checks.
>
> Only programs with CAP_BPF are allowed to use bpf_timer.
>
> The amount of timers used by the program is constrained by
> the memcg recorded at map creation time.
>
> The bpf_timer_init() helper is receiving hidden 'map' and 'prog' arguments
> supplied by the verifier. The prog pointer is needed to do refcnting of bpf
> program to make sure that program doesn't get freed while timer is armed.
>
> The bpf_map_delete_elem() and bpf_map_update_elem() operations cancel
> and free the timer if given map element had it allocated.
> "bpftool map update" command can be used to cancel timers.
>
> Signed-off-by: Alexei Starovoitov <ast@...nel.org>
> ---
> include/linux/bpf.h | 2 +
> include/uapi/linux/bpf.h | 40 ++++++
> kernel/bpf/helpers.c | 227 +++++++++++++++++++++++++++++++++
> kernel/bpf/verifier.c | 109 ++++++++++++++++
> kernel/trace/bpf_trace.c | 2 +-
> scripts/bpf_doc.py | 2 +
> tools/include/uapi/linux/bpf.h | 40 ++++++
> 7 files changed, 421 insertions(+), 1 deletion(-)
>
[...]
> +
> +static enum hrtimer_restart bpf_timer_cb(struct hrtimer *timer)
> +{
> + struct bpf_hrtimer *t = container_of(timer, struct bpf_hrtimer, timer);
> + struct bpf_prog *prog = t->prog;
> + struct bpf_map *map = t->map;
> + void *key;
> + u32 idx;
> + int ret;
> +
> + /* bpf_timer_cb() runs in hrtimer_run_softirq. It doesn't migrate and
> + * cannot be preempted by another bpf_timer_cb() on the same cpu.
> + * Remember the timer this callback is servicing to prevent
> + * deadlock if callback_fn() calls bpf_timer_cancel() on the same timer.
> + */
> + this_cpu_write(hrtimer_running, t);
> + if (map->map_type == BPF_MAP_TYPE_ARRAY) {
> + struct bpf_array *array = container_of(map, struct bpf_array, map);
> +
> + /* compute the key */
> + idx = ((char *)t->value - array->value) / array->elem_size;
> + key = &idx;
> + } else { /* hash or lru */
> + key = t->value - round_up(map->key_size, 8);
> + }
> +
> + ret = BPF_CAST_CALL(t->callback_fn)((u64)(long)map,
> + (u64)(long)key,
> + (u64)(long)t->value, 0, 0);
> + WARN_ON(ret != 0); /* Next patch disallows 1 in the verifier */
I didn't find that next patch disallows callback return value 1 in the
verifier. If we indeed disallows return value 1 in the verifier. We
don't need WARN_ON here. Did I miss anything?
> +
> + /* The bpf function finished executed. Drop the prog refcnt.
> + * It could reach zero here and trigger free of bpf_prog
> + * and subsequent free of the maps that were holding timers.
> + * If callback_fn called bpf_timer_start on this timer
> + * the prog refcnt will be > 0.
> + *
> + * If callback_fn deleted map element the 't' could have been freed,
> + * hence t->prog deref is done earlier.
> + */
> + bpf_prog_put(prog);
> + this_cpu_write(hrtimer_running, NULL);
> + return HRTIMER_NORESTART;
> +}
> +
> +BPF_CALL_5(bpf_timer_init, struct bpf_timer_kern *, timer, void *, cb, int, flags,
> + struct bpf_map *, map, struct bpf_prog *, prog)
> +{
> + clockid_t clockid = flags & (MAX_CLOCKS - 1);
> + struct bpf_hrtimer *t;
> + int ret = 0;
> +
> + BUILD_BUG_ON(MAX_CLOCKS != 16);
> + if (flags >= MAX_CLOCKS ||
> + /* similar to timerfd except _ALARM variants are not supported */
> + (clockid != CLOCK_MONOTONIC &&
> + clockid != CLOCK_REALTIME &&
> + clockid != CLOCK_BOOTTIME))
> + return -EINVAL;
> + ____bpf_spin_lock(&timer->lock);
> + t = timer->timer;
> + if (t) {
> + ret = -EBUSY;
> + goto out;
> + }
> + /* allocate hrtimer via map_kmalloc to use memcg accounting */
> + t = bpf_map_kmalloc_node(map, sizeof(*t), GFP_ATOMIC, NUMA_NO_NODE);
> + if (!t) {
> + ret = -ENOMEM;
> + goto out;
> + }
> + t->callback_fn = cb;
> + t->value = (void *)timer /* - offset of bpf_timer inside elem */;
> + t->map = map;
> + t->prog = prog;
> + hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
> + t->timer.function = bpf_timer_cb;
> + timer->timer = t;
> +out:
> + ____bpf_spin_unlock(&timer->lock);
> + return ret;
> +}
> +
> +static const struct bpf_func_proto bpf_timer_init_proto = {
> + .func = bpf_timer_init,
> + .gpl_only = true,
> + .ret_type = RET_INTEGER,
> + .arg1_type = ARG_PTR_TO_TIMER,
> + .arg2_type = ARG_PTR_TO_FUNC,
> + .arg3_type = ARG_ANYTHING,
> +};
> +
> +BPF_CALL_2(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs)
> +{
> + struct bpf_hrtimer *t;
> + int ret = 0;
> +
> + ____bpf_spin_lock(&timer->lock);
> + t = timer->timer;
> + if (!t) {
> + ret = -EINVAL;
> + goto out;
> + }
> + if (!hrtimer_active(&t->timer) || hrtimer_callback_running(&t->timer))
> + /* If the timer wasn't active or callback already executing
> + * bump the prog refcnt to keep it alive until
> + * callback is invoked (again).
> + */
> + bpf_prog_inc(t->prog);
I am not 100% sure. But could we have race condition here?
cpu 1: running bpf_timer_start() helper call
cpu 2: doing hrtimer work (calling callback etc.)
Is it possible that
!hrtimer_active(&t->timer) || hrtimer_callback_running(&t->timer)
may be true and then right before bpf_prog_inc(t->prog), it becomes
true? If hrtimer_callback_running() is called, it is possible that
callback function could have dropped the reference count for t->prog,
so we could already go into the body of the function
__bpf_prog_put()?
static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
{
if (atomic64_dec_and_test(&prog->aux->refcnt)) {
perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
/* bpf_prog_free_id() must be called first */
bpf_prog_free_id(prog, do_idr_lock);
__bpf_prog_put_noref(prog, true);
}
}
> + hrtimer_start(&t->timer, ns_to_ktime(nsecs), HRTIMER_MODE_REL_SOFT);
> +out:
> + ____bpf_spin_unlock(&timer->lock);
> + return ret;
> +}
> +
> +static const struct bpf_func_proto bpf_timer_start_proto = {
> + .func = bpf_timer_start,
> + .gpl_only = true,
> + .ret_type = RET_INTEGER,
> + .arg1_type = ARG_PTR_TO_TIMER,
> + .arg2_type = ARG_ANYTHING,
> +};
> +
> +BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer)
> +{
> + struct bpf_hrtimer *t;
> + int ret = 0;
> +
> + ____bpf_spin_lock(&timer->lock);
> + t = timer->timer;
> + if (!t) {
> + ret = -EINVAL;
> + goto out;
> + }
> + if (this_cpu_read(hrtimer_running) == t) {
> + /* If bpf callback_fn is trying to bpf_timer_cancel()
> + * its own timer the hrtimer_cancel() will deadlock
> + * since it waits for callback_fn to finish
> + */
> + ret = -EDEADLK;
> + goto out;
> + }
> + /* Cancel the timer and wait for associated callback to finish
> + * if it was running.
> + */
> + if (hrtimer_cancel(&t->timer) == 1) {
Again, could we have race here between bpf program and hrtimer_cancel()?
> + /* If the timer was active then drop the prog refcnt,
> + * since callback will not be invoked.
> + */
> + bpf_prog_put(t->prog);
> + ret = 1;
> + }
> +out:
> + ____bpf_spin_unlock(&timer->lock);
> + return ret;
> +}
> +
> +static const struct bpf_func_proto bpf_timer_cancel_proto = {
> + .func = bpf_timer_cancel,
> + .gpl_only = true,
> + .ret_type = RET_INTEGER,
> + .arg1_type = ARG_PTR_TO_TIMER,
> +};
> +
> +/* This function is called by delete_element in htab and lru maps
> + * and by map_free for array, lru, htab maps.
> + */
> +void bpf_timer_cancel_and_free(void *val)
> +{
> + struct bpf_timer_kern *timer = val;
> + struct bpf_hrtimer *t;
> +
> + ____bpf_spin_lock(&timer->lock);
> + t = timer->timer;
> + if (!t)
> + goto out;
> + /* Cancel the timer and wait for callback to complete if it was
> + * running. Only individual delete_element in htab or lru maps can
> + * return 1 from hrtimer_cancel.
> + * The whole map is destroyed when its refcnt reaches zero.
> + * That happens after bpf prog refcnt reaches zero.
> + * bpf prog refcnt will not reach zero until all timers are executed.
> + * So when maps are destroyed hrtimer_cancel will surely return 0.
> + * In such case t->prog is a pointer to freed memory.
> + *
> + * When htab or lru is deleting individual element check that
> + * bpf_map_delete_elem() isn't trying to delete elem with running timer.
> + * In such case don't call hrtimer_cancel() (since it will deadlock)
> + * and don't call hrtimer_try_to_cancel() (since it will just return -1).
> + * Instead free the timer and set timer->timer = NULL.
> + * The subsequent bpf_timer_start/cancel() helpers won't be able to use it.
> + * In preallocated maps it's safe to do timer->timer = NULL.
> + * The memory could be reused for another element while current timer
> + * callback can still do bpf_timer_init() on it.
> + * In non-preallocated maps timer->timer = NULL will happen after
> + * callback completes, since prog execution is an RCU critical section.
> + */
> + if (this_cpu_read(hrtimer_running) != t &&
> + hrtimer_cancel(&t->timer) == 1)
> + bpf_prog_put(t->prog);
> + kfree(t);
> + timer->timer = NULL;
> +out:
> + ____bpf_spin_unlock(&timer->lock);
> +}
> +
> const struct bpf_func_proto bpf_get_current_task_proto __weak;
> const struct bpf_func_proto bpf_probe_read_user_proto __weak;
> const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
> @@ -1051,6 +1272,12 @@ bpf_base_func_proto(enum bpf_func_id func_id)
> return &bpf_per_cpu_ptr_proto;
> case BPF_FUNC_this_cpu_ptr:
> return &bpf_this_cpu_ptr_proto;
> + case BPF_FUNC_timer_init:
> + return &bpf_timer_init_proto;
> + case BPF_FUNC_timer_start:
> + return &bpf_timer_start_proto;
> + case BPF_FUNC_timer_cancel:
> + return &bpf_timer_cancel_proto;
> default:
> break;
> }
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 1de4b8c6ee42..44ec9760b562 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -4656,6 +4656,35 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
> return 0;
> }
>
> +static int process_timer_func(struct bpf_verifier_env *env, int regno,
> + struct bpf_call_arg_meta *meta)
> +{
> + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno];
> + bool is_const = tnum_is_const(reg->var_off);
> + struct bpf_map *map = reg->map_ptr;
> + u64 val = reg->var_off.value;
> +
> + if (!is_const) {
> + verbose(env,
> + "R%d doesn't have constant offset. bpf_timer has to be at the constant offset\n",
> + regno);
> + return -EINVAL;
> + }
> + if (!map->btf) {
> + verbose(env, "map '%s' has to have BTF in order to use bpf_timer\n",
> + map->name);
> + return -EINVAL;
> + }
> + if (val) {
> + /* This restriction will be removed in the next patch */
> + verbose(env, "bpf_timer field can only be first in the map value element\n");
> + return -EINVAL;
> + }
> + WARN_ON(meta->map_ptr);
Could you explain when this could happen?
> + meta->map_ptr = map;
> + return 0;
> +}
> +
> static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
> {
> return type == ARG_PTR_TO_MEM ||
> @@ -4788,6 +4817,7 @@ static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PER
> static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
> static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
> static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
> +static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } };
>
[...]
Powered by blists - more mailing lists