netdev - Re: [RFC PATCH v7 1/8] net_sched: Introduce eBPF based Qdisc

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAMB2axM1TVw05jZsFe7TsKKRN8jw=YOwu-+rA9bOAkOiCPyFqQ@mail.gmail.com>
Date: Tue, 23 Jan 2024 21:22:07 -0800
From: Amery Hung <ameryhung@...il.com>
To: Martin KaFai Lau <martin.lau@...ux.dev>
Cc: bpf@...r.kernel.org, yangpeihao@...u.edu.cn, toke@...hat.com, 
	jhs@...atatu.com, jiri@...nulli.us, sdf@...gle.com, xiyou.wangcong@...il.com, 
	yepeilin.cs@...il.com, netdev@...r.kernel.org
Subject: Re: [RFC PATCH v7 1/8] net_sched: Introduce eBPF based Qdisc

On Tue, Jan 23, 2024 at 3:51 PM Martin KaFai Lau <martin.lau@...ux.dev> wrote:
>
> On 1/17/24 1:56 PM, Amery Hung wrote:
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index 0bb92414c036..df280bbb7c0d 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -997,6 +997,7 @@ enum bpf_prog_type {
> >       BPF_PROG_TYPE_SK_LOOKUP,
> >       BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
> >       BPF_PROG_TYPE_NETFILTER,
> > +     BPF_PROG_TYPE_QDISC,
> >   };
> >
> >   enum bpf_attach_type {
> > @@ -1056,6 +1057,8 @@ enum bpf_attach_type {
> >       BPF_CGROUP_UNIX_GETSOCKNAME,
> >       BPF_NETKIT_PRIMARY,
> >       BPF_NETKIT_PEER,
> > +     BPF_QDISC_ENQUEUE,
> > +     BPF_QDISC_DEQUEUE,
> >       __MAX_BPF_ATTACH_TYPE
> >   };
> >
> > @@ -7357,4 +7360,22 @@ struct bpf_iter_num {
> >       __u64 __opaque[1];
> >   } __attribute__((aligned(8)));
> >
> > +struct bpf_qdisc_ctx {
> > +     __bpf_md_ptr(struct sk_buff *, skb);
> > +     __u32 classid;
> > +     __u64 expire;
> > +     __u64 delta_ns;
> > +};
> > +
> > +enum {
> > +     SCH_BPF_QUEUED,
> > +     SCH_BPF_DEQUEUED = SCH_BPF_QUEUED,
> > +     SCH_BPF_DROP,
> > +     SCH_BPF_CN,
> > +     SCH_BPF_THROTTLE,
> > +     SCH_BPF_PASS,
> > +     SCH_BPF_BYPASS,
> > +     SCH_BPF_STOLEN,
> > +};
> > +
> >   #endif /* _UAPI__LINUX_BPF_H__ */
>
> [ ... ]
>
> > +static bool tc_qdisc_is_valid_access(int off, int size,
> > +                                  enum bpf_access_type type,
> > +                                  const struct bpf_prog *prog,
> > +                                  struct bpf_insn_access_aux *info)
> > +{
> > +     struct btf *btf;
> > +
> > +     if (off < 0 || off >= sizeof(struct bpf_qdisc_ctx))
> > +             return false;
> > +
> > +     switch (off) {
> > +     case offsetof(struct bpf_qdisc_ctx, skb):
> > +             if (type == BPF_WRITE ||
> > +                 size != sizeof_field(struct bpf_qdisc_ctx, skb))
> > +                     return false;
> > +
> > +             if (prog->expected_attach_type != BPF_QDISC_ENQUEUE)
> > +                     return false;
> > +
> > +             btf = bpf_get_btf_vmlinux();
> > +             if (IS_ERR_OR_NULL(btf))
> > +                     return false;
> > +
> > +             info->btf = btf;
> > +             info->btf_id = tc_qdisc_ctx_access_btf_ids[0];
> > +             info->reg_type = PTR_TO_BTF_ID | PTR_TRUSTED;
> > +             return true;
> > +     case bpf_ctx_range(struct bpf_qdisc_ctx, classid):
> > +             return size == sizeof_field(struct bpf_qdisc_ctx, classid);
> > +     case bpf_ctx_range(struct bpf_qdisc_ctx, expire):
> > +             return size == sizeof_field(struct bpf_qdisc_ctx, expire);
> > +     case bpf_ctx_range(struct bpf_qdisc_ctx, delta_ns):
> > +             return size == sizeof_field(struct bpf_qdisc_ctx, delta_ns);
> > +     default:
> > +             return false;
> > +     }
> > +
> > +     return false;
> > +}
> > +
>
> [ ... ]
>
> > +static int sch_bpf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
> > +                        struct sk_buff **to_free)
> > +{
> > +     struct bpf_sched_data *q = qdisc_priv(sch);
> > +     unsigned int len = qdisc_pkt_len(skb);
> > +     struct bpf_qdisc_ctx ctx = {};
> > +     int res = NET_XMIT_SUCCESS;
> > +     struct sch_bpf_class *cl;
> > +     struct bpf_prog *enqueue;
> > +
> > +     enqueue = rcu_dereference(q->enqueue_prog.prog);
> > +     if (!enqueue)
> > +             return NET_XMIT_DROP;
> > +
> > +     ctx.skb = skb;
> > +     ctx.classid = sch->handle;
> > +     res = bpf_prog_run(enqueue, &ctx);
> > +     switch (res) {
> > +     case SCH_BPF_THROTTLE:
> > +             qdisc_watchdog_schedule_range_ns(&q->watchdog, ctx.expire, ctx.delta_ns);
> > +             qdisc_qstats_overlimit(sch);
> > +             fallthrough;
> > +     case SCH_BPF_QUEUED:
> > +             qdisc_qstats_backlog_inc(sch, skb);
> > +             return NET_XMIT_SUCCESS;
> > +     case SCH_BPF_BYPASS:
> > +             qdisc_qstats_drop(sch);
> > +             __qdisc_drop(skb, to_free);
> > +             return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
> > +     case SCH_BPF_STOLEN:
> > +             __qdisc_drop(skb, to_free);
> > +             return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
> > +     case SCH_BPF_CN:
> > +             return NET_XMIT_CN;
> > +     case SCH_BPF_PASS:
> > +             break;
> > +     default:
> > +             return qdisc_drop(skb, sch, to_free);
> > +     }
> > +
> > +     cl = sch_bpf_find(sch, ctx.classid);
> > +     if (!cl || !cl->qdisc)
> > +             return qdisc_drop(skb, sch, to_free);
> > +
> > +     res = qdisc_enqueue(skb, cl->qdisc, to_free);
> > +     if (res != NET_XMIT_SUCCESS) {
> > +             if (net_xmit_drop_count(res)) {
> > +                     qdisc_qstats_drop(sch);
> > +                     cl->drops++;
> > +             }
> > +             return res;
> > +     }
> > +
> > +     sch->qstats.backlog += len;
> > +     sch->q.qlen++;
> > +     return res;
> > +}
> > +
> > +DEFINE_PER_CPU(struct sk_buff*, bpf_skb_dequeue);
> > +
> > +static struct sk_buff *sch_bpf_dequeue(struct Qdisc *sch)
> > +{
> > +     struct bpf_sched_data *q = qdisc_priv(sch);
> > +     struct bpf_qdisc_ctx ctx = {};
> > +     struct sk_buff *skb = NULL;
> > +     struct bpf_prog *dequeue;
> > +     struct sch_bpf_class *cl;
> > +     int res;
> > +
> > +     dequeue = rcu_dereference(q->dequeue_prog.prog);
> > +     if (!dequeue)
> > +             return NULL;
> > +
> > +     __this_cpu_write(bpf_skb_dequeue, NULL);
> > +     ctx.classid = sch->handle;
> > +     res = bpf_prog_run(dequeue, &ctx);
> > +     switch (res) {
> > +     case SCH_BPF_DEQUEUED:
> > +             skb = __this_cpu_read(bpf_skb_dequeue);
> > +             qdisc_bstats_update(sch, skb);
> > +             qdisc_qstats_backlog_dec(sch, skb);
> > +             break;
> > +     case SCH_BPF_THROTTLE:
> > +             qdisc_watchdog_schedule_range_ns(&q->watchdog, ctx.expire, ctx.delta_ns);
> > +             qdisc_qstats_overlimit(sch);
> > +             cl = sch_bpf_find(sch, ctx.classid);
> > +             if (cl)
> > +                     cl->overlimits++;
> > +             return NULL;
> > +     case SCH_BPF_PASS:
> > +             cl = sch_bpf_find(sch, ctx.classid);
> > +             if (!cl || !cl->qdisc)
> > +                     return NULL;
> > +             skb = qdisc_dequeue_peeked(cl->qdisc);
> > +             if (skb) {
> > +                     bstats_update(&cl->bstats, skb);
> > +                     qdisc_bstats_update(sch, skb);
> > +                     qdisc_qstats_backlog_dec(sch, skb);
> > +                     sch->q.qlen--;
> > +             }
> > +             break;
> > +     }
> > +
> > +     return skb;
> > +}
>
> [ ... ]
>
> > +static int sch_bpf_init(struct Qdisc *sch, struct nlattr *opt,
> > +                     struct netlink_ext_ack *extack)
> > +{
> > +     struct bpf_sched_data *q = qdisc_priv(sch);
> > +     int err;
> > +
> > +     qdisc_watchdog_init(&q->watchdog, sch);
> > +     if (opt) {
> > +             err = sch_bpf_change(sch, opt, extack);
> > +             if (err)
> > +                     return err;
> > +     }
> > +
> > +     err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
> > +     if (err)
> > +             return err;
> > +
> > +     return qdisc_class_hash_init(&q->clhash);
> > +}
> > +
> > +static void sch_bpf_reset(struct Qdisc *sch)
> > +{
> > +     struct bpf_sched_data *q = qdisc_priv(sch);
> > +     struct sch_bpf_class *cl;
> > +     unsigned int i;
> > +
> > +     for (i = 0; i < q->clhash.hashsize; i++) {
> > +             hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
> > +                     if (cl->qdisc)
> > +                             qdisc_reset(cl->qdisc);
> > +             }
> > +     }
> > +
> > +     qdisc_watchdog_cancel(&q->watchdog);
> > +}
> > +
>
> [ ... ]
>
> > +static const struct Qdisc_class_ops sch_bpf_class_ops = {
> > +     .graft          =       sch_bpf_graft,
> > +     .leaf           =       sch_bpf_leaf,
> > +     .find           =       sch_bpf_search,
> > +     .change         =       sch_bpf_change_class,
> > +     .delete         =       sch_bpf_delete,
> > +     .tcf_block      =       sch_bpf_tcf_block,
> > +     .bind_tcf       =       sch_bpf_bind,
> > +     .unbind_tcf     =       sch_bpf_unbind,
> > +     .dump           =       sch_bpf_dump_class,
> > +     .dump_stats     =       sch_bpf_dump_class_stats,
> > +     .walk           =       sch_bpf_walk,
> > +};
> > +
> > +static struct Qdisc_ops sch_bpf_qdisc_ops __read_mostly = {
> > +     .cl_ops         =       &sch_bpf_class_ops,
> > +     .id             =       "bpf",
> > +     .priv_size      =       sizeof(struct bpf_sched_data),
> > +     .enqueue        =       sch_bpf_enqueue,
> > +     .dequeue        =       sch_bpf_dequeue,
>
> I looked at the high level of the patchset. The major ops that it wants to be
> programmable in bpf is the ".enqueue" and ".dequeue" (+ ".init" and ".reset" in
> patch 4 and patch 5).
>
> This patch adds a new prog type BPF_PROG_TYPE_QDISC, four attach types (each for
> ".enqueue", ".dequeue", ".init", and ".reset"), and a new "bpf_qdisc_ctx" in the
> uapi. It is no long an acceptable way to add new bpf extension.
>
> Can the ".enqueue", ".dequeue", ".init", and ".reset" be completely implemented
> in bpf (with the help of new kfuncs if needed)? Then a struct_ops for Qdisc_ops
> can be created. The bpf Qdisc_ops can be loaded through the existing struct_ops api.
>

Partially. If using struct_ops, I think we'll need another structure
like the following in bpf qdisc to be implemented with struct_ops bpf:

struct bpf_qdisc_ops {
    int (*enqueue) (struct sk_buff *skb)
    void (*dequeue) (void)
    void (*init) (void)
    void (*reset) (void)
};

Then, Qdisc_ops will wrap around them to handle things that cannot be
implemented with bpf (e.g., sch_tree_lock, returning a skb ptr).

> If other ops (like ".dump", ".dump_stats"...) do not have good use case to be
> programmable in bpf, it can stay with the kernel implementation for now and only
> allows the userspace to load the a bpf Qdisc_ops with .equeue/dequeue/init/reset
> implemented.
>
> You mentioned in the cover letter that:
> "Current struct_ops attachment model does not seem to support replacing only
> functions of a specific instance of a module, but I might be wrong."
>
> I assumed you meant allow bpf to replace only "some" ops of the Qdisc_ops? Yes,
> it can be done through the struct_ops's ".init_member". Take a look at
> bpf_tcp_ca_init_member. The kernel can assign the kernel implementation for
> ".dump" (for example) when loading the bpf Qdisc_ops.
>

I have no problem with partially replacing a struct, which like you
mentioned has been demonstrated by congestion control or sched_ext.
What I am not sure about is the ability to create multiple bpf qdiscs,
where each has different ".enqueue", ".dequeue", and so on. I like the
struct_ops approach and would love to try it if struct_ops support
this.

Thanks,
Amery



> > +     .peek           =       qdisc_peek_dequeued,
> > +     .init           =       sch_bpf_init,
> > +     .reset          =       sch_bpf_reset, > +      .destroy        =       sch_bpf_destroy,
> > +     .change         =       sch_bpf_change,
> > +     .dump           =       sch_bpf_dump,
> > +     .dump_stats     =       sch_bpf_dump_stats,
> > +     .owner          =       THIS_MODULE,
> > +};
> > +
> > +static int __init sch_bpf_mod_init(void)
> > +{
> > +     return register_qdisc(&sch_bpf_qdisc_ops);
> > +}
> > +
> > +static void __exit sch_bpf_mod_exit(void)
> > +{
> > +     unregister_qdisc(&sch_bpf_qdisc_ops);
> > +}
>