[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAMB2axM1TVw05jZsFe7TsKKRN8jw=YOwu-+rA9bOAkOiCPyFqQ@mail.gmail.com>
Date: Tue, 23 Jan 2024 21:22:07 -0800
From: Amery Hung <ameryhung@...il.com>
To: Martin KaFai Lau <martin.lau@...ux.dev>
Cc: bpf@...r.kernel.org, yangpeihao@...u.edu.cn, toke@...hat.com,
jhs@...atatu.com, jiri@...nulli.us, sdf@...gle.com, xiyou.wangcong@...il.com,
yepeilin.cs@...il.com, netdev@...r.kernel.org
Subject: Re: [RFC PATCH v7 1/8] net_sched: Introduce eBPF based Qdisc
On Tue, Jan 23, 2024 at 3:51 PM Martin KaFai Lau <martin.lau@...ux.dev> wrote:
>
> On 1/17/24 1:56 PM, Amery Hung wrote:
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index 0bb92414c036..df280bbb7c0d 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -997,6 +997,7 @@ enum bpf_prog_type {
> > BPF_PROG_TYPE_SK_LOOKUP,
> > BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
> > BPF_PROG_TYPE_NETFILTER,
> > + BPF_PROG_TYPE_QDISC,
> > };
> >
> > enum bpf_attach_type {
> > @@ -1056,6 +1057,8 @@ enum bpf_attach_type {
> > BPF_CGROUP_UNIX_GETSOCKNAME,
> > BPF_NETKIT_PRIMARY,
> > BPF_NETKIT_PEER,
> > + BPF_QDISC_ENQUEUE,
> > + BPF_QDISC_DEQUEUE,
> > __MAX_BPF_ATTACH_TYPE
> > };
> >
> > @@ -7357,4 +7360,22 @@ struct bpf_iter_num {
> > __u64 __opaque[1];
> > } __attribute__((aligned(8)));
> >
> > +struct bpf_qdisc_ctx {
> > + __bpf_md_ptr(struct sk_buff *, skb);
> > + __u32 classid;
> > + __u64 expire;
> > + __u64 delta_ns;
> > +};
> > +
> > +enum {
> > + SCH_BPF_QUEUED,
> > + SCH_BPF_DEQUEUED = SCH_BPF_QUEUED,
> > + SCH_BPF_DROP,
> > + SCH_BPF_CN,
> > + SCH_BPF_THROTTLE,
> > + SCH_BPF_PASS,
> > + SCH_BPF_BYPASS,
> > + SCH_BPF_STOLEN,
> > +};
> > +
> > #endif /* _UAPI__LINUX_BPF_H__ */
>
> [ ... ]
>
> > +static bool tc_qdisc_is_valid_access(int off, int size,
> > + enum bpf_access_type type,
> > + const struct bpf_prog *prog,
> > + struct bpf_insn_access_aux *info)
> > +{
> > + struct btf *btf;
> > +
> > + if (off < 0 || off >= sizeof(struct bpf_qdisc_ctx))
> > + return false;
> > +
> > + switch (off) {
> > + case offsetof(struct bpf_qdisc_ctx, skb):
> > + if (type == BPF_WRITE ||
> > + size != sizeof_field(struct bpf_qdisc_ctx, skb))
> > + return false;
> > +
> > + if (prog->expected_attach_type != BPF_QDISC_ENQUEUE)
> > + return false;
> > +
> > + btf = bpf_get_btf_vmlinux();
> > + if (IS_ERR_OR_NULL(btf))
> > + return false;
> > +
> > + info->btf = btf;
> > + info->btf_id = tc_qdisc_ctx_access_btf_ids[0];
> > + info->reg_type = PTR_TO_BTF_ID | PTR_TRUSTED;
> > + return true;
> > + case bpf_ctx_range(struct bpf_qdisc_ctx, classid):
> > + return size == sizeof_field(struct bpf_qdisc_ctx, classid);
> > + case bpf_ctx_range(struct bpf_qdisc_ctx, expire):
> > + return size == sizeof_field(struct bpf_qdisc_ctx, expire);
> > + case bpf_ctx_range(struct bpf_qdisc_ctx, delta_ns):
> > + return size == sizeof_field(struct bpf_qdisc_ctx, delta_ns);
> > + default:
> > + return false;
> > + }
> > +
> > + return false;
> > +}
> > +
>
> [ ... ]
>
> > +static int sch_bpf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
> > + struct sk_buff **to_free)
> > +{
> > + struct bpf_sched_data *q = qdisc_priv(sch);
> > + unsigned int len = qdisc_pkt_len(skb);
> > + struct bpf_qdisc_ctx ctx = {};
> > + int res = NET_XMIT_SUCCESS;
> > + struct sch_bpf_class *cl;
> > + struct bpf_prog *enqueue;
> > +
> > + enqueue = rcu_dereference(q->enqueue_prog.prog);
> > + if (!enqueue)
> > + return NET_XMIT_DROP;
> > +
> > + ctx.skb = skb;
> > + ctx.classid = sch->handle;
> > + res = bpf_prog_run(enqueue, &ctx);
> > + switch (res) {
> > + case SCH_BPF_THROTTLE:
> > + qdisc_watchdog_schedule_range_ns(&q->watchdog, ctx.expire, ctx.delta_ns);
> > + qdisc_qstats_overlimit(sch);
> > + fallthrough;
> > + case SCH_BPF_QUEUED:
> > + qdisc_qstats_backlog_inc(sch, skb);
> > + return NET_XMIT_SUCCESS;
> > + case SCH_BPF_BYPASS:
> > + qdisc_qstats_drop(sch);
> > + __qdisc_drop(skb, to_free);
> > + return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
> > + case SCH_BPF_STOLEN:
> > + __qdisc_drop(skb, to_free);
> > + return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
> > + case SCH_BPF_CN:
> > + return NET_XMIT_CN;
> > + case SCH_BPF_PASS:
> > + break;
> > + default:
> > + return qdisc_drop(skb, sch, to_free);
> > + }
> > +
> > + cl = sch_bpf_find(sch, ctx.classid);
> > + if (!cl || !cl->qdisc)
> > + return qdisc_drop(skb, sch, to_free);
> > +
> > + res = qdisc_enqueue(skb, cl->qdisc, to_free);
> > + if (res != NET_XMIT_SUCCESS) {
> > + if (net_xmit_drop_count(res)) {
> > + qdisc_qstats_drop(sch);
> > + cl->drops++;
> > + }
> > + return res;
> > + }
> > +
> > + sch->qstats.backlog += len;
> > + sch->q.qlen++;
> > + return res;
> > +}
> > +
> > +DEFINE_PER_CPU(struct sk_buff*, bpf_skb_dequeue);
> > +
> > +static struct sk_buff *sch_bpf_dequeue(struct Qdisc *sch)
> > +{
> > + struct bpf_sched_data *q = qdisc_priv(sch);
> > + struct bpf_qdisc_ctx ctx = {};
> > + struct sk_buff *skb = NULL;
> > + struct bpf_prog *dequeue;
> > + struct sch_bpf_class *cl;
> > + int res;
> > +
> > + dequeue = rcu_dereference(q->dequeue_prog.prog);
> > + if (!dequeue)
> > + return NULL;
> > +
> > + __this_cpu_write(bpf_skb_dequeue, NULL);
> > + ctx.classid = sch->handle;
> > + res = bpf_prog_run(dequeue, &ctx);
> > + switch (res) {
> > + case SCH_BPF_DEQUEUED:
> > + skb = __this_cpu_read(bpf_skb_dequeue);
> > + qdisc_bstats_update(sch, skb);
> > + qdisc_qstats_backlog_dec(sch, skb);
> > + break;
> > + case SCH_BPF_THROTTLE:
> > + qdisc_watchdog_schedule_range_ns(&q->watchdog, ctx.expire, ctx.delta_ns);
> > + qdisc_qstats_overlimit(sch);
> > + cl = sch_bpf_find(sch, ctx.classid);
> > + if (cl)
> > + cl->overlimits++;
> > + return NULL;
> > + case SCH_BPF_PASS:
> > + cl = sch_bpf_find(sch, ctx.classid);
> > + if (!cl || !cl->qdisc)
> > + return NULL;
> > + skb = qdisc_dequeue_peeked(cl->qdisc);
> > + if (skb) {
> > + bstats_update(&cl->bstats, skb);
> > + qdisc_bstats_update(sch, skb);
> > + qdisc_qstats_backlog_dec(sch, skb);
> > + sch->q.qlen--;
> > + }
> > + break;
> > + }
> > +
> > + return skb;
> > +}
>
> [ ... ]
>
> > +static int sch_bpf_init(struct Qdisc *sch, struct nlattr *opt,
> > + struct netlink_ext_ack *extack)
> > +{
> > + struct bpf_sched_data *q = qdisc_priv(sch);
> > + int err;
> > +
> > + qdisc_watchdog_init(&q->watchdog, sch);
> > + if (opt) {
> > + err = sch_bpf_change(sch, opt, extack);
> > + if (err)
> > + return err;
> > + }
> > +
> > + err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
> > + if (err)
> > + return err;
> > +
> > + return qdisc_class_hash_init(&q->clhash);
> > +}
> > +
> > +static void sch_bpf_reset(struct Qdisc *sch)
> > +{
> > + struct bpf_sched_data *q = qdisc_priv(sch);
> > + struct sch_bpf_class *cl;
> > + unsigned int i;
> > +
> > + for (i = 0; i < q->clhash.hashsize; i++) {
> > + hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
> > + if (cl->qdisc)
> > + qdisc_reset(cl->qdisc);
> > + }
> > + }
> > +
> > + qdisc_watchdog_cancel(&q->watchdog);
> > +}
> > +
>
> [ ... ]
>
> > +static const struct Qdisc_class_ops sch_bpf_class_ops = {
> > + .graft = sch_bpf_graft,
> > + .leaf = sch_bpf_leaf,
> > + .find = sch_bpf_search,
> > + .change = sch_bpf_change_class,
> > + .delete = sch_bpf_delete,
> > + .tcf_block = sch_bpf_tcf_block,
> > + .bind_tcf = sch_bpf_bind,
> > + .unbind_tcf = sch_bpf_unbind,
> > + .dump = sch_bpf_dump_class,
> > + .dump_stats = sch_bpf_dump_class_stats,
> > + .walk = sch_bpf_walk,
> > +};
> > +
> > +static struct Qdisc_ops sch_bpf_qdisc_ops __read_mostly = {
> > + .cl_ops = &sch_bpf_class_ops,
> > + .id = "bpf",
> > + .priv_size = sizeof(struct bpf_sched_data),
> > + .enqueue = sch_bpf_enqueue,
> > + .dequeue = sch_bpf_dequeue,
>
> I looked at the high level of the patchset. The major ops that it wants to be
> programmable in bpf is the ".enqueue" and ".dequeue" (+ ".init" and ".reset" in
> patch 4 and patch 5).
>
> This patch adds a new prog type BPF_PROG_TYPE_QDISC, four attach types (each for
> ".enqueue", ".dequeue", ".init", and ".reset"), and a new "bpf_qdisc_ctx" in the
> uapi. It is no long an acceptable way to add new bpf extension.
>
> Can the ".enqueue", ".dequeue", ".init", and ".reset" be completely implemented
> in bpf (with the help of new kfuncs if needed)? Then a struct_ops for Qdisc_ops
> can be created. The bpf Qdisc_ops can be loaded through the existing struct_ops api.
>
Partially. If using struct_ops, I think we'll need another structure
like the following in bpf qdisc to be implemented with struct_ops bpf:
struct bpf_qdisc_ops {
int (*enqueue) (struct sk_buff *skb)
void (*dequeue) (void)
void (*init) (void)
void (*reset) (void)
};
Then, Qdisc_ops will wrap around them to handle things that cannot be
implemented with bpf (e.g., sch_tree_lock, returning a skb ptr).
> If other ops (like ".dump", ".dump_stats"...) do not have good use case to be
> programmable in bpf, it can stay with the kernel implementation for now and only
> allows the userspace to load the a bpf Qdisc_ops with .equeue/dequeue/init/reset
> implemented.
>
> You mentioned in the cover letter that:
> "Current struct_ops attachment model does not seem to support replacing only
> functions of a specific instance of a module, but I might be wrong."
>
> I assumed you meant allow bpf to replace only "some" ops of the Qdisc_ops? Yes,
> it can be done through the struct_ops's ".init_member". Take a look at
> bpf_tcp_ca_init_member. The kernel can assign the kernel implementation for
> ".dump" (for example) when loading the bpf Qdisc_ops.
>
I have no problem with partially replacing a struct, which like you
mentioned has been demonstrated by congestion control or sched_ext.
What I am not sure about is the ability to create multiple bpf qdiscs,
where each has different ".enqueue", ".dequeue", and so on. I like the
struct_ops approach and would love to try it if struct_ops support
this.
Thanks,
Amery
> > + .peek = qdisc_peek_dequeued,
> > + .init = sch_bpf_init,
> > + .reset = sch_bpf_reset, > + .destroy = sch_bpf_destroy,
> > + .change = sch_bpf_change,
> > + .dump = sch_bpf_dump,
> > + .dump_stats = sch_bpf_dump_stats,
> > + .owner = THIS_MODULE,
> > +};
> > +
> > +static int __init sch_bpf_mod_init(void)
> > +{
> > + return register_qdisc(&sch_bpf_qdisc_ops);
> > +}
> > +
> > +static void __exit sch_bpf_mod_exit(void)
> > +{
> > + unregister_qdisc(&sch_bpf_qdisc_ops);
> > +}
>
Powered by blists - more mailing lists