lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <97f318a1-072d-80c2-7de7-6d0d71ca0b10@iogearbox.net>
Date: Fri, 22 Sep 2023 10:12:29 +0200
From: Daniel Borkmann <daniel@...earbox.net>
To: Jamal Hadi Salim <jhs@...atatu.com>
Cc: Victor Nogueira <victor@...atatu.com>, xiyou.wangcong@...il.com,
 jiri@...nulli.us, davem@...emloft.net, edumazet@...gle.com, kuba@...nel.org,
 pabeni@...hat.com, paulb@...dia.com, netdev@...r.kernel.org,
 kernel@...atatu.com, martin.lau@...ux.dev, bpf@...r.kernel.org
Subject: Re: [PATCH net-next 1/1] net/sched: Disambiguate verdict from return
 code

On 9/20/23 1:20 AM, Jamal Hadi Salim wrote:
> On Tue, Sep 19, 2023 at 6:15 PM Daniel Borkmann <daniel@...earbox.net> wrote:
>>
>> [ +Martin, bpf ]
>>
>> On 9/19/23 4:59 PM, Victor Nogueira wrote:
>>> Currently there is no way to distinguish between an error and a
>>> classification verdict. This patch adds the verdict field as a part of
>>> struct tcf_result. That way, tcf_classify can return a proper
>>> error number when it fails, and we keep the classification result
>>> information encapsulated in struct tcf_result.
>>>
>>> Also add values SKB_DROP_REASON_TC_EGRESS_ERROR and
>>> SKB_DROP_REASON_TC_INGRESS_ERROR to enum skb_drop_reason.
>>> With that we can distinguish between a drop from a processing error versus
>>> a drop from classification.
>>>
>>> Signed-off-by: Victor Nogueira <victor@...atatu.com>
>>> ---
>>>    include/net/dropreason-core.h |  6 +++++
>>>    include/net/sch_generic.h     |  7 ++++++
>>>    net/core/dev.c                | 42 ++++++++++++++++++++++++++---------
>>>    net/sched/cls_api.c           | 38 ++++++++++++++++++++-----------
>>>    net/sched/sch_cake.c          | 32 +++++++++++++-------------
>>>    net/sched/sch_drr.c           | 33 +++++++++++++--------------
>>>    net/sched/sch_ets.c           |  6 +++--
>>>    net/sched/sch_fq_codel.c      | 29 ++++++++++++------------
>>>    net/sched/sch_fq_pie.c        | 28 +++++++++++------------
>>>    net/sched/sch_hfsc.c          |  6 +++--
>>>    net/sched/sch_htb.c           |  6 +++--
>>>    net/sched/sch_multiq.c        |  6 +++--
>>>    net/sched/sch_prio.c          |  7 ++++--
>>>    net/sched/sch_qfq.c           | 34 +++++++++++++---------------
>>>    net/sched/sch_sfb.c           | 29 ++++++++++++------------
>>>    net/sched/sch_sfq.c           | 28 +++++++++++------------
>>>    16 files changed, 195 insertions(+), 142 deletions(-)
>>>
>>> diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
>>> index a587e83fc169..b1c069c8e7f2 100644
>>> --- a/include/net/dropreason-core.h
>>> +++ b/include/net/dropreason-core.h
>>> @@ -80,6 +80,8 @@
>>>        FN(IPV6_NDISC_BAD_OPTIONS)      \
>>>        FN(IPV6_NDISC_NS_OTHERHOST)     \
>>>        FN(QUEUE_PURGE)                 \
>>> +     FN(TC_EGRESS_ERROR)             \
>>> +     FN(TC_INGRESS_ERROR)            \
>>>        FNe(MAX)
>>>
>>>    /**
>>> @@ -345,6 +347,10 @@ enum skb_drop_reason {
>>>        SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST,
>>>        /** @SKB_DROP_REASON_QUEUE_PURGE: bulk free. */
>>>        SKB_DROP_REASON_QUEUE_PURGE,
>>> +     /** @SKB_DROP_REASON_TC_EGRESS_ERROR: dropped in TC egress HOOK due to error */
>>> +     SKB_DROP_REASON_TC_EGRESS_ERROR,
>>> +     /** @SKB_DROP_REASON_TC_INGRESS_ERROR: dropped in TC ingress HOOK due to error */
>>> +     SKB_DROP_REASON_TC_INGRESS_ERROR,
>>>        /**
>>>         * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which
>>>         * shouldn't be used as a real 'reason' - only for tracing code gen
>>> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
>>> index f232512505f8..9a3f71d2545e 100644
>>> --- a/include/net/sch_generic.h
>>> +++ b/include/net/sch_generic.h
>>> @@ -326,6 +326,7 @@ struct Qdisc_ops {
>>>
>>>
>>>    struct tcf_result {
>>> +     u32 verdict;
>>>        union {
>>>                struct {
>>>                        unsigned long   class;
>>> @@ -336,6 +337,12 @@ struct tcf_result {
>>>        };
>>>    };
>>>
>>> +static inline void tcf_result_set_verdict(struct tcf_result *res,
>>> +                                       const u32 verdict)
>>> +{
>>> +     res->verdict = verdict;
>>> +}
>>> +
>>>    struct tcf_chain;
>>>
>>>    struct tcf_proto_ops {
>>> diff --git a/net/core/dev.c b/net/core/dev.c
>>> index ccff2b6ef958..1450f4741d9b 100644
>>> --- a/net/core/dev.c
>>> +++ b/net/core/dev.c
>>> @@ -3910,31 +3910,39 @@ EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
>>>    #endif /* CONFIG_NET_EGRESS */
>>>
>>>    #ifdef CONFIG_NET_XGRESS
>>> -static int tc_run(struct tcx_entry *entry, struct sk_buff *skb)
>>> +static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
>>> +               struct tcf_result *res)
>>>    {
>>> -     int ret = TC_ACT_UNSPEC;
>>> +     int ret = 0;
>>>    #ifdef CONFIG_NET_CLS_ACT
>>>        struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq);
>>> -     struct tcf_result res;
>>>
>>> -     if (!miniq)
>>> +     if (!miniq) {
>>> +             tcf_result_set_verdict(res, TC_ACT_UNSPEC);
>>>                return ret;
>>> +     }
>>>
>>>        tc_skb_cb(skb)->mru = 0;
>>>        tc_skb_cb(skb)->post_ct = false;
>>>
>>>        mini_qdisc_bstats_cpu_update(miniq, skb);
>>> -     ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
>>> +     ret = tcf_classify(skb, miniq->block, miniq->filter_list, res, false);
>>> +     if (ret < 0) {
>>> +             mini_qdisc_qstats_cpu_drop(miniq);
>>> +             return ret;
>>> +     }
>>>        /* Only tcf related quirks below. */
>>> -     switch (ret) {
>>> +     switch (res->verdict) {
>>>        case TC_ACT_SHOT:
>>>                mini_qdisc_qstats_cpu_drop(miniq);
>>>                break;
>>>        case TC_ACT_OK:
>>>        case TC_ACT_RECLASSIFY:
>>> -             skb->tc_index = TC_H_MIN(res.classid);
>>> +             skb->tc_index = TC_H_MIN(res->classid);
>>>                break;
>>>        }
>>> +#else
>>> +     tcf_result_set_verdict(res, TC_ACT_UNSPEC);
>>>    #endif /* CONFIG_NET_CLS_ACT */
>>>        return ret;
>>>    }
>>> @@ -3977,6 +3985,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
>>>                   struct net_device *orig_dev, bool *another)
>>>    {
>>>        struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
>>> +     struct tcf_result res = {0};
>>>        int sch_ret;
>>>
>>>        if (!entry)
>>> @@ -3994,9 +4003,14 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
>>>                if (sch_ret != TC_ACT_UNSPEC)
>>>                        goto ingress_verdict;
>>>        }
>>> -     sch_ret = tc_run(tcx_entry(entry), skb);
>>> +     sch_ret = tc_run(tcx_entry(entry), skb, &res);
>>> +     if (sch_ret < 0) {
>>> +             kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS_ERROR);
>>> +             *ret = NET_RX_DROP;
>>> +             return NULL;
>>> +     }
>>>    ingress_verdict:
>>> -     switch (sch_ret) {
>>> +     switch (res.verdict) {
>>
>> This breaks tcx, please move all this logic into tc_run(). No changes to sch_handle_ingress()
>> or sch_handle_egress should be necessary, you can then just remap the return code to TC_ACT_SHOT
>> in such case.
> 
> I think it is valuable to have a good reason code like
> SKB_DROP_REASON_TC_XXX_ERROR to disambiguate between errors vs
> verdicts in the case of tc_run() variant.
> For tcx_run(), does this look ok (for consistency)?:
> 
> if (static_branch_unlikely(&tcx_needed_key)) {
>                  sch_ret = tcx_run(entry, skb, true);
>                  if (sch_ret != TC_ACT_UNSPEC) {
>                          res.verdict = sch_ret;
>                          goto ingress_verdict;
>                  }
> }

In the above case we don't have 'internal' errors which you want to trace, so I would
also love to avoid the cost of zeroing struct tcf_result res which should be 3x 8b for
every packet.

I was more thinking like something below could be a better choice. I presume your main
goal is to trace where these errors originated in the first place, so it might even be
useful to capture the actual return code as well.

Then you can use perf script, bpf and whatnot to gather further insights into what
happened while being less invasive and avoiding the need to extend struct tcf_result.

This would be quite similar to trace_xdp_exception() as well, and I think you can guarantee
that in fast path all errors are < TC_ACT_UNSPEC anyway.

diff --git a/net/core/dev.c b/net/core/dev.c
index 85df22f05c38..4089d195144d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3925,6 +3925,10 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb)

  	mini_qdisc_bstats_cpu_update(miniq, skb);
  	ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
+	if (unlikely(ret < TC_ACT_UNSPEC)) {
+		trace_tc_exception(skb->dev, skb->tc_at_ingress, ret);
+		ret = TC_ACT_SHOT;
+	}
  	/* Only tcf related quirks below. */
  	switch (ret) {
  	case TC_ACT_SHOT:

Best,
Daniel

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ