[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <cb4db95b-89ff-02ef-f36f-7a8b0edc5863@iogearbox.net>
Date: Tue, 3 Oct 2023 11:00:30 +0200
From: Daniel Borkmann <daniel@...earbox.net>
To: Jamal Hadi Salim <jhs@...atatu.com>
Cc: Victor Nogueira <victor@...atatu.com>, xiyou.wangcong@...il.com,
jiri@...nulli.us, davem@...emloft.net, edumazet@...gle.com, kuba@...nel.org,
pabeni@...hat.com, paulb@...dia.com, netdev@...r.kernel.org,
kernel@...atatu.com, martin.lau@...ux.dev, bpf@...r.kernel.org
Subject: Re: [PATCH net-next 1/1] net/sched: Disambiguate verdict from return
code
On 10/2/23 9:54 PM, Jamal Hadi Salim wrote:
> On Fri, Sep 29, 2023 at 11:48 AM Daniel Borkmann <daniel@...earbox.net> wrote:
>> On 9/26/23 1:01 AM, Jamal Hadi Salim wrote:
>>> On Fri, Sep 22, 2023 at 4:12 AM Daniel Borkmann <daniel@...earbox.net> wrote:
>>>> On 9/20/23 1:20 AM, Jamal Hadi Salim wrote:
>>>>> On Tue, Sep 19, 2023 at 6:15 PM Daniel Borkmann <daniel@...earbox.net> wrote:
>>>>>> On 9/19/23 4:59 PM, Victor Nogueira wrote:
>> [...]
>>>>
>>>> In the above case we don't have 'internal' errors which you want to trace, so I would
>>>> also love to avoid the cost of zeroing struct tcf_result res which should be 3x 8b for
>>>> every packet.
>>>
>>> We can move the zeroing inside tc_run() but we declare it in the same
>>> spot as we do right now. You will still need to set res.verdict as
>>> above.
>>> Would that work for you?
>>
>> What I'm not following is that with the below you can avoid the unnecessary
>> fast path cost (which is only for corner case which is almost never hit) and
>> get even better visibility. Are you saying it doesn't work?
>
> I am probably missing something:
> -1/UNSPEC is a legit errno. And the main motivation here for this
> patch is to disambiguate if it was -EPERM vs UNSPEC
> Maybe that is what you are calling a "corner case"?
Yes, but what is the use-case to ever return a -EPERM from the fast-path? This can
be audited for the code in the tree and therefore avoided so that you never run into
this problem.
> There are two options in my mind right now (since you are guaranteed
> in tcx_run you will never return anything below UNSPEC):
> 1) we just have the switch statement invocation inside an inline
> function and you can pass it sch_ret (for tcx case) and we'll pass it
> res.verdit for tc_run() case.
> 2) is something is we leave tcx_run alone and we have something along
> the lines of:
>
> --------------
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 1450f4741d9b..93613bce647c 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3985,7 +3985,7 @@ sch_handle_ingress(struct sk_buff *skb, struct
> packet_type **pt_prev, int *ret,
> struct net_device *orig_dev, bool *another)
> {
> struct bpf_mprog_entry *entry =
> rcu_dereference_bh(skb->dev->tcx_ingress);
> - struct tcf_result res = {0};
> + struct tcf_result res;
> int sch_ret;
>
> if (!entry)
> @@ -4003,14 +4003,16 @@ sch_handle_ingress(struct sk_buff *skb, struct
> packet_type **pt_prev, int *ret,
> if (sch_ret != TC_ACT_UNSPEC)
> goto ingress_verdict;
> }
> +
> + res.verdict = 0;
> sch_ret = tc_run(tcx_entry(entry), skb, &res);
> if (sch_ret < 0) {
> kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS_ERROR);
> *ret = NET_RX_DROP;
> return NULL;
> }
> + sch_ret = res.verdict;
> ingress_verdict:
> - switch (res.verdict) {
> + switch (sch_ret) {
> case TC_ACT_REDIRECT:
> /* skb_mac_header check was done by BPF, so we can
> safely
> * push the L2 header back before redirecting to another
> -----------
>
> on the drop reason - our thinking is to support drop_watch alongside
> tracepoint given kfree_skb_reason exists already; if i am not mistaken
> what you suggested would require us to create a new tracepoint?
So if the only thing you really care about is the different drop reason for
kfree_skb_reason, then I still don't follow why you need to drag this into
struct tcf_result. This can be done in a much simpler and more efficient way
like the following:
diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
index a587e83fc169..b1c069c8e7f2 100644
--- a/include/net/dropreason-core.h
+++ b/include/net/dropreason-core.h
@@ -80,6 +80,8 @@
FN(IPV6_NDISC_BAD_OPTIONS) \
FN(IPV6_NDISC_NS_OTHERHOST) \
FN(QUEUE_PURGE) \
+ FN(TC_EGRESS_ERROR) \
+ FN(TC_INGRESS_ERROR) \
FNe(MAX)
/**
@@ -345,6 +347,10 @@ enum skb_drop_reason {
SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST,
/** @SKB_DROP_REASON_QUEUE_PURGE: bulk free. */
SKB_DROP_REASON_QUEUE_PURGE,
+ /** @SKB_DROP_REASON_TC_EGRESS_ERROR: dropped in TC egress HOOK due to error */
+ SKB_DROP_REASON_TC_EGRESS_ERROR,
+ /** @SKB_DROP_REASON_TC_INGRESS_ERROR: dropped in TC ingress HOOK due to error */
+ SKB_DROP_REASON_TC_INGRESS_ERROR,
/**
* @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which
* shouldn't be used as a real 'reason' - only for tracing code gen
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index f308e8268651..cd2444dd3745 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -10,6 +10,7 @@
/* TC action not accessible from user space */
#define TC_ACT_CONSUMED (TC_ACT_VALUE_MAX + 1)
+#define TC_ACT_ABORT (TC_ACT_VALUE_MAX + 2)
/* Basic packet classifier frontend definitions. */
diff --git a/net/core/dev.c b/net/core/dev.c
index 85df22f05c38..3abb4d71c170 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4011,7 +4011,10 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
*ret = NET_RX_SUCCESS;
return NULL;
case TC_ACT_SHOT:
- kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS);
+ case TC_ACT_ABORT:
+ kfree_skb_reason(skb, likely(sch_ret == TC_ACT_SHOT) ?
+ SKB_DROP_REASON_TC_INGRESS :
+ SKB_DROP_REASON_TC_INGRESS_ERROR);
*ret = NET_RX_DROP;
return NULL;
/* used by tc_run */
@@ -4054,7 +4057,10 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
*ret = NET_XMIT_SUCCESS;
return NULL;
case TC_ACT_SHOT:
- kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS);
+ case TC_ACT_ABORT:
+ kfree_skb_reason(skb, likely(sch_ret == TC_ACT_SHOT) ?
+ SKB_DROP_REASON_TC_EGRESS :
+ SKB_DROP_REASON_TC_EGRESS_ERROR);
*ret = NET_XMIT_DROP;
return NULL;
/* used by tc_run */
Then you just return the internal TC_ACT_ABORT code for internal 'exceptions',
and you'll get the same result to make it observable for dropwatch.
Thanks,
Daniel
Powered by blists - more mailing lists