[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <2b4f1d6045d8885cf70a113f194795cf3e1ef453.1643981839.git.gnault@redhat.com>
Date: Fri, 4 Feb 2022 14:58:11 +0100
From: Guillaume Nault <gnault@...hat.com>
To: David Miller <davem@...emloft.net>,
Jakub Kicinski <kuba@...nel.org>
Cc: netdev@...r.kernel.org,
Hideaki YOSHIFUJI <yoshfuji@...ux-ipv6.org>,
David Ahern <dsahern@...nel.org>,
Toke Høiland-Jørgensen <toke@...hat.com>,
Shuah Khan <shuah@...nel.org>, linux-kselftest@...r.kernel.org,
Russell Strong <russell@...ong.id.au>,
Dave Taht <dave.taht@...il.com>
Subject: [PATCH net-next 1/4] ipv6: Define dscp_t and stop taking ECN bits
into account in fib6-rules
Define a dscp_t type and its appropriate helpers that ensure ECN bits
are not taken into account when handling DSCP.
Use this new type to replace the tclass field of struct fib6_rule, so
that fib6-rules don't get influenced by ECN bits anymore.
Before this patch, fib6-rules didn't make any distinction between the
DSCP and ECN bits. Therefore, rules specifying a DSCP (tos or dsfield
options in iproute2) stopped working as soon a packets had at least one
of its ECN bits set (as a work around one could create four rules for
each DSCP value to match, one for each possible ECN value).
After this patch fib6-rules only compare the DSCP bits. ECN doesn't
influence the result anymore. Also, fib6-rules now must have the ECN
bits cleared or they will be rejected.
Signed-off-by: Guillaume Nault <gnault@...hat.com>
---
include/net/inet_dscp.h | 57 +++++++++++++++++++
include/net/ipv6.h | 6 ++
net/ipv6/fib6_rules.c | 19 +++++--
tools/testing/selftests/net/fib_rule_tests.sh | 30 +++++++++-
4 files changed, 105 insertions(+), 7 deletions(-)
create mode 100644 include/net/inet_dscp.h
diff --git a/include/net/inet_dscp.h b/include/net/inet_dscp.h
new file mode 100644
index 000000000000..72f250dffada
--- /dev/null
+++ b/include/net/inet_dscp.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * inet_dscp.h: helpers for handling differentiated services codepoints (DSCP)
+ *
+ * DSCP is defined in RFC 2474:
+ *
+ * 0 1 2 3 4 5 6 7
+ * +---+---+---+---+---+---+---+---+
+ * | DSCP | CU |
+ * +---+---+---+---+---+---+---+---+
+ *
+ * DSCP: differentiated services codepoint
+ * CU: currently unused
+ *
+ * The whole DSCP + CU bits form the DS field.
+ * The DS field is also commonly called TOS or Traffic Class (for IPv6).
+ *
+ * Note: the CU bits are now used for Explicit Congestion Notification
+ * (RFC 3168).
+ */
+
+#ifndef _INET_DSCP_H
+#define _INET_DSCP_H
+
+#include <linux/types.h>
+
+/* Special type for storing DSCP values.
+ *
+ * A dscp_t variable stores a DS field with the CU (ECN) bits cleared.
+ * Using dscp_t allows to strictly separate DSCP and ECN bits, thus avoiding
+ * bugs where ECN bits are erroneously taken into account during FIB lookups
+ * or policy routing.
+ *
+ * Note: to get the real DSCP value contained in a dscp_t variable one would
+ * have to do a bit shift after calling inet_dscp_to_dsfield(). We could have
+ * a helper for that, but there's currently no users.
+ */
+typedef u8 __bitwise dscp_t;
+
+#define INET_DSCP_MASK 0xfc
+
+static inline dscp_t inet_dsfield_to_dscp(__u8 dsfield)
+{
+ return (__force dscp_t)(dsfield & INET_DSCP_MASK);
+}
+
+static inline __u8 inet_dscp_to_dsfield(dscp_t dscp)
+{
+ return (__force __u8)dscp;
+}
+
+static inline bool inet_validate_dscp(__u8 val)
+{
+ return !(val & ~INET_DSCP_MASK);
+}
+
+#endif /* _INET_DSCP_H */
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 082f30256f59..3d898eb6df9c 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -18,6 +18,7 @@
#include <net/ndisc.h>
#include <net/flow.h>
#include <net/flow_dissector.h>
+#include <net/inet_dscp.h>
#include <net/snmp.h>
#include <net/netns/hash.h>
@@ -975,6 +976,11 @@ static inline u8 ip6_tclass(__be32 flowinfo)
return ntohl(flowinfo & IPV6_TCLASS_MASK) >> IPV6_TCLASS_SHIFT;
}
+static inline dscp_t ip6_dscp(__be32 flowinfo)
+{
+ return inet_dsfield_to_dscp(ip6_tclass(flowinfo));
+}
+
static inline __be32 ip6_make_flowinfo(unsigned int tclass, __be32 flowlabel)
{
return htonl(tclass << IPV6_TCLASS_SHIFT) | flowlabel;
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index ec029c86ae06..e2a7b0059669 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -16,6 +16,7 @@
#include <linux/indirect_call_wrapper.h>
#include <net/fib_rules.h>
+#include <net/inet_dscp.h>
#include <net/ipv6.h>
#include <net/addrconf.h>
#include <net/ip6_route.h>
@@ -25,14 +26,14 @@ struct fib6_rule {
struct fib_rule common;
struct rt6key src;
struct rt6key dst;
- u8 tclass;
+ dscp_t dscp;
};
static bool fib6_rule_matchall(const struct fib_rule *rule)
{
struct fib6_rule *r = container_of(rule, struct fib6_rule, common);
- if (r->dst.plen || r->src.plen || r->tclass)
+ if (r->dst.plen || r->src.plen || r->dscp)
return false;
return fib_rule_matchall(rule);
}
@@ -323,7 +324,7 @@ INDIRECT_CALLABLE_SCOPE int fib6_rule_match(struct fib_rule *rule,
return 0;
}
- if (r->tclass && r->tclass != ip6_tclass(fl6->flowlabel))
+ if (r->dscp && r->dscp != ip6_dscp(fl6->flowlabel))
return 0;
if (rule->ip_proto && (rule->ip_proto != fl6->flowi6_proto))
@@ -349,6 +350,13 @@ static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
struct net *net = sock_net(skb->sk);
struct fib6_rule *rule6 = (struct fib6_rule *) rule;
+ if (!inet_validate_dscp(frh->tos)) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid dsfield (tos): ECN bits must be 0");
+ goto errout;
+ }
+ rule6->dscp = inet_dsfield_to_dscp(frh->tos);
+
if (rule->action == FR_ACT_TO_TBL && !rule->l3mdev) {
if (rule->table == RT6_TABLE_UNSPEC) {
NL_SET_ERR_MSG(extack, "Invalid table");
@@ -369,7 +377,6 @@ static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
rule6->src.plen = frh->src_len;
rule6->dst.plen = frh->dst_len;
- rule6->tclass = frh->tos;
if (fib_rule_requires_fldissect(rule))
net->ipv6.fib6_rules_require_fldissect++;
@@ -402,7 +409,7 @@ static int fib6_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
if (frh->dst_len && (rule6->dst.plen != frh->dst_len))
return 0;
- if (frh->tos && (rule6->tclass != frh->tos))
+ if (frh->tos && inet_dscp_to_dsfield(rule6->dscp) != frh->tos)
return 0;
if (frh->src_len &&
@@ -423,7 +430,7 @@ static int fib6_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
frh->dst_len = rule6->dst.plen;
frh->src_len = rule6->src.plen;
- frh->tos = rule6->tclass;
+ frh->tos = inet_dscp_to_dsfield(rule6->dscp);
if ((rule6->dst.plen &&
nla_put_in6_addr(skb, FRA_DST, &rule6->dst.addr)) ||
diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh
index 3b0489910422..d7a9ab3be1d3 100755
--- a/tools/testing/selftests/net/fib_rule_tests.sh
+++ b/tools/testing/selftests/net/fib_rule_tests.sh
@@ -114,10 +114,25 @@ fib_rule6_test_match_n_redirect()
log_test $? 0 "rule6 del by pref: $description"
}
+fib_rule6_test_reject()
+{
+ local match="$1"
+ local rc
+
+ $IP -6 rule add $match table $RTABLE 2>/dev/null
+ rc=$?
+ log_test $rc 2 "rule6 check: $match"
+
+ if [ $rc -eq 0 ]; then
+ $IP -6 rule del $match table $RTABLE
+ fi
+}
+
fib_rule6_test()
{
local getmatch
local match
+ local cnt
# setup the fib rule redirect route
$IP -6 route add table $RTABLE default via $GW_IP6 dev $DEV onlink
@@ -128,8 +143,21 @@ fib_rule6_test()
match="from $SRC_IP6 iif $DEV"
fib_rule6_test_match_n_redirect "$match" "$match" "iif redirect to table"
+ # Reject dsfield (tos) options which have ECN bits set
+ for cnt in $(seq 1 3); do
+ match="dsfield $cnt"
+ fib_rule6_test_reject "$match"
+ done
+
+ # Don't take ECN bits into account when matching on dsfield
match="tos 0x10"
- fib_rule6_test_match_n_redirect "$match" "$match" "tos redirect to table"
+ for cnt in "0x10" "0x11" "0x12" "0x13"; do
+ # Using option 'tos' instead of 'dsfield' as old iproute2
+ # versions don't support 'dsfield' in ip rule show.
+ getmatch="tos $cnt"
+ fib_rule6_test_match_n_redirect "$match" "$getmatch" \
+ "$getmatch redirect to table"
+ done
match="fwmark 0x64"
getmatch="mark 0x64"
--
2.21.3
Powered by blists - more mailing lists