[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20190319194929.10798-2-ldir@darbyshire-bryant.me.uk>
Date: Tue, 19 Mar 2019 19:49:58 +0000
From: Kevin 'ldir' Darbyshire-Bryant <ldir@...byshire-bryant.me.uk>
To: "netdev@...r.kernel.org" <netdev@...r.kernel.org>
CC: "jiri@...nulli.us" <jiri@...nulli.us>,
"xiyou.wangcong@...il.com" <xiyou.wangcong@...il.com>,
"jhs@...atatu.com" <jhs@...atatu.com>,
Kevin 'ldir' Darbyshire-Bryant <ldir@...byshire-bryant.me.uk>
Subject: [PATCH 1/1] net: sched: Introduce conndscp action
Conndscp is a new tc filter action module. It is designed to copy DSCPs
to conntrack marks and the reverse operation of conntrack mark contained
DSCPs to the diffserv field of suitable skbs.
The feature is intended for use and has been found useful for restoring
ingress classifications based on egress classifications across links
that bleach or otherwise change DSCP, typically home ISP Internet links.
Restoring DSCP on ingress on the WAN link allows qdiscs such as CAKE to
shape inbound packets according to policies that are easier to implement
on egress.
Ingress classification is traditionally a challenging task since
iptables rules haven't yet run and tc filter/eBPF programs are pre-NAT
lookups, hence are unable to see internal IPv4 addresses as used on the
typical home masquerading gateway.
conndscp understands the following parameters:
mask - a 32 bit mask of at least 6 contiguous bits where conndscp will
place the DSCP in conntrack mark. The DSCP is left-shifted by the
number of unset lower bits of the mask before storing into the mark
field.
statemask - a 32 bit mask of (usually) 1 bit length, outside the area
specified by mask. This represents a conditional operation flag - get
will only store the DSCP if the flag is unset. set will only restore
the DSCP if the flag is set. This is useful to implement a 'one shot'
iptables based classification where the 'complicated' iptables rules are
only run once to classify the connection on initial (egress) packet and
subsequent packets are all marked/restored with the same DSCP. A mask
of zero disables the conditional behaviour.
mode - get/set/both - get stores the DSCP into the mark, set restores
the DSCP into the diffserv field from the mark, both 'gets' the mark and
then 'sets' it in that order.
optional parameters:
zone - conntrack zone
control - action related control (reclassify | pipe | drop | continue |
ok | goto chain <CHAIN_INDEX>
Signed-off-by: Kevin Darbyshire-Bryant <ldir@...byshire-bryant.me.uk>
---
include/net/tc_act/tc_conndscp.h | 19 ++
include/uapi/linux/tc_act/tc_conndscp.h | 33 +++
net/sched/Kconfig | 13 +
net/sched/Makefile | 1 +
net/sched/act_conndscp.c | 333 ++++++++++++++++++++++
tools/testing/selftests/tc-testing/config | 1 +
6 files changed, 400 insertions(+)
create mode 100644 include/net/tc_act/tc_conndscp.h
create mode 100644 include/uapi/linux/tc_act/tc_conndscp.h
create mode 100644 net/sched/act_conndscp.c
diff --git a/include/net/tc_act/tc_conndscp.h b/include/net/tc_act/tc_conndscp.h
new file mode 100644
index 000000000000..4cb328fc487d
--- /dev/null
+++ b/include/net/tc_act/tc_conndscp.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __NET_TC_CONNDSCP_H
+#define __NET_TC_CONNDSCP_H
+
+#include <net/act_api.h>
+
+struct tcf_conndscp_info {
+ struct tc_action common;
+ struct net *net;
+ u16 zone;
+ u32 mask;
+ u32 statemask;
+ u8 mode;
+ u8 maskshift;
+};
+
+#define to_conndscp(a) ((struct tcf_conndscp_info *)a)
+
+#endif /* __NET_TC_CONNDSCP_H */
diff --git a/include/uapi/linux/tc_act/tc_conndscp.h b/include/uapi/linux/tc_act/tc_conndscp.h
new file mode 100644
index 000000000000..0897b5d6b0ce
--- /dev/null
+++ b/include/uapi/linux/tc_act/tc_conndscp.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __UAPI_TC_CONNDSCP_H
+#define __UAPI_TC_CONNDSCP_H
+
+#include <linux/types.h>
+#include <linux/pkt_cls.h>
+
+#define TCA_ACT_CONNDSCP 27
+
+struct tc_conndscp {
+ tc_gen;
+ __u16 zone;
+ __u32 mask;
+ __u32 statemask;
+ __u8 mode;
+ __u8 maskshift;
+};
+
+enum {
+ TCA_CONNDSCP_UNSPEC,
+ TCA_CONNDSCP_PARMS,
+ TCA_CONNDSCP_TM,
+ TCA_CONNDSCP_PAD,
+ __TCA_CONNDSCP_MAX
+};
+#define TCA_CONNDSCP_MAX (__TCA_CONNDSCP_MAX - 1)
+
+enum {
+ CONNDSCP_FLAG_GETDSCP = BIT(0),
+ CONNDSCP_FLAG_SETDSCP = BIT(1)
+};
+
+#endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 1b9afdee5ba9..f43788b9d332 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -865,6 +865,19 @@ config NET_ACT_BPF
To compile this code as a module, choose M here: the
module will be called act_bpf.
+config NET_ACT_CONNDSCP
+ tristate "DSCP to Netfilter Connection Mark Store/Retriever"
+ depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
+ depends on NF_CONNTRACK && NF_CONNTRACK_MARK
+ ---help---
+ Say Y here to allow storing of DSCP into conn mark
+ and vice verca
+
+ If unsure, say N.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_connmark.
+
config NET_ACT_CONNMARK
tristate "Netfilter Connection Mark Retriever"
depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 8a40431d7b5c..b78198944618 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_NET_ACT_SKBEDIT) += act_skbedit.o
obj-$(CONFIG_NET_ACT_CSUM) += act_csum.o
obj-$(CONFIG_NET_ACT_VLAN) += act_vlan.o
obj-$(CONFIG_NET_ACT_BPF) += act_bpf.o
+obj-$(CONFIG_NET_ACT_CONNDSCP) += act_conndscp.o
obj-$(CONFIG_NET_ACT_CONNMARK) += act_connmark.o
obj-$(CONFIG_NET_ACT_SKBMOD) += act_skbmod.o
obj-$(CONFIG_NET_ACT_IFE) += act_ife.o
diff --git a/net/sched/act_conndscp.c b/net/sched/act_conndscp.c
new file mode 100644
index 000000000000..8ee87e2ab814
--- /dev/null
+++ b/net/sched/act_conndscp.c
@@ -0,0 +1,333 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* net/sched/act_conndscp.c netfilter conndscp dscp<->connmark action
+ *
+ * Copyright (c) 2019 Kevin Darbyshire-Bryant <ldir@...byshire-bryant.me.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/pkt_cls.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/act_api.h>
+#include <uapi/linux/tc_act/tc_conndscp.h>
+#include <net/tc_act/tc_conndscp.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+static unsigned int conndscp_net_id;
+static struct tc_action_ops act_conndscp_ops;
+
+static void tcf_conndscp_get(struct nf_conn *ct, struct tcf_conndscp_info *ca,
+ struct sk_buff *skb, int proto)
+{
+ u32 newmark;
+ u8 dscp;
+
+ /* mark does not contain DSCP so store DSCP bits into c->mark */
+ switch (proto) {
+ case NFPROTO_IPV4:
+ dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
+ break;
+ case NFPROTO_IPV6:
+ dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
+ break;
+ default:
+ dscp = 0;
+ break;
+ }
+ newmark = ct->mark & ~(ca->mask | ca->statemask);
+ newmark |= (dscp << ca->maskshift) | ca->statemask;
+ if (ct->mark != newmark) {
+ /* using requeues stats to count how many connmark updates */
+ ca->tcf_qstats.requeues++;
+ ct->mark = newmark;
+ nf_conntrack_event_cache(IPCT_MARK, ct);
+ }
+}
+
+static void tcf_conndscp_set(struct nf_conn *ct, struct tcf_conndscp_info *ca,
+ struct sk_buff *skb, int proto)
+{
+ u8 newdscp;
+
+ newdscp = (((ct->mark & ca->mask) >> ca->maskshift) << 2) &
+ ~INET_ECN_MASK;
+
+ /* mark contains DSCP so restore DSCP bits from c->mark into diffserv */
+ /* using overlimits stats to count how many DSCP updates */
+ switch (proto) {
+ case NFPROTO_IPV4:
+ if ((ipv4_get_dsfield(ip_hdr(skb)) & ~INET_ECN_MASK) !=
+ newdscp) {
+ ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK,
+ newdscp);
+ ca->tcf_qstats.overlimits++;
+ }
+ break;
+ case NFPROTO_IPV6:
+ if ((ipv6_get_dsfield(ipv6_hdr(skb)) &
+ ~INET_ECN_MASK) != newdscp) {
+ ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK,
+ newdscp);
+ ca->tcf_qstats.overlimits++;
+ }
+ break;
+ default:
+ break;
+ }
+}
+
+static int tcf_conndscp_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ const struct nf_conntrack_tuple_hash *thash;
+ struct nf_conntrack_tuple tuple;
+ enum ip_conntrack_info ctinfo;
+ struct tcf_conndscp_info *ca = to_conndscp(a);
+ struct nf_conntrack_zone zone;
+ struct nf_conn *ct;
+ int proto;
+
+ spin_lock(&ca->tcf_lock);
+ tcf_lastuse_update(&ca->tcf_tm);
+ bstats_update(&ca->tcf_bstats, skb);
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ if (skb->len < sizeof(struct iphdr))
+ goto out;
+
+ proto = NFPROTO_IPV4;
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ if (skb->len < sizeof(struct ipv6hdr))
+ goto out;
+
+ proto = NFPROTO_IPV6;
+ } else {
+ goto out;
+ }
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct) {
+ if (ca->mode & CONNDSCP_FLAG_SETDSCP &&
+ (!ca->statemask || (ct->mark & ca->statemask)))
+ tcf_conndscp_set(ct, ca, skb, proto);
+ else if (ca->mode & CONNDSCP_FLAG_GETDSCP &&
+ (!ca->statemask || !(ct->mark & ca->statemask)))
+ tcf_conndscp_get(ct, ca, skb, proto);
+ goto out;
+ }
+
+ if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
+ proto, ca->net, &tuple))
+ goto out;
+
+ zone.id = ca->zone;
+ zone.dir = NF_CT_DEFAULT_ZONE_DIR;
+
+ thash = nf_conntrack_find_get(ca->net, &zone, &tuple);
+ if (!thash)
+ goto out;
+
+ ct = nf_ct_tuplehash_to_ctrack(thash);
+ if (ca->mode & CONNDSCP_FLAG_SETDSCP &&
+ (!ca->statemask || (ct->mark & ca->statemask)))
+ tcf_conndscp_set(ct, ca, skb, proto);
+ else if (ca->mode & CONNDSCP_FLAG_GETDSCP &&
+ (!ca->statemask || !(ct->mark & ca->statemask)))
+ tcf_conndscp_get(ct, ca, skb, proto);
+ nf_ct_put(ct);
+
+out:
+ spin_unlock(&ca->tcf_lock);
+ return ca->tcf_action;
+}
+
+static const struct nla_policy conndscp_policy[TCA_CONNDSCP_MAX + 1] = {
+ [TCA_CONNDSCP_PARMS] = { .len = sizeof(struct tc_conndscp) },
+};
+
+static void conndscp_parmset(struct tcf_conndscp_info *ci,
+ struct tc_conndscp *parm)
+{
+ ci->tcf_action = parm->action;
+ ci->zone = parm->zone;
+ ci->mask = parm->mask;
+ ci->maskshift = ci->mask ? __ffs(ci->mask) : 0;
+ ci->statemask = parm->statemask;
+ ci->mode = parm->mode;
+
+ /* let's not trust userspace entirely */
+ /* need at least contiguous 6 bit mask */
+ if ((0x3f & (ci->mask >> ci->maskshift)) != 0x3f)
+ ci->mode = 0;
+ if (ci->mask & ci->statemask)
+ ci->mode = 0;
+}
+
+static int tcf_conndscp_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a,
+ int ovr, int bind, bool rtnl_held,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, conndscp_net_id);
+ struct nlattr *tb[TCA_CONNDSCP_MAX + 1];
+ struct tcf_conndscp_info *ci;
+ struct tc_conndscp *parm;
+ int ret = 0;
+
+ if (!nla)
+ return -EINVAL;
+
+ ret = nla_parse_nested(tb, TCA_CONNDSCP_MAX, nla, conndscp_policy,
+ NULL);
+ if (ret < 0)
+ return ret;
+
+ if (!tb[TCA_CONNDSCP_PARMS])
+ return -EINVAL;
+
+ parm = nla_data(tb[TCA_CONNDSCP_PARMS]);
+
+ ret = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ if (!ret) {
+ ret = tcf_idr_create(tn, parm->index, est, a,
+ &act_conndscp_ops, bind, false);
+ if (ret)
+ return ret;
+
+ ci = to_conndscp(*a);
+ ci->net = net;
+ conndscp_parmset(ci, parm);
+
+ tcf_idr_insert(tn, *a);
+ ret = ACT_P_CREATED;
+ } else if (ret > 0) {
+ ci = to_conndscp(*a);
+ if (bind)
+ return 0;
+ tcf_idr_release(*a, bind);
+ if (!ovr)
+ return -EEXIST;
+ /* replacing action and zone */
+ spin_lock_bh(&ci->tcf_lock);
+ conndscp_parmset(ci, parm);
+ spin_unlock_bh(&ci->tcf_lock);
+ ret = 0;
+ }
+
+ return ret;
+}
+
+static inline int tcf_conndscp_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_conndscp_info *ci = to_conndscp(a);
+ struct tc_conndscp opt = {
+ .index = ci->tcf_index,
+ .refcnt = refcount_read(&ci->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind,
+ };
+ struct tcf_t t;
+
+ spin_lock_bh(&ci->tcf_lock);
+ opt.action = ci->tcf_action;
+ opt.zone = ci->zone;
+ opt.mask = ci->mask;
+ opt.statemask = ci->statemask;
+ opt.mode = ci->mode;
+
+ if (nla_put(skb, TCA_CONNDSCP_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ tcf_tm_dump(&t, &ci->tcf_tm);
+ if (nla_put_64bit(skb, TCA_CONNDSCP_TM, sizeof(t), &t,
+ TCA_CONNDSCP_PAD))
+ goto nla_put_failure;
+ spin_unlock_bh(&ci->tcf_lock);
+
+ return skb->len;
+
+nla_put_failure:
+ spin_unlock_bh(&ci->tcf_lock);
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int tcf_conndscp_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ const struct tc_action_ops *ops,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, conndscp_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static int tcf_conndscp_search(struct net *net, struct tc_action **a, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, conndscp_net_id);
+
+ return tcf_idr_search(tn, a, index);
+}
+
+static struct tc_action_ops act_conndscp_ops = {
+ .kind = "conndscp",
+ .type = TCA_ACT_CONNDSCP,
+ .owner = THIS_MODULE,
+ .act = tcf_conndscp_act,
+ .dump = tcf_conndscp_dump,
+ .init = tcf_conndscp_init,
+ .walk = tcf_conndscp_walker,
+ .lookup = tcf_conndscp_search,
+ .size = sizeof(struct tcf_conndscp_info),
+};
+
+static __net_init int conndscp_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, conndscp_net_id);
+
+ return tc_action_net_init(tn, &act_conndscp_ops);
+}
+
+static void __net_exit conndscp_exit_net(struct list_head *net_list)
+{
+ tc_action_net_exit(net_list, conndscp_net_id);
+}
+
+static struct pernet_operations conndscp_net_ops = {
+ .init = conndscp_init_net,
+ .exit_batch = conndscp_exit_net,
+ .id = &conndscp_net_id,
+ .size = sizeof(struct tc_action_net),
+};
+
+static int __init conndscp_init_module(void)
+{
+ return tcf_register_action(&act_conndscp_ops, &conndscp_net_ops);
+}
+
+static void __exit conndscp_cleanup_module(void)
+{
+ tcf_unregister_action(&act_conndscp_ops, &conndscp_net_ops);
+}
+
+module_init(conndscp_init_module);
+module_exit(conndscp_cleanup_module);
+MODULE_AUTHOR("Kevin Darbyshire-Bryant <ldir@...byshire-bryant.me.uk>");
+MODULE_DESCRIPTION("DSCP to Connection tracking mark storing/restoring");
+MODULE_LICENSE("GPL");
diff --git a/tools/testing/selftests/tc-testing/config b/tools/testing/selftests/tc-testing/config
index 203302065458..9d1fddcfb887 100644
--- a/tools/testing/selftests/tc-testing/config
+++ b/tools/testing/selftests/tc-testing/config
@@ -37,6 +37,7 @@ CONFIG_NET_ACT_SKBEDIT=m
CONFIG_NET_ACT_CSUM=m
CONFIG_NET_ACT_VLAN=m
CONFIG_NET_ACT_BPF=m
+CONFIG_NET_ACT_CONNDSCP=m
CONFIG_NET_ACT_CONNMARK=m
CONFIG_NET_ACT_SKBMOD=m
CONFIG_NET_ACT_IFE=m
--
2.17.2 (Apple Git-113)
Powered by blists - more mailing lists