[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20190329204438.68777-2-ldir@darbyshire-bryant.me.uk>
Date: Fri, 29 Mar 2019 20:45:12 +0000
From: Kevin 'ldir' Darbyshire-Bryant <ldir@...byshire-bryant.me.uk>
To: Kevin 'ldir' Darbyshire-Bryant <ldir@...byshire-bryant.me.uk>
CC: "jhs@...atatu.com" <jhs@...atatu.com>,
"jiri@...nulli.us" <jiri@...nulli.us>,
"netdev@...r.kernel.org" <netdev@...r.kernel.org>,
"xiyou.wangcong@...il.com" <xiyou.wangcong@...il.com>
Subject: [RFC net-next 1/1] net: sched: Introduce conntrack action
conntrack is a new tc filter action module. It is designed to restore
DSCPs stored in conntrack marks
The feature is intended for use and has been found useful for restoring
ingress classifications based on egress classifications across links
that bleach or otherwise change DSCP, typically home ISP Internet links.
Restoring DSCP on ingress on the WAN link allows qdiscs such as CAKE to
shape inbound packets according to policies that are easier to implement
on egress.
Ingress classification is traditionally a challenging task since
iptables rules haven't yet run and tc filter/eBPF programs are pre-NAT
lookups, hence are unable to see internal IPv4 addresses as used on the
typical home masquerading gateway.
It is anticipated that userspace control from tc would support the
following parameters:
mask - a 32 bit mask of at least 6 contiguous bits where conntrack will
place the DSCP in conntrack mark. The DSCP is left-shifted by the
number of unset lower bits of the mask before storing into the mark
field.
statemask - a 32 bit mask of (usually) 1 bit length, outside the area
specified by mask. This represents a conditional operation flag hence
the DSCP is only restored if the flag is set. This is useful to
implement a 'one shot' iptables based classification where the
'complicated' iptables rules are only run once to classify the
connection on initial (egress) packet and subsequent packets are all
marked/restored with the same DSCP. A statemask of zero disables the
conditional behaviour, the conntrack mark will be updated on every DSCP
change.
mode dscp - conntrack at present only understands one mode 'dscp'.
optional parameters:
zone - conntrack zone
control - action related control (reclassify | pipe | drop | continue |
ok | goto chain <CHAIN_INDEX>
Signed-off-by: Kevin Darbyshire-Bryant <ldir@...byshire-bryant.me.uk>
---
include/net/tc_act/tc_conntrack.h | 19 ++
include/uapi/linux/pkt_cls.h | 1 +
include/uapi/linux/tc_act/tc_conntrack.h | 30 ++
net/sched/Kconfig | 13 +
net/sched/Makefile | 1 +
net/sched/act_conntrack.c | 324 ++++++++++++++++++++++
tools/testing/selftests/tc-testing/config | 1 +
7 files changed, 389 insertions(+)
create mode 100644 include/net/tc_act/tc_conntrack.h
create mode 100644 include/uapi/linux/tc_act/tc_conntrack.h
create mode 100644 net/sched/act_conntrack.c
diff --git a/include/net/tc_act/tc_conntrack.h b/include/net/tc_act/tc_conntrack.h
new file mode 100644
index 000000000000..45319e557f90
--- /dev/null
+++ b/include/net/tc_act/tc_conntrack.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __NET_TC_CONNTRACK_H
+#define __NET_TC_CONNTRACK_H
+
+#include <net/act_api.h>
+
+struct tcf_conntrack_info {
+ struct tc_action common;
+ struct net *net;
+ u32 mask;
+ u32 statemask;
+ u16 zone;
+ u8 mode;
+ u8 maskshift;
+};
+
+#define to_conntrack(a) ((struct tcf_conntrack_info *)a)
+
+#endif /* __NET_TC_CONNTRACK_H */
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 51a0496f78ea..a5a6930084d7 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -105,6 +105,7 @@ enum tca_id {
TCA_ID_IFE = TCA_ACT_IFE,
TCA_ID_SAMPLE = TCA_ACT_SAMPLE,
/* other actions go here */
+ TCA_ID_CONNTRACK,
__TCA_ID_MAX = 255
};
diff --git a/include/uapi/linux/tc_act/tc_conntrack.h b/include/uapi/linux/tc_act/tc_conntrack.h
new file mode 100644
index 000000000000..99c4bfb29c21
--- /dev/null
+++ b/include/uapi/linux/tc_act/tc_conntrack.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __UAPI_TC_CONNTRACK_H
+#define __UAPI_TC_CONNTRACK_H
+
+#include <linux/types.h>
+#include <linux/pkt_cls.h>
+
+struct tc_conntrack {
+ tc_gen;
+ __u32 mask;
+ __u32 statemask;
+ __u16 zone;
+ __u8 mode;
+ __u8 maskshift;
+};
+
+enum {
+ TCA_CONNTRACK_UNSPEC,
+ TCA_CONNTRACK_PARMS,
+ TCA_CONNTRACK_TM,
+ TCA_CONNTRACK_PAD,
+ __TCA_CONNTRACK_MAX
+};
+#define TCA_CONNTRACK_MAX (__TCA_CONNTRACK_MAX - 1)
+
+enum {
+ CONNTRACK_FLAG_SETDSCP = BIT(0)
+};
+
+#endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 5c02ad97ef23..848dc1dd3be1 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -876,6 +876,19 @@ config NET_ACT_CONNMARK
To compile this code as a module, choose M here: the
module will be called act_connmark.
+config NET_ACT_CONNTRACK
+ tristate "Netfilter Connmark to DSCP Retriever"
+ depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
+ depends on NF_CONNTRACK && NF_CONNTRACK_MARK
+ help
+ Say Y here to allow transfer of a connmark stored DSCP into
+ ipv4/v6 diffserv
+
+ If unsure, say N.
+
+ To compile this code as a module, choose M here: the
+ module will be called act_conntrack.
+
config NET_ACT_SKBMOD
tristate "skb data modification action"
depends on NET_CLS_ACT
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 8a40431d7b5c..e962dd782046 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -21,6 +21,7 @@ obj-$(CONFIG_NET_ACT_CSUM) += act_csum.o
obj-$(CONFIG_NET_ACT_VLAN) += act_vlan.o
obj-$(CONFIG_NET_ACT_BPF) += act_bpf.o
obj-$(CONFIG_NET_ACT_CONNMARK) += act_connmark.o
+obj-$(CONFIG_NET_ACT_CONNTRACK) += act_conntrack.o
obj-$(CONFIG_NET_ACT_SKBMOD) += act_skbmod.o
obj-$(CONFIG_NET_ACT_IFE) += act_ife.o
obj-$(CONFIG_NET_IFE_SKBMARK) += act_meta_mark.o
diff --git a/net/sched/act_conntrack.c b/net/sched/act_conntrack.c
new file mode 100644
index 000000000000..c08ff06aae60
--- /dev/null
+++ b/net/sched/act_conntrack.c
@@ -0,0 +1,324 @@
+// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+/* net/sched/act_conntrack.c netfilter conntrack connmark->DSCP action
+ *
+ * Copyright (c) 2019 Kevin Darbyshire-Bryant <ldir@...byshire-bryant.me.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/pkt_cls.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/act_api.h>
+#include <net/pkt_cls.h>
+#include <uapi/linux/tc_act/tc_conntrack.h>
+#include <net/tc_act/tc_conntrack.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+static unsigned int conntrack_net_id;
+static struct tc_action_ops act_conntrack_ops;
+
+static void tcf_conntrack_set(struct nf_conn *ct, struct tcf_conntrack_info *ca,
+ struct sk_buff *skb, int proto)
+{
+ u8 newdscp;
+
+ newdscp = (((ct->mark & ca->mask) >> ca->maskshift) << 2) &
+ ~INET_ECN_MASK;
+
+ /* mark contains DSCP so restore DSCP bits from c->mark into diffserv */
+ /* using overlimits stats to count how many DSCP updates */
+ switch (proto) {
+ case NFPROTO_IPV4:
+ if ((ipv4_get_dsfield(ip_hdr(skb)) & ~INET_ECN_MASK) !=
+ newdscp) {
+ ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK,
+ newdscp);
+ ca->tcf_qstats.overlimits++;
+ }
+ break;
+ case NFPROTO_IPV6:
+ if ((ipv6_get_dsfield(ipv6_hdr(skb)) &
+ ~INET_ECN_MASK) != newdscp) {
+ ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK,
+ newdscp);
+ ca->tcf_qstats.overlimits++;
+ }
+ break;
+ default:
+ break;
+ }
+}
+
+static int tcf_conntrack_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ const struct nf_conntrack_tuple_hash *thash = NULL;
+ struct nf_conntrack_tuple tuple;
+ enum ip_conntrack_info ctinfo;
+ struct tcf_conntrack_info *ca = to_conntrack(a);
+ struct nf_conntrack_zone zone;
+ struct nf_conn *ct;
+ int proto;
+
+ spin_lock(&ca->tcf_lock);
+ tcf_lastuse_update(&ca->tcf_tm);
+ bstats_update(&ca->tcf_bstats, skb);
+
+ if (unlikely(!(ca->mode & CONNTRACK_FLAG_SETDSCP)))
+ goto out;
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ if (skb->len < sizeof(struct iphdr))
+ goto out;
+
+ proto = NFPROTO_IPV4;
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ if (skb->len < sizeof(struct ipv6hdr))
+ goto out;
+
+ proto = NFPROTO_IPV6;
+ } else {
+ goto out;
+ }
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct) { /* look harder */
+ if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
+ proto, ca->net, &tuple))
+ goto out;
+ zone.id = ca->zone;
+ zone.dir = NF_CT_DEFAULT_ZONE_DIR;
+
+ thash = nf_conntrack_find_get(ca->net, &zone, &tuple);
+ if (!thash)
+ goto out;
+
+ if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
+ proto, ca->net, &tuple))
+ goto out;
+
+ ct = nf_ct_tuplehash_to_ctrack(thash);
+ }
+
+ if (!ca->statemask || (ct->mark & ca->statemask))
+ tcf_conntrack_set(ct, ca, skb, proto);
+
+ if (thash)
+ nf_ct_put(ct);
+
+out:
+ spin_unlock(&ca->tcf_lock);
+ return ca->tcf_action;
+}
+
+static const struct nla_policy conntrack_policy[TCA_CONNTRACK_MAX + 1] = {
+ [TCA_CONNTRACK_PARMS] = { .len = sizeof(struct tc_conntrack) },
+};
+
+static void conntrack_parmset(struct tcf_conntrack_info *ci,
+ struct tc_conntrack *parm)
+{
+ ci->zone = parm->zone;
+ ci->mask = parm->mask;
+ ci->maskshift = ci->mask ? __ffs(ci->mask) : 0;
+ ci->statemask = parm->statemask;
+ ci->mode = parm->mode;
+
+ /* let's not trust userspace entirely */
+ /* need at least contiguous 6 bit mask */
+ if ((0x3f & (ci->mask >> ci->maskshift)) != 0x3f)
+ ci->mode &= ~CONNTRACK_FLAG_SETDSCP;
+ /* mask & statemask must not overlap */
+ if (ci->mask & ci->statemask)
+ ci->mode &= ~CONNTRACK_FLAG_SETDSCP;
+}
+
+static int tcf_conntrack_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action **a,
+ int ovr, int bind, bool rtnl_held,
+ struct tcf_proto *tp,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, conntrack_net_id);
+ struct nlattr *tb[TCA_CONNTRACK_MAX + 1];
+ struct tcf_chain *goto_ch = NULL;
+ struct tcf_conntrack_info *ci;
+ struct tc_conntrack *parm;
+ int ret = 0, err;
+
+ if (!nla)
+ return -EINVAL;
+
+ ret = nla_parse_nested(tb, TCA_CONNTRACK_MAX, nla, conntrack_policy,
+ NULL);
+ if (ret < 0)
+ return ret;
+
+ if (!tb[TCA_CONNTRACK_PARMS])
+ return -EINVAL;
+
+ parm = nla_data(tb[TCA_CONNTRACK_PARMS]);
+
+ ret = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ if (!ret) {
+ ret = tcf_idr_create(tn, parm->index, est, a,
+ &act_conntrack_ops, bind, false);
+ if (ret) {
+ tcf_idr_cleanup(tn, parm->index);
+ return ret;
+ }
+
+ ci = to_conntrack(*a);
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch,
+ extack);
+ if (err < 0)
+ goto release_idr;
+ tcf_action_set_ctrlact(*a, parm->action, goto_ch);
+ ci->net = net;
+ conntrack_parmset(ci, parm);
+
+ tcf_idr_insert(tn, *a);
+ ret = ACT_P_CREATED;
+ } else if (ret > 0) {
+ ci = to_conntrack(*a);
+ if (bind)
+ return 0;
+ if (!ovr) {
+ tcf_idr_release(*a, bind);
+ return -EEXIST;
+ }
+ err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch,
+ extack);
+ if (err < 0)
+ goto release_idr;
+ /* replacing action and zone */
+ spin_lock_bh(&ci->tcf_lock);
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
+ conntrack_parmset(ci, parm);
+ spin_unlock_bh(&ci->tcf_lock);
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+ ret = 0;
+ }
+
+ return ret;
+release_idr:
+ tcf_idr_release(*a, bind);
+ return err;
+}
+
+static inline int tcf_conntrack_dump(struct sk_buff *skb, struct tc_action *a,
+ int bind, int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_conntrack_info *ci = to_conntrack(a);
+ struct tc_conntrack opt = {
+ .index = ci->tcf_index,
+ .refcnt = refcount_read(&ci->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind,
+ };
+ struct tcf_t t;
+
+ spin_lock_bh(&ci->tcf_lock);
+ opt.action = ci->tcf_action;
+ opt.zone = ci->zone;
+ opt.mask = ci->mask;
+ opt.statemask = ci->statemask;
+ opt.mode = ci->mode;
+
+ if (nla_put(skb, TCA_CONNTRACK_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ tcf_tm_dump(&t, &ci->tcf_tm);
+ if (nla_put_64bit(skb, TCA_CONNTRACK_TM, sizeof(t), &t,
+ TCA_CONNTRACK_PAD))
+ goto nla_put_failure;
+ spin_unlock_bh(&ci->tcf_lock);
+
+ return skb->len;
+
+nla_put_failure:
+ spin_unlock_bh(&ci->tcf_lock);
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int tcf_conntrack_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ const struct tc_action_ops *ops,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_action_net *tn = net_generic(net, conntrack_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static int tcf_conntrack_search(struct net *net, struct tc_action **a,
+ u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, conntrack_net_id);
+
+ return tcf_idr_search(tn, a, index);
+}
+
+static struct tc_action_ops act_conntrack_ops = {
+ .kind = "conntrack",
+ .id = TCA_ID_CONNTRACK,
+ .owner = THIS_MODULE,
+ .act = tcf_conntrack_act,
+ .dump = tcf_conntrack_dump,
+ .init = tcf_conntrack_init,
+ .walk = tcf_conntrack_walker,
+ .lookup = tcf_conntrack_search,
+ .size = sizeof(struct tcf_conntrack_info),
+};
+
+static __net_init int conntrack_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, conntrack_net_id);
+
+ return tc_action_net_init(tn, &act_conntrack_ops);
+}
+
+static void __net_exit conntrack_exit_net(struct list_head *net_list)
+{
+ tc_action_net_exit(net_list, conntrack_net_id);
+}
+
+static struct pernet_operations conntrack_net_ops = {
+ .init = conntrack_init_net,
+ .exit_batch = conntrack_exit_net,
+ .id = &conntrack_net_id,
+ .size = sizeof(struct tc_action_net),
+};
+
+static int __init conntrack_init_module(void)
+{
+ return tcf_register_action(&act_conntrack_ops, &conntrack_net_ops);
+}
+
+static void __exit conntrack_cleanup_module(void)
+{
+ tcf_unregister_action(&act_conntrack_ops, &conntrack_net_ops);
+}
+
+module_init(conntrack_init_module);
+module_exit(conntrack_cleanup_module);
+MODULE_AUTHOR("Kevin Darbyshire-Bryant <ldir@...byshire-bryant.me.uk>");
+MODULE_DESCRIPTION("Conntrack mark to DSCP restoring");
+MODULE_LICENSE("GPL");
diff --git a/tools/testing/selftests/tc-testing/config b/tools/testing/selftests/tc-testing/config
index 203302065458..104256b4fa2a 100644
--- a/tools/testing/selftests/tc-testing/config
+++ b/tools/testing/selftests/tc-testing/config
@@ -38,6 +38,7 @@ CONFIG_NET_ACT_CSUM=m
CONFIG_NET_ACT_VLAN=m
CONFIG_NET_ACT_BPF=m
CONFIG_NET_ACT_CONNMARK=m
+CONFIG_NET_ACT_CONNTRACK=m
CONFIG_NET_ACT_SKBMOD=m
CONFIG_NET_ACT_IFE=m
CONFIG_NET_ACT_TUNNEL_KEY=m
--
2.20.1 (Apple Git-117)
Powered by blists - more mailing lists