[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20180627215950.6719-7-jesus.sanchez-palencia@intel.com>
Date: Wed, 27 Jun 2018 14:59:42 -0700
From: Jesus Sanchez-Palencia <jesus.sanchez-palencia@...el.com>
To: netdev@...r.kernel.org
Cc: tglx@...utronix.de, jan.altenberg@...utronix.de,
vinicius.gomes@...el.com, kurt.kanzenbach@...utronix.de,
henrik@...tad.us, richardcochran@...il.com,
levi.pearson@...man.com, ilias.apalodimas@...aro.org,
ivan.khoronzhuk@...aro.org, mlichvar@...hat.com,
willemb@...gle.com, jhs@...atatu.com, xiyou.wangcong@...il.com,
jiri@...nulli.us,
Jesus Sanchez-Palencia <jesus.sanchez-palencia@...el.com>
Subject: [PATCH v1 net-next 06/14] net/sched: Introduce the ETF Qdisc
From: Vinicius Costa Gomes <vinicius.gomes@...el.com>
The ETF (Earliest TxTime First) qdisc uses the information added
earlier in this series (the socket option SO_TXTIME and the new
role of sk_buff->tstamp) to schedule packets transmission based
on absolute time.
For some workloads, just bandwidth enforcement is not enough, and
precise control of the transmission of packets is necessary.
Example:
$ tc qdisc replace dev enp2s0 parent root handle 100 mqprio num_tc 3 \
map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 queues 1@0 1@1 2@2 hw 0
$ tc qdisc add dev enp2s0 parent 100:1 etf delta 100000 \
clockid CLOCK_REALTIME
In this example, the Qdisc will provide SW best-effort for the control
of the transmission time to the network adapter, the time stamp in the
socket will be in reference to the clockid CLOCK_REALTIME and packets
will leave the qdisc "delta" (100000) nanoseconds before its transmission
time.
The ETF qdisc will buffer packets sorted by their txtime. It will drop
packets on enqueue() if their skbuff clockid does not match the clock
reference of the Qdisc. Moreover, on dequeue(), a packet will be dropped
if it expires while being enqueued.
The qdisc also supports the SO_TXTIME deadline mode. For this mode, it
will dequeue a packet as soon as possible and change the skb timestamp
to 'now' during etf_dequeue().
Signed-off-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@...el.com>
Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@...el.com>
---
include/linux/netdevice.h | 1 +
include/uapi/linux/pkt_sched.h | 17 ++
net/sched/Kconfig | 11 +
net/sched/Makefile | 1 +
net/sched/sch_etf.c | 385 +++++++++++++++++++++++++++++++++
5 files changed, 415 insertions(+)
create mode 100644 net/sched/sch_etf.c
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c6b377a15869..7f650bdc6ec3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -793,6 +793,7 @@ enum tc_setup_type {
TC_SETUP_QDISC_RED,
TC_SETUP_QDISC_PRIO,
TC_SETUP_QDISC_MQ,
+ TC_SETUP_QDISC_ETF,
};
/* These structures hold the attributes of bpf state that are being passed
diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 37b5096ae97b..9d6fd2004a03 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -934,4 +934,21 @@ enum {
#define TCA_CBS_MAX (__TCA_CBS_MAX - 1)
+
+/* ETF */
+struct tc_etf_qopt {
+ __s32 delta;
+ __s32 clockid;
+ __u32 flags;
+#define TC_ETF_DEADLINE_MODE_ON BIT(0)
+};
+
+enum {
+ TCA_ETF_UNSPEC,
+ TCA_ETF_PARMS,
+ __TCA_ETF_MAX,
+};
+
+#define TCA_ETF_MAX (__TCA_ETF_MAX - 1)
+
#endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index a01169fb5325..fcc89706745b 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -183,6 +183,17 @@ config NET_SCH_CBS
To compile this code as a module, choose M here: the
module will be called sch_cbs.
+config NET_SCH_ETF
+ tristate "Earliest TxTime First (ETF)"
+ help
+ Say Y here if you want to use the Earliest TxTime First (ETF) packet
+ scheduling algorithm.
+
+ See the top of <file:net/sched/sch_etf.c> for more details.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_etf.
+
config NET_SCH_GRED
tristate "Generic Random Early Detection (GRED)"
---help---
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 8811d3804878..9a5a7077d217 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -54,6 +54,7 @@ obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o
obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o
obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o
obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o
+obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o
obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o
diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c
new file mode 100644
index 000000000000..5f01a285f399
--- /dev/null
+++ b/net/sched/sch_etf.c
@@ -0,0 +1,385 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* net/sched/sch_etf.c Earliest TxTime First queueing discipline.
+ *
+ * Authors: Jesus Sanchez-Palencia <jesus.sanchez-palencia@...el.com>
+ * Vinicius Costa Gomes <vinicius.gomes@...el.com>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/rbtree.h>
+#include <linux/skbuff.h>
+#include <linux/posix-timers.h>
+#include <net/netlink.h>
+#include <net/sch_generic.h>
+#include <net/pkt_sched.h>
+#include <net/sock.h>
+
+#define DEADLINE_MODE_IS_ON(x) ((x)->flags & TC_ETF_DEADLINE_MODE_ON)
+
+struct etf_sched_data {
+ bool deadline_mode;
+ int clockid;
+ int queue;
+ s32 delta; /* in ns */
+ ktime_t last; /* The txtime of the last skb sent to the netdevice. */
+ struct rb_root head;
+ struct qdisc_watchdog watchdog;
+ ktime_t (*get_time)(void);
+};
+
+static const struct nla_policy etf_policy[TCA_ETF_MAX + 1] = {
+ [TCA_ETF_PARMS] = { .len = sizeof(struct tc_etf_qopt) },
+};
+
+static inline int validate_input_params(struct tc_etf_qopt *qopt,
+ struct netlink_ext_ack *extack)
+{
+ /* Check if params comply to the following rules:
+ * * Clockid and delta must be valid.
+ *
+ * * Dynamic clockids are not supported.
+ *
+ * * Delta must be a positive integer.
+ */
+ if (qopt->clockid < 0) {
+ NL_SET_ERR_MSG(extack, "Dynamic clockids are not supported");
+ return -ENOTSUPP;
+ }
+
+ if (qopt->clockid >= MAX_CLOCKS) {
+ NL_SET_ERR_MSG(extack, "Invalid clockid");
+ return -EINVAL;
+ }
+
+ if (qopt->delta < 0) {
+ NL_SET_ERR_MSG(extack, "Delta must be positive");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+ ktime_t txtime = nskb->tstamp;
+ struct sock *sk = nskb->sk;
+ ktime_t now;
+
+ if (!sk)
+ return false;
+
+ if (!sock_flag(sk, SOCK_TXTIME))
+ return false;
+
+ /* We don't perform crosstimestamping.
+ * Drop if packet's clockid differs from qdisc's.
+ */
+ if (sk->sk_clockid != q->clockid)
+ return false;
+
+ if ((sk->sk_txtime_flags & SK_TXTIME_DEADLINE_MASK) !=
+ q->deadline_mode)
+ return false;
+
+ now = q->get_time();
+ if (ktime_before(txtime, now) || ktime_before(txtime, q->last))
+ return false;
+
+ return true;
+}
+
+static struct sk_buff *etf_peek_timesortedlist(struct Qdisc *sch)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+ struct rb_node *p;
+
+ p = rb_first(&q->head);
+ if (!p)
+ return NULL;
+
+ return rb_to_skb(p);
+}
+
+static void reset_watchdog(struct Qdisc *sch)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb = etf_peek_timesortedlist(sch);
+ ktime_t next;
+
+ if (!skb)
+ return;
+
+ next = ktime_sub_ns(skb->tstamp, q->delta);
+ qdisc_watchdog_schedule_ns(&q->watchdog, ktime_to_ns(next));
+}
+
+static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+ struct rb_node **p = &q->head.rb_node, *parent = NULL;
+ ktime_t txtime = nskb->tstamp;
+
+ if (!is_packet_valid(sch, nskb))
+ return qdisc_drop(nskb, sch, to_free);
+
+ while (*p) {
+ struct sk_buff *skb;
+
+ parent = *p;
+ skb = rb_to_skb(parent);
+ if (ktime_after(txtime, skb->tstamp))
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+ rb_link_node(&nskb->rbnode, parent, p);
+ rb_insert_color(&nskb->rbnode, &q->head);
+
+ qdisc_qstats_backlog_inc(sch, nskb);
+ sch->q.qlen++;
+
+ /* Now we may need to re-arm the qdisc watchdog for the next packet. */
+ reset_watchdog(sch);
+
+ return NET_XMIT_SUCCESS;
+}
+
+static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb,
+ bool drop)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+
+ rb_erase(&skb->rbnode, &q->head);
+
+ /* The rbnode field in the skb re-uses these fields, now that
+ * we are done with the rbnode, reset them.
+ */
+ skb->next = NULL;
+ skb->prev = NULL;
+ skb->dev = qdisc_dev(sch);
+
+ qdisc_qstats_backlog_dec(sch, skb);
+
+ if (drop) {
+ struct sk_buff *to_free = NULL;
+
+ qdisc_drop(skb, sch, &to_free);
+ kfree_skb_list(to_free);
+ qdisc_qstats_overlimit(sch);
+ } else {
+ qdisc_bstats_update(sch, skb);
+
+ q->last = skb->tstamp;
+ }
+
+ sch->q.qlen--;
+}
+
+static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+ ktime_t now, next;
+
+ skb = etf_peek_timesortedlist(sch);
+ if (!skb)
+ return NULL;
+
+ now = q->get_time();
+
+ /* Drop if packet has expired while in queue. */
+ /* FIXME: Must return error on the socket's error queue */
+ if (ktime_before(skb->tstamp, now)) {
+ timesortedlist_erase(sch, skb, true);
+ skb = NULL;
+ goto out;
+ }
+
+ /* When in deadline mode, dequeue as soon as possible and change the
+ * txtime from deadline to (now + delta).
+ */
+ if (q->deadline_mode) {
+ timesortedlist_erase(sch, skb, false);
+ skb->tstamp = now;
+ goto out;
+ }
+
+ next = ktime_sub_ns(skb->tstamp, q->delta);
+
+ /* Dequeue only if now is within the [txtime - delta, txtime] range. */
+ if (ktime_after(now, next))
+ timesortedlist_erase(sch, skb, false);
+ else
+ skb = NULL;
+
+out:
+ /* Now we may need to re-arm the qdisc watchdog for the next packet. */
+ reset_watchdog(sch);
+
+ return skb;
+}
+
+static int etf_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ struct nlattr *tb[TCA_ETF_MAX + 1];
+ struct tc_etf_qopt *qopt;
+ int err;
+
+ if (!opt) {
+ NL_SET_ERR_MSG(extack,
+ "Missing ETF qdisc options which are mandatory");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, TCA_ETF_MAX, opt, etf_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_ETF_PARMS]) {
+ NL_SET_ERR_MSG(extack, "Missing mandatory ETF parameters");
+ return -EINVAL;
+ }
+
+ qopt = nla_data(tb[TCA_ETF_PARMS]);
+
+ pr_debug("delta %d clockid %d deadline %s\n",
+ qopt->delta, qopt->clockid,
+ DEADLINE_MODE_IS_ON(qopt) ? "on" : "off");
+
+ err = validate_input_params(qopt, extack);
+ if (err < 0)
+ return err;
+
+ q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0);
+
+ /* Everything went OK, save the parameters used. */
+ q->delta = qopt->delta;
+ q->clockid = qopt->clockid;
+ q->deadline_mode = DEADLINE_MODE_IS_ON(qopt);
+
+ switch (q->clockid) {
+ case CLOCK_REALTIME:
+ q->get_time = ktime_get_real;
+ break;
+ case CLOCK_MONOTONIC:
+ q->get_time = ktime_get;
+ break;
+ case CLOCK_BOOTTIME:
+ q->get_time = ktime_get_boottime;
+ break;
+ case CLOCK_TAI:
+ q->get_time = ktime_get_clocktai;
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "Clockid is not supported");
+ return -ENOTSUPP;
+ }
+
+ qdisc_watchdog_init_clockid(&q->watchdog, sch, q->clockid);
+
+ return 0;
+}
+
+static void timesortedlist_clear(struct Qdisc *sch)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+ struct rb_node *p = rb_first(&q->head);
+
+ while (p) {
+ struct sk_buff *skb = rb_to_skb(p);
+
+ p = rb_next(p);
+
+ rb_erase(&skb->rbnode, &q->head);
+ rtnl_kfree_skbs(skb, skb);
+ sch->q.qlen--;
+ }
+}
+
+static void etf_reset(struct Qdisc *sch)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+
+ /* Only cancel watchdog if it's been initialized. */
+ if (q->watchdog.qdisc == sch)
+ qdisc_watchdog_cancel(&q->watchdog);
+
+ /* No matter which mode we are on, it's safe to clear both lists. */
+ timesortedlist_clear(sch);
+ __qdisc_reset_queue(&sch->q);
+
+ sch->qstats.backlog = 0;
+ sch->q.qlen = 0;
+
+ q->last = 0;
+}
+
+static void etf_destroy(struct Qdisc *sch)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+
+ /* Only cancel watchdog if it's been initialized. */
+ if (q->watchdog.qdisc == sch)
+ qdisc_watchdog_cancel(&q->watchdog);
+}
+
+static int etf_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct etf_sched_data *q = qdisc_priv(sch);
+ struct tc_etf_qopt opt = { };
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (!nest)
+ goto nla_put_failure;
+
+ opt.delta = q->delta;
+ opt.clockid = q->clockid;
+ if (q->deadline_mode)
+ opt.flags |= TC_ETF_DEADLINE_MODE_ON;
+
+ if (nla_put(skb, TCA_ETF_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ return nla_nest_end(skb, nest);
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -1;
+}
+
+static struct Qdisc_ops etf_qdisc_ops __read_mostly = {
+ .id = "etf",
+ .priv_size = sizeof(struct etf_sched_data),
+ .enqueue = etf_enqueue_timesortedlist,
+ .dequeue = etf_dequeue_timesortedlist,
+ .peek = etf_peek_timesortedlist,
+ .init = etf_init,
+ .reset = etf_reset,
+ .destroy = etf_destroy,
+ .dump = etf_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init etf_module_init(void)
+{
+ return register_qdisc(&etf_qdisc_ops);
+}
+
+static void __exit etf_module_exit(void)
+{
+ unregister_qdisc(&etf_qdisc_ops);
+}
+module_init(etf_module_init)
+module_exit(etf_module_exit)
+MODULE_LICENSE("GPL");
--
2.17.1
Powered by blists - more mailing lists