netdev - Re: [PATCH v2 net-next] net/sched: add skbprio scheduler

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Sat, 23 Jun 2018 15:10:32 -0700
From:   Alexander Duyck <alexander.duyck@...il.com>
To:     Nishanth Devarajan <ndev2021@...il.com>
Cc:     Cong Wang <xiyou.wangcong@...il.com>,
        Jamal Hadi Salim <jhs@...atatu.com>,
        Jiri Pirko <jiri@...nulli.us>,
        David Miller <davem@...emloft.net>,
        Netdev <netdev@...r.kernel.org>, doucette@...edu,
        michel@...irati.com.br
Subject: Re: [PATCH v2 net-next] net/sched: add skbprio scheduler

On Sat, Jun 23, 2018 at 1:47 PM, Nishanth Devarajan <ndev2021@...il.com> wrote:
> net/sched: add skbprio scheduler
>
> Skbprio (SKB Priority Queue) is a queueing discipline that prioritizes packets
> according to their skb->priority field. Although Skbprio can be employed in any
> scenario in which a higher skb->priority field means a higher priority packet,
> Skbprio was concieved as a solution for denial-of-service defenses that need to
> route packets with different priorities.

Really this description is not very good. Reading it I was thinking to
myself "why do we need this, prio already does this". It wasn't until
I read through the code that I figured out that you are basically
adding dropping of lower priority frames.

>
> v2
> *Use skb->priority field rather than DS field. Rename queueing discipline as
> SKB Priority Queue (previously Gatekeeper Priority Queue).
>
> *Queueing discipline is made classful to expose Skbprio's internal priority
> queues.
>
> Signed-off-by: Nishanth Devarajan <ndev2021@...il.com>
> Reviewed-by: Sachin Paryani <sachin.paryani@...il.com>
> Reviewed-by: Cody Doucette <doucette@...edu>
> Reviewed-by: Michel Machado <michel@...irati.com.br>
> ---
>  include/uapi/linux/pkt_sched.h |  15 ++
>  net/sched/Kconfig              |  13 ++
>  net/sched/Makefile             |   1 +
>  net/sched/sch_skbprio.c        | 347 +++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 376 insertions(+)
>  create mode 100644 net/sched/sch_skbprio.c
>
> diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
> index 37b5096..6fd07e8 100644
> --- a/include/uapi/linux/pkt_sched.h
> +++ b/include/uapi/linux/pkt_sched.h
> @@ -124,6 +124,21 @@ struct tc_fifo_qopt {
>         __u32   limit;  /* Queue length: bytes for bfifo, packets for pfifo */
>  };
>
> +/* SKBPRIO section */
> +
> +/*
> + * Priorities go from zero to (SKBPRIO_MAX_PRIORITY - 1).
> + * SKBPRIO_MAX_PRIORITY should be at least 64 in order for skbprio to be able
> + * to map one to one the DS field of IPV4 and IPV6 headers.
> + * Memory allocation grows linearly with SKBPRIO_MAX_PRIORITY.
> + */
> +
> +#define SKBPRIO_MAX_PRIORITY 64
> +
> +struct tc_skbprio_qopt {
> +       __u32   limit;          /* Queue length in packets. */
> +};
> +
>  /* PRIO section */
>
>  #define TCQ_PRIO_BANDS 16
> diff --git a/net/sched/Kconfig b/net/sched/Kconfig
> index a01169f..9ac4b53 100644
> --- a/net/sched/Kconfig
> +++ b/net/sched/Kconfig
> @@ -240,6 +240,19 @@ config NET_SCH_MQPRIO
>
>           If unsure, say N.
>
> +config NET_SCH_SKBPRIO
> +       tristate "SKB priority queue scheduler (SKBPRIO)"
> +       help
> +         Say Y here if you want to use the SKB priority queue
> +         scheduler. This schedules packets according to skb->priority,
> +         which is useful for request packets in DoS mitigation systems such
> +         as Gatekeeper.
> +
> +         To compile this driver as a module, choose M here: the module will
> +         be called sch_skbprio.
> +
> +         If unsure, say N.
> +
>  config NET_SCH_CHOKE
>         tristate "CHOose and Keep responsive flow scheduler (CHOKE)"
>         help
> diff --git a/net/sched/Makefile b/net/sched/Makefile
> index 8811d38..a4d8893 100644
> --- a/net/sched/Makefile
> +++ b/net/sched/Makefile
> @@ -46,6 +46,7 @@ obj-$(CONFIG_NET_SCH_NETEM)   += sch_netem.o
>  obj-$(CONFIG_NET_SCH_DRR)      += sch_drr.o
>  obj-$(CONFIG_NET_SCH_PLUG)     += sch_plug.o
>  obj-$(CONFIG_NET_SCH_MQPRIO)   += sch_mqprio.o
> +obj-$(CONFIG_NET_SCH_SKBPRIO)  += sch_skbprio.o
>  obj-$(CONFIG_NET_SCH_CHOKE)    += sch_choke.o
>  obj-$(CONFIG_NET_SCH_QFQ)      += sch_qfq.o
>  obj-$(CONFIG_NET_SCH_CODEL)    += sch_codel.o
> diff --git a/net/sched/sch_skbprio.c b/net/sched/sch_skbprio.c
> new file mode 100644
> index 0000000..5e89446
> --- /dev/null
> +++ b/net/sched/sch_skbprio.c
> @@ -0,0 +1,347 @@
> +/*
> + * net/sched/sch_skbprio.c  SKB Priority Queue.
> + *
> + *             This program is free software; you can redistribute it and/or
> + *             modify it under the terms of the GNU General Public License
> + *             as published by the Free Software Foundation; either version
> + *             2 of the License, or (at your option) any later version.
> + *
> + * Authors:    Nishanth Devarajan, <ndev2021@...il.com>
> + *             Cody Doucette, <doucette@...edu>
> + *             original idea by Michel Machado, Cody Doucette, and Qiaobin Fu
> + */
> +
> +#include <linux/string.h>
> +#include <linux/module.h>
> +#include <linux/slab.h>
> +#include <linux/types.h>
> +#include <linux/kernel.h>
> +#include <linux/errno.h>
> +#include <linux/skbuff.h>
> +#include <net/pkt_sched.h>
> +#include <net/sch_generic.h>
> +#include <net/inet_ecn.h>
> +
> +
> +/*       SKB Priority Queue
> + *     =================================
> + *
> + * This qdisc schedules a packet according to skb->priority, where a higher
> + * value places the packet closer to the exit of the queue. When the queue is
> + * full, the lowest priority packet in the queue is dropped to make room for
> + * the packet to be added if it has higher priority. If the packet to be added
> + * has lower priority than all packets in the queue, it is dropped.
> + *
> + * Without the SKB priority queue, queue length limits must be imposed
> + * for individual queues, and there is no easy way to enforce a global queue
> + * length limit across all priorities. With the SKBprio queue, a global
> + * queue length limit can be enforced while not restricting the queue lengths
> + * of individual priorities.
> + *
> + * This is especially useful for a denial-of-service defense system like
> + * Gatekeeper, which prioritizes packets in flows that demonstrate expected
> + * behavior of legitimate users. The queue is flexible to allow any number
> + * of packets of any priority up to the global limit of the scheduler
> + * without risking resource overconsumption by a flood of low priority packets.
> + *
> + * The Gatekeeper codebase is found here:
> + *
> + *             https://github.com/AltraMayor/gatekeeper

Do we really need to be linking to third party projects in the code? I
am not even really sure it is related here since I cannot find
anything that references the queuing discipline.

Also this seems like it should be in the Documentation/networking
folder rather than being stored in the code itself.

> + */
> +
> +struct skbprio_sched_data {
> +       /* Parameters. */
> +       u32 max_limit;
> +
> +       /* Queue state. */
> +       struct sk_buff_head qdiscs[SKBPRIO_MAX_PRIORITY];
> +       struct gnet_stats_queue qstats[SKBPRIO_MAX_PRIORITY];
> +       u16 highest_prio;
> +       u16 lowest_prio;
> +};
> +
> +static u16 calc_new_high_prio(const struct skbprio_sched_data *q)
> +{
> +       int prio;
> +
> +       for (prio = q->highest_prio - 1; prio >= q->lowest_prio; prio--) {
> +               if (!skb_queue_empty(&q->qdiscs[prio]))
> +                       return prio;
> +       }
> +
> +       /* SKB queue is empty, return 0 (default highest priority setting). */
> +       return 0;
> +}
> +
> +static u16 calc_new_low_prio(const struct skbprio_sched_data *q)
> +{
> +       int prio;
> +
> +       for (prio = q->lowest_prio + 1; prio <= q->highest_prio; prio++) {
> +               if (!skb_queue_empty(&q->qdiscs[prio]))
> +                       return prio;
> +       }
> +
> +       /* SKB queue is empty, return SKBPRIO_MAX_PRIORITY - 1
> +        * (default lowest priority setting).
> +        */
> +       return SKBPRIO_MAX_PRIORITY - 1;
> +}
> +
> +static int skbprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
> +                         struct sk_buff **to_free)
> +{
> +       const unsigned int max_priority = SKBPRIO_MAX_PRIORITY - 1;
> +       struct skbprio_sched_data *q = qdisc_priv(sch);
> +       struct sk_buff_head *qdisc;
> +       struct sk_buff_head *lp_qdisc;
> +       struct sk_buff *to_drop;
> +       u16 prio, lp;
> +
> +       /* Obtain the priority of @skb. */
> +       prio = min(skb->priority, max_priority);
> +
> +       qdisc = &q->qdiscs[prio];
> +       if (sch->q.qlen < q->max_limit) {
> +               __skb_queue_tail(qdisc, skb);
> +               qdisc_qstats_backlog_inc(sch, skb);
> +               q->qstats[prio].backlog += qdisc_pkt_len(skb);
> +
> +               /* Check to update highest and lowest priorities. */
> +               if (prio > q->highest_prio)
> +                       q->highest_prio = prio;
> +
> +               if (prio < q->lowest_prio)
> +                       q->lowest_prio = prio;
> +
> +               sch->q.qlen++;
> +               return NET_XMIT_SUCCESS;
> +       }
> +
> +       /* If this packet has the lowest priority, drop it. */
> +       lp = q->lowest_prio;
> +       if (prio <= lp) {
> +               q->qstats[prio].drops++;
> +               return qdisc_drop(skb, sch, to_free);
> +       }
> +
> +       /* Drop the packet at the tail of the lowest priority qdisc. */
> +       lp_qdisc = &q->qdiscs[lp];
> +       to_drop = __skb_dequeue_tail(lp_qdisc);

Is there a specific reason for dropping tail instead of head? I would
think you might see better latency numbers with this if you were to
focus on dropping older packets from the lowest priority instead of
the newest ones. Then things like TCP would likely be able to respond
more quickly to the congestion effects and would be able to more
accurately determine actual round trip time versus buffer delay.

> +       BUG_ON(!to_drop);
> +       qdisc_qstats_backlog_dec(sch, to_drop);
> +       qdisc_drop(to_drop, sch, to_free);
> +
> +       q->qstats[lp].backlog -= qdisc_pkt_len(to_drop);
> +       q->qstats[lp].drops++;
> +
> +       __skb_queue_tail(qdisc, skb);
> +       qdisc_qstats_backlog_inc(sch, skb);
> +       q->qstats[prio].backlog += qdisc_pkt_len(skb);
> +

You could probably just take care of enqueueing first, and then take
care of the dequeue. That way you can drop the references to skb
earlier from the stack or free up whatever register is storing it.

> +       /* Check to update highest and lowest priorities. */
> +       if (skb_queue_empty(lp_qdisc)) {
> +               if (q->lowest_prio == q->highest_prio) {
> +                       BUG_ON(sch->q.qlen);
> +                       q->lowest_prio = prio;
> +                       q->highest_prio = prio;
> +               } else {
> +                       q->lowest_prio = calc_new_low_prio(q);
> +               }
> +       }
> +
> +       if (prio > q->highest_prio)
> +               q->highest_prio = prio;
> +
> +       return NET_XMIT_CN;
> +}
> +
> +static struct sk_buff *skbprio_dequeue(struct Qdisc *sch)
> +{
> +       struct skbprio_sched_data *q = qdisc_priv(sch);
> +       struct sk_buff_head *hpq = &q->qdiscs[q->highest_prio];
> +       struct sk_buff *skb = __skb_dequeue(hpq);
> +
> +       if (unlikely(!skb))
> +               return NULL;
> +
> +       sch->q.qlen--;
> +       qdisc_qstats_backlog_dec(sch, skb);
> +       qdisc_bstats_update(sch, skb);
> +
> +       q->qstats[q->highest_prio].backlog -= qdisc_pkt_len(skb);
> +
> +       /* Update highest priority field. */
> +       if (skb_queue_empty(hpq)) {
> +               if (q->lowest_prio == q->highest_prio) {
> +                       BUG_ON(sch->q.qlen);
> +                       q->highest_prio = 0;
> +                       q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1;
> +               } else {
> +                       q->highest_prio = calc_new_high_prio(q);
> +               }
> +       }
> +       return skb;
> +}
> +
> +static int skbprio_change(struct Qdisc *sch, struct nlattr *opt,
> +                       struct netlink_ext_ack *extack)
> +{
> +       struct skbprio_sched_data *q = qdisc_priv(sch);
> +       struct tc_skbprio_qopt *ctl = nla_data(opt);
> +       const unsigned int min_limit = 1;
> +
> +       if (ctl->limit == (typeof(ctl->limit))-1)
> +               q->max_limit = max(qdisc_dev(sch)->tx_queue_len, min_limit);
> +       else if (ctl->limit < min_limit ||
> +               ctl->limit > qdisc_dev(sch)->tx_queue_len)
> +               return -EINVAL;
> +       else
> +               q->max_limit = ctl->limit;
> +
> +       return 0;
> +}
> +
> +static int skbprio_init(struct Qdisc *sch, struct nlattr *opt,
> +                       struct netlink_ext_ack *extack)
> +{
> +       struct skbprio_sched_data *q = qdisc_priv(sch);
> +       const unsigned int min_limit = 1;
> +       int prio;
> +
> +       /* Initialise all queues, one for each possible priority. */
> +       for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++)
> +               __skb_queue_head_init(&q->qdiscs[prio]);
> +
> +       memset(&q->qstats, 0, sizeof(q->qstats));
> +       q->highest_prio = 0;
> +       q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1;
> +       if (!opt) {
> +               q->max_limit = max(qdisc_dev(sch)->tx_queue_len, min_limit);
> +               return 0;
> +       }
> +       return skbprio_change(sch, opt, extack);
> +}
> +
> +static int skbprio_dump(struct Qdisc *sch, struct sk_buff *skb)
> +{
> +       struct skbprio_sched_data *q = qdisc_priv(sch);
> +       struct tc_skbprio_qopt opt;
> +
> +       opt.limit = q->max_limit;
> +
> +       if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
> +               return -1;
> +
> +       return skb->len;
> +}
> +
> +static void skbprio_reset(struct Qdisc *sch)
> +{
> +       struct skbprio_sched_data *q = qdisc_priv(sch);
> +       int prio;
> +
> +       sch->qstats.backlog = 0;
> +       sch->q.qlen = 0;
> +
> +       for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++)
> +               __skb_queue_purge(&q->qdiscs[prio]);
> +
> +       memset(&q->qstats, 0, sizeof(q->qstats));
> +       q->highest_prio = 0;
> +       q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1;
> +}
> +
> +static void skbprio_destroy(struct Qdisc *sch)
> +{
> +       struct skbprio_sched_data *q = qdisc_priv(sch);
> +       int prio;
> +
> +       for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++)
> +               __skb_queue_purge(&q->qdiscs[prio]);
> +}
> +
> +static struct Qdisc *skbprio_leaf(struct Qdisc *sch, unsigned long arg)
> +{
> +       return NULL;
> +}
> +
> +static unsigned long skbprio_find(struct Qdisc *sch, u32 classid)
> +{
> +       return 0;
> +}
> +
> +static int skbprio_dump_class(struct Qdisc *sch, unsigned long cl,
> +                            struct sk_buff *skb, struct tcmsg *tcm)
> +{
> +       tcm->tcm_handle |= TC_H_MIN(cl);
> +       return 0;
> +}
> +
> +static int skbprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
> +                                  struct gnet_dump *d)
> +{
> +       struct skbprio_sched_data *q = qdisc_priv(sch);
> +       if (gnet_stats_copy_queue(d, NULL, &q->qstats[cl - 1],
> +               q->qstats[cl - 1].qlen) < 0)
> +               return -1;
> +       return 0;
> +}
> +
> +static void skbprio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
> +{
> +       unsigned int i;
> +
> +       if (arg->stop)
> +               return;
> +
> +       for (i = 0; i < SKBPRIO_MAX_PRIORITY; i++) {
> +               if (arg->count < arg->skip) {
> +                       arg->count++;
> +                       continue;
> +               }
> +               if (arg->fn(sch, i + 1, arg) < 0) {
> +                       arg->stop = 1;
> +                       break;
> +               }
> +               arg->count++;
> +       }
> +}
> +
> +static const struct Qdisc_class_ops skbprio_class_ops = {
> +       .leaf           =       skbprio_leaf,
> +       .find           =       skbprio_find,
> +       .dump           =       skbprio_dump_class,
> +       .dump_stats     =       skbprio_dump_class_stats,
> +       .walk           =       skbprio_walk,
> +};
> +
> +static struct Qdisc_ops skbprio_qdisc_ops __read_mostly = {
> +       .cl_ops         =       &skbprio_class_ops,
> +       .id             =       "skbprio",
> +       .priv_size      =       sizeof(struct skbprio_sched_data),
> +       .enqueue        =       skbprio_enqueue,
> +       .dequeue        =       skbprio_dequeue,
> +       .peek           =       qdisc_peek_dequeued,
> +       .init           =       skbprio_init,
> +       .reset          =       skbprio_reset,
> +       .change         =       skbprio_change,
> +       .dump           =       skbprio_dump,
> +       .destroy        =       skbprio_destroy,
> +       .owner          =       THIS_MODULE,
> +};
> +
> +static int __init skbprio_module_init(void)
> +{
> +       return register_qdisc(&skbprio_qdisc_ops);
> +}
> +
> +static void __exit skbprio_module_exit(void)
> +{
> +       unregister_qdisc(&skbprio_qdisc_ops);
> +}
> +
> +module_init(skbprio_module_init)
> +module_exit(skbprio_module_exit)
> +
> +MODULE_LICENSE("GPL");
> --
> 1.9.1
>