[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1294667210.3491.7.camel@edumazet-laptop>
Date: Mon, 10 Jan 2011 14:46:50 +0100
From: Eric Dumazet <eric.dumazet@...il.com>
To: Stephen Hemminger <shemminger@...tta.com>
Cc: David Miller <davem@...emloft.net>, netdev@...r.kernel.org
Subject: Re: [RFC] sched: CHOKe packet scheduler (v0.2)
Le jeudi 06 janvier 2011 à 20:55 -0800, Stephen Hemminger a écrit :
>
> The problem is that large tables of pointers in kernel require either
> contiguous allocation or some indirect table algorithm.
>
Here is a v3 version with an array based queue for O(1) peek_random
complexity.
Could you send the iproute2 patch so that I can test it ?
Thanks !
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index e69de29..ea9db00 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -0,0 +1,388 @@
+/*
+ * net/sched/sch_choke.c CHOKE scheduler
+ *
+ * Copyright (c) 2011 Stephen Hemminger <shemminger@...tta.com>
+ * Copyright (c) 2011 Eric Dumazet <eric.dumazet@...il.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/reciprocal_div.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+#include <net/red.h>
+
+/* CHOKe stateless AQM for fair bandwidth allocation
+ =================================================
+
+ CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
+ unresponsive flows) is a variant of RED that penalizes misbehaving flows but
+ maintains no flow state. The difference from RED is an additional step
+ during the enqueuing process. If average queue size is over the
+ low threshold (qmin), a packet is chosen at random from the queue.
+ If both the new and chosen packet are from the same flow, both
+ are dropped. Unlike RED, CHOKe is not a "classful" qdisc because it
+ needs to access packets in queue randomly.
+
+ Source:
+ R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
+ Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
+ IEEE INFOCOM, 2000.
+
+ A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
+ Characteristics", IEEE/ACM Transactions on Networking, 2004
+
+ */
+
+struct choke_sched_data {
+ u32 limit;
+ unsigned char flags;
+
+ struct red_parms parms;
+ struct red_stats stats;
+
+ unsigned int head;
+ unsigned int tail;
+ unsigned int holes;
+ unsigned int tab_mask; /* size - 1 */
+ struct sk_buff **tab;
+};
+
+static inline unsigned int choke_len(const struct choke_sched_data *q)
+{
+ return (q->tail - q->head) & q->tab_mask;
+}
+
+/* deliver a random number between 0 and N - 1 */
+static inline u32 random_N(unsigned int N)
+{
+ return reciprocal_divide(random32(), N);
+}
+
+/* Select a packet at random from the queue in O(1) */
+static struct sk_buff *choke_peek_random(struct choke_sched_data *q, unsigned int *pidx)
+{
+ *pidx = (q->head + random_N(choke_len(q))) & q->tab_mask;
+ return q->tab[*pidx];
+}
+
+
+static inline int use_ecn(const struct choke_sched_data *q)
+{
+ return q->flags & TC_RED_ECN;
+}
+
+static inline int use_harddrop(const struct choke_sched_data *q)
+{
+ return q->flags & TC_RED_HARDDROP;
+}
+
+static inline void choke_zap_head_holes(struct choke_sched_data *q)
+{
+ while (q->holes && q->tab[q->head] == NULL) {
+ q->head = (q->head + 1) & q->tab_mask;
+ q->holes--;
+ }
+}
+
+static inline void choke_zap_tail_holes(struct choke_sched_data *q)
+{
+ while (q->holes && q->tab[q->tail - 1] == NULL) {
+ q->tail = (q->tail - 1) & q->tab_mask;
+ q->holes--;
+ }
+}
+
+static void choke_drop_by_idx(struct choke_sched_data *q, unsigned int idx)
+{
+ q->tab[idx] = NULL;
+ q->holes++;
+ choke_zap_head_holes(q);
+ choke_zap_tail_holes(q);
+}
+
+
+static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct red_parms *p = &q->parms;
+
+ p->qavg = red_calc_qavg(p, choke_len(q) - q->holes);
+ if (red_is_idling(p))
+ red_end_of_idle_period(p);
+
+ if (p->qavg <= p->qth_min)
+ p->qcount = -1;
+ else {
+ struct sk_buff *oskb;
+ unsigned int idx;
+
+ /* Draw a packet at random from queue */
+ oskb = choke_peek_random(q, &idx);
+
+ /* Both packets from same flow ? */
+ if (oskb && skb_get_rxhash(oskb) == skb_get_rxhash(skb)) {
+ /* Drop both packets */
+ choke_drop_by_idx(q, idx);
+ qdisc_drop(oskb, sch);
+ goto congestion_drop;
+ }
+
+ if (p->qavg > p->qth_max) {
+ p->qcount = -1;
+
+ sch->qstats.overlimits++;
+ if (use_harddrop(q) || !use_ecn(q) ||
+ !INET_ECN_set_ce(skb)) {
+ q->stats.forced_drop++;
+ goto congestion_drop;
+ }
+
+ q->stats.forced_mark++;
+ }
+
+ if (++p->qcount) {
+ if (red_mark_probability(p, p->qavg)) {
+ p->qcount = 0;
+ p->qR = red_random(p);
+
+ sch->qstats.overlimits++;
+ if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
+ q->stats.prob_drop++;
+ goto congestion_drop;
+ }
+
+ q->stats.prob_mark++;
+ }
+ } else
+ p->qR = red_random(p);
+ }
+
+ /* Admit new packet */
+ if (likely(choke_len(q) < q->limit)) {
+ q->tab[q->tail] = skb;
+ q->tail = (q->tail + 1) & q->tab_mask;
+ sch->qstats.backlog += qdisc_pkt_len(skb);
+ __qdisc_update_bstats(sch, qdisc_pkt_len(skb));
+ return NET_XMIT_SUCCESS;
+ }
+ q->stats.pdrop++;
+ sch->qstats.drops++;
+ kfree_skb(skb);
+ return NET_XMIT_DROP;
+
+ congestion_drop:
+ qdisc_drop(skb, sch);
+ return NET_XMIT_CN;
+}
+
+static struct sk_buff *choke_dequeue(struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+
+ if (q->head == q->tail) {
+ if (!red_is_idling(&q->parms))
+ red_start_of_idle_period(&q->parms);
+ return NULL;
+ }
+ skb = q->tab[q->head];
+ q->tab[q->head] = NULL; /* not really needed */
+ q->head = (q->head + 1) & q->tab_mask;
+ choke_zap_head_holes(q);
+ sch->qstats.backlog -= qdisc_pkt_len(skb);
+
+ return skb;
+}
+
+static unsigned int choke_drop(struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ unsigned int len;
+
+ len = qdisc_queue_drop(sch);
+
+ if (len > 0)
+ q->stats.other++;
+ else {
+ if (!red_is_idling(&q->parms))
+ red_start_of_idle_period(&q->parms);
+ }
+
+ return len;
+}
+
+static void choke_reset(struct Qdisc* sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+
+ red_restart(&q->parms);
+}
+
+static const struct nla_policy choke_policy[TCA_RED_MAX + 1] = {
+ [TCA_RED_PARMS] = { .len = sizeof(struct tc_red_qopt) },
+ [TCA_RED_STAB] = { .len = RED_STAB_SIZE },
+};
+
+
+static void choke_free(void *addr)
+{
+ if (addr) {
+ if (is_vmalloc_addr(addr))
+ vfree(addr);
+ else
+ kfree(addr);
+ }
+}
+
+static int choke_change(struct Qdisc *sch, struct nlattr *opt)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct nlattr *tb[TCA_RED_MAX + 1];
+ struct tc_red_qopt *ctl;
+ int err;
+ struct sk_buff **old = NULL;
+ unsigned int mask;
+
+ if (opt == NULL)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, TCA_RED_MAX, opt, choke_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[TCA_RED_PARMS] == NULL ||
+ tb[TCA_RED_STAB] == NULL)
+ return -EINVAL;
+
+ ctl = nla_data(tb[TCA_RED_PARMS]);
+
+ mask = roundup_pow_of_two(ctl->limit + 1) - 1;
+ if (mask != q->tab_mask) {
+ struct sk_buff **ntab = kcalloc(mask + 1, sizeof(struct sk_buff *),
+ GFP_KERNEL);
+ if (!ntab)
+ ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
+ if (!ntab)
+ return -ENOMEM;
+ sch_tree_lock(sch);
+ old = q->tab;
+ if (old) {
+ unsigned int tail = 0;
+
+ while (q->head != q->tail) {
+ ntab[tail++] = q->tab[q->head];
+ q->head = (q->head + 1) & q->tab_mask;
+ }
+ q->head = 0;
+ q->tail = tail;
+ }
+ q->tab_mask = mask;
+ q->holes = 0;
+ } else
+ sch_tree_lock(sch);
+ q->flags = ctl->flags;
+ q->limit = ctl->limit;
+
+ red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
+ ctl->Plog, ctl->Scell_log,
+ nla_data(tb[TCA_RED_STAB]));
+
+ if (q->head == q->tail)
+ red_end_of_idle_period(&q->parms);
+
+ sch_tree_unlock(sch);
+ choke_free(old);
+ return 0;
+}
+
+static int choke_init(struct Qdisc* sch, struct nlattr *opt)
+{
+ return choke_change(sch, opt);
+}
+
+static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct nlattr *opts = NULL;
+ struct tc_red_qopt opt = {
+ .limit = q->limit,
+ .flags = q->flags,
+ .qth_min = q->parms.qth_min >> q->parms.Wlog,
+ .qth_max = q->parms.qth_max >> q->parms.Wlog,
+ .Wlog = q->parms.Wlog,
+ .Plog = q->parms.Plog,
+ .Scell_log = q->parms.Scell_log,
+ };
+
+ sch->q.qlen = choke_len(q) - q->holes;
+ opts = nla_nest_start(skb, TCA_OPTIONS);
+ if (opts == NULL)
+ goto nla_put_failure;
+
+ NLA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt);
+ return nla_nest_end(skb, opts);
+
+nla_put_failure:
+ nla_nest_cancel(skb, opts);
+ return -EMSGSIZE;
+}
+
+static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+ struct tc_red_xstats st = {
+ .early = q->stats.prob_drop + q->stats.forced_drop,
+ .pdrop = q->stats.pdrop,
+ .other = q->stats.other,
+ .marked = q->stats.prob_mark + q->stats.forced_mark,
+ };
+
+ return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static void choke_destroy(struct Qdisc *sch)
+{
+ struct choke_sched_data *q = qdisc_priv(sch);
+
+ choke_free(q->tab);
+}
+
+static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
+ .id = "choke",
+ .priv_size = sizeof(struct choke_sched_data),
+
+ .enqueue = choke_enqueue,
+ .dequeue = choke_dequeue,
+ .peek = qdisc_peek_head,
+ .drop = choke_drop,
+ .init = choke_init,
+ .destroy = choke_destroy,
+ .reset = choke_reset,
+ .change = choke_change,
+ .dump = choke_dump,
+ .dump_stats = choke_dump_stats,
+ .owner = THIS_MODULE,
+};
+
+static int __init choke_module_init(void)
+{
+ return register_qdisc(&choke_qdisc_ops);
+}
+
+static void __exit choke_module_exit(void)
+{
+ unregister_qdisc(&choke_qdisc_ops);
+}
+
+module_init(choke_module_init)
+module_exit(choke_module_exit)
+
+MODULE_LICENSE("GPL");
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists