[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAA93jw7024Ejncj_Gjg9jcQF=HTERkc0dSCrVKrv7XJfELryow@mail.gmail.com>
Date: Fri, 6 Jan 2012 17:56:51 +0100
From: Dave Taht <dave.taht@...il.com>
To: Eric Dumazet <eric.dumazet@...il.com>
Cc: David Miller <davem@...emloft.net>,
netdev <netdev@...r.kernel.org>,
Stephen Hemminger <shemminger@...tta.com>,
Kathleen Nichols <nichols@...lere.com>,
Jim Gettys <jg@...edesktop.org>
Subject: Re: [PATCH] net_sched: sfq: add optional RED on top of SFQ
On Fri, Jan 6, 2012 at 5:31 PM, Eric Dumazet <eric.dumazet@...il.com> wrote:
> Adds an optional Random Early Detection on each SFQ flow queue.
netperf -t TCP_RR is useful
-t TCP_MAERTS will be interesting.
simultaneous ping?
> Traditional SFQ limits count of packets, while RED permits to also
> control number of bytes per flow, and adds ECN capability as well.
>
> 1) We dont handle the idle time management in this RED implementation,
> since each 'new flow' begins with a null qavg. We really want to address
> backlogged flows.
>
> 2) if headdrop is selected, we try to ecn mark first packet instead of
> currently enqueued packet. This gives faster feedback for tcp flows
> compared to traditional RED [ marking the last packet in queue ]
>
> Example of use :
>
> tc qdisc add dev $DEV parent 1:1 handle 10: est 1sec 4sec sfq \
> limit 3000 headdrop flows 512 divisor 16384 \
> redflowlimit 100000 min 8000 max 60000 probability 0.20 ecn
>
> qdisc sfq 10: parent 1:1 limit 3000p quantum 1514b depth 127 headdrop
> flows 512/16384 divisor 16384
> ewma 6 min 8000b max 60000b probability 0.2 ecn
> prob_mark 0 prob_mark_head 4876 prob_drop 6131
> forced_mark 0 forced_mark_head 0 forced_drop 0
> Sent 1175211782 bytes 777537 pkt (dropped 6131, overlimits 11007
> requeues 0)
> rate 99483Kbit 8219pps backlog 689392b 456p requeues 0
>
> In this test, with 64 netperf TCP_STREAM sessions, 50% using ECN enabled
> flows, we can see number of packets CE marked is smaller than number of
> drops (for non ECN flows)
>
> If same test is run, without RED, we can check backlog is much bigger.
>
> qdisc sfq 10: parent 1:1 limit 3000p quantum 1514b depth 127 headdrop
> flows 512/16384 divisor 16384
> Sent 1148683617 bytes 795006 pkt (dropped 0, overlimits 0 requeues 0)
> rate 98429Kbit 8521pps backlog 1221290b 841p requeues 0
>
>
> Signed-off-by: Eric Dumazet <eric.dumazet@...il.com>
> CC: Stephen Hemminger <shemminger@...tta.com>
> CC: Dave Taht <dave.taht@...il.com>
> ---
> include/linux/pkt_sched.h | 20 ++++
> include/net/red.h | 3
> net/sched/sch_sfq.c | 146 ++++++++++++++++++++++++++++++++----
> 3 files changed, 152 insertions(+), 17 deletions(-)
>
> diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
> index 8f1b928..0d5b793 100644
> --- a/include/linux/pkt_sched.h
> +++ b/include/linux/pkt_sched.h
> @@ -162,10 +162,30 @@ struct tc_sfq_qopt {
> unsigned flows; /* Maximal number of flows */
> };
>
> +struct tc_sfqred_stats {
> + __u32 prob_drop; /* Early drops, below max threshold */
> + __u32 forced_drop; /* Early drops, after max threshold */
> + __u32 prob_mark; /* Marked packets, below max threshold */
> + __u32 forced_mark; /* Marked packets, after max threshold */
> + __u32 prob_mark_head; /* Marked packets, below max threshold */
> + __u32 forced_mark_head;/* Marked packets, after max threshold */
> +};
> +
> struct tc_sfq_qopt_v1 {
> struct tc_sfq_qopt v0;
> unsigned int depth; /* max number of packets per flow */
> unsigned int headdrop;
> +/* SFQRED parameters */
> + __u32 limit; /* HARD maximal flow queue length (bytes) */
> + __u32 qth_min; /* Min average length threshold (bytes) */
> + __u32 qth_max; /* Max average length threshold (bytes) */
> + unsigned char Wlog; /* log(W) */
> + unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */
> + unsigned char Scell_log; /* cell size for idle damping */
> + unsigned char flags;
> + __u32 max_P; /* probability, high resolution */
> +/* SFQRED stats */
> + struct tc_sfqred_stats stats;
> };
>
>
> diff --git a/include/net/red.h b/include/net/red.h
> index baab385..28068ec 100644
> --- a/include/net/red.h
> +++ b/include/net/red.h
> @@ -199,7 +199,8 @@ static inline void red_set_parms(struct red_parms *p,
> p->Scell_log = Scell_log;
> p->Scell_max = (255 << Scell_log);
>
> - memcpy(p->Stab, stab, sizeof(p->Stab));
> + if (stab)
> + memcpy(p->Stab, stab, sizeof(p->Stab));
> }
>
> static inline int red_is_idling(const struct red_vars *v)
> diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
> index 0a79640..67494ae 100644
> --- a/net/sched/sch_sfq.c
> +++ b/net/sched/sch_sfq.c
> @@ -24,6 +24,7 @@
> #include <net/netlink.h>
> #include <net/pkt_sched.h>
> #include <net/flow_keys.h>
> +#include <net/red.h>
>
>
> /* Stochastic Fairness Queuing algorithm.
> @@ -108,24 +109,30 @@ struct sfq_slot {
> struct sfq_head dep; /* anchor in dep[] chains */
> unsigned short hash; /* hash value (index in ht[]) */
> short allot; /* credit for this slot */
> +
> + unsigned int backlog;
> + struct red_vars vars;
> };
>
> struct sfq_sched_data {
> /* frequently used fields */
> int limit; /* limit of total number of packets in this qdisc */
> unsigned int divisor; /* number of slots in hash table */
> - unsigned int maxflows; /* number of flows in flows array */
> - int headdrop;
> - int maxdepth; /* limit of packets per flow */
> + u8 headdrop;
> + u8 maxdepth; /* limit of packets per flow */
>
> u32 perturbation;
> - struct tcf_proto *filter_list;
> - sfq_index cur_depth; /* depth of longest slot */
> + u8 cur_depth; /* depth of longest slot */
> + u8 flags;
> unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
> - struct sfq_slot *tail; /* current slot in round */
> + struct tcf_proto *filter_list;
> sfq_index *ht; /* Hash table ('divisor' slots) */
> struct sfq_slot *slots; /* Flows table ('maxflows' entries) */
>
> + struct red_parms *red_parms;
> + struct tc_sfqred_stats stats;
> + struct sfq_slot *tail; /* current slot in round */
> +
> struct sfq_head dep[SFQ_MAX_DEPTH + 1];
> /* Linked lists of slots, indexed by depth
> * dep[0] : list of unused flows
> @@ -133,6 +140,7 @@ struct sfq_sched_data {
> * dep[X] : list of flows with X packets
> */
>
> + unsigned int maxflows; /* number of flows in flows array */
> int perturb_period;
> unsigned int quantum; /* Allotment per round: MUST BE >= MTU */
> struct timer_list perturb_timer;
> @@ -321,6 +329,7 @@ static unsigned int sfq_drop(struct Qdisc *sch)
> drop:
> skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
> len = qdisc_pkt_len(skb);
> + slot->backlog -= len;
> sfq_dec(q, x);
> kfree_skb(skb);
> sch->q.qlen--;
> @@ -341,6 +350,23 @@ drop:
> return 0;
> }
>
> +/* Is ECN parameter configured */
> +static int sfq_prob_mark(const struct sfq_sched_data *q)
> +{
> + return q->flags & TC_RED_ECN;
> +}
> +
> +/* Should packets over max threshold just be marked */
> +static int sfq_hard_mark(const struct sfq_sched_data *q)
> +{
> + return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
> +}
> +
> +static int sfq_headdrop(const struct sfq_sched_data *q)
> +{
> + return q->headdrop;
> +}
> +
> static int
> sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
> {
> @@ -349,6 +375,8 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
> sfq_index x, qlen;
> struct sfq_slot *slot;
> int uninitialized_var(ret);
> + struct sk_buff *head;
> + int delta;
>
> hash = sfq_classify(skb, sch, &ret);
> if (hash == 0) {
> @@ -368,24 +396,75 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
> q->ht[hash] = x;
> slot = &q->slots[x];
> slot->hash = hash;
> + slot->backlog = 0; /* should already be 0 anyway... */
> + red_set_vars(&slot->vars);
> + goto enqueue;
> }
> + if (q->red_parms) {
> + slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
> + &slot->vars,
> + slot->backlog);
> + switch (red_action(q->red_parms,
> + &slot->vars,
> + slot->vars.qavg)) {
> + case RED_DONT_MARK:
> + break;
>
> - if (slot->qlen >= q->maxdepth) {
> - struct sk_buff *head;
> + case RED_PROB_MARK:
> + sch->qstats.overlimits++;
> + if (sfq_prob_mark(q)) {
> + /* We know we have at least one packet in queue */
> + if (sfq_headdrop(q) &&
> + INET_ECN_set_ce(slot->skblist_next)) {
> + q->stats.prob_mark_head++;
> + break;
> + }
> + if (INET_ECN_set_ce(skb)) {
> + q->stats.prob_mark++;
> + break;
> + }
> + }
> + q->stats.prob_drop++;
> + goto congestion_drop;
> +
> + case RED_HARD_MARK:
> + sch->qstats.overlimits++;
> + if (sfq_hard_mark(q)) {
> + /* We know we have at least one packet in queue */
> + if (sfq_headdrop(q) &&
> + INET_ECN_set_ce(slot->skblist_next)) {
> + q->stats.forced_mark_head++;
> + break;
> + }
> + if (INET_ECN_set_ce(skb)) {
> + q->stats.forced_mark++;
> + break;
> + }
> + }
> + q->stats.forced_drop++;
> + goto congestion_drop;
> + }
> + }
>
> - if (!q->headdrop)
> + if (slot->qlen >= q->maxdepth) {
> +congestion_drop:
> + if (!sfq_headdrop(q))
> return qdisc_drop(skb, sch);
>
> + /* We know we have at least one packet in queue */
> head = slot_dequeue_head(slot);
> - sch->qstats.backlog -= qdisc_pkt_len(head);
> + delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
> + sch->qstats.backlog -= delta;
> + slot->backlog -= delta;
> qdisc_drop(head, sch);
>
> - sch->qstats.backlog += qdisc_pkt_len(skb);
> slot_queue_add(slot, skb);
> return NET_XMIT_CN;
> }
>
> +enqueue:
> sch->qstats.backlog += qdisc_pkt_len(skb);
> + slot->backlog += qdisc_pkt_len(skb);
> slot_queue_add(slot, skb);
> sfq_inc(q, x);
> if (slot->qlen == 1) { /* The flow is new */
> @@ -396,6 +475,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
> slot->next = q->tail->next;
> q->tail->next = x;
> }
> + /* We could use a bigger initial quantum for new flows */
> slot->allot = q->scaled_quantum;
> }
> if (++sch->q.qlen <= q->limit)
> @@ -439,7 +519,7 @@ next_slot:
> qdisc_bstats_update(sch, skb);
> sch->q.qlen--;
> sch->qstats.backlog -= qdisc_pkt_len(skb);
> -
> + slot->backlog -= qdisc_pkt_len(skb);
> /* Is the slot empty? */
> if (slot->qlen == 0) {
> q->ht[slot->hash] = SFQ_EMPTY_SLOT;
> @@ -490,6 +570,8 @@ static void sfq_rehash(struct Qdisc *sch)
> sfq_dec(q, i);
> __skb_queue_tail(&list, skb);
> }
> + slot->backlog = 0;
> + red_set_vars(&slot->vars);
> q->ht[slot->hash] = SFQ_EMPTY_SLOT;
> }
> q->tail = NULL;
> @@ -514,6 +596,11 @@ drop: sch->qstats.backlog -= qdisc_pkt_len(skb);
> if (slot->qlen >= q->maxdepth)
> goto drop;
> slot_queue_add(slot, skb);
> + if (q->red_parms)
> + slot->vars.qavg = red_calc_qavg(q->red_parms,
> + &slot->vars,
> + slot->backlog);
> + slot->backlog += qdisc_pkt_len(skb);
> sfq_inc(q, x);
> if (slot->qlen == 1) { /* The flow is new */
> if (q->tail == NULL) { /* It is the first flow */
> @@ -552,6 +639,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
> struct tc_sfq_qopt *ctl = nla_data(opt);
> struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
> unsigned int qlen;
> + struct red_parms *p = NULL;
>
> if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
> return -EINVAL;
> @@ -560,7 +648,11 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
> if (ctl->divisor &&
> (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
> return -EINVAL;
> -
> + if (ctl_v1 && ctl_v1->qth_min) {
> + p = kmalloc(sizeof(*p), GFP_KERNEL);
> + if (!p)
> + return -ENOMEM;
> + }
> sch_tree_lock(sch);
> if (ctl->quantum) {
> q->quantum = ctl->quantum;
> @@ -576,6 +668,16 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
> if (ctl_v1) {
> if (ctl_v1->depth)
> q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
> + if (p) {
> + swap(q->red_parms, p);
> + red_set_parms(q->red_parms,
> + ctl_v1->qth_min, ctl_v1->qth_max,
> + ctl_v1->Wlog,
> + ctl_v1->Plog, ctl_v1->Scell_log,
> + NULL,
> + ctl_v1->max_P);
> + }
> + q->flags = ctl_v1->flags;
> q->headdrop = ctl_v1->headdrop;
> }
> if (ctl->limit) {
> @@ -594,6 +696,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
> q->perturbation = net_random();
> }
> sch_tree_unlock(sch);
> + kfree(p);
> return 0;
> }
>
> @@ -625,6 +728,7 @@ static void sfq_destroy(struct Qdisc *sch)
> del_timer_sync(&q->perturb_timer);
> sfq_free(q->ht);
> sfq_free(q->slots);
> + kfree(q->red_parms);
> }
>
> static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
> @@ -683,6 +787,7 @@ static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
> struct sfq_sched_data *q = qdisc_priv(sch);
> unsigned char *b = skb_tail_pointer(skb);
> struct tc_sfq_qopt_v1 opt;
> + struct red_parms *p = q->red_parms;
>
> memset(&opt, 0, sizeof(opt));
> opt.v0.quantum = q->quantum;
> @@ -693,6 +798,17 @@ static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
> opt.depth = q->maxdepth;
> opt.headdrop = q->headdrop;
>
> + if (p) {
> + opt.qth_min = p->qth_min >> p->Wlog;
> + opt.qth_max = p->qth_max >> p->Wlog;
> + opt.Wlog = p->Wlog;
> + opt.Plog = p->Plog;
> + opt.Scell_log = p->Scell_log;
> + opt.max_P = p->max_P;
> + }
> + memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
> + opt.flags = q->flags;
> +
> NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
>
> return skb->len;
> @@ -747,15 +863,13 @@ static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
> sfq_index idx = q->ht[cl - 1];
> struct gnet_stats_queue qs = { 0 };
> struct tc_sfq_xstats xstats = { 0 };
> - struct sk_buff *skb;
>
> if (idx != SFQ_EMPTY_SLOT) {
> const struct sfq_slot *slot = &q->slots[idx];
>
> xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
> qs.qlen = slot->qlen;
> - slot_queue_walk(slot, skb)
> - qs.backlog += qdisc_pkt_len(skb);
> + qs.backlog = slot->backlog;
> }
> if (gnet_stats_copy_queue(d, &qs) < 0)
> return -1;
>
>
--
Dave Täht
SKYPE: davetaht
US Tel: 1-239-829-5608
FR Tel: 0638645374
http://www.bufferbloat.net
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists