[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1294207345.3420.43.camel@edumazet-laptop>
Date: Wed, 05 Jan 2011 07:02:25 +0100
From: Eric Dumazet <eric.dumazet@...il.com>
To: Stephen Hemminger <shemminger@...tta.com>
Cc: David Miller <davem@...emloft.net>, netdev@...r.kernel.org
Subject: Re: [RFC] sched: CHOKe packet scheduler
Le mardi 04 janvier 2011 à 16:29 -0800, Stephen Hemminger a écrit :
> This implements the CHOKe packet scheduler based on the existing
> Linux RED scheduler based on the algorithm described in the paper.
> Configuration is the same as RED; only the name changes.
>
> The core idea is:
> For every packet arrival:
> Calculate Qave
> if (Qave < minth) {
> Queue the new packet
> }
> Else {
> Select randomly a packet from the queue for their flow id
> Compare arriving packet with a randomly selected packet.
> If they have the same flow id {
> Drop both the packets
> }
> Else {
> if (Qave ≥ maxth) {
you mean if (Qave is less than maxth) ?
> Calculate the dropping probability pa
> Drop the packet with probability pa
> }
> Else {
> Drop the new packet
> }
> }
> }
>
> This an early access version.
>
No ECN support at all ? even RED supports it :)
> Signed-off-by: Stephen Hemminger <shemminger@...tta.com>
>
> ---
> net/sched/Kconfig | 11 +
> net/sched/Makefile | 1
> net/sched/sch_choke.c | 364 ++++++++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 376 insertions(+)
>
> --- a/net/sched/Kconfig 2011-01-04 16:25:18.000000000 -0800
> +++ b/net/sched/Kconfig 2011-01-04 16:26:02.335973715 -0800
> @@ -205,6 +205,17 @@ config NET_SCH_DRR
>
> If unsure, say N.
>
> +config NET_SCH_CHOKE
> + tristate "CHOose and Keep responsive flow scheduler (CHOKE)"
> + help
> + Say Y here if you want to use the CHOKe packet scheduler (CHOose
> + and Keep for responsive flows, CHOose and Kill for unresponsive
> + flows). This is a variation of RED which trys to penalize flows
> + that monopolize the queue.
> +
> + To compile this code as a module, choose M here: the
> + module will be called sch_choke.
> +
> config NET_SCH_INGRESS
> tristate "Ingress Qdisc"
> depends on NET_CLS_ACT
> --- a/net/sched/Makefile 2011-01-04 16:25:18.000000000 -0800
> +++ b/net/sched/Makefile 2011-01-04 16:26:16.048938937 -0800
> @@ -32,6 +32,7 @@ obj-$(CONFIG_NET_SCH_MULTIQ) += sch_mult
> obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o
> obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o
> obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o
> +obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o
> obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
> obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o
> obj-$(CONFIG_NET_CLS_FW) += cls_fw.o
> --- /dev/null 1970-01-01 00:00:00.000000000 +0000
> +++ b/net/sched/sch_choke.c 2011-01-04 16:25:33.913971468 -0800
> @@ -0,0 +1,364 @@
> +/*
> + * net/sched/sch_choke.c CHOKE scheduler
> + *
> + * Copyright (c) 2011 Stephen Hemminger <shemminger@...tta.com>
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * version 2 as published by the Free Software Foundation.
> + *
> + */
> +
> +#include <linux/module.h>
> +#include <linux/types.h>
> +#include <linux/kernel.h>
> +#include <linux/skbuff.h>
> +#include <linux/ipv6.h>
> +#include <linux/jhash.h>
> +#include <net/pkt_sched.h>
> +#include <net/ip.h>
> +#include <net/red.h>
> +#include <net/ipv6.h>
> +
> +/* CHOKe stateless AQM for fair bandwidth allocation
> + =================================================
> +
> + Source:
> + R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
> + Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
> + IEEE INFOCOM, 2000.
> +
> + A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
> + Characteristics", IEEE/ACM Transactions on Networking, 2004
> +
> + ADVANTAGE:
> + - Penalizes unfair flows
> + - Random drop provide gradual feedback
> +
> + DRAWBACKS:
> + - Small queue for single flow
> + - Can be gamed by opening lots of connections
> + - Hard to get correct paremeters (same problem as RED)
Big packets are really unfair. Must disable TSO/GRO :)
> +
> + */
> +
> +struct choke_sched_data
> +{
> + u32 limit;
> + unsigned char flags;
> +
> + struct red_parms parms;
> + struct red_stats stats;
> +};
> +
> +/* Select a packet at random from the list.
> + * Same caveats as skb_peek.
> + */
> +static struct sk_buff *skb_peek_random(struct sk_buff_head *list)
> +{
> + struct sk_buff *skb = list->next;
> + unsigned int idx = net_random() % list->qlen;
> +
> + while (skb && idx-- > 0)
> + skb = skb->next;
Ouch... A linked list (using in skb anchors) is not appropriate data
structure. Thats too many cache lines misses.
Maybe using a q->limit array of skb pointers ?
> +
> + return skb;
> +}
> +
> +/* Given IP header and size find src/dst port pair */
> +static inline u32 get_ports(const void *hdr, size_t hdr_size, int offset)
> +{
> + return *(u32 *)(hdr + hdr_size + offset);
> +}
> +
> +
> +static bool same_flow(struct sk_buff *nskb, const struct sk_buff *oskb)
> +{
> + if (nskb->protocol != oskb->protocol)
> + return false;
> +
> + switch (nskb->protocol) {
> + case htons(ETH_P_IP):
> + {
> + const struct iphdr *iph1, *iph2;
> + int poff;
> +
> + if (!pskb_network_may_pull(nskb, sizeof(*iph1)))
> + return false;
Why isnt it necessary to also may_pull test oskb ?
> +
> + iph1 = ip_hdr(nskb);
> + iph2 = ip_hdr(oskb);
> +
> + if (iph1->protocol != iph2->protocol ||
> + iph1->daddr != iph2->daddr ||
> + iph1->saddr != iph2->saddr)
> + return false;
> +
> + /* Be hostile to new fragmented packets */
> + if (iph1->frag_off & htons(IP_MF|IP_OFFSET))
> + return true;
> +
> + if (iph2->frag_off & htons(IP_MF|IP_OFFSET))
> + return false;
> +
> + poff = proto_ports_offset(iph1->protocol);
> + if (poff >= 0 &&
> + pskb_network_may_pull(nskb, iph1->ihl * 4 + 4 + poff)) {
> + iph1 = ip_hdr(nskb);
> +
> + return get_ports(iph1, iph1->ihl * 4, poff)
> + == get_ports(iph2, iph2->ihl * 4, poff);
> + }
> +
> + return false;
> + }
> +
> + case htons(ETH_P_IPV6):
> + {
> + const struct ipv6hdr *iph1, *iph2;
> + int poff;
> +
> + if (!pskb_network_may_pull(nskb, sizeof(*iph1)))
> + return false;
same here.
> +
> + iph1 = ipv6_hdr(nskb);
> + iph2 = ipv6_hdr(oskb);
> +
> + if (iph1->nexthdr != iph2->nexthdr ||
> + ipv6_addr_cmp(&iph1->daddr, &iph2->daddr) != 0 ||
> + ipv6_addr_cmp(&iph1->saddr, &iph2->saddr) != 0)
> + return false;
> +
> + poff = proto_ports_offset(iph1->nexthdr);
> + if (poff >= 0 &&
> + pskb_network_may_pull(nskb, sizeof(*iph1) + 4 + poff)) {
> + iph1 = ipv6_hdr(nskb);
> +
> + return get_ports(iph1, sizeof(*iph1), poff)
> + == get_ports(iph2, sizeof(*iph2), poff);
> + }
> + return false;
> + }
> + default:
> + return false;
> + }
> +
> +}
> +
> +/*
> + * Decide what to do with new packet based on queue size.
> + * returns 1 if packet should be admitted
> + * 0 if packet should be dropped
> + */
> +static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
> +{
> + struct choke_sched_data *q = qdisc_priv(sch);
> + struct red_parms *p = &q->parms;
> +
> + p->qavg = red_calc_qavg(p, skb_queue_len(&sch->q));
> + if (red_is_idling(p))
> + red_end_of_idle_period(p);
> +
> + if (p->qavg <= p->qth_min)
> + p->qcount = -1;
> + else {
> + struct sk_buff *oskb;
> +
> + /* Draw a packet at random from queue */
> + oskb = skb_peek_random(&sch->q);
> +
> + /* Both packets from same flow? */
> + if (same_flow(skb, oskb)) {
> + /* Drop both packets */
> + __skb_unlink(oskb, &sch->q);
> + qdisc_drop(oskb, sch);
> + goto congestion_drop;
> + }
> +
> + if (p->qavg > p->qth_max) {
ok, maybe CHOKe paper used : if (p->qavg >= p->qth_max) ?
> + p->qcount = -1;
> +
> + sch->qstats.overlimits++;
> + q->stats.forced_drop++;
> + goto congestion_drop;
> + }
> +
> + if (++p->qcount) {
> + if (red_mark_probability(p, p->qavg)) {
> + p->qcount = 0;
> + p->qR = red_random(p);
> +
> + sch->qstats.overlimits++;
> + q->stats.prob_drop++;
> + goto congestion_drop;
> + }
> + } else
> + p->qR = red_random(p);
> + }
> +
> + /* Admit new packet */
> + if (likely(skb_queue_len(&sch->q) < q->limit))
> + return qdisc_enqueue_tail(skb, sch);
> +
> + q->stats.pdrop++;
> + sch->qstats.drops++;
> + kfree_skb(skb);
> + return NET_XMIT_DROP;
> +
> + congestion_drop:
> + qdisc_drop(skb, sch);
> + return NET_XMIT_CN;
> +}
> +
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists