netdev - Re: [RFC] sched: CHOKe packet scheduler

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1294207345.3420.43.camel@edumazet-laptop>
Date:	Wed, 05 Jan 2011 07:02:25 +0100
From:	Eric Dumazet <eric.dumazet@...il.com>
To:	Stephen Hemminger <shemminger@...tta.com>
Cc:	David Miller <davem@...emloft.net>, netdev@...r.kernel.org
Subject: Re: [RFC] sched: CHOKe packet scheduler

Le mardi 04 janvier 2011 à 16:29 -0800, Stephen Hemminger a écrit :
> This implements the CHOKe packet scheduler based on the existing
> Linux RED scheduler based on the algorithm described in the paper.
> Configuration is the same as RED; only the name changes.
> 
> The core idea is:
>   For every packet arrival:
>   	Calculate Qave
> 	if (Qave < minth) {
> 	   Queue the new packet
> 	}
> 	Else {
> 	     Select randomly a packet from the queue for their flow id
> 	     Compare arriving packet with a randomly selected packet.
> 	     If they have the same flow id {
> 	     	Drop both the packets
> 	     }
> 	     Else {
> 	     	  if (Qave ≥ maxth) {

you mean if (Qave is less than maxth) ?

> 		     Calculate the dropping probability pa
> 		     Drop the packet with probability pa
> 		  }
> 		  Else {
> 		     Drop the new packet
> 		  }
> 	     }
>        }
> 
> This an early access version.
> 

No ECN support at all ? even RED supports it :)

> Signed-off-by: Stephen Hemminger <shemminger@...tta.com>
> 
> ---
>  net/sched/Kconfig     |   11 +
>  net/sched/Makefile    |    1 
>  net/sched/sch_choke.c |  364 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 376 insertions(+)
> 
> --- a/net/sched/Kconfig	2011-01-04 16:25:18.000000000 -0800
> +++ b/net/sched/Kconfig	2011-01-04 16:26:02.335973715 -0800
> @@ -205,6 +205,17 @@ config NET_SCH_DRR
>  
>  	  If unsure, say N.
>  
> +config NET_SCH_CHOKE
> +	tristate "CHOose and Keep responsive flow scheduler (CHOKE)"
> +	help
> +	  Say Y here if you want to use the CHOKe packet scheduler (CHOose
> +	  and Keep for responsive flows, CHOose and Kill for unresponsive
> +	  flows). This is a variation of RED which trys to penalize flows
> +	  that monopolize the queue.
> +
> +	  To compile this code as a module, choose M here: the
> +	  module will be called sch_choke.
> +
>  config NET_SCH_INGRESS
>  	tristate "Ingress Qdisc"
>  	depends on NET_CLS_ACT
> --- a/net/sched/Makefile	2011-01-04 16:25:18.000000000 -0800
> +++ b/net/sched/Makefile	2011-01-04 16:26:16.048938937 -0800
> @@ -32,6 +32,7 @@ obj-$(CONFIG_NET_SCH_MULTIQ)	+= sch_mult
>  obj-$(CONFIG_NET_SCH_ATM)	+= sch_atm.o
>  obj-$(CONFIG_NET_SCH_NETEM)	+= sch_netem.o
>  obj-$(CONFIG_NET_SCH_DRR)	+= sch_drr.o
> +obj-$(CONFIG_NET_SCH_CHOKE)	+= sch_choke.o
>  obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
>  obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
>  obj-$(CONFIG_NET_CLS_FW)	+= cls_fw.o
> --- /dev/null	1970-01-01 00:00:00.000000000 +0000
> +++ b/net/sched/sch_choke.c	2011-01-04 16:25:33.913971468 -0800
> @@ -0,0 +1,364 @@
> +/*
> + * net/sched/sch_choke.c	CHOKE scheduler
> + *
> + * Copyright (c) 2011 Stephen Hemminger <shemminger@...tta.com>
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * version 2 as published by the Free Software Foundation.
> + *
> + */
> +
> +#include <linux/module.h>
> +#include <linux/types.h>
> +#include <linux/kernel.h>
> +#include <linux/skbuff.h>
> +#include <linux/ipv6.h>
> +#include <linux/jhash.h>
> +#include <net/pkt_sched.h>
> +#include <net/ip.h>
> +#include <net/red.h>
> +#include <net/ipv6.h>
> +
> +/*	CHOKe stateless AQM for fair bandwidth allocation
> +        =================================================
> +
> +	Source:
> +	R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
> +	Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
> +	IEEE INFOCOM, 2000.
> +
> +	A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
> +	Characteristics", IEEE/ACM Transactions on Networking, 2004
> +
> +	ADVANTAGE:
> +	- Penalizes unfair flows
> +	- Random drop provide gradual feedback
> +
> +	DRAWBACKS:
> +	- Small queue for single flow
> +	- Can be gamed by opening lots of connections
> +	- Hard to get correct paremeters (same problem as RED)

	Big packets are really unfair. Must disable TSO/GRO :)

> +
> + */
> +
> +struct choke_sched_data
> +{
> +	u32		  limit;
> +	unsigned char	  flags;
> +
> +	struct red_parms  parms;
> +	struct red_stats  stats;
> +};
> +
> +/* Select a packet at random from the list.
> + * Same caveats as skb_peek.
> + */
> +static struct sk_buff *skb_peek_random(struct sk_buff_head *list)
> +{
> +	struct sk_buff *skb = list->next;
> +	unsigned int idx = net_random() % list->qlen;
> +
> +	while (skb && idx-- > 0)
> +		skb = skb->next;

Ouch... A linked list (using in skb anchors) is not appropriate data
structure. Thats too many cache lines misses.

Maybe using a q->limit array of skb pointers ?

> +
> +	return skb;
> +}
> +
> +/* Given IP header and size find src/dst port pair */
> +static inline u32 get_ports(const void *hdr, size_t hdr_size, int offset)
> +{
> +	return *(u32 *)(hdr + hdr_size + offset);
> +}
> +
> +
> +static bool same_flow(struct sk_buff *nskb, const struct sk_buff *oskb)
> +{
> +	if (nskb->protocol != oskb->protocol)
> +		return false;
> +
> +	switch (nskb->protocol) {
> +	case htons(ETH_P_IP):
> +	{
> +		const struct iphdr *iph1, *iph2;
> +		int poff;
> +
> +		if (!pskb_network_may_pull(nskb, sizeof(*iph1)))
> +			return false;

Why isnt it necessary to also may_pull test oskb ?

> +
> +		iph1 = ip_hdr(nskb);
> +		iph2 = ip_hdr(oskb);
> +
> +		if (iph1->protocol != iph2->protocol ||
> +		    iph1->daddr != iph2->daddr ||
> +		    iph1->saddr != iph2->saddr)
> +			return false;
> +
> +		/* Be hostile to new fragmented packets */
> +		if (iph1->frag_off & htons(IP_MF|IP_OFFSET))
> +			return true;
> +
> +		if (iph2->frag_off & htons(IP_MF|IP_OFFSET))
> +			return false;
> +
> +		poff = proto_ports_offset(iph1->protocol);
> +		if (poff >= 0 &&
> +		    pskb_network_may_pull(nskb, iph1->ihl * 4 + 4 + poff)) {
> +			iph1 = ip_hdr(nskb);
> +
> +			return get_ports(iph1, iph1->ihl * 4, poff)
> +				== get_ports(iph2, iph2->ihl * 4, poff);
> +		}
> +
> +		return false;
> +	}
> +
> +	case htons(ETH_P_IPV6):
> +	{
> +		const struct ipv6hdr *iph1, *iph2;
> +		int poff;
> +
> +		if (!pskb_network_may_pull(nskb, sizeof(*iph1)))
> +			return false;

same here.

> +
> +		iph1 = ipv6_hdr(nskb);
> +		iph2 = ipv6_hdr(oskb);
> +
> +		if (iph1->nexthdr != iph2->nexthdr ||
> +		    ipv6_addr_cmp(&iph1->daddr, &iph2->daddr) != 0 ||
> +		    ipv6_addr_cmp(&iph1->saddr, &iph2->saddr) != 0)
> +			return false;
> +
> +		poff = proto_ports_offset(iph1->nexthdr);
> +		if (poff >= 0 &&
> +		    pskb_network_may_pull(nskb, sizeof(*iph1) + 4 + poff)) {
> +			iph1 = ipv6_hdr(nskb);
> +
> +			return get_ports(iph1, sizeof(*iph1), poff)
> +				== get_ports(iph2, sizeof(*iph2), poff);
> +		}
> +		return false;
> +	}
> +	default:
> +		return false;
> +	}
> +
> +}
> +
> +/*
> + * Decide what to do with new packet based on queue size.
> + * returns 1 if packet should be admitted
> + *         0 if packet should be dropped
> + */
> +static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
> +{
> +	struct choke_sched_data *q = qdisc_priv(sch);
> +	struct red_parms *p = &q->parms;
> +
> +	p->qavg = red_calc_qavg(p, skb_queue_len(&sch->q));
> +	if (red_is_idling(p))
> +		red_end_of_idle_period(p);
> +
> +	if (p->qavg <= p->qth_min)
> +		p->qcount = -1;
> +	else {
> +		struct sk_buff *oskb;
> +
> +		/* Draw a packet at random from queue */
> +		oskb = skb_peek_random(&sch->q);
> +
> +		/* Both packets from same flow? */
> +		if (same_flow(skb, oskb)) {
> +			/* Drop both packets */
> +			__skb_unlink(oskb, &sch->q);
> +			qdisc_drop(oskb, sch);
> +			goto congestion_drop;
> +		}
> +
> +		if (p->qavg > p->qth_max) {

ok, maybe CHOKe paper used : if (p->qavg >= p->qth_max)  ?

> +			p->qcount = -1;
> +
> +			sch->qstats.overlimits++;
> +			q->stats.forced_drop++;
> +			goto congestion_drop;
> +		}
> +
> +		if (++p->qcount) {
> +			if (red_mark_probability(p, p->qavg)) {
> +				p->qcount = 0;
> +				p->qR = red_random(p);
> +
> +				sch->qstats.overlimits++;
> +				q->stats.prob_drop++;
> +				goto congestion_drop;
> +			}
> +		} else
> +			p->qR = red_random(p);
> +	}
> +
> +	/* Admit new packet */
> +	if (likely(skb_queue_len(&sch->q) < q->limit))
> +		return qdisc_enqueue_tail(skb, sch);
> +
> +	q->stats.pdrop++;
> +	sch->qstats.drops++;
> +	kfree_skb(skb);
> +	return NET_XMIT_DROP;
> +
> + congestion_drop:
> +	qdisc_drop(skb, sch);
> +	return NET_XMIT_CN;
> +}
> +

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html