netdev - Re: [PATCH] CHOKe flow scheduler (0.7)

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1294941621.3570.419.camel@edumazet-laptop>
Date:	Thu, 13 Jan 2011 19:00:21 +0100
From:	Eric Dumazet <eric.dumazet@...il.com>
To:	Stephen Hemminger <shemminger@...tta.com>
Cc:	David Miller <davem@...emloft.net>, netdev@...r.kernel.org
Subject: Re: [PATCH] CHOKe flow scheduler (0.7)

Le jeudi 13 janvier 2011 à 09:27 -0800, Stephen Hemminger a écrit :
> This implements the CHOKe packet scheduler based on the existing
> Linux RED scheduler based on the algorithm described in the paper.
> 
> The core idea is:
>   For every packet arrival:
>   	Calculate Qave
> 	if (Qave < minth) 
> 	     Queue the new packet
> 	else 
> 	     Select randomly a packet from the queue 
> 	     if (both packets from same flow)
> 	     then Drop both the packets
> 	     else if (Qave > maxth)
> 	          Drop packet
> 	     else
> 	       	  Admit packet with proability p (same as RED)
> 
> See also:
>   Rong Pan, Balaji Prabhakar, Konstantinos Psounis, "CHOKe: a stateless active
>    queue management scheme for approximating fair bandwidth allocation", 
>   Proceeding of INFOCOM'2000, March 2000.
> 
> Signed-off-by: Stephen Hemminger <shemminger@...tta.com>
> 
> ---
> Patch versions
> 0.3 (Eric) uses table for queue.
> 0.4 allows classification with TC filters
>     fixes crash when peek_random() finds a hole
> 0.5 (Eric) that fixes qlen with holes and peek
> 0.7 change to use separate params / stats than RED
>     account for drops in backlog
> 
> Almost ready, still need to make sure API (netlink) is right
> 
> 
>  net/sched/Kconfig     |   11 +
>  net/sched/Makefile    |    1 
>  net/sched/sch_choke.c |  536 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 548 insertions(+)
> 
> --- a/net/sched/Kconfig	2011-01-12 17:44:05.747500044 -0800
> +++ b/net/sched/Kconfig	2011-01-12 17:44:53.167735188 -0800
> @@ -205,6 +205,17 @@ config NET_SCH_DRR
>  
>  	  If unsure, say N.
>  
> +config NET_SCH_CHOKE
> +	tristate "CHOose and Keep responsive flow scheduler (CHOKE)"
> +	help
> +	  Say Y here if you want to use the CHOKe packet scheduler (CHOose
> +	  and Keep for responsive flows, CHOose and Kill for unresponsive
> +	  flows). This is a variation of RED which trys to penalize flows
> +	  that monopolize the queue.
> +
> +	  To compile this code as a module, choose M here: the
> +	  module will be called sch_choke.
> +
>  config NET_SCH_INGRESS
>  	tristate "Ingress Qdisc"
>  	depends on NET_CLS_ACT
> --- a/net/sched/Makefile	2011-01-12 17:44:05.767500135 -0800
> +++ b/net/sched/Makefile	2011-01-12 17:44:53.167735188 -0800
> @@ -32,6 +32,7 @@ obj-$(CONFIG_NET_SCH_MULTIQ)	+= sch_mult
>  obj-$(CONFIG_NET_SCH_ATM)	+= sch_atm.o
>  obj-$(CONFIG_NET_SCH_NETEM)	+= sch_netem.o
>  obj-$(CONFIG_NET_SCH_DRR)	+= sch_drr.o
> +obj-$(CONFIG_NET_SCH_CHOKE)	+= sch_choke.o
>  obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
>  obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
>  obj-$(CONFIG_NET_CLS_FW)	+= cls_fw.o
> --- /dev/null	1970-01-01 00:00:00.000000000 +0000
> +++ b/net/sched/sch_choke.c	2011-01-12 17:45:07.227806180 -0800
> @@ -0,0 +1,556 @@
> +/*
> + * net/sched/sch_choke.c	CHOKE scheduler
> + *
> + * Copyright (c) 2011 Stephen Hemminger <shemminger@...tta.com>
> + * Copyright (c) 2011 Eric Dumazet <eric.dumazet@...il.com>
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * version 2 as published by the Free Software Foundation.
> + *
> + */
> +
> +#include <linux/module.h>
> +#include <linux/types.h>
> +#include <linux/kernel.h>
> +#include <linux/skbuff.h>
> +#include <linux/reciprocal_div.h>
> +#include <net/pkt_sched.h>
> +#include <net/inet_ecn.h>
> +#include <net/red.h>
> +
> +/*	CHOKe stateless AQM for fair bandwidth allocation
> +        =================================================
> +
> +   CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
> +   unresponsive flows) is a variant of RED that penalizes misbehaving flows but
> +   maintains no flow state. The difference from RED is an additional step
> +   during the enqueuing process. If average queue size is over the
> +   low threshold (qmin), a packet is chosen at random from the queue.
> +   If both the new and chosen packet are from the same flow, both
> +   are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
> +   needs to access packets in queue randomly. It has a minimal class
> +   interface to allow overriding the builtin flow classifier with
> +   filters.
> +
> +   Source:
> +   R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
> +   Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
> +   IEEE INFOCOM, 2000.
> +
> +   A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
> +   Characteristics", IEEE/ACM Transactions on Networking, 2004
> +
> + */
> +
> +/* Upper bound on size of sk_buff table */
> +#define CHOKE_MAX_QUEUE	(128*1024 - 1)
> +
> +struct choke_sched_data {
> +/* Parameters */
> +	u32		 limit;
> +	unsigned char	 flags;
> +
> +	struct red_parms parms;
> +
> +/* Variables */
> +	struct tcf_proto *filter_list;
> +	struct {
> +		u32	prob_drop;	/* Early probability drops */
> +		u32	prob_mark;	/* Early probability marks */
> +		u32	forced_drop;	/* Forced drops, qavg > max_thresh */
> +		u32	forced_mark;	/* Forced marks, qavg > max_thresh */
> +		u32	pdrop;          /* Drops due to queue limits */
> +		u32	other;          /* Drops due to drop() calls */
> +		u32	matched;	/* Drops to flow match */
> +	} stats;
> +
> +	unsigned int	 head;
> +	unsigned int	 tail;
> +	unsigned int	 holes;
> +	unsigned int	 tab_mask; /* size - 1 */
> +
> +	struct sk_buff **tab;
> +};
> +
> +static inline unsigned int choke_len(const struct choke_sched_data *q)
> +{
> +	return (q->tail - q->head) & q->tab_mask;
> +}
> +
> +/* deliver a random number between 0 and N - 1 */
> +static inline u32 random_N(unsigned int N)
> +{
> +	return reciprocal_divide(random32(), N);
> +}
> +
> +/* Select a packet at random from the queue in O(1) and handle holes */
> +static struct sk_buff *choke_peek_random(struct choke_sched_data *q,
> +					 unsigned int *pidx)
> +{
> +	struct sk_buff *skb;
> +	int retrys = 3;
> +
> +	do {
> +		*pidx = (q->head + random_N(choke_len(q))) & q->tab_mask;
> +		skb = q->tab[*pidx];
> +		if (skb)
> +			return skb;
> +	} while (--retrys > 0);
> +
> +	/* queue is has lots of holes use the head which is known to exist */
> +	return q->tab[*pidx = q->head];
> +}
> +
> +/* Is ECN parameter configured */
> +static inline int use_ecn(const struct choke_sched_data *q)
> +{
> +	return q->flags & TC_RED_ECN;
> +}
> +
> +/* Should packets over max just be dropped (versus marked) */
> +static inline int use_harddrop(const struct choke_sched_data *q)
> +{
> +	return q->flags & TC_RED_HARDDROP;
> +}
> +
> +/* Move head pointer forward to skip over holes */
> +static void choke_zap_head_holes(struct choke_sched_data *q)
> +{
> +	while (q->holes && q->tab[q->head] == NULL) {
> +		q->head = (q->head + 1) & q->tab_mask;
> +		q->holes--;
> +	}
> +}
> +
> +/* Move tail pointer backwards to reuse holes */
> +static void choke_zap_tail_holes(struct choke_sched_data *q)
> +{
> +	while (q->holes && q->tab[q->tail - 1] == NULL) {
> +		q->tail = (q->tail - 1) & q->tab_mask;
> +		q->holes--;
> +	}
> +}
> +
> +/* Drop packet from queue array by creating a "hole" */
> +static void choke_drop_by_idx(struct choke_sched_data *q, unsigned int idx)
> +{
> +	q->tab[idx] = NULL;
> +	q->holes++;
> +
> +	if (idx == q->head)
> +		choke_zap_head_holes(q);
> +	if (idx == q->tail)
> +		choke_zap_tail_holes(q);
> +}
> +
> +/* Classify flow using either:
> +   1. pre-existing classification result in skb
> +   2. fast internal classification
> +   3. use TC filter based classification
> +*/
> +static inline unsigned int choke_classify(struct sk_buff *skb,
> +					  struct Qdisc *sch, int *qerr)
> +
> +{
> +	struct choke_sched_data *q = qdisc_priv(sch);
> +	struct tcf_result res;
> +	int result;
> +
> +	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
> +
> +	if (TC_H_MAJ(skb->priority) == sch->handle &&
> +	    TC_H_MIN(skb->priority) > 0)
> +		return TC_H_MIN(skb->priority);
> +
> +	if (!q->filter_list)
> +		return skb_get_rxhash(skb);
> +
> +	result = tc_classify(skb, q->filter_list, &res);
> +	if (result >= 0) {
> +#ifdef CONFIG_NET_CLS_ACT
> +		switch (result) {
> +		case TC_ACT_STOLEN:
> +		case TC_ACT_QUEUED:
> +			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
> +		case TC_ACT_SHOT:
> +			return 0;
> +		}
> +#endif
> +		return TC_H_MIN(res.classid);
> +	}
> +
> +	return 0;
> +}
> +
> +static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
> +{
> +	struct choke_sched_data *q = qdisc_priv(sch);
> +	struct red_parms *p = &q->parms;
> +	unsigned int hash;
> +	int uninitialized_var(ret);
> +
> +	hash = choke_classify(skb, sch, &ret);
> +	if (!hash) {
> +		/* Packet was eaten by filter */
> +		if (ret & __NET_XMIT_BYPASS)
> +			sch->qstats.drops++;
> +		kfree_skb(skb);
> +		return ret;
> +	}
> +
> +	/* Maybe add hash as field in struct qdisc_skb_cb? */
> +	*(unsigned int *)(qdisc_skb_cb(skb)->data) = hash;
> +
> +	/* Compute average queue usage (see RED) */
> +	p->qavg = red_calc_qavg(p, choke_len(q) - q->holes);
> +	if (red_is_idling(p))
> +		red_end_of_idle_period(p);
> +
> +	/* Is queue small? */
> +	if (p->qavg <= p->qth_min)
> +		p->qcount = -1;
> +	else {
> +		struct sk_buff *oskb;
> +		unsigned int idx;
> +
> +		/* Draw a packet at random from queue */
> +		oskb = choke_peek_random(q, &idx);
> +
> +		/* Both packets from same flow ? */
> +		if (*(unsigned int *)(qdisc_skb_cb(oskb)->data) == hash) {
> +			/* Drop both packets */
> +			q->stats.matched++;
> +			choke_drop_by_idx(q, idx);
> +			qdisc_drop(oskb, sch);

I feel we should add : sch->q.qlen--;

> +			goto congestion_drop;
> +		}
> +
> +		/* Queue is large, always mark/drop */
> +		if (p->qavg > p->qth_max) {
> +			p->qcount = -1;
> +
> +			sch->qstats.overlimits++;
> +			if (use_harddrop(q) || !use_ecn(q) ||
> +			    !INET_ECN_set_ce(skb)) {
> +				q->stats.forced_drop++;
> +				goto congestion_drop;
> +			}
> +
> +			q->stats.forced_mark++;
> +		} else if (++p->qcount) {
> +			if (red_mark_probability(p, p->qavg)) {
> +				p->qcount = 0;
> +				p->qR = red_random(p);
> +
> +				sch->qstats.overlimits++;
> +				if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
> +					q->stats.prob_drop++;
> +					goto congestion_drop;
> +				}
> +
> +				q->stats.prob_mark++;
> +			}
> +		} else
> +			p->qR = red_random(p);
> +	}
> +
> +	/* Admit new packet */
> +	if (likely(choke_len(q) < q->limit)) {
> +
> +		q->tab[q->tail] = skb;
> +		q->tail = (q->tail + 1) & q->tab_mask;
> +
> +		sch->qstats.backlog += qdisc_pkt_len(skb);
> +		qdisc_update_bstats(sch, skb);
> +		sch->q.qlen = choke_len(q) - q->holes;
	or : sch->q.qlen++;

(If sch->q.qlen is up2date in respect of above comment)

> +		return NET_XMIT_SUCCESS;
> +	}
> +
> +	q->stats.pdrop++;
> +	sch->qstats.drops++;
> +	kfree_skb(skb);
> +	return NET_XMIT_DROP;
> +
> + congestion_drop:
> +	qdisc_drop(skb, sch);
> +	return NET_XMIT_CN;
> +}
> +
> +static struct sk_buff *choke_dequeue(struct Qdisc *sch)
> +{
> +	struct choke_sched_data *q = qdisc_priv(sch);
> +	struct sk_buff *skb;
> +
> +	if (q->head == q->tail) {
> +		if (!red_is_idling(&q->parms))
> +			red_start_of_idle_period(&q->parms);
> +		return NULL;
> +	}
> +	skb = q->tab[q->head];
> +	q->tab[q->head] = NULL; /* not really needed */
> +	q->head = (q->head + 1) & q->tab_mask;
> +	choke_zap_head_holes(q);
> +	sch->qstats.backlog -= qdisc_pkt_len(skb);
> +	sch->q.qlen = choke_len(q) - q->holes;

	sch->q.qlen--;

> +
> +	return skb;
> +}
> +
> +static unsigned int choke_drop(struct Qdisc *sch)
> +{
> +	struct choke_sched_data *q = qdisc_priv(sch);
> +	unsigned int len;
> +
> +	len = qdisc_queue_drop(sch);
> +	if (len > 0)
> +		q->stats.other++;
> +	else {
> +		if (!red_is_idling(&q->parms))
> +			red_start_of_idle_period(&q->parms);
> +	}
> +
> +	return len;
> +}
> +
> +static void choke_reset(struct Qdisc* sch)
> +{
> +	struct choke_sched_data *q = qdisc_priv(sch);
> +
> +	red_restart(&q->parms);
> +}
> +
> +static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
> +	[TCA_CHOKE_PARMS]	= { .len = sizeof(struct tc_red_qopt) },
> +	[TCA_CHOKE_STAB]	= { .len = 256 },

RED_STAB_SIZE ?


Thanks !


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html