lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20200112173624.5f7b754b@cakuba>
Date:   Sun, 12 Jan 2020 17:36:24 -0800
From:   Jakub Kicinski <kuba@...nel.org>
To:     gautamramk@...il.com
Cc:     netdev@...r.kernel.org,
        "Mohit P. Tahiliani" <tahiliani@...k.edu.in>,
        Jamal Hadi Salim <jhs@...atatu.com>,
        "David S . Miller" <davem@...emloft.net>,
        Dave Taht <dave.taht@...il.com>,
        Toke Høiland-Jørgensen <toke@...hat.com>,
        Leslie Monis <lesliemonis@...il.com>,
        "Sachin D . Patil" <sdp.sachin@...il.com>,
        "V . Saicharan" <vsaicharan1998@...il.com>,
        Mohit Bhasi <mohitbhasi1998@...il.com>
Subject: Re: [PATCH net-next v3 2/2] net: sched: add Flow Queue PIE packet
 scheduler

On Fri, 10 Jan 2020 11:56:57 +0530, gautamramk@...il.com wrote:
> From: "Mohit P. Tahiliani" <tahiliani@...k.edu.in>
> 
> Principles:
>   - Packets are classified on flows.
>   - This is a Stochastic model (as we use a hash, several flows might
>                                 be hashed on the same slot)
>   - Each flow has a PIE managed queue.
>   - Flows are linked onto two (Round Robin) lists,
>     so that new flows have priority on old ones.
>   - For a given flow, packets are not reordered.
>   - Drops during enqueue only.
>   - ECN capability is off by default.
>   - ECN threshold is at 10% by default.
>   - Uses timestamps to calculate queue delay by default.
> 
> Usage:
> tc qdisc ... fq_pie [ limit PACKETS ] [ flows NUMBER ]
>                     [ alpha NUMBER ] [ beta NUMBER ]
>                     [ target TIME us ] [ tupdate TIME us ]
>                     [ memory_limit BYTES ] [ quantum BYTES ]
>                     [ ecnprob PERCENTAGE ] [ [no]ecn ]
>                     [ [no]bytemode ] [ [no_]dq_rate_estimator ]
> 
> defaults:
>   limit: 10240 packets, flows: 1024
>   alpha: 1/8, beta : 5/4
>   target: 15 ms, tupdate: 15 ms (in jiffies)
>   memory_limit: 32 Mb, quantum: device MTU
>   ecnprob: 10%, ecn: off
>   bytemode: off, dq_rate_estimator: off

Some reviews below, but hopefully someone who knows more about qdiscs
will still review :)

> diff --git a/net/sched/Kconfig b/net/sched/Kconfig
> index b1e7ec726958..93e480069a52 100644
> --- a/net/sched/Kconfig
> +++ b/net/sched/Kconfig
> @@ -366,6 +366,18 @@ config NET_SCH_PIE
>  
>  	  If unsure, say N.
>  
> +config NET_SCH_FQ_PIE
> +	depends on NET_SCH_PIE
> +	tristate "Flow Queue Proportional Integral controller Enhanced (FQ-PIE)"
> +	help
> +	  Say Y here if you want to use the Flow Queue Proportional Integral
> +	  controller Enhanced (FQ-PIE) packet scheduling algorithm.
> +
> +	  To compile this driver as a module, choose M here: the module
> +	  will be called sch_fq_pie.

Worth mentioning the RFC here?

> +	  If unsure, say N.
> +
>  config NET_SCH_INGRESS
>  	tristate "Ingress/classifier-action Qdisc"
>  	depends on NET_CLS_ACT

> +/* Flow Queue PIE
> + *
> + * Principles:
> + *   - Packets are classified on flows.
> + *   - This is a Stochastic model (as we use a hash, several flows might
> + *                                 be hashed on the same slot)

s/on/to/ ?

> + *   - Each flow has a PIE managed queue.
> + *   - Flows are linked onto two (Round Robin) lists,
> + *     so that new flows have priority on old ones.
> + *   - For a given flow, packets are not reordered.
> + *   - Drops during enqueue only.
> + *   - ECN capability is off by default.
> + *   - ECN threshold is at 10% by default.
> + *   - Uses timestamps to calculate queue delay by default.
> + */

> +static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt,
> +			 struct netlink_ext_ack *extack)
> +{
> +	struct fq_pie_sched_data *q = qdisc_priv(sch);
> +	struct nlattr *tb[TCA_FQ_PIE_MAX + 1];
> +	unsigned int len_dropped = 0;
> +	unsigned int num_dropped = 0;
> +	unsigned int qlen;

net/sched/sch_fq_pie.c:275:15: warning: variable ‘qlen’ set but not used [-Wunused-but-set-variable]
  275 |  unsigned int qlen;
      |               ^~~~

> +	int err;
> +
> +	if (!opt)
> +		return -EINVAL;
> +
> +	err = nla_parse_nested_deprecated(tb, TCA_FQ_PIE_MAX, opt,
> +					  fq_pie_policy, NULL);

Please use the non-deprecated version, and pass extack in.

> +	if (err < 0)
> +		return err;
> +
> +	sch_tree_lock(sch);
> +	if (tb[TCA_FQ_PIE_LIMIT]) {
> +		u32 limit = nla_get_u32(tb[TCA_FQ_PIE_LIMIT]);
> +
> +		q->p_params.limit = limit;
> +		sch->limit = limit;
> +	}
> +	if (tb[TCA_FQ_PIE_FLOWS]) {
> +		if (q->flows)
> +			return -EINVAL;
> +		q->flows_cnt = nla_get_u32(tb[TCA_FQ_PIE_FLOWS]);
> +		if (!q->flows_cnt ||
> +		    q->flows_cnt > 65536)
> +			return -EINVAL;

Gotta release the tree lock here. Please also consider adding extack
messages for these error conditions so users understand what went wrong.

> +	}
> +
> +	if (tb[TCA_FQ_PIE_ALPHA])
> +		q->p_params.alpha = nla_get_u32(tb[TCA_FQ_PIE_ALPHA]);
> +
> +	if (tb[TCA_FQ_PIE_BETA])
> +		q->p_params.beta = nla_get_u32(tb[TCA_FQ_PIE_BETA]);
> +
> +	/* convert from microseconds to pschedtime */
> +	if (tb[TCA_FQ_PIE_TARGET]) {
> +		/* target is in us */
> +		u32 target = nla_get_u32(tb[TCA_FQ_PIE_TARGET]);
> +
> +		/* convert to pschedtime */
> +		q->p_params.target =
> +		PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC);

looks misaligned

> +	}
> +
> +	/* tupdate is in jiffies */
> +	if (tb[TCA_FQ_PIE_TUPDATE])
> +		q->p_params.tupdate =
> +		usecs_to_jiffies(nla_get_u32(tb[TCA_FQ_PIE_TUPDATE]));

ditto, and in couple other places, the continuation line should be more
indented than the beginning one

> +	if (tb[TCA_FQ_PIE_MEMORY_LIMIT])
> +		q->memory_limit = nla_get_u32(tb[TCA_FQ_PIE_MEMORY_LIMIT]);
> +	if (tb[TCA_FQ_PIE_QUANTUM])
> +		q->quantum = nla_get_u32(tb[TCA_FQ_PIE_QUANTUM]);
> +
> +	if (tb[TCA_FQ_PIE_ECN_PROB])
> +		q->ecn_prob = nla_get_u32(tb[TCA_FQ_PIE_ECN_PROB]);
> +
> +	if (tb[TCA_FQ_PIE_ECN])
> +		q->p_params.ecn = nla_get_u32(tb[TCA_FQ_PIE_ECN]);
> +
> +	if (tb[TCA_FQ_PIE_BYTEMODE])
> +		q->p_params.bytemode = nla_get_u32(tb[TCA_FQ_PIE_BYTEMODE]);
> +
> +	if (tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR])
> +		q->p_params.dq_rate_estimator =
> +		nla_get_u32(tb[TCA_FQ_PIE_DQ_RATE_ESTIMATOR]);
> +
> +	/* Drop excess packets if new limit is lower */
> +	qlen = sch->q.qlen;
> +	while (sch->q.qlen > sch->limit) {
> +		struct sk_buff *skb = fq_pie_qdisc_dequeue(sch);
> +
> +		kfree_skb(skb);
> +		len_dropped += qdisc_pkt_len(skb);
> +		num_dropped += 1;
> +	}
> +	qdisc_tree_reduce_backlog(sch, num_dropped, len_dropped);
> +
> +	sch_tree_unlock(sch);
> +	return 0;
> +}
> +
> +static void fq_pie_timer(struct timer_list *t)
> +{
> +	struct fq_pie_sched_data *q = from_timer(q, t, adapt_timer);
> +	struct Qdisc *sch = q->sch;
> +	spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
> +	u16 idx;

Could you order variable declaration lines longest to shortest, 
if initialization depends on order and would break such order 
please move it out to the body of the function (applies to all
functions).

> +	spin_lock(root_lock);
> +
> +	for (idx = 0; idx < q->flows_cnt; idx++)
> +		pie_calculate_probability(&q->p_params, &q->flows[idx].vars,
> +					  q->flows[idx].backlog);
> +
> +	/* reset the timer to fire after 'tupdate'. tupdate is in jiffies. */
> +	if (q->p_params.tupdate)
> +		mod_timer(&q->adapt_timer, jiffies + q->p_params.tupdate);
> +	spin_unlock(root_lock);
> +}
> +
> +static int fq_pie_init(struct Qdisc *sch, struct nlattr *opt,
> +		       struct netlink_ext_ack *extack)
> +{
> +	struct fq_pie_sched_data *q = qdisc_priv(sch);
> +	int err;
> +	u16 idx;
> +
> +	pie_params_init(&q->p_params);
> +	sch->limit = 10 * 1024;
> +	q->p_params.limit = sch->limit;
> +	q->quantum = psched_mtu(qdisc_dev(sch));
> +	q->sch = sch;
> +	q->ecn_prob = 10;
> +	q->flows_cnt = 1024;
> +	q->memory_limit = 32 << 20;

SZ_32M ?

> +
> +	INIT_LIST_HEAD(&q->new_flows);
> +	INIT_LIST_HEAD(&q->old_flows);
> +
> +	timer_setup(&q->adapt_timer, fq_pie_timer, 0);
> +	mod_timer(&q->adapt_timer, jiffies + HZ / 2);

The timer should probably not be armed until flows are allocated

> +	if (opt) {
> +		int err = fq_pie_change(sch, opt, extack);

There is an err variable in outer scope already

> +		if (err)
> +			return err;
> +	}
> +
> +	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
> +	if (err)
> +		goto init_failure;
> +
> +	if (!q->flows) {

Can qdisc ->init really called multiple times for this to ever be
non-NULL?

> +		q->flows = kvcalloc(q->flows_cnt,
> +				    sizeof(struct fq_pie_flow),
> +				    GFP_KERNEL);
> +		if (!q->flows) {
> +			err = -ENOMEM;
> +			goto init_failure;
> +		}
> +		for (idx = 0; idx < q->flows_cnt; idx++) {
> +			struct fq_pie_flow *flow = q->flows + idx;
> +
> +			INIT_LIST_HEAD(&flow->flowchain);
> +			pie_vars_init(&flow->vars);
> +		}
> +	}
> +	return 0;
> +
> +init_failure:
> +	q->flows_cnt = 0;
> +
> +	return err;
> +}
> +
> +static int fq_pie_dump(struct Qdisc *sch, struct sk_buff *skb)
> +{
> +	struct fq_pie_sched_data *q = qdisc_priv(sch);
> +	struct nlattr *opts;
> +
> +	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
> +	if (!opts)
> +		goto nla_put_failure;

return -EMSGSIZE;

> +
> +	/* convert target from pschedtime to us */
> +	if (nla_put_u32(skb, TCA_FQ_PIE_LIMIT, sch->limit) ||
> +	    nla_put_u32(skb, TCA_FQ_PIE_FLOWS, q->flows_cnt) ||
> +	    nla_put_u32(skb, TCA_FQ_PIE_ALPHA, q->p_params.alpha) ||
> +	    nla_put_u32(skb, TCA_FQ_PIE_BETA, q->p_params.beta) ||
> +	    nla_put_u32(skb, TCA_FQ_PIE_TARGET,
> +			((u32)PSCHED_TICKS2NS(q->p_params.target)) /
> +			NSEC_PER_USEC) ||
> +	    nla_put_u32(skb, TCA_FQ_PIE_TUPDATE,
> +			jiffies_to_usecs(q->p_params.tupdate)) ||
> +	    nla_put_u32(skb, TCA_FQ_PIE_MEMORY_LIMIT, q->memory_limit) ||
> +	    nla_put_u32(skb, TCA_FQ_PIE_QUANTUM, q->quantum) ||
> +	    nla_put_u32(skb, TCA_FQ_PIE_ECN_PROB, q->ecn_prob) ||
> +	    nla_put_u32(skb, TCA_FQ_PIE_ECN, q->p_params.ecn) ||
> +	    nla_put_u32(skb, TCA_FQ_PIE_BYTEMODE, q->p_params.bytemode) ||
> +	    nla_put_u32(skb, TCA_FQ_PIE_DQ_RATE_ESTIMATOR,
> +			q->p_params.dq_rate_estimator))
> +		goto nla_put_failure;
> +
> +	return nla_nest_end(skb, opts);
> +
> +nla_put_failure:

nla_nest_cancel(skb, opts);

> +	return -1;

return -EMSGSIZE;

> +}

> +static void fq_pie_destroy(struct Qdisc *sch)
> +{
> +	struct fq_pie_sched_data *q = qdisc_priv(sch);
> +
> +	kvfree(q->flows);
> +	del_timer_sync(&q->adapt_timer);

You probably want to del_timer_sync before freeing the flows

> +}

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ