[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAK6E8=cwpq0MVK6fcrOa=wTX237xQiVP2xzh0aX-AtuSmzt-sw@mail.gmail.com>
Date: Fri, 31 Aug 2012 18:37:05 -0700
From: Yuchung Cheng <ycheng@...gle.com>
To: Eric Dumazet <eric.dumazet@...il.com>
Cc: codel@...ts.bufferbloat.net, Dave Taht <dave.taht@...il.com>,
Kathleen Nichols <nichols@...lere.com>,
netdev <netdev@...r.kernel.org>,
Nandita Dukkipati <nanditad@...gle.com>,
Tomas Hruby <thruby@...gle.com>
Subject: Re: [RFC v2] fq_codel : interval servo on hosts
On Fri, Aug 31, 2012 at 6:57 AM, Eric Dumazet <eric.dumazet@...il.com> wrote:
> On Fri, 2012-08-31 at 06:50 -0700, Eric Dumazet wrote:
>> On Thu, 2012-08-30 at 23:55 -0700, Eric Dumazet wrote:
>> > On locally generated TCP traffic (host), we can override the 100 ms
>> > interval value using the more accurate RTT estimation maintained by TCP
>> > stack (tp->srtt)
>> >
>> > Datacenter workload benefits using shorter feedback (say if RTT is below
>> > 1 ms, we can react 100 times faster to a congestion)
>> >
>> > Idea from Yuchung Cheng.
>> >
>>
>> Linux patch would be the following :
>>
>> I'll do tests next week, but I am sending a raw patch right now if
>> anybody wants to try it.
>>
>> Presumably we also want to adjust target as well.
>>
>> To get more precise srtt values in the datacenter, we might avoid the
>> 'one jiffie slack' on small values in tcp_rtt_estimator(), as we force
>> m to be 1 before the scaling by 8 :
>>
>> if (m == 0)
>> m = 1;
>>
>> We only need to force the least significant bit of srtt to be set.
>>
Just curious: tp->srtt is a very rough estimator, e.g., Delayed-ACks
can easily add 40 - 200ms fuzziness. Will this affect short flows?
>
> Hmm, I also need to properly init default_interval after
> codel_params_init(&q->cparams) :
>
> net/sched/sch_fq_codel.c | 24 ++++++++++++++++++++++--
> 1 file changed, 22 insertions(+), 2 deletions(-)
>
> diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
> index 9fc1c62..f04ff6a 100644
> --- a/net/sched/sch_fq_codel.c
> +++ b/net/sched/sch_fq_codel.c
> @@ -25,6 +25,7 @@
> #include <net/pkt_sched.h>
> #include <net/flow_keys.h>
> #include <net/codel.h>
> +#include <linux/tcp.h>
>
> /* Fair Queue CoDel.
> *
> @@ -59,6 +60,7 @@ struct fq_codel_sched_data {
> u32 perturbation; /* hash perturbation */
> u32 quantum; /* psched_mtu(qdisc_dev(sch)); */
> struct codel_params cparams;
> + codel_time_t default_interval;
> struct codel_stats cstats;
> u32 drop_overlimit;
> u32 new_flow_count;
> @@ -211,6 +213,14 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
> return NET_XMIT_SUCCESS;
> }
>
> +/* Given TCP srtt evaluation, return codel interval.
> + * srtt is given in jiffies, scaled by 8.
> + */
> +static codel_time_t tcp_srtt_to_codel(unsigned int srtt)
> +{
> + return srtt * ((NSEC_PER_SEC >> (CODEL_SHIFT + 3)) / HZ);
> +}
> +
> /* This is the specific function called from codel_dequeue()
> * to dequeue a packet from queue. Note: backlog is handled in
> * codel, we dont need to reduce it here.
> @@ -220,12 +230,21 @@ static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch)
> struct fq_codel_sched_data *q = qdisc_priv(sch);
> struct fq_codel_flow *flow;
> struct sk_buff *skb = NULL;
> + struct sock *sk;
>
> flow = container_of(vars, struct fq_codel_flow, cvars);
> if (flow->head) {
> skb = dequeue_head(flow);
> q->backlogs[flow - q->flows] -= qdisc_pkt_len(skb);
> sch->q.qlen--;
> + sk = skb->sk;
> + q->cparams.interval = q->default_interval;
> + if (sk && sk->sk_protocol == IPPROTO_TCP) {
> + u32 srtt = tcp_sk(sk)->srtt;
> +
> + if (srtt)
> + q->cparams.interval = tcp_srtt_to_codel(srtt);
> + }
> }
> return skb;
> }
> @@ -330,7 +349,7 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt)
> if (tb[TCA_FQ_CODEL_INTERVAL]) {
> u64 interval = nla_get_u32(tb[TCA_FQ_CODEL_INTERVAL]);
>
> - q->cparams.interval = (interval * NSEC_PER_USEC) >> CODEL_SHIFT;
> + q->default_interval = (interval * NSEC_PER_USEC) >> CODEL_SHIFT;
> }
>
> if (tb[TCA_FQ_CODEL_LIMIT])
> @@ -395,6 +414,7 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt)
> INIT_LIST_HEAD(&q->new_flows);
> INIT_LIST_HEAD(&q->old_flows);
> codel_params_init(&q->cparams);
> + q->default_interval = q->cparams.interval;
> codel_stats_init(&q->cstats);
> q->cparams.ecn = true;
>
> @@ -441,7 +461,7 @@ static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb)
> nla_put_u32(skb, TCA_FQ_CODEL_LIMIT,
> sch->limit) ||
> nla_put_u32(skb, TCA_FQ_CODEL_INTERVAL,
> - codel_time_to_us(q->cparams.interval)) ||
> + codel_time_to_us(q->default_interval)) ||
> nla_put_u32(skb, TCA_FQ_CODEL_ECN,
> q->cparams.ecn) ||
> nla_put_u32(skb, TCA_FQ_CODEL_QUANTUM,
>
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists