[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20090906200409.GB8833@ami.dom.local>
Date: Sun, 6 Sep 2009 22:04:09 +0200
From: Jarek Poplawski <jarkao2@...il.com>
To: Patrick McHardy <kaber@...sh.net>
Cc: netdev@...r.kernel.org
Subject: Re: net_sched 07/07: add classful multiqueue dummy scheduler
Patrick McHardy wrote, On 09/04/2009 06:41 PM:
> commit f114d0f02c9e72fea7bbc4d28a113946183fc65f
> Author: Patrick McHardy <kaber@...sh.net>
> Date: Fri Sep 4 18:25:04 2009 +0200
>
> net_sched: add classful multiqueue dummy scheduler
>
> This patch adds a classful dummy scheduler which can be used as root qdisc
> for multiqueue devices and exposes each device queue as a child class.
>
> This allows to address queues individually and graft them similar to regular
> classes. Additionally it presents an accumulated view of the statistics of
> all real root qdiscs in the dummy root.
>
> Two new callbacks are added to the qdisc_ops and qdisc_class_ops:
>
> - cl_ops->select_queue selects the tx queue number for new child classes.
>
> - qdisc_ops->attach() overrides root qdisc device grafting to attach
> non-shared qdiscs to the queues.
>
> Signed-off-by: Patrick McHardy <kaber@...sh.net>
>
> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> index a92dc62..9c69585 100644
> --- a/include/net/sch_generic.h
> +++ b/include/net/sch_generic.h
> @@ -80,6 +80,7 @@ struct Qdisc
> struct Qdisc_class_ops
> {
> /* Child qdisc manipulation */
> + unsigned int (*select_queue)(struct Qdisc *, struct tcmsg *);
> int (*graft)(struct Qdisc *, unsigned long cl,
> struct Qdisc *, struct Qdisc **);
> struct Qdisc * (*leaf)(struct Qdisc *, unsigned long cl);
> @@ -122,6 +123,7 @@ struct Qdisc_ops
> void (*reset)(struct Qdisc *);
> void (*destroy)(struct Qdisc *);
> int (*change)(struct Qdisc *, struct nlattr *arg);
> + void (*attach)(struct Qdisc *);
Probably it's a matter of taste, but I wonder why these two methods
used only by one qdisc in max 2 places can't be functions instead
(maybe even static in case of select_queue)? (And this mq sched could
be tested with some flag instead of ->attach, I guess.)
>
> int (*dump)(struct Qdisc *, struct sk_buff *);
> int (*dump_stats)(struct Qdisc *, struct gnet_dump *);
> @@ -255,6 +257,8 @@ static inline void sch_tree_unlock(struct Qdisc *q)
>
> extern struct Qdisc noop_qdisc;
> extern struct Qdisc_ops noop_qdisc_ops;
> +extern struct Qdisc_ops pfifo_fast_ops;
> +extern struct Qdisc_ops mq_qdisc_ops;
>
> struct Qdisc_class_common
> {
> diff --git a/net/sched/Makefile b/net/sched/Makefile
> index 54d950c..f14e71b 100644
> --- a/net/sched/Makefile
> +++ b/net/sched/Makefile
> @@ -2,7 +2,7 @@
> # Makefile for the Linux Traffic Control Unit.
> #
>
> -obj-y := sch_generic.o
> +obj-y := sch_generic.o sch_mq.o
>
> obj-$(CONFIG_NET_SCHED) += sch_api.o sch_blackhole.o
> obj-$(CONFIG_NET_CLS) += cls_api.o
> diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
> index d71f12b..2a78d54 100644
> --- a/net/sched/sch_api.c
> +++ b/net/sched/sch_api.c
> @@ -678,6 +678,11 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
> if (dev->flags & IFF_UP)
> dev_deactivate(dev);
>
> + if (new && new->ops->attach) {
> + new->ops->attach(new);
> + num_q = 0;
> + }
> +
Actually, I wonder if it's not cleaner to let replace all qdiscs with
noops below like in qdisc delete case, and do this attaching in one
place only (dev_activate).
> for (i = 0; i < num_q; i++) {
> struct netdev_queue *dev_queue = &dev->rx_queue;
>
> @@ -692,7 +697,7 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
> }
>
> notify_and_destroy(skb, n, classid, dev->qdisc, new);
> - if (new)
> + if (new && !new->ops->attach)
> atomic_inc(&new->refcnt);
> dev->qdisc = new ? : &noop_qdisc;
>
> @@ -1095,10 +1100,16 @@ create_n_graft:
> q = qdisc_create(dev, &dev->rx_queue,
> tcm->tcm_parent, tcm->tcm_parent,
> tca, &err);
> - else
> - q = qdisc_create(dev, netdev_get_tx_queue(dev, 0),
> + else {
> + unsigned int ntx = 0;
> +
> + if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
> + ntx = p->ops->cl_ops->select_queue(p, tcm);
So, this if could be probably made shorter with a common function, but
the main point is: this probably works only for qdiscs having mq as a
parent, and not below.
> +
> + q = qdisc_create(dev, netdev_get_tx_queue(dev, ntx),
> tcm->tcm_parent, tcm->tcm_handle,
> tca, &err);
> + }
> if (q == NULL) {
> if (err == -EAGAIN)
> goto replay;
> @@ -1674,6 +1685,7 @@ static int __init pktsched_init(void)
> {
> register_qdisc(&pfifo_qdisc_ops);
> register_qdisc(&bfifo_qdisc_ops);
> + register_qdisc(&mq_qdisc_ops);
> proc_net_fops_create(&init_net, "psched", 0, &psched_fops);
>
> rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
> index e7c47ce..4ae6aa5 100644
> --- a/net/sched/sch_generic.c
> +++ b/net/sched/sch_generic.c
> @@ -514,7 +514,7 @@ static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
> return 0;
> }
>
> -static struct Qdisc_ops pfifo_fast_ops __read_mostly = {
> +struct Qdisc_ops pfifo_fast_ops __read_mostly = {
> .id = "pfifo_fast",
> .priv_size = sizeof(struct pfifo_fast_priv),
> .enqueue = pfifo_fast_enqueue,
> @@ -670,6 +670,26 @@ static void attach_one_default_qdisc(struct net_device *dev,
> dev_queue->qdisc_sleeping = qdisc;
> }
>
> +static void attach_default_qdiscs(struct net_device *dev)
> +{
> + struct netdev_queue *txq;
> + struct Qdisc *qdisc;
> +
> + txq = netdev_get_tx_queue(dev, 0);
> +
> + if (!netif_is_multiqueue(dev) || dev->tx_queue_len == 0) {
> + netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
> + dev->qdisc = txq->qdisc_sleeping;
> + atomic_inc(&dev->qdisc->refcnt);
> + } else {
> + qdisc = qdisc_create_dflt(dev, txq, &mq_qdisc_ops, TC_H_ROOT);
> + if (qdisc) {
> + qdisc->ops->attach(qdisc);
> + dev->qdisc = qdisc;
> + }
> + }
> +}
> +
> static void transition_one_qdisc(struct net_device *dev,
> struct netdev_queue *dev_queue,
> void *_need_watchdog)
> @@ -689,7 +709,6 @@ static void transition_one_qdisc(struct net_device *dev,
>
> void dev_activate(struct net_device *dev)
> {
> - struct netdev_queue *txq;
> int need_watchdog;
>
> /* No queueing discipline is attached to device;
> @@ -698,13 +717,8 @@ void dev_activate(struct net_device *dev)
> virtual interfaces
> */
>
> - if (dev->qdisc == &noop_qdisc) {
> - netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
> -
> - txq = netdev_get_tx_queue(dev, 0);
> - dev->qdisc = txq->qdisc_sleeping;
> - atomic_inc(&dev->qdisc->refcnt);
> - }
> + if (dev->qdisc == &noop_qdisc)
> + attach_default_qdiscs(dev);
>
> if (!netif_carrier_ok(dev))
> /* Delay activation until next carrier-on event */
> diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
> new file mode 100644
> index 0000000..5e453fd
> --- /dev/null
> +++ b/net/sched/sch_mq.c
> @@ -0,0 +1,234 @@
> +/*
> + * net/sched/sch_mq.c Classful multiqueue dummy scheduler
> + *
> + * Copyright (c) 2009 Patrick McHardy <kaber@...sh.net>
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * version 2 as published by the Free Software Foundation.
> + */
> +
> +#include <linux/types.h>
> +#include <linux/kernel.h>
> +#include <linux/string.h>
> +#include <linux/errno.h>
> +#include <linux/skbuff.h>
> +#include <net/netlink.h>
> +#include <net/pkt_sched.h>
> +
> +struct mq_sched {
> + struct Qdisc **qdiscs;
> +};
> +
> +static void mq_destroy(struct Qdisc *sch)
> +{
> + struct net_device *dev = qdisc_dev(sch);
> + struct mq_sched *priv = qdisc_priv(sch);
> + unsigned int ntx;
> +
> + if (priv->qdiscs)
> + return;
> + for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++)
> + qdisc_destroy(priv->qdiscs[ntx]);
> + kfree(priv->qdiscs);
> +}
> +
> +static int mq_init(struct Qdisc *sch, struct nlattr *opt)
> +{
> + struct net_device *dev = qdisc_dev(sch);
> + struct mq_sched *priv = qdisc_priv(sch);
> + struct netdev_queue *dev_queue;
> + struct Qdisc *qdisc;
> + unsigned int ntx;
> +
> + if (sch->parent != TC_H_ROOT)
> + return -EOPNOTSUPP;
> +
> + if (!netif_is_multiqueue(dev))
> + return -EOPNOTSUPP;
> +
> + /* pre-allocate qdiscs, attachment can't fail */
> + priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]),
> + GFP_KERNEL);
I guess we could avoid this at all or at least to do it in one step with
current ->attach.
> + if (priv->qdiscs == NULL)
> + return -ENOMEM;
> +
> + for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
> + dev_queue = netdev_get_tx_queue(dev, ntx);
> + qdisc = qdisc_create_dflt(dev, dev_queue, &pfifo_fast_ops,
> + TC_H_MAKE(TC_H_MAJ(sch->handle),
> + TC_H_MIN(ntx + 1)));
As I wrote in 05/07 comment, I wonder if we really can't achieve this
with old TC_H_ROOT parentid, and maybe some mapping while dumping to
the userspace only. Another possibility would be considering a new
kind of root (mqroot?) to tell precisely, where a new qdisc should be
added.
> + if (qdisc == NULL)
> + goto err;
> + qdisc->flags |= TCQ_F_CAN_BYPASS;
> + priv->qdiscs[ntx] = qdisc;
> + }
> +
> + return 0;
> +
> +err:
> + mq_destroy(sch);
> + return -ENOMEM;
> +}
> +
> +static void mq_attach(struct Qdisc *sch)
> +{
> + struct net_device *dev = qdisc_dev(sch);
> + struct mq_sched *priv = qdisc_priv(sch);
> + struct Qdisc *qdisc;
> + unsigned int ntx;
> +
> + for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
> + qdisc = priv->qdiscs[ntx];
> + qdisc = dev_graft_qdisc(qdisc->dev_queue, qdisc);
> + if (qdisc)
> + qdisc_destroy(qdisc);
> + }
> + kfree(priv->qdiscs);
> + priv->qdiscs = NULL;
> +}
> +
> +static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
> +{
> + struct net_device *dev = qdisc_dev(sch);
> + struct Qdisc *qdisc;
> + unsigned int ntx;
> +
> + sch->q.qlen = 0;
> + memset(&sch->bstats, 0, sizeof(sch->bstats));
> + memset(&sch->qstats, 0, sizeof(sch->qstats));
> +
> + for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
> + qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping;
> + spin_lock_bh(qdisc_lock(qdisc));
> + sch->q.qlen += qdisc->q.qlen;
> + sch->bstats.bytes += qdisc->bstats.bytes;
> + sch->bstats.packets += qdisc->bstats.packets;
> + sch->qstats.qlen += qdisc->qstats.qlen;
Like in Christoph's case, we should probably use q.qlen instead.
Thanks,
Jarek P.
> + sch->qstats.backlog += qdisc->qstats.backlog;
> + sch->qstats.drops += qdisc->qstats.drops;
> + sch->qstats.requeues += qdisc->qstats.requeues;
> + sch->qstats.overlimits += qdisc->qstats.overlimits;
> + spin_unlock_bh(qdisc_lock(qdisc));
> + }
> + return 0;
> +}
> +
> +static struct netdev_queue *mq_queue_get(struct Qdisc *sch, unsigned long cl)
> +{
> + struct net_device *dev = qdisc_dev(sch);
> + unsigned long ntx = cl - 1;
> +
> + if (ntx >= dev->num_tx_queues)
> + return NULL;
> + return netdev_get_tx_queue(dev, ntx);
> +}
> +
> +static unsigned int mq_select_queue(struct Qdisc *sch, struct tcmsg *tcm)
> +{
> + unsigned int ntx = TC_H_MIN(tcm->tcm_parent);
> +
> + if (!mq_queue_get(sch, ntx))
> + return 0;
> + return ntx - 1;
> +}
> +
> +static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
> + struct Qdisc **old)
> +{
> + struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
> + struct net_device *dev = qdisc_dev(sch);
> +
> + if (dev->flags & IFF_UP)
> + dev_deactivate(dev);
> +
> + *old = dev_graft_qdisc(dev_queue, new);
> +
> + if (dev->flags & IFF_UP)
> + dev_activate(dev);
> + return 0;
> +}
> +
> +static struct Qdisc *mq_leaf(struct Qdisc *sch, unsigned long cl)
> +{
> + struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
> +
> + return dev_queue->qdisc_sleeping;
> +}
> +
> +static unsigned long mq_get(struct Qdisc *sch, u32 classid)
> +{
> + unsigned int ntx = TC_H_MIN(classid);
> +
> + if (!mq_queue_get(sch, ntx))
> + return 0;
> + return ntx;
> +}
> +
> +static void mq_put(struct Qdisc *sch, unsigned long cl)
> +{
> + return;
> +}
> +
> +static int mq_dump_class(struct Qdisc *sch, unsigned long cl,
> + struct sk_buff *skb, struct tcmsg *tcm)
> +{
> + struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
> +
> + tcm->tcm_parent = TC_H_ROOT;
> + tcm->tcm_handle |= TC_H_MIN(cl);
> + tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
> + return 0;
> +}
> +
> +static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
> + struct gnet_dump *d)
> +{
> + struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
> +
> + sch = dev_queue->qdisc_sleeping;
> + if (gnet_stats_copy_basic(d, &sch->bstats) < 0 ||
> + gnet_stats_copy_queue(d, &sch->qstats) < 0)
> + return -1;
> + return 0;
> +}
> +
> +static void mq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
> +{
> + struct net_device *dev = qdisc_dev(sch);
> + unsigned int ntx;
> +
> + if (arg->stop)
> + return;
> +
> + arg->count = arg->skip;
> + for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) {
> + if (arg->fn(sch, ntx + 1, arg) < 0) {
> + arg->stop = 1;
> + break;
> + }
> + arg->count++;
> + }
> +}
> +
> +static const struct Qdisc_class_ops mq_class_ops = {
> + .select_queue = mq_select_queue,
> + .graft = mq_graft,
> + .leaf = mq_leaf,
> + .get = mq_get,
> + .put = mq_put,
> + .walk = mq_walk,
> + .dump = mq_dump_class,
> + .dump_stats = mq_dump_class_stats,
> +};
> +
> +struct Qdisc_ops mq_qdisc_ops __read_mostly = {
> + .cl_ops = &mq_class_ops,
> + .id = "mq",
> + .priv_size = sizeof(struct mq_sched),
> + .init = mq_init,
> + .destroy = mq_destroy,
> + .attach = mq_attach,
> + .dump = mq_dump,
> + .owner = THIS_MODULE,
> +};
> --
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists