[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20070604214045.1524.18254.stgit@localhost.localdomain>
Date: Mon, 04 Jun 2007 14:40:45 -0700
From: PJ Waskiewicz <peter.p.waskiewicz.jr@...el.com>
To: davem@...emloft.net
Cc: netdev@...r.kernel.org, jeff@...zik.org, auke-jan.h.kok@...el.com
Subject: [PATCH] NET: Multiqueue network device support.
API added to support multiple hardware queues on an ethernet device.
Round-robin scheduler added (sch_rr) to provide a no-scheduling policy
qdisc for hardware with multiple queues.
Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@...el.com>
---
include/linux/etherdevice.h | 3
include/linux/netdevice.h | 62 +++++
include/linux/pkt_sched.h | 11 +
include/linux/skbuff.h | 2
net/core/dev.c | 27 ++
net/core/skbuff.c | 3
net/ethernet/eth.c | 9 -
net/sched/Kconfig | 22 ++
net/sched/Makefile | 1
net/sched/sch_generic.c | 4
net/sched/sch_prio.c | 66 +++++-
net/sched/sch_rr.c | 516 +++++++++++++++++++++++++++++++++++++++++++
12 files changed, 706 insertions(+), 20 deletions(-)
diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 071c67a..283e687 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -39,7 +39,8 @@ extern void eth_header_cache_update(struct hh_cache *hh, struct net_device *dev
extern int eth_header_cache(struct neighbour *neigh,
struct hh_cache *hh);
-extern struct net_device *alloc_etherdev(int sizeof_priv);
+extern struct net_device *alloc_etherdev_mq(int sizeof_priv, int queue_count);
+#define alloc_etherdev(sizeof_priv) alloc_etherdev_mq(sizeof_priv, 1)
static inline void eth_copy_and_sum (struct sk_buff *dest,
const unsigned char *src,
int len, int base)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f671cd2..376a0d2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -108,6 +108,14 @@ struct wireless_dev;
#define MAX_HEADER (LL_MAX_HEADER + 48)
#endif
+struct net_device_subqueue
+{
+ /* Give a control state for each queue. This struct may contain
+ * per-queue locks in the future.
+ */
+ unsigned long state;
+};
+
/*
* Network device statistics. Akin to the 2.0 ether stats but
* with byte counters.
@@ -325,6 +333,7 @@ struct net_device
#define NETIF_F_VLAN_CHALLENGED 1024 /* Device cannot handle VLAN packets */
#define NETIF_F_GSO 2048 /* Enable software GSO. */
#define NETIF_F_LLTX 4096 /* LockLess TX */
+#define NETIF_F_MULTI_QUEUE 16384 /* Has multiple TX/RX queues */
/* Segmentation offload features */
#define NETIF_F_GSO_SHIFT 16
@@ -540,6 +549,10 @@ struct net_device
struct device dev;
/* space for optional statistics and wireless sysfs groups */
struct attribute_group *sysfs_groups[3];
+
+ /* The TX queue control structures */
+ struct net_device_subqueue *egress_subqueue;
+ int egress_subqueue_count;
};
#define to_net_dev(d) container_of(d, struct net_device, dev)
@@ -702,6 +715,48 @@ static inline int netif_running(const struct net_device *dev)
return test_bit(__LINK_STATE_START, &dev->state);
}
+/*
+ * Routines to manage the subqueues on a device. We only need start
+ * stop, and a check if it's stopped. All other device management is
+ * done at the overall netdevice level.
+ * Also test the device if we're multiqueue.
+ */
+static inline void netif_start_subqueue(struct net_device *dev, u16 queue_index)
+{
+ clear_bit(__LINK_STATE_XOFF, &dev->egress_subqueue[queue_index].state);
+}
+
+static inline void netif_stop_subqueue(struct net_device *dev, u16 queue_index)
+{
+#ifdef CONFIG_NETPOLL_TRAP
+ if (netpoll_trap())
+ return;
+#endif
+ set_bit(__LINK_STATE_XOFF, &dev->egress_subqueue[queue_index].state);
+}
+
+static inline int netif_subqueue_stopped(const struct net_device *dev,
+ u16 queue_index)
+{
+ return test_bit(__LINK_STATE_XOFF,
+ &dev->egress_subqueue[queue_index].state);
+}
+
+static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
+{
+#ifdef CONFIG_NETPOLL_TRAP
+ if (netpoll_trap())
+ return;
+#endif
+ if (test_and_clear_bit(__LINK_STATE_XOFF,
+ &dev->egress_subqueue[queue_index].state))
+ __netif_schedule(dev);
+}
+
+static inline int netif_is_multiqueue(const struct net_device *dev)
+{
+ return (!!(NETIF_F_MULTI_QUEUE & dev->features));
+}
/* Use this variant when it is known for sure that it
* is executing from interrupt context.
@@ -995,8 +1050,11 @@ static inline void netif_tx_disable(struct net_device *dev)
extern void ether_setup(struct net_device *dev);
/* Support for loadable net-drivers */
-extern struct net_device *alloc_netdev(int sizeof_priv, const char *name,
- void (*setup)(struct net_device *));
+extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
+ void (*setup)(struct net_device *),
+ int queue_count);
+#define alloc_netdev(sizeof_priv, name, setup) \
+ alloc_netdev_mq(sizeof_priv, name, setup, 1)
extern int register_netdev(struct net_device *dev);
extern void unregister_netdev(struct net_device *dev);
/* Functions used for multicast support */
diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index d10f353..0d1adaf 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -22,6 +22,7 @@
#define TC_PRIO_CONTROL 7
#define TC_PRIO_MAX 15
+#define TC_RR_MAX 15
/* Generic queue statistics, available for all the elements.
Particular schedulers may have also their private records.
@@ -90,6 +91,16 @@ struct tc_fifo_qopt
__u32 limit; /* Queue length: bytes for bfifo, packets for pfifo */
};
+/* RR section */
+#define TCQ_RR_BANDS 16
+#define TCQ_MIN_RR_BANDS 2
+
+struct tc_rr_qopt
+{
+ int bands; /* Number of bands */
+ __u8 priomap[TC_RR_MAX+1]; /* Map: Linux priority -> RR band */
+};
+
/* PRIO section */
#define TCQ_PRIO_BANDS 16
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index e7367c7..8bcd870 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -215,6 +215,7 @@ typedef unsigned char *sk_buff_data_t;
* @pkt_type: Packet class
* @fclone: skbuff clone status
* @ip_summed: Driver fed us an IP checksum
+ * @queue_mapping: Queue mapping for multiqueue devices
* @priority: Packet queueing priority
* @users: User count - see {datagram,tcp}.c
* @protocol: Packet protocol from driver
@@ -269,6 +270,7 @@ struct sk_buff {
__u16 csum_offset;
};
};
+ __u16 queue_mapping;
__u32 priority;
__u8 local_df:1,
cloned:1,
diff --git a/net/core/dev.c b/net/core/dev.c
index 4317c1b..27c90e1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1477,6 +1477,8 @@ gso:
spin_lock(&dev->queue_lock);
q = dev->qdisc;
if (q->enqueue) {
+ /* reset queue_mapping to zero */
+ skb->queue_mapping = 0;
rc = q->enqueue(skb, q);
qdisc_run(dev);
spin_unlock(&dev->queue_lock);
@@ -3273,16 +3275,18 @@ static struct net_device_stats *internal_stats(struct net_device *dev)
}
/**
- * alloc_netdev - allocate network device
+ * alloc_netdev_mq - allocate network device
* @sizeof_priv: size of private data to allocate space for
* @name: device name format string
* @setup: callback to initialize device
+ * @queue_count: the number of subqueues to allocate
*
* Allocates a struct net_device with private data area for driver use
- * and performs basic initialization.
+ * and performs basic initialization. Also allocates subqueue structs
+ * for each queue on the device.
*/
-struct net_device *alloc_netdev(int sizeof_priv, const char *name,
- void (*setup)(struct net_device *))
+struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
+ void (*setup)(struct net_device *), int queue_count)
{
void *p;
struct net_device *dev;
@@ -3307,12 +3311,23 @@ struct net_device *alloc_netdev(int sizeof_priv, const char *name,
if (sizeof_priv)
dev->priv = netdev_priv(dev);
+ alloc_size = (sizeof(struct net_device_subqueue) * queue_count);
+
+ p = kzalloc(alloc_size, GFP_KERNEL);
+ if (!p) {
+ printk(KERN_ERR "alloc_netdev: Unable to allocate queues.\n");
+ return NULL;
+ }
+
+ dev->egress_subqueue = p;
+ dev->egress_subqueue_count = queue_count;
+
dev->get_stats = internal_stats;
setup(dev);
strcpy(dev->name, name);
return dev;
}
-EXPORT_SYMBOL(alloc_netdev);
+EXPORT_SYMBOL(alloc_netdev_mq);
/**
* free_netdev - free network device
@@ -3326,6 +3341,7 @@ void free_netdev(struct net_device *dev)
{
#ifdef CONFIG_SYSFS
/* Compatibility with error handling in drivers */
+ kfree((char *)dev->egress_subqueue);
if (dev->reg_state == NETREG_UNINITIALIZED) {
kfree((char *)dev - dev->padded);
return;
@@ -3337,6 +3353,7 @@ void free_netdev(struct net_device *dev)
/* will free via device release */
put_device(&dev->dev);
#else
+ kfree((char *)dev->egress_subqueue);
kfree((char *)dev - dev->padded);
#endif
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 1422573..0528cf3 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -418,6 +418,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
n->nohdr = 0;
C(pkt_type);
C(ip_summed);
+ C(queue_mapping);
C(priority);
#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
C(ipvs_property);
@@ -459,6 +460,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
#endif
new->sk = NULL;
new->dev = old->dev;
+ new->queue_mapping = old->queue_mapping;
new->priority = old->priority;
new->protocol = old->protocol;
new->dst = dst_clone(old->dst);
@@ -1926,6 +1928,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features)
tail = nskb;
nskb->dev = skb->dev;
+ nskb->queue_mapping = skb->queue_mapping;
nskb->priority = skb->priority;
nskb->protocol = skb->protocol;
nskb->dst = dst_clone(skb->dst);
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 0ac2524..87a509c 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -316,9 +316,10 @@ void ether_setup(struct net_device *dev)
EXPORT_SYMBOL(ether_setup);
/**
- * alloc_etherdev - Allocates and sets up an Ethernet device
+ * alloc_etherdev_mq - Allocates and sets up an Ethernet device
* @sizeof_priv: Size of additional driver-private structure to be allocated
* for this Ethernet device
+ * @queue_count: The number of queues this device has.
*
* Fill in the fields of the device structure with Ethernet-generic
* values. Basically does everything except registering the device.
@@ -328,8 +329,8 @@ EXPORT_SYMBOL(ether_setup);
* this private data area.
*/
-struct net_device *alloc_etherdev(int sizeof_priv)
+struct net_device *alloc_etherdev_mq(int sizeof_priv, int queue_count)
{
- return alloc_netdev(sizeof_priv, "eth%d", ether_setup);
+ return alloc_netdev_mq(sizeof_priv, "eth%d", ether_setup, queue_count);
}
-EXPORT_SYMBOL(alloc_etherdev);
+EXPORT_SYMBOL(alloc_etherdev_mq);
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 475df84..a532554 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -111,6 +111,28 @@ config NET_SCH_PRIO
To compile this code as a module, choose M here: the
module will be called sch_prio.
+config NET_SCH_PRIO_MQ
+ bool "Multiple hardware queue support for PRIO"
+ depends on NET_SCH_PRIO
+ ---help---
+ Say Y here if you want to allow the PRIO qdisc to assign
+ flows to multiple hardware queues on an ethernet device. This
+ will still work on devices with 1 queue.
+
+ Consider this scheduler for devices that do not use
+ hardware-based scheduling policies. Otherwise, use NET_SCH_RR.
+
+ Most people will say N here.
+
+config NET_SCH_RR
+ tristate "Multi Band Round Robin Queuing (RR)"
+ ---help---
+ Say Y here if you want to use an n-band round robin packet
+ scheduler.
+
+ To compile this code as a module, choose M here: the
+ module will be caleld sch_rr.
+
config NET_SCH_RED
tristate "Random Early Detection (RED)"
---help---
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 020767a..d3ed44e 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -26,6 +26,7 @@ obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o
obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o
obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o
obj-$(CONFIG_NET_SCH_PRIO) += sch_prio.o
+obj-$(CONFIG_NET_SCH_RR) += sch_rr.o
obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o
obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o
obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index f28bb2d..b9dc2a6 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -123,7 +123,8 @@ static inline int qdisc_restart(struct net_device *dev)
/* And release queue */
spin_unlock(&dev->queue_lock);
- if (!netif_queue_stopped(dev)) {
+ if (!netif_queue_stopped(dev) &&
+ !netif_subqueue_stopped(dev, skb->queue_mapping)) {
int ret;
ret = dev_hard_start_xmit(skb, dev);
@@ -141,7 +142,6 @@ static inline int qdisc_restart(struct net_device *dev)
goto collision;
}
}
-
/* NETDEV_TX_BUSY - we need to requeue */
/* Release the driver */
if (!nolock) {
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 269a6e1..c78dba4 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -43,6 +43,7 @@ struct prio_sched_data
struct tcf_proto *filter_list;
u8 prio2band[TC_PRIO_MAX+1];
struct Qdisc *queues[TCQ_PRIO_BANDS];
+ u16 band2queue[TC_PRIO_MAX + 1];
};
@@ -70,13 +71,26 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
#endif
if (TC_H_MAJ(band))
band = 0;
+#ifdef CONFIG_NET_SCH_PRIO_MQ
+ skb->queue_mapping =
+ q->band2queue[q->prio2band[band&TC_PRIO_MAX]];
+#endif
+
return q->queues[q->prio2band[band&TC_PRIO_MAX]];
}
band = res.classid;
}
band = TC_H_MIN(band) - 1;
- if (band > q->bands)
+ if (band > q->bands) {
+#ifdef CONFIG_NET_SCH_PRIO_MQ
+ skb->queue_mapping = q->band2queue[q->prio2band[0]];
+#endif
return q->queues[q->prio2band[0]];
+ }
+
+#ifdef CONFIG_NET_SCH_PRIO_MQ
+ skb->queue_mapping = q->band2queue[band];
+#endif
return q->queues[band];
}
@@ -144,12 +158,22 @@ prio_dequeue(struct Qdisc* sch)
struct Qdisc *qdisc;
for (prio = 0; prio < q->bands; prio++) {
- qdisc = q->queues[prio];
- skb = qdisc->dequeue(qdisc);
- if (skb) {
- sch->q.qlen--;
- return skb;
+#ifdef CONFIG_NET_SCH_PRIO_MQ
+ /* Check if the target subqueue is available before
+ * pulling an skb. This way we avoid excessive requeues
+ * for slower queues.
+ */
+ if (!netif_subqueue_stopped(sch->dev, q->band2queue[prio])) {
+#endif
+ qdisc = q->queues[prio];
+ skb = qdisc->dequeue(qdisc);
+ if (skb) {
+ sch->q.qlen--;
+ return skb;
+ }
+#ifdef CONFIG_NET_SCH_PRIO_MQ
}
+#endif
}
return NULL;
@@ -200,6 +224,10 @@ static int prio_tune(struct Qdisc *sch, struct rtattr *opt)
struct prio_sched_data *q = qdisc_priv(sch);
struct tc_prio_qopt *qopt = RTA_DATA(opt);
int i;
+ int queue;
+ int qmapoffset;
+ int offset;
+ int mod;
if (opt->rta_len < RTA_LENGTH(sizeof(*qopt)))
return -EINVAL;
@@ -242,6 +270,32 @@ static int prio_tune(struct Qdisc *sch, struct rtattr *opt)
}
}
}
+#ifdef CONFIG_NET_SCH_PRIO_MQ
+ /* setup queue to band mapping */
+ if (q->bands < sch->dev->egress_subqueue_count) {
+ qmapoffset = 1;
+ mod = sch->dev->egress_subqueue_count;
+ } else {
+ mod = q->bands % sch->dev->egress_subqueue_count;
+ qmapoffset = q->bands / sch->dev->egress_subqueue_count
+ + ((mod) ? 1 : 0);
+ }
+
+ queue = 0;
+ offset = 0;
+ for (i = 0; i < q->bands; i++) {
+ q->band2queue[i] = queue;
+ if ( ((i + 1) - offset) == qmapoffset) {
+ queue++;
+ offset += qmapoffset;
+ if (mod)
+ mod--;
+ qmapoffset = q->bands /
+ sch->dev->egress_subqueue_count +
+ ((mod) ? 1 : 0);
+ }
+ }
+#endif
return 0;
}
diff --git a/net/sched/sch_rr.c b/net/sched/sch_rr.c
new file mode 100644
index 0000000..ce9f237
--- /dev/null
+++ b/net/sched/sch_rr.c
@@ -0,0 +1,516 @@
+/*
+ * net/sched/sch_rr.c Simple n-band round-robin scheduler.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * The core part of this qdisc is based on sch_prio. ->dequeue() is where
+ * this scheduler functionally differs.
+ *
+ * Author: PJ Waskiewicz, <peter.p.waskiewicz.jr@...el.com>
+ *
+ * Original Authors (from PRIO): Alexey Kuznetsov, <kuznet@....inr.ac.ru>
+ * Fixes: 19990609: J Hadi Salim <hadi@...telnetworks.com>:
+ * Init -- EINVAL when opt undefined
+ */
+
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/notifier.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+
+
+struct rr_sched_data
+{
+ int bands;
+ int curband;
+ struct tcf_proto *filter_list;
+ u8 prio2band[TC_RR_MAX + 1];
+ struct Qdisc *queues[TCQ_RR_BANDS];
+ u16 band2queue[TC_RR_MAX + 1];
+};
+
+
+static struct Qdisc *rr_classify(struct sk_buff *skb, struct Qdisc *sch,
+ int *qerr)
+{
+ struct rr_sched_data *q = qdisc_priv(sch);
+ u32 band = skb->priority;
+ struct tcf_result res;
+
+ *qerr = NET_XMIT_BYPASS;
+ if (TC_H_MAJ(skb->priority) != sch->handle) {
+#ifdef CONFIG_NET_CLS_ACT
+ switch (tc_classify(skb, q->filter_list, &res)) {
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ *qerr = NET_XMIT_SUCCESS;
+ case TC_ACT_SHOT:
+ return NULL;
+ }
+
+ if (!q->filter_list ) {
+#else
+ if (!q->filter_list || tc_classify(skb, q->filter_list, &res)) {
+#endif
+ if (TC_H_MAJ(band))
+ band = 0;
+ skb->queue_mapping =
+ q->band2queue[q->prio2band[band&TC_RR_MAX]];
+
+ return q->queues[q->prio2band[band&TC_RR_MAX]];
+ }
+ band = res.classid;
+ }
+ band = TC_H_MIN(band) - 1;
+ if (band > q->bands) {
+ skb->queue_mapping = q->band2queue[q->prio2band[0]];
+ return q->queues[q->prio2band[0]];
+ }
+
+ skb->queue_mapping = q->band2queue[band];
+
+ return q->queues[band];
+}
+
+static int rr_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct Qdisc *qdisc;
+ int ret;
+
+ qdisc = rr_classify(skb, sch, &ret);
+#ifdef CONFIG_NET_CLS_ACT
+ if (qdisc == NULL) {
+
+ if (ret == NET_XMIT_BYPASS)
+ sch->qstats.drops++;
+ kfree_skb(skb);
+ return ret;
+ }
+#endif
+
+ if ((ret = qdisc->enqueue(skb, qdisc)) == NET_XMIT_SUCCESS) {
+ sch->bstats.bytes += skb->len;
+ sch->bstats.packets++;
+ sch->q.qlen++;
+ return NET_XMIT_SUCCESS;
+ }
+ sch->qstats.drops++;
+ return ret;
+}
+
+
+static int rr_requeue(struct sk_buff *skb, struct Qdisc* sch)
+{
+ struct Qdisc *qdisc;
+ int ret;
+
+ qdisc = rr_classify(skb, sch, &ret);
+#ifdef CONFIG_NET_CLS_ACT
+ if (qdisc == NULL) {
+ if (ret == NET_XMIT_BYPASS)
+ sch->qstats.drops++;
+ kfree_skb(skb);
+ return ret;
+ }
+#endif
+
+ if ((ret = qdisc->ops->requeue(skb, qdisc)) == NET_XMIT_SUCCESS) {
+ sch->q.qlen++;
+ sch->qstats.requeues++;
+ return 0;
+ }
+ sch->qstats.drops++;
+ return NET_XMIT_DROP;
+}
+
+
+static struct sk_buff *rr_dequeue(struct Qdisc* sch)
+{
+ struct sk_buff *skb;
+ struct rr_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *qdisc;
+ int bandcount;
+
+ /* Only take one pass through the queues. If nothing is available,
+ * return nothing.
+ */
+ for (bandcount = 0; bandcount < q->bands; bandcount++) {
+ /* Check if the target subqueue is available before
+ * pulling an skb. This way we avoid excessive requeues
+ * for slower queues. If the queue is stopped, try the
+ * next queue.
+ */
+ if (!netif_subqueue_stopped(sch->dev, q->band2queue[q->curband])) {
+ qdisc = q->queues[q->curband];
+ skb = qdisc->dequeue(qdisc);
+ if (skb) {
+ sch->q.qlen--;
+ q->curband++;
+ if (q->curband >= q->bands)
+ q->curband = 0;
+ return skb;
+ }
+ }
+ q->curband++;
+ if (q->curband >= q->bands)
+ q->curband = 0;
+ }
+ return NULL;
+}
+
+static unsigned int rr_drop(struct Qdisc* sch)
+{
+ struct rr_sched_data *q = qdisc_priv(sch);
+ int band;
+ unsigned int len;
+ struct Qdisc *qdisc;
+
+ for (band = q->bands - 1; band >= 0; band--) {
+ qdisc = q->queues[band];
+ if (qdisc->ops->drop && (len = qdisc->ops->drop(qdisc)) != 0) {
+ sch->q.qlen--;
+ return len;
+ }
+ }
+ return 0;
+}
+
+
+static void rr_reset(struct Qdisc* sch)
+{
+ int band;
+ struct rr_sched_data *q = qdisc_priv(sch);
+
+ for (band = 0; band < q->bands; band++)
+ qdisc_reset(q->queues[band]);
+ sch->q.qlen = 0;
+}
+
+static void rr_destroy(struct Qdisc* sch)
+{
+ int band;
+ struct rr_sched_data *q = qdisc_priv(sch);
+
+ tcf_destroy_chain(q->filter_list);
+ for (band = 0; band < q->bands; band++)
+ qdisc_destroy(q->queues[band]);
+}
+
+static int rr_tune(struct Qdisc *sch, struct rtattr *opt)
+{
+ struct rr_sched_data *q = qdisc_priv(sch);
+ struct tc_rr_qopt *qopt = RTA_DATA(opt);
+ int i;
+ int queue;
+ int qmapoffset;
+ int offset;
+ int mod;
+
+ if (opt->rta_len < RTA_LENGTH(sizeof(*qopt)))
+ return -EINVAL;
+ if (qopt->bands > TCQ_RR_BANDS || qopt->bands < 2)
+ return -EINVAL;
+
+ for (i = 0; i <= TC_RR_MAX; i++) {
+ if (qopt->priomap[i] >= qopt->bands)
+ return -EINVAL;
+ }
+
+ sch_tree_lock(sch);
+ q->bands = qopt->bands;
+ memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1);
+ q->curband = 0;
+
+ for (i = q->bands; i < TCQ_RR_BANDS; i++) {
+ struct Qdisc *child = xchg(&q->queues[i], &noop_qdisc);
+ if (child != &noop_qdisc) {
+ qdisc_tree_decrease_qlen(child, child->q.qlen);
+ qdisc_destroy(child);
+ }
+ }
+ sch_tree_unlock(sch);
+
+ for (i = 0; i < q->bands; i++) {
+ if (q->queues[i] == &noop_qdisc) {
+ struct Qdisc *child;
+ child = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops,
+ TC_H_MAKE(sch->handle, i + 1));
+ if (child) {
+ sch_tree_lock(sch);
+ child = xchg(&q->queues[i], child);
+
+ if (child != &noop_qdisc) {
+ qdisc_tree_decrease_qlen(child,
+ child->q.qlen);
+ qdisc_destroy(child);
+ }
+ sch_tree_unlock(sch);
+ }
+ }
+ }
+ /* setup queue to band mapping - best effort to map into available
+ * hardware queues
+ */
+ if (q->bands < sch->dev->egress_subqueue_count) {
+ qmapoffset = 1;
+ mod = sch->dev->egress_subqueue_count;
+ } else {
+ mod = q->bands % sch->dev->egress_subqueue_count;
+ qmapoffset = q->bands / sch->dev->egress_subqueue_count
+ + ((mod) ? 1 : 0);
+ }
+
+ queue = 0;
+ offset = 0;
+ for (i = 0; i < q->bands; i++) {
+ q->band2queue[i] = queue;
+ if ( ((i + 1) - offset) == qmapoffset) {
+ queue++;
+ offset += qmapoffset;
+ if (mod)
+ mod--;
+ qmapoffset = q->bands /
+ sch->dev->egress_subqueue_count +
+ ((mod) ? 1 : 0);
+ }
+ }
+
+ return 0;
+}
+
+static int rr_init(struct Qdisc *sch, struct rtattr *opt)
+{
+ struct rr_sched_data *q = qdisc_priv(sch);
+ int i;
+
+ for (i = 0; i < TCQ_RR_BANDS; i++)
+ q->queues[i] = &noop_qdisc;
+
+ if (opt == NULL) {
+ return -EINVAL;
+ } else {
+ int err;
+
+ if ((err = rr_tune(sch, opt)) != 0)
+ return err;
+ }
+ return 0;
+}
+
+static int rr_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct rr_sched_data *q = qdisc_priv(sch);
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tc_rr_qopt opt;
+
+ opt.bands = q->bands;
+ memcpy(&opt.priomap, q->prio2band, TC_RR_MAX + 1);
+ RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+ return skb->len;
+
+rtattr_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+static int rr_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+ struct Qdisc **old)
+{
+ struct rr_sched_data *q = qdisc_priv(sch);
+ unsigned long band = arg - 1;
+
+ if (band >= q->bands)
+ return -EINVAL;
+
+ if (new == NULL)
+ new = &noop_qdisc;
+
+ sch_tree_lock(sch);
+ *old = q->queues[band];
+ q->queues[band] = new;
+ qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+ qdisc_reset(*old);
+ sch_tree_unlock(sch);
+
+ return 0;
+}
+
+static struct Qdisc *rr_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct rr_sched_data *q = qdisc_priv(sch);
+ unsigned long band = arg - 1;
+
+ if (band >= q->bands)
+ return NULL;
+
+ return q->queues[band];
+}
+
+static unsigned long rr_get(struct Qdisc *sch, u32 classid)
+{
+ struct rr_sched_data *q = qdisc_priv(sch);
+ unsigned long band = TC_H_MIN(classid);
+
+ if (band - 1 >= q->bands)
+ return 0;
+ return band;
+}
+
+static unsigned long rr_bind(struct Qdisc *sch, unsigned long parent,
+ u32 classid)
+{
+ return rr_get(sch, classid);
+}
+
+
+static void rr_put(struct Qdisc *q, unsigned long cl)
+{
+ return;
+}
+
+static int rr_change(struct Qdisc *sch, u32 handle, u32 parent,
+ struct rtattr **tca, unsigned long *arg)
+{
+ unsigned long cl = *arg;
+ struct rr_sched_data *q = qdisc_priv(sch);
+
+ if (cl - 1 > q->bands)
+ return -ENOENT;
+ return 0;
+}
+
+static int rr_delete(struct Qdisc *sch, unsigned long cl)
+{
+ struct rr_sched_data *q = qdisc_priv(sch);
+ if (cl - 1 > q->bands)
+ return -ENOENT;
+ return 0;
+}
+
+
+static int rr_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct rr_sched_data *q = qdisc_priv(sch);
+
+ if (cl - 1 > q->bands)
+ return -ENOENT;
+ tcm->tcm_handle |= TC_H_MIN(cl);
+ if (q->queues[cl - 1])
+ tcm->tcm_info = q->queues[cl - 1]->handle;
+ return 0;
+}
+
+static int rr_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+ struct gnet_dump *d)
+{
+ struct rr_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *cl_q;
+
+ cl_q = q->queues[cl - 1];
+ if (gnet_stats_copy_basic(d, &cl_q->bstats) < 0 ||
+ gnet_stats_copy_queue(d, &cl_q->qstats) < 0)
+ return -1;
+
+ return 0;
+}
+
+static void rr_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ struct rr_sched_data *q = qdisc_priv(sch);
+ int band;
+
+ if (arg->stop)
+ return;
+
+ for (band = 0; band < q->bands; band++) {
+ if (arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(sch, band + 1, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+ arg->count++;
+ }
+}
+
+static struct tcf_proto **rr_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+ struct rr_sched_data *q = qdisc_priv(sch);
+
+ if (cl)
+ return NULL;
+ return &q->filter_list;
+}
+
+static struct Qdisc_class_ops rr_class_ops = {
+ .graft = rr_graft,
+ .leaf = rr_leaf,
+ .get = rr_get,
+ .put = rr_put,
+ .change = rr_change,
+ .delete = rr_delete,
+ .walk = rr_walk,
+ .tcf_chain = rr_find_tcf,
+ .bind_tcf = rr_bind,
+ .unbind_tcf = rr_put,
+ .dump = rr_dump_class,
+ .dump_stats = rr_dump_class_stats,
+};
+
+static struct Qdisc_ops rr_qdisc_ops = {
+ .next = NULL,
+ .cl_ops = &rr_class_ops,
+ .id = "rr",
+ .priv_size = sizeof(struct rr_sched_data),
+ .enqueue = rr_enqueue,
+ .dequeue = rr_dequeue,
+ .requeue = rr_requeue,
+ .drop = rr_drop,
+ .init = rr_init,
+ .reset = rr_reset,
+ .destroy = rr_destroy,
+ .change = rr_tune,
+ .dump = rr_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init rr_module_init(void)
+{
+ return register_qdisc(&rr_qdisc_ops);
+}
+
+static void __exit rr_module_exit(void)
+{
+ unregister_qdisc(&rr_qdisc_ops);
+}
+
+module_init(rr_module_init)
+module_exit(rr_module_exit)
+
+MODULE_LICENSE("GPL");
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists