[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <f4eaac32-204e-259d-b69b-c2c9885d55fa@gmail.com>
Date: Tue, 12 Jun 2018 10:49:19 -0700
From: Florian Fainelli <f.fainelli@...il.com>
To: Alexander Duyck <alexander.h.duyck@...el.com>,
intel-wired-lan@...osl.org, jeffrey.t.kirsher@...el.com,
netdev@...r.kernel.org
Subject: Re: [jkirsher/next-queue PATCH v2 2/7] net: Add support for
subordinate device traffic classes
On 06/12/2018 08:18 AM, Alexander Duyck wrote:
> This patch is meant to provide the basic tools needed to allow us to create
> subordinate device traffic classes. The general idea here is to allow
> subdividing the queues of a device into queue groups accessible through an
> upper device such as a macvlan.
>
> The idea here is to enforce the idea that an upper device has to be a
> single queue device, ideally with IFF_NO_QUQUE set. With that being the
> case we can pretty much guarantee that the tc_to_txq mappings and XPS maps
> for the upper device are unused. As such we could reuse those in order to
> support subdividing the lower device and distributing those queues between
> the subordinate devices.
This is not necessarily a valid paradigm to work with. For instance in
DSA we have IFF_NO_QUEUE devices, but we still expose multiple egress
queues because that is how an application can choose how it wants to get
packets transmitted at the switch level. We have a 1:1 representation
between a queue at the net_device level, and what an egress queue at the
switch level is, so things like buffer reservation etc. can be configured.
I think you should consider that an upper device might want to have a
1:1 mapping to the lower device's queues and make that permissible.
Thoughts?
>
> In order to distinguish between a regular set of traffic classes and if a
> device is carrying subordinate traffic classes I changed num_tc from a u8
> to a s16 value and use the negative values to represent the suboordinate
> pool values. So starting at -1 and running to -32768 we can encode those as
> pool values, and the existing values of 0 to 15 can be maintained.
>
> Signed-off-by: Alexander Duyck <alexander.h.duyck@...el.com>
> ---
> include/linux/netdevice.h | 16 ++++++++
> net/core/dev.c | 89 +++++++++++++++++++++++++++++++++++++++++++++
> net/core/net-sysfs.c | 21 ++++++++++-
> 3 files changed, 124 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 3ec9850..41b4660 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -569,6 +569,9 @@ struct netdev_queue {
> * (/sys/class/net/DEV/Q/trans_timeout)
> */
> unsigned long trans_timeout;
> +
> + /* Suboordinate device that the queue has been assigned to */
> + struct net_device *sb_dev;
> /*
> * write-mostly part
> */
> @@ -1978,7 +1981,7 @@ struct net_device {
> #ifdef CONFIG_DCB
> const struct dcbnl_rtnl_ops *dcbnl_ops;
> #endif
> - u8 num_tc;
> + s16 num_tc;
> struct netdev_tc_txq tc_to_txq[TC_MAX_QUEUE];
> u8 prio_tc_map[TC_BITMASK + 1];
>
> @@ -2032,6 +2035,17 @@ int netdev_get_num_tc(struct net_device *dev)
> return dev->num_tc;
> }
>
> +void netdev_unbind_sb_channel(struct net_device *dev,
> + struct net_device *sb_dev);
> +int netdev_bind_sb_channel_queue(struct net_device *dev,
> + struct net_device *sb_dev,
> + u8 tc, u16 count, u16 offset);
> +int netdev_set_sb_channel(struct net_device *dev, u16 channel);
> +static inline int netdev_get_sb_channel(struct net_device *dev)
> +{
> + return max_t(int, -dev->num_tc, 0);
> +}
> +
> static inline
> struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev,
> unsigned int index)
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 6e18242..27fe4f2 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2068,11 +2068,13 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
> struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
> int i;
>
> + /* walk through the TCs and see if it falls into any of them */
> for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
> if ((txq - tc->offset) < tc->count)
> return i;
> }
>
> + /* didn't find it, just return -1 to indicate no match */
> return -1;
> }
>
> @@ -2215,7 +2217,14 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
> bool active = false;
>
> if (dev->num_tc) {
> + /* Do not allow XPS on subordinate device directly */
> num_tc = dev->num_tc;
> + if (num_tc < 0)
> + return -EINVAL;
> +
> + /* If queue belongs to subordinate dev use its map */
> + dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
> +
> tc = netdev_txq_to_tc(dev, index);
> if (tc < 0)
> return -EINVAL;
> @@ -2366,11 +2375,25 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
> EXPORT_SYMBOL(netif_set_xps_queue);
>
> #endif
> +static void netdev_unbind_all_sb_channels(struct net_device *dev)
> +{
> + struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
> +
> + /* Unbind any subordinate channels */
> + while (txq-- != &dev->_tx[0]) {
> + if (txq->sb_dev)
> + netdev_unbind_sb_channel(dev, txq->sb_dev);
> + }
> +}
> +
> void netdev_reset_tc(struct net_device *dev)
> {
> #ifdef CONFIG_XPS
> netif_reset_xps_queues_gt(dev, 0);
> #endif
> + netdev_unbind_all_sb_channels(dev);
> +
> + /* Reset TC configuration of device */
> dev->num_tc = 0;
> memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
> memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
> @@ -2399,11 +2422,77 @@ int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
> #ifdef CONFIG_XPS
> netif_reset_xps_queues_gt(dev, 0);
> #endif
> + netdev_unbind_all_sb_channels(dev);
> +
> dev->num_tc = num_tc;
> return 0;
> }
> EXPORT_SYMBOL(netdev_set_num_tc);
>
> +void netdev_unbind_sb_channel(struct net_device *dev,
> + struct net_device *sb_dev)
> +{
> + struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
> +
> +#ifdef CONFIG_XPS
> + netif_reset_xps_queues_gt(sb_dev, 0);
> +#endif
> + memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
> + memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
> +
> + while (txq-- != &dev->_tx[0]) {
> + if (txq->sb_dev == sb_dev)
> + txq->sb_dev = NULL;
> + }
> +}
> +EXPORT_SYMBOL(netdev_unbind_sb_channel);
> +
> +int netdev_bind_sb_channel_queue(struct net_device *dev,
> + struct net_device *sb_dev,
> + u8 tc, u16 count, u16 offset)
> +{
> + /* Make certain the sb_dev and dev are already configured */
> + if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
> + return -EINVAL;
> +
> + /* We cannot hand out queues we don't have */
> + if ((offset + count) > dev->real_num_tx_queues)
> + return -EINVAL;
> +
> + /* Record the mapping */
> + sb_dev->tc_to_txq[tc].count = count;
> + sb_dev->tc_to_txq[tc].offset = offset;
> +
> + /* Provide a way for Tx queue to find the tc_to_txq map or
> + * XPS map for itself.
> + */
> + while (count--)
> + netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
> +
> + return 0;
> +}
> +EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
> +
> +int netdev_set_sb_channel(struct net_device *dev, u16 channel)
> +{
> + /* Do not use a multiqueue device to represent a subordinate channel */
> + if (netif_is_multiqueue(dev))
> + return -ENODEV;
> +
> + /* We allow channels 1 - 32767 to be used for subordinate channels.
> + * Channel 0 is meant to be "native" mode and used only to represent
> + * the main root device. We allow writing 0 to reset the device back
> + * to normal mode after being used as a subordinate channel.
> + */
> + if (channel > S16_MAX)
> + return -EINVAL;
> +
> + dev->num_tc = -channel;
> +
> + return 0;
> +}
> +EXPORT_SYMBOL(netdev_set_sb_channel);
> +
> /*
> * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
> * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
> diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
> index 335c6a4..bd067b1 100644
> --- a/net/core/net-sysfs.c
> +++ b/net/core/net-sysfs.c
> @@ -1054,11 +1054,23 @@ static ssize_t traffic_class_show(struct netdev_queue *queue,
> return -ENOENT;
>
> index = get_netdev_queue_index(queue);
> +
> + /* If queue belongs to subordinate dev use its tc mapping */
> + dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
> +
> tc = netdev_txq_to_tc(dev, index);
> if (tc < 0)
> return -EINVAL;
>
> - return sprintf(buf, "%u\n", tc);
> + /* We can report the traffic class one of two ways:
> + * Subordinate device traffic classes are reported with the traffic
> + * class first, and then the subordinate class so for example TC0 on
> + * subordinate device 2 will be reported as "0-2". If the queue
> + * belongs to the root device it will be reported with just the
> + * traffic class, so just "0" for TC 0 for example.
> + */
> + return dev->num_tc < 0 ? sprintf(buf, "%u%d\n", tc, dev->num_tc) :
> + sprintf(buf, "%u\n", tc);
> }
>
> #ifdef CONFIG_XPS
> @@ -1225,7 +1237,14 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue,
> index = get_netdev_queue_index(queue);
>
> if (dev->num_tc) {
> + /* Do not allow XPS on subordinate device directly */
> num_tc = dev->num_tc;
> + if (num_tc < 0)
> + return -EINVAL;
> +
> + /* If queue belongs to subordinate dev use its map */
> + dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
> +
> tc = netdev_txq_to_tc(dev, index);
> if (tc < 0)
> return -EINVAL;
>
--
Florian
Powered by blists - more mailing lists