[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20200624171749.11927-8-tom@herbertland.com>
Date: Wed, 24 Jun 2020 10:17:46 -0700
From: Tom Herbert <tom@...bertland.com>
To: netdev@...r.kernel.org
Cc: Tom Herbert <tom@...bertland.com>
Subject: [RFC PATCH 07/11] net: Introduce global queues
Global queues, or gqids, are an abstract representation of NIC
device queues. They are global in the sense that the each gqid
can be map to a queue in each device, i.e. if there are multiple
devices in the system, a gqid can map to a different queue, a dqid,
in each device in a one to many mapping. gqids are used for
configuring packet steering on both send and receive in a generic
way not bound to a particular device.
Each transmit or receive device queue may be reversed mapped to
one gqid. Each device maintains a table mapping gqids to local
device queues, those tables are used in the data path to convert
a gqid receive or transmit queue into a device queue relative to
the sending or receiving device.
Changes in the patch:
- Add a simple index to netdev_queue and netdev_rx_queue
This serves as the dqid (it's just the index in the
receive or transmit queue array for the device)
- Add gqid to netdev_queue and netdev_rx_queue. This is the
mapping of a device queue to gqid. If gqid is NO_QUEUE
then the gqid is unmapped
- The per device gqid to dqid maps are maintained in an
array of netdev_queue_map structures in a net_devce for
both transmit and receive
- Functions that return a dqid where input is gqid and
a net_device
- Sysfs to set device queue mappings in global_queue_mapping
attribyte of the sysfs rx- and tx- queue directory
- Create per device gqid to dqid maps in the sysfs function
---
include/linux/netdevice.h | 75 ++++++++++++++
net/core/dev.c | 20 +++-
net/core/net-sysfs.c | 199 +++++++++++++++++++++++++++++++++++++-
3 files changed, 290 insertions(+), 4 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 48ba1c1fc644..ca163925211a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -606,6 +606,10 @@ struct netdev_queue {
#endif
#if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
int numa_node;
+#endif
+#ifdef CONFIG_RPS
+ u16 index;
+ u16 gqid;
#endif
unsigned long tx_maxrate;
/*
@@ -823,6 +827,8 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id,
/* This structure contains an instance of an RX queue. */
struct netdev_rx_queue {
#ifdef CONFIG_RPS
+ u16 index;
+ u16 gqid;
struct rps_map __rcu *rps_map;
struct rps_dev_flow_table __rcu *rps_flow_table;
#endif
@@ -875,6 +881,25 @@ struct xps_dev_maps {
#endif /* CONFIG_XPS */
+#ifdef CONFIG_RPS
+/* Structure to map a global queue to a device queue */
+struct netdev_queue_map {
+ struct rcu_head rcu;
+ unsigned int max_ents;
+ unsigned int set_count;
+ u16 map[0];
+};
+
+/* Allocate queue map in blocks to avoid thrashing */
+#define QUEUE_MAP_ALLOC_BLOCK 128
+
+#define QUEUE_MAP_ALLOC_NUMBER(_num) \
+ ((((_num - 1) / QUEUE_MAP_ALLOC_BLOCK) + 1) * QUEUE_MAP_ALLOC_BLOCK)
+
+#define QUEUE_MAP_ALLOC_SIZE(_num) (sizeof(struct netdev_queue_map) + \
+ (_num) * sizeof(u16))
+#endif /* CONFIG_RPS */
+
#define TC_MAX_QUEUE 16
#define TC_BITMASK 15
/* HW offloaded queuing disciplines txq count and offset maps */
@@ -2092,6 +2117,10 @@ struct net_device {
rx_handler_func_t __rcu *rx_handler;
void __rcu *rx_handler_data;
+#ifdef CONFIG_RPS
+ struct netdev_queue_map __rcu *rx_gqueue_map;
+#endif
+
#ifdef CONFIG_NET_CLS_ACT
struct mini_Qdisc __rcu *miniq_ingress;
#endif
@@ -2122,6 +2151,9 @@ struct net_device {
struct xps_dev_maps __rcu *xps_cpus_map;
struct xps_dev_maps __rcu *xps_rxqs_map;
#endif
+#ifdef CONFIG_RPS
+ struct netdev_queue_map __rcu *tx_gqueue_map;
+#endif
#ifdef CONFIG_NET_CLS_ACT
struct mini_Qdisc __rcu *miniq_egress;
#endif
@@ -2218,6 +2250,36 @@ struct net_device {
};
#define to_net_dev(d) container_of(d, struct net_device, dev)
+#ifdef CONFIG_RPS
+static inline u16 netdev_gqid_to_dqid(const struct netdev_queue_map *map,
+ u16 gqid)
+{
+ return (map && gqid < map->max_ents) ? map->map[gqid] : NO_QUEUE;
+}
+
+static inline u16 netdev_tx_gqid_to_dqid(const struct net_device *dev, u16 gqid)
+{
+ u16 dqid;
+
+ rcu_read_lock();
+ dqid = netdev_gqid_to_dqid(rcu_dereference(dev->tx_gqueue_map), gqid);
+ rcu_read_unlock();
+
+ return dqid;
+}
+
+static inline u16 netdev_rx_gqid_to_dqid(const struct net_device *dev, u16 gqid)
+{
+ u16 dqid;
+
+ rcu_read_lock();
+ dqid = netdev_gqid_to_dqid(rcu_dereference(dev->rx_gqueue_map), gqid);
+ rcu_read_unlock();
+
+ return dqid;
+}
+#endif
+
static inline bool netif_elide_gro(const struct net_device *dev)
{
if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
@@ -2290,6 +2352,19 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev,
f(dev, &dev->_tx[i], arg);
}
+static inline void netdev_for_each_tx_queue_index(struct net_device *dev,
+ void (*f)(struct net_device *,
+ struct netdev_queue *,
+ unsigned int index,
+ void *),
+ void *arg)
+{
+ unsigned int i;
+
+ for (i = 0; i < dev->num_tx_queues; i++)
+ f(dev, &dev->_tx[i], i, arg);
+}
+
#define netdev_lockdep_set_classes(dev) \
{ \
static struct lock_class_key qdisc_tx_busylock_key; \
diff --git a/net/core/dev.c b/net/core/dev.c
index 946940bdd583..f64bf6608775 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -9331,6 +9331,10 @@ static int netif_alloc_rx_queues(struct net_device *dev)
for (i = 0; i < count; i++) {
rx[i].dev = dev;
+#ifdef CONFIG_RPS
+ rx[i].index = i;
+ rx[i].gqid = NO_QUEUE;
+#endif
/* XDP RX-queue setup */
err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i);
@@ -9363,7 +9367,8 @@ static void netif_free_rx_queues(struct net_device *dev)
}
static void netdev_init_one_queue(struct net_device *dev,
- struct netdev_queue *queue, void *_unused)
+ struct netdev_queue *queue,
+ unsigned int index, void *_unused)
{
/* Initialize queue lock */
spin_lock_init(&queue->_xmit_lock);
@@ -9371,6 +9376,10 @@ static void netdev_init_one_queue(struct net_device *dev,
queue->xmit_lock_owner = -1;
netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
queue->dev = dev;
+#ifdef CONFIG_RPS
+ queue->index = index;
+ queue->gqid = NO_QUEUE;
+#endif
#ifdef CONFIG_BQL
dql_init(&queue->dql, HZ);
#endif
@@ -9396,7 +9405,7 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
dev->_tx = tx;
- netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
+ netdev_for_each_tx_queue_index(dev, netdev_init_one_queue, NULL);
spin_lock_init(&dev->tx_global_lock);
return 0;
@@ -9884,7 +9893,7 @@ struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
queue = kzalloc(sizeof(*queue), GFP_KERNEL);
if (!queue)
return NULL;
- netdev_init_one_queue(dev, queue, NULL);
+ netdev_init_one_queue(dev, queue, 0, NULL);
RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
queue->qdisc_sleeping = &noop_qdisc;
rcu_assign_pointer(dev->ingress_queue, queue);
@@ -10041,6 +10050,11 @@ void free_netdev(struct net_device *dev)
{
struct napi_struct *p, *n;
+#ifdef CONFIG_RPS
+ WARN_ON(rcu_dereference_protected(dev->tx_gqueue_map, 1));
+ WARN_ON(rcu_dereference_protected(dev->rx_gqueue_map, 1));
+#endif
+
might_sleep();
netif_free_tx_queues(dev);
netif_free_rx_queues(dev);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 56d27463d466..3a9d3d9ee8e0 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -875,18 +875,166 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
return len;
}
+static void queue_map_release(struct rcu_head *rcu)
+{
+ struct netdev_queue_map *q_map = container_of(rcu,
+ struct netdev_queue_map, rcu);
+ vfree(q_map);
+}
+
+static int set_device_queue_mapping(struct netdev_queue_map **pmap,
+ u16 gqid, u16 dqid, u16 *p_gqid)
+{
+ static DEFINE_MUTEX(global_mapping_table);
+ struct netdev_queue_map *gq_map, *old_gq_map;
+ u16 old_gqid;
+ int ret = 0;
+
+ mutex_lock(&global_mapping_table);
+
+ old_gqid = *p_gqid;
+ if (old_gqid == gqid) {
+ /* Nothing changing */
+ goto out;
+ }
+
+ gq_map = rcu_dereference_protected(*pmap,
+ lockdep_is_held(&global_mapping_table));
+ old_gq_map = gq_map;
+
+ if (gqid == NO_QUEUE) {
+ /* Remove any old mapping (we know that old_gqid cannot be
+ * NO_QUEUE from above)
+ */
+ if (!WARN_ON(!gq_map || old_gqid > gq_map->max_ents ||
+ gq_map->map[old_gqid] != dqid)) {
+ /* Unset old mapping */
+ gq_map->map[old_gqid] = NO_QUEUE;
+ if (--gq_map->set_count == 0) {
+ /* Done with map so free */
+ rcu_assign_pointer(*pmap, NULL);
+ call_rcu(&gq_map->rcu, queue_map_release);
+ }
+ }
+ *p_gqid = NO_QUEUE;
+
+ goto out;
+ }
+
+ if (!gq_map || gqid >= gq_map->max_ents) {
+ unsigned int max_queues;
+ int i = 0;
+
+ /* Need to create or expand queue map */
+
+ max_queues = QUEUE_MAP_ALLOC_NUMBER(gqid + 1);
+
+ gq_map = vmalloc(QUEUE_MAP_ALLOC_SIZE(max_queues));
+ if (!gq_map) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ gq_map->max_ents = max_queues;
+
+ if (old_gq_map) {
+ /* Copy old map entries */
+
+ memcpy(gq_map->map, old_gq_map->map,
+ old_gq_map->max_ents * sizeof(gq_map->map[0]));
+ gq_map->set_count = old_gq_map->set_count;
+ i = old_gq_map->max_ents;
+ } else {
+ gq_map->set_count = 0;
+ }
+
+ /* Initialize entries not copied from old map */
+ for (; i < max_queues; i++)
+ gq_map->map[i] = NO_QUEUE;
+ } else if (gq_map->map[gqid] != NO_QUEUE) {
+ /* The global qid is already mapped to another device qid */
+ ret = -EBUSY;
+ goto out;
+ }
+
+ /* Set map entry */
+ gq_map->map[gqid] = dqid;
+ gq_map->set_count++;
+
+ if (old_gqid != NO_QUEUE) {
+ /* We know old_gqid is not equal to gqid */
+ if (!WARN_ON(!old_gq_map ||
+ old_gqid > old_gq_map->max_ents ||
+ old_gq_map->map[old_gqid] != dqid)) {
+ /* Unset old mapping in (new) table */
+ gq_map->map[old_gqid] = NO_QUEUE;
+ gq_map->set_count--;
+ }
+ }
+
+ if (gq_map != old_gq_map) {
+ rcu_assign_pointer(*pmap, gq_map);
+ if (old_gq_map)
+ call_rcu(&old_gq_map->rcu, queue_map_release);
+ }
+
+ /* Save for caller */
+ *p_gqid = gqid;
+
+out:
+ mutex_unlock(&global_mapping_table);
+
+ return ret;
+}
+
+static ssize_t show_rx_queue_global_mapping(struct netdev_rx_queue *queue,
+ char *buf)
+{
+ u16 gqid = queue->gqid;
+
+ if (gqid == NO_QUEUE)
+ return sprintf(buf, "none\n");
+ else
+ return sprintf(buf, "%u\n", gqid);
+}
+
+static ssize_t store_rx_queue_global_mapping(struct netdev_rx_queue *queue,
+ const char *buf, size_t len)
+{
+ unsigned long gqid;
+ int ret;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ ret = kstrtoul(buf, 0, &gqid);
+ if (ret < 0)
+ return ret;
+
+ if (gqid > RPS_MAX_QID || WARN_ON(queue->index > RPS_MAX_QID))
+ return -EINVAL;
+
+ ret = set_device_queue_mapping(&queue->dev->rx_gqueue_map,
+ gqid, queue->index, &queue->gqid);
+ return ret ? : len;
+}
+
static struct rx_queue_attribute rps_cpus_attribute __ro_after_init
= __ATTR(rps_cpus, 0644, show_rps_map, store_rps_map);
static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute __ro_after_init
= __ATTR(rps_flow_cnt, 0644,
show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt);
+static struct rx_queue_attribute rx_queue_global_mapping_attribute __ro_after_init =
+ __ATTR(global_queue_mapping, 0644,
+ show_rx_queue_global_mapping, store_rx_queue_global_mapping);
#endif /* CONFIG_RPS */
static struct attribute *rx_queue_default_attrs[] __ro_after_init = {
#ifdef CONFIG_RPS
&rps_cpus_attribute.attr,
&rps_dev_flow_table_cnt_attribute.attr,
+ &rx_queue_global_mapping_attribute.attr,
#endif
NULL
};
@@ -896,8 +1044,11 @@ static void rx_queue_release(struct kobject *kobj)
{
struct netdev_rx_queue *queue = to_rx_queue(kobj);
#ifdef CONFIG_RPS
- struct rps_map *map;
struct rps_dev_flow_table *flow_table;
+ struct rps_map *map;
+
+ set_device_queue_mapping(&queue->dev->rx_gqueue_map, NO_QUEUE,
+ queue->index, &queue->gqid);
map = rcu_dereference_protected(queue->rps_map, 1);
if (map) {
@@ -1152,6 +1303,46 @@ static ssize_t traffic_class_show(struct netdev_queue *queue,
sprintf(buf, "%u\n", tc);
}
+#ifdef CONFIG_RPS
+static ssize_t show_queue_global_queue_mapping(struct netdev_queue *queue,
+ char *buf)
+{
+ u16 gqid = queue->gqid;
+
+ if (gqid == NO_QUEUE)
+ return sprintf(buf, "none\n");
+ else
+ return sprintf(buf, "%u\n", gqid);
+ return 0;
+}
+
+static ssize_t store_queue_global_queue_mapping(struct netdev_queue *queue,
+ const char *buf, size_t len)
+{
+ unsigned long gqid;
+ int ret;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ ret = kstrtoul(buf, 0, &gqid);
+ if (ret < 0)
+ return ret;
+
+ if (gqid > RPS_MAX_QID || WARN_ON(queue->index > RPS_MAX_QID))
+ return -EINVAL;
+
+ ret = set_device_queue_mapping(&queue->dev->tx_gqueue_map,
+ gqid, queue->index, &queue->gqid);
+ return ret ? : len;
+}
+
+static struct netdev_queue_attribute global_queue_mapping_attribute __ro_after_init =
+ __ATTR(global_queue_mapping, 0644,
+ show_queue_global_queue_mapping,
+ store_queue_global_queue_mapping);
+#endif
+
#ifdef CONFIG_XPS
static ssize_t tx_maxrate_show(struct netdev_queue *queue,
char *buf)
@@ -1483,6 +1674,9 @@ static struct netdev_queue_attribute xps_rxqs_attribute __ro_after_init
static struct attribute *netdev_queue_default_attrs[] __ro_after_init = {
&queue_trans_timeout.attr,
&queue_traffic_class.attr,
+#ifdef CONFIG_RPS
+ &global_queue_mapping_attribute.attr,
+#endif
#ifdef CONFIG_XPS
&xps_cpus_attribute.attr,
&xps_rxqs_attribute.attr,
@@ -1496,6 +1690,9 @@ static void netdev_queue_release(struct kobject *kobj)
{
struct netdev_queue *queue = to_netdev_queue(kobj);
+ set_device_queue_mapping(&queue->dev->tx_gqueue_map, NO_QUEUE,
+ queue->index, &queue->gqid);
+
memset(kobj, 0, sizeof(*kobj));
dev_put(queue->dev);
}
--
2.25.1
Powered by blists - more mailing lists