[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <65634d661003110911w4b23a962gb79955ff2cc19db3@mail.gmail.com>
Date: Thu, 11 Mar 2010 09:11:14 -0800
From: Tom Herbert <therbert@...gle.com>
To: Eric Dumazet <eric.dumazet@...il.com>
Cc: davem@...emloft.net, netdev@...r.kernel.org,
bhutchings@...arflare.com, shemminger@...tta.com
Subject: Re: [PATCH v6] rps: Receive Packet Steering
Eric, thanks for the great comments!
>
> - When a netdevice is freed, I believe you leak rps_maps if they were
> previously allocated. I found following patch necessary to fix it.
>
Yes.
> - Maybe use max(RPM_MAP_SIZE(x), L1_CACHE_BYTES) for allocations to
> avoid false sharing for small maps. (Or else, other part of cache line
> might be given to a user that might dirty it at high rate)
>
Okay.
> - "struct netdev_rx_queue" being only read in fast path, I am not sure
> an alignement is necessary, apart the false sharing that can be
> separately addressed.
>
If we get rid of the double indirection like you describe below, then
this can be allocated in the same way dev->_tx is (kcalloc). Also,
this might a a good structure for writing stats, etc.
>
> - A double indirection to get a struct netdev_rx_queue pointer is
> expensive, not for heavy load benchmarks, but for latencies in seldom
> used devices. I understand you added this for kobject deferred release
> and kfreeing, but this seems a high price to pay in our fast path. I
> used another solution.
>
Yes, this is much better. The kobject documentation has dire warnings
about placing more than one kobject in a data structure, I think I
took that too literally!
>
> - WARN(1, "Recieved packet on %s for queue %u, "
> -> Received
>
Yes, i before e expect after c...
> - Use of cpumask_t for variable on stack is discouraged, we shall use
> cpumask_var_t if possible. Easy for show/store commands. (see my patch)
>
Yes.
> Yet, adding a memory allocation in net_rx_action() seems overkill, so we
> might use two static masks per cpu, and perform a flip ? This would help
> machines with 4096 cpus : To Be Done ?
That is an interesting idea. I'll try it.
>
>
> - RTNL to guard rps map exchange ???
> This sounds like BKL (Big Kernel Lock) syndrom to me :
> Trying to avoid it if possible is better, because it wont exist anymore
> in ten years with 0.99 probability.
>
Yes.
>
> For ease of discussion, I cooked following patch on top of yours :
>
I will apply it.
> Thanks !
>
> include/linux/netdevice.h | 8 ++-
> net/core/dev.c | 64 ++++++++++-------------------
> net/core/net-sysfs.c | 78 ++++++++++++++++++++++--------------
> 3 files changed, 77 insertions(+), 73 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 468da0a..3f4a986 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -544,9 +544,11 @@ struct rps_map {
>
> /* This structure contains an instance of an RX queue. */
> struct netdev_rx_queue {
> - struct kobject kobj;
> struct rps_map *rps_map;
> -} ____cacheline_aligned_in_smp;
> + struct kobject kobj;
> + struct netdev_rx_queue *first;
> + atomic_t count;
> +};
>
> /*
> * This structure defines the management hooks for network devices.
> @@ -897,7 +899,7 @@ struct net_device {
>
> struct kset *queues_kset;
>
> - struct netdev_rx_queue **_rx;
> + struct netdev_rx_queue *_rx;
>
> /* Number of RX queues allocated at alloc_netdev_mq() time */
> unsigned int num_rx_queues;
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 939b1a2..402f23a 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2195,15 +2195,15 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
> u16 index = skb_get_rx_queue(skb);
> if (unlikely(index >= dev->num_rx_queues)) {
> if (net_ratelimit()) {
> - WARN(1, "Recieved packet on %s for queue %u, "
> + WARN(1, "Received packet on %s for queue %u, "
> "but number of RX queues is %u\n",
> dev->name, index, dev->num_rx_queues);
> }
> goto done;
> }
> - rxqueue = dev->_rx[index];
> + rxqueue = dev->_rx + index;
> } else
> - rxqueue = dev->_rx[0];
> + rxqueue = dev->_rx;
>
> if (!rxqueue->rps_map)
> goto done;
> @@ -5236,20 +5236,17 @@ int register_netdevice(struct net_device *dev)
> * alloc_netdev_mq
> */
>
> - dev->_rx = kzalloc(sizeof(struct netdev_rx_queue *),
> - GFP_KERNEL);
> + dev->_rx = kzalloc(max_t(unsigned,
> + sizeof(struct netdev_rx_queue),
> + L1_CACHE_BYTES),
> + GFP_KERNEL);
> if (!dev->_rx) {
> ret = -ENOMEM;
> goto out;
> }
>
> - dev->_rx[0] = kzalloc(sizeof(struct netdev_rx_queue),
> - GFP_KERNEL);
> - if (!dev->_rx[0]) {
> - kfree(dev->_rx);
> - ret = -ENOMEM;
> - goto out;
> - }
> + dev->_rx[0].first = dev->_rx;
> + atomic_set(&dev->_rx[0].count, 1);
> dev->num_rx_queues = 1;
> }
>
> @@ -5610,7 +5607,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
> void (*setup)(struct net_device *), unsigned int queue_count)
> {
> struct netdev_queue *tx;
> - struct netdev_rx_queue **rx;
> + struct netdev_rx_queue *rx;
> struct net_device *dev;
> size_t alloc_size;
> struct net_device *p;
> @@ -5635,37 +5632,30 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
>
> tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
> if (!tx) {
> - printk(KERN_ERR "alloc_netdev: Unable to allocate "
> - "tx qdiscs.\n");
> + pr_err("alloc_netdev: Unable to allocate tx qdiscs.\n");
> goto free_p;
> }
>
> - /*
> - * Allocate RX queue structures, this includes an array of pointers
> - * and netdev_queue_rx stuctures (individually allocated since
> - * each has a kobject).
> - */
> - rx = kzalloc(queue_count *
> - sizeof(struct netdev_rx_queue *), GFP_KERNEL);
> + rx = kzalloc(max_t(unsigned,
> + queue_count * sizeof(struct netdev_rx_queue),
> + L1_CACHE_BYTES),
> + GFP_KERNEL);
> if (!rx) {
> - printk(KERN_ERR "alloc_netdev: Unable to allocate "
> - "rx queues array.\n");
> + pr_err("alloc_netdev: Unable to allocate rx queues.\n");
> goto free_tx;
> }
> - for (i = 0; i < queue_count; i++) {
> - rx[i] = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL);
> - if (!rx[i]) {
> - printk(KERN_ERR "alloc_netdev: Unable to allocate "
> - "rx queues.\n");
> - goto free_rx;
> - }
> - }
> + atomic_set(&rx->count, queue_count);
> + /*
> + * Use counter is located in first element of this array
> + */
> + for (i = 0; i < queue_count; i++)
> + rx[i].first = rx;
>
> dev = PTR_ALIGN(p, NETDEV_ALIGN);
> dev->padded = (char *)dev - (char *)p;
>
> if (dev_addr_init(dev))
> - goto free_tx;
> + goto free_rx;
>
> dev_unicast_init(dev);
>
> @@ -5693,8 +5683,6 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
> return dev;
>
> free_rx:
> - for (i = 0; i < queue_count; i++)
> - kfree(rx[i]);
> kfree(rx);
> free_tx:
> kfree(tx);
> @@ -5720,12 +5708,6 @@ void free_netdev(struct net_device *dev)
>
> kfree(dev->_tx);
>
> - /*
> - * Free RX queue pointer array. Actual netdev_rx_queue objects are
> - * freed by kobject release.
> - */
> - kfree(dev->_rx);
> -
> /* Flush device addresses */
> dev_addr_flush(dev);
>
> diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
> index a07d6ec..de75258 100644
> --- a/net/core/net-sysfs.c
> +++ b/net/core/net-sysfs.c
> @@ -516,24 +516,26 @@ static ssize_t show_rps_map(struct netdev_rx_queue *queue,
> size_t len = 0;
> struct rps_map *map;
> int i;
> - cpumask_t mask;
> + cpumask_var_t mask;
>
> - cpus_clear(mask);
> + if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
> + return -ENOMEM;
>
> rcu_read_lock();
> map = rcu_dereference(queue->rps_map);
> if (map) {
> for (i = 0; i < map->len; i++)
> - cpu_set(map->cpus[i], mask);
> + cpumask_set_cpu(map->cpus[i], mask);
>
> - len += cpumask_scnprintf(buf + len, PAGE_SIZE, &mask);
> + len += cpumask_scnprintf(buf + len, PAGE_SIZE, mask);
> if (PAGE_SIZE - len < 3) {
> rcu_read_unlock();
> + free_cpumask_var(mask);
> return -EINVAL;
> }
> }
> rcu_read_unlock();
> -
> + free_cpumask_var(mask);
> len += sprintf(buf + len, "\n");
> return len;
> }
> @@ -550,38 +552,48 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue,
> const char *buf, size_t len)
> {
> struct rps_map *old_map, *map;
> - cpumask_t mask;
> - int err, cpu, i, weight;
> + cpumask_var_t mask;
> + int err, cpu, i;
> + static DEFINE_SPINLOCK(rps_map_lock);
>
> if (!capable(CAP_NET_ADMIN))
> return -EPERM;
>
> - err = bitmap_parse(buf, len, cpumask_bits(&mask), nr_cpumask_bits);
> - if (err)
> + if (!alloc_cpumask_var(&mask, GFP_KERNEL))
> + return -ENOMEM;
> +
> + err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
> + if (err) {
> + free_cpumask_var(mask);
> return err;
> + }
> + map = kzalloc(max_t(unsigned,
> + RPS_MAP_SIZE(cpumask_weight(mask)),
> + L1_CACHE_BYTES),
> + GFP_KERNEL);
> + if (!map) {
> + free_cpumask_var(mask);
> + return -ENOMEM;
> + }
> + i = 0;
> + for_each_cpu_and(cpu, mask, cpu_online_mask)
> + map->cpus[i++] = cpu;
> + if (i)
> + map->len = i;
> + else {
> + kfree(map);
> + map = NULL;
> + }
>
> - weight = cpumask_weight(&mask);
> - if (weight) {
> - map = kzalloc(RPS_MAP_SIZE(weight), GFP_KERNEL);
> - if (!map)
> - return -ENOMEM;
> -
> - cpus_and(mask, mask, cpu_online_map);
> - i = 0;
> - for_each_cpu_mask(cpu, mask)
> - map->cpus[i++] = cpu;
> - map->len = i;
> - } else
> - map = NULL;
> -
> - rtnl_lock();
> + spin_lock(&rps_map_lock);
> old_map = queue->rps_map;
> rcu_assign_pointer(queue->rps_map, map);
> - rtnl_unlock();
> + spin_unlock(&rps_map_lock);
>
> if (old_map)
> call_rcu(&old_map->rcu, rps_map_release);
>
> + free_cpumask_var(mask);
> return len;
> }
>
> @@ -596,8 +608,16 @@ static struct attribute *rx_queue_default_attrs[] = {
> static void rx_queue_release(struct kobject *kobj)
> {
> struct netdev_rx_queue *queue = to_rx_queue(kobj);
> + struct rps_map *map = queue->rps_map;
> + struct netdev_rx_queue *first = queue->first;
>
> - kfree(queue);
> + if (map)
> + call_rcu(&map->rcu, rps_map_release);
> + /*
> + * Free the array containing us only if all elems were released
> + */
> + if (atomic_dec_and_test(&first->count))
> + kfree(first);
> }
>
> static struct kobj_type rx_queue_ktype = {
> @@ -609,7 +629,7 @@ static struct kobj_type rx_queue_ktype = {
> static int rx_queue_add_kobject(struct net_device *net, int index)
> {
> int error = 0;
> - struct netdev_rx_queue *queue = net->_rx[index];
> + struct netdev_rx_queue *queue = net->_rx + index;
> struct kobject *kobj = &queue->kobj;
>
> kobj->kset = net->queues_kset;
> @@ -642,7 +662,7 @@ static int rx_queue_register_kobjects(struct net_device *net)
>
> if (error)
> while (--i >= 0)
> - kobject_put(&net->_rx[i]->kobj);
> + kobject_put(&net->_rx[i].kobj);
>
> return error;
> }
> @@ -652,7 +672,7 @@ static void rx_queue_remove_kobjects(struct net_device *net)
> int i;
>
> for (i = 0; i < net->num_rx_queues; i++)
> - kobject_put(&net->_rx[i]->kobj);
> + kobject_put(&net->_rx[i].kobj);
> kset_unregister(net->queues_kset);
> }
>
>
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists