netdev - Re: [PATCH v6] rps: Receive Packet Steering

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <65634d661003110911w4b23a962gb79955ff2cc19db3@mail.gmail.com>
Date:	Thu, 11 Mar 2010 09:11:14 -0800
From:	Tom Herbert <therbert@...gle.com>
To:	Eric Dumazet <eric.dumazet@...il.com>
Cc:	davem@...emloft.net, netdev@...r.kernel.org,
	bhutchings@...arflare.com, shemminger@...tta.com
Subject: Re: [PATCH v6] rps: Receive Packet Steering

Eric, thanks for the great comments!

>
> - When a netdevice is freed, I believe you leak rps_maps if they were
> previously allocated. I found following patch necessary to fix it.
>
Yes.

> - Maybe use max(RPM_MAP_SIZE(x), L1_CACHE_BYTES) for allocations to
> avoid false sharing for small maps. (Or else, other part of cache line
> might be given to a user that might dirty it at high rate)
>
Okay.

> - "struct netdev_rx_queue" being only read in fast path, I am not sure
> an alignement is necessary, apart the false sharing that can be
> separately addressed.
>
If we get rid of the double indirection like you describe below, then
this can be allocated in the same way dev->_tx is (kcalloc).  Also,
this might a a good structure for writing stats, etc.

>
> - A double indirection to get a struct netdev_rx_queue pointer is
> expensive, not for heavy load benchmarks, but for latencies in seldom
> used devices. I understand you added this for kobject deferred release
> and kfreeing, but this seems a high price to pay in our fast path. I
> used another solution.
>
Yes, this is much better.  The kobject documentation has dire warnings
about placing more than one kobject in a data structure, I think I
took that too literally!

>
> - WARN(1, "Recieved packet on %s for queue %u, "
>   -> Received
>
Yes, i before e expect after c...

> - Use of cpumask_t for variable on stack is discouraged, we shall use
> cpumask_var_t if possible. Easy for show/store commands. (see my patch)
>
Yes.

> Yet, adding a memory allocation in net_rx_action() seems overkill, so we
> might use two static masks per cpu, and perform a flip ? This would help
> machines with 4096 cpus : To Be Done ?

That is an interesting idea.  I'll try it.

>
>
> - RTNL to guard rps map exchange ???
> This sounds like BKL (Big Kernel Lock) syndrom to me :
> Trying to avoid it if possible is better, because it wont exist anymore
> in ten years with 0.99 probability.
>
Yes.

>
> For ease of discussion, I cooked following patch on top of yours :
>

I will apply it.

> Thanks !
>
>  include/linux/netdevice.h |    8 ++-
>  net/core/dev.c            |   64 ++++++++++-------------------
>  net/core/net-sysfs.c      |   78 ++++++++++++++++++++++--------------
>  3 files changed, 77 insertions(+), 73 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 468da0a..3f4a986 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -544,9 +544,11 @@ struct rps_map {
>
>  /* This structure contains an instance of an RX queue. */
>  struct netdev_rx_queue {
> -       struct kobject kobj;
>        struct rps_map *rps_map;
> -} ____cacheline_aligned_in_smp;
> +       struct kobject kobj;
> +       struct netdev_rx_queue *first;
> +       atomic_t count;
> +};
>
>  /*
>  * This structure defines the management hooks for network devices.
> @@ -897,7 +899,7 @@ struct net_device {
>
>        struct kset             *queues_kset;
>
> -       struct netdev_rx_queue  **_rx;
> +       struct netdev_rx_queue  *_rx;
>
>        /* Number of RX queues allocated at alloc_netdev_mq() time  */
>        unsigned int            num_rx_queues;
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 939b1a2..402f23a 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2195,15 +2195,15 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
>                u16 index = skb_get_rx_queue(skb);
>                if (unlikely(index >= dev->num_rx_queues)) {
>                        if (net_ratelimit()) {
> -                               WARN(1, "Recieved packet on %s for queue %u, "
> +                               WARN(1, "Received packet on %s for queue %u, "
>                                    "but number of RX queues is %u\n",
>                                     dev->name, index, dev->num_rx_queues);
>                        }
>                        goto done;
>                }
> -               rxqueue = dev->_rx[index];
> +               rxqueue = dev->_rx + index;
>        } else
> -               rxqueue = dev->_rx[0];
> +               rxqueue = dev->_rx;
>
>        if (!rxqueue->rps_map)
>                goto done;
> @@ -5236,20 +5236,17 @@ int register_netdevice(struct net_device *dev)
>                 * alloc_netdev_mq
>                 */
>
> -               dev->_rx = kzalloc(sizeof(struct netdev_rx_queue *),
> -                   GFP_KERNEL);
> +               dev->_rx = kzalloc(max_t(unsigned,
> +                                        sizeof(struct netdev_rx_queue),
> +                                        L1_CACHE_BYTES),
> +                                  GFP_KERNEL);
>                if (!dev->_rx) {
>                        ret = -ENOMEM;
>                        goto out;
>                }
>
> -               dev->_rx[0] = kzalloc(sizeof(struct netdev_rx_queue),
> -                   GFP_KERNEL);
> -               if (!dev->_rx[0]) {
> -                       kfree(dev->_rx);
> -                       ret = -ENOMEM;
> -                       goto out;
> -               }
> +               dev->_rx[0].first = dev->_rx;
> +               atomic_set(&dev->_rx[0].count, 1);
>                dev->num_rx_queues = 1;
>        }
>
> @@ -5610,7 +5607,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
>                void (*setup)(struct net_device *), unsigned int queue_count)
>  {
>        struct netdev_queue *tx;
> -       struct netdev_rx_queue **rx;
> +       struct netdev_rx_queue *rx;
>        struct net_device *dev;
>        size_t alloc_size;
>        struct net_device *p;
> @@ -5635,37 +5632,30 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
>
>        tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
>        if (!tx) {
> -               printk(KERN_ERR "alloc_netdev: Unable to allocate "
> -                      "tx qdiscs.\n");
> +               pr_err("alloc_netdev: Unable to allocate tx qdiscs.\n");
>                goto free_p;
>        }
>
> -       /*
> -        * Allocate RX queue structures, this includes an array of pointers
> -        * and netdev_queue_rx stuctures (individually allocated since
> -        * each has a kobject).
> -        */
> -       rx = kzalloc(queue_count *
> -           sizeof(struct netdev_rx_queue *), GFP_KERNEL);
> +       rx = kzalloc(max_t(unsigned,
> +                          queue_count * sizeof(struct netdev_rx_queue),
> +                          L1_CACHE_BYTES),
> +                    GFP_KERNEL);
>        if (!rx) {
> -               printk(KERN_ERR "alloc_netdev: Unable to allocate "
> -                      "rx queues array.\n");
> +               pr_err("alloc_netdev: Unable to allocate rx queues.\n");
>                goto free_tx;
>        }
> -       for (i = 0; i < queue_count; i++) {
> -               rx[i] = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL);
> -               if (!rx[i]) {
> -                       printk(KERN_ERR "alloc_netdev: Unable to allocate "
> -                           "rx queues.\n");
> -                       goto free_rx;
> -               }
> -       }
> +       atomic_set(&rx->count, queue_count);
> +       /*
> +        * Use counter is located in first element of this array
> +        */
> +       for (i = 0; i < queue_count; i++)
> +               rx[i].first = rx;
>
>        dev = PTR_ALIGN(p, NETDEV_ALIGN);
>        dev->padded = (char *)dev - (char *)p;
>
>        if (dev_addr_init(dev))
> -               goto free_tx;
> +               goto free_rx;
>
>        dev_unicast_init(dev);
>
> @@ -5693,8 +5683,6 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
>        return dev;
>
>  free_rx:
> -       for (i = 0; i < queue_count; i++)
> -               kfree(rx[i]);
>        kfree(rx);
>  free_tx:
>        kfree(tx);
> @@ -5720,12 +5708,6 @@ void free_netdev(struct net_device *dev)
>
>        kfree(dev->_tx);
>
> -       /*
> -        * Free RX queue pointer array.  Actual netdev_rx_queue objects are
> -        * freed by kobject release.
> -        */
> -       kfree(dev->_rx);
> -
>        /* Flush device addresses */
>        dev_addr_flush(dev);
>
> diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
> index a07d6ec..de75258 100644
> --- a/net/core/net-sysfs.c
> +++ b/net/core/net-sysfs.c
> @@ -516,24 +516,26 @@ static ssize_t show_rps_map(struct netdev_rx_queue *queue,
>        size_t len = 0;
>        struct rps_map *map;
>        int i;
> -       cpumask_t mask;
> +       cpumask_var_t mask;
>
> -       cpus_clear(mask);
> +       if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
> +               return -ENOMEM;
>
>        rcu_read_lock();
>        map = rcu_dereference(queue->rps_map);
>        if (map) {
>                for (i = 0; i < map->len; i++)
> -                       cpu_set(map->cpus[i], mask);
> +                       cpumask_set_cpu(map->cpus[i], mask);
>
> -               len += cpumask_scnprintf(buf + len, PAGE_SIZE, &mask);
> +               len += cpumask_scnprintf(buf + len, PAGE_SIZE, mask);
>                if (PAGE_SIZE - len < 3) {
>                        rcu_read_unlock();
> +                       free_cpumask_var(mask);
>                        return -EINVAL;
>                }
>        }
>        rcu_read_unlock();
> -
> +       free_cpumask_var(mask);
>        len += sprintf(buf + len, "\n");
>        return len;
>  }
> @@ -550,38 +552,48 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue,
>                      const char *buf, size_t len)
>  {
>        struct rps_map *old_map, *map;
> -       cpumask_t mask;
> -       int err, cpu, i, weight;
> +       cpumask_var_t mask;
> +       int err, cpu, i;
> +       static DEFINE_SPINLOCK(rps_map_lock);
>
>        if (!capable(CAP_NET_ADMIN))
>                return -EPERM;
>
> -       err = bitmap_parse(buf, len, cpumask_bits(&mask), nr_cpumask_bits);
> -       if (err)
> +       if (!alloc_cpumask_var(&mask, GFP_KERNEL))
> +               return -ENOMEM;
> +
> +       err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
> +       if (err) {
> +               free_cpumask_var(mask);
>                return err;
> +       }
> +       map = kzalloc(max_t(unsigned,
> +                           RPS_MAP_SIZE(cpumask_weight(mask)),
> +                           L1_CACHE_BYTES),
> +                     GFP_KERNEL);
> +       if (!map) {
> +               free_cpumask_var(mask);
> +               return -ENOMEM;
> +       }
> +       i = 0;
> +       for_each_cpu_and(cpu, mask, cpu_online_mask)
> +               map->cpus[i++] = cpu;
> +       if (i)
> +               map->len = i;
> +       else {
> +               kfree(map);
> +               map = NULL;
> +       }
>
> -       weight = cpumask_weight(&mask);
> -       if (weight) {
> -               map = kzalloc(RPS_MAP_SIZE(weight), GFP_KERNEL);
> -               if (!map)
> -                       return -ENOMEM;
> -
> -               cpus_and(mask, mask, cpu_online_map);
> -               i = 0;
> -               for_each_cpu_mask(cpu, mask)
> -                       map->cpus[i++] = cpu;
> -               map->len = i;
> -       } else
> -               map = NULL;
> -
> -       rtnl_lock();
> +       spin_lock(&rps_map_lock);
>        old_map = queue->rps_map;
>        rcu_assign_pointer(queue->rps_map, map);
> -       rtnl_unlock();
> +       spin_unlock(&rps_map_lock);
>
>        if (old_map)
>                call_rcu(&old_map->rcu, rps_map_release);
>
> +       free_cpumask_var(mask);
>        return len;
>  }
>
> @@ -596,8 +608,16 @@ static struct attribute *rx_queue_default_attrs[] = {
>  static void rx_queue_release(struct kobject *kobj)
>  {
>        struct netdev_rx_queue *queue = to_rx_queue(kobj);
> +       struct rps_map *map = queue->rps_map;
> +       struct netdev_rx_queue *first = queue->first;
>
> -       kfree(queue);
> +       if (map)
> +               call_rcu(&map->rcu, rps_map_release);
> +       /*
> +        * Free the array containing us only if all elems were released
> +        */
> +       if (atomic_dec_and_test(&first->count))
> +               kfree(first);
>  }
>
>  static struct kobj_type rx_queue_ktype = {
> @@ -609,7 +629,7 @@ static struct kobj_type rx_queue_ktype = {
>  static int rx_queue_add_kobject(struct net_device *net, int index)
>  {
>        int error = 0;
> -       struct netdev_rx_queue *queue = net->_rx[index];
> +       struct netdev_rx_queue *queue = net->_rx + index;
>        struct kobject *kobj = &queue->kobj;
>
>        kobj->kset = net->queues_kset;
> @@ -642,7 +662,7 @@ static int rx_queue_register_kobjects(struct net_device *net)
>
>        if (error)
>                while (--i >= 0)
> -                       kobject_put(&net->_rx[i]->kobj);
> +                       kobject_put(&net->_rx[i].kobj);
>
>        return error;
>  }
> @@ -652,7 +672,7 @@ static void rx_queue_remove_kobjects(struct net_device *net)
>        int i;
>
>        for (i = 0; i < net->num_rx_queues; i++)
> -               kobject_put(&net->_rx[i]->kobj);
> +               kobject_put(&net->_rx[i].kobj);
>        kset_unregister(net->queues_kset);
>  }
>
>
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html