netdev - Re: [PATCH net-next] net: rfs: add hash collision detection

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Fri, 6 Feb 2015 14:21:29 -0800
From:	Tom Herbert <therbert@...gle.com>
To:	Eric Dumazet <eric.dumazet@...il.com>
Cc:	David Miller <davem@...emloft.net>,
	netdev <netdev@...r.kernel.org>, Ying Cai <ycai@...gle.com>,
	Willem de Bruijn <willemb@...gle.com>
Subject: Re: [PATCH net-next] net: rfs: add hash collision detection

On Fri, Feb 6, 2015 at 12:59 PM, Eric Dumazet <eric.dumazet@...il.com> wrote:
> From: Eric Dumazet <edumazet@...gle.com>
>
> Receive Flow Steering is a nice solution but suffers from
> hash collisions when a mix of connected and unconnected traffic
> is received on the host, when flow hash table is populated.
>
> Also, clearing flow in inet_release() makes RFS not very good
> for short lived flows, as many packets can follow close().
> (FIN , ACK packets, ...)
>
> This patch extends the information stored into global hash table
> to not only include cpu number, but upper part of the hash value.
>
> I use a 32bit value, and dynamically split it in two parts.
>
> For host with less than 64 possible cpus, this gives 6 bits for the
> cpu number, and 26 (32-6) bits for the upper part of the hash.
>
> Since hash bucket selection use low order bits of the hash, we have
> a full hash match, if /proc/sys/net/core/rps_sock_flow_entries is big
> enough.
>
> If the hash found in flow table does not match, we fallback to RPS (if
> it is enabled for the rxqueue).
>
> This means that a packet for an non connected flow can avoid the
> IPI through a unrelated/victim CPU.
>
> This also means we no longer have to clear the table at socket
> close time, and this helps short lived flows performance.
>

Acked-by: Tom Herbert <therbert@...gle.com>

Eric, looks awesome! Can you share any performance numbers?

Thanks,
Tom

> Signed-off-by: Eric Dumazet <edumazet@...gle.com>
> ---
>  drivers/net/tun.c          |    5 ---
>  include/linux/netdevice.h  |   34 ++++++++++++------------
>  include/net/sock.h         |   24 -----------------
>  net/core/dev.c             |   48 +++++++++++++++++++----------------
>  net/core/sysctl_net_core.c |    2 -
>  net/ipv4/af_inet.c         |    2 -
>  6 files changed, 47 insertions(+), 68 deletions(-)
>
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index ad7d3d5f3ee5..857dca47bf80 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -256,7 +256,6 @@ static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e)
>  {
>         tun_debug(KERN_INFO, tun, "delete flow: hash %u index %u\n",
>                   e->rxhash, e->queue_index);
> -       sock_rps_reset_flow_hash(e->rps_rxhash);
>         hlist_del_rcu(&e->hash_link);
>         kfree_rcu(e, rcu);
>         --tun->flow_count;
> @@ -373,10 +372,8 @@ unlock:
>   */
>  static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
>  {
> -       if (unlikely(e->rps_rxhash != hash)) {
> -               sock_rps_reset_flow_hash(e->rps_rxhash);
> +       if (unlikely(e->rps_rxhash != hash))
>                 e->rps_rxhash = hash;
> -       }
>  }
>
>  /* We try to identify a flow through its rxhash first. The reason that
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index ce784d5018e0..ab3b7cef4638 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -644,39 +644,39 @@ struct rps_dev_flow_table {
>  /*
>   * The rps_sock_flow_table contains mappings of flows to the last CPU
>   * on which they were processed by the application (set in recvmsg).
> + * Each entry is a 32bit value. Upper part is the high order bits
> + * of flow hash, lower part is cpu number.
> + * rps_cpu_mask is used to partition the space, depending on number of
> + * possible cpus : rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1
> + * For example, if 64 cpus are possible, rps_cpu_mask = 0x3f,
> + * meaning we use 32-6=26 bits for the hash.
>   */
>  struct rps_sock_flow_table {
> -       unsigned int mask;
> -       u16 ents[0];
> +       u32     mask;
> +       u32     ents[0];
>  };
> -#define        RPS_SOCK_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_sock_flow_table) + \
> -    ((_num) * sizeof(u16)))
> +#define        RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num]))
>
>  #define RPS_NO_CPU 0xffff
>
> +extern u32 rps_cpu_mask;
> +extern struct rps_sock_flow_table __rcu *rps_sock_flow_table;
> +
>  static inline void rps_record_sock_flow(struct rps_sock_flow_table *table,
>                                         u32 hash)
>  {
>         if (table && hash) {
> -               unsigned int cpu, index = hash & table->mask;
> +               unsigned int index = hash & table->mask;
> +               u32 val = hash & ~rps_cpu_mask;
>
>                 /* We only give a hint, preemption can change cpu under us */
> -               cpu = raw_smp_processor_id();
> +               val |= raw_smp_processor_id();
>
> -               if (table->ents[index] != cpu)
> -                       table->ents[index] = cpu;
> +               if (table->ents[index] != val)
> +                       table->ents[index] = val;
>         }
>  }
>
> -static inline void rps_reset_sock_flow(struct rps_sock_flow_table *table,
> -                                      u32 hash)
> -{
> -       if (table && hash)
> -               table->ents[hash & table->mask] = RPS_NO_CPU;
> -}
> -
> -extern struct rps_sock_flow_table __rcu *rps_sock_flow_table;
> -
>  #ifdef CONFIG_RFS_ACCEL
>  bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id,
>                          u16 filter_id);
> diff --git a/include/net/sock.h b/include/net/sock.h
> index d28b8fededd6..e13824570b0f 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -857,18 +857,6 @@ static inline void sock_rps_record_flow_hash(__u32 hash)
>  #endif
>  }
>
> -static inline void sock_rps_reset_flow_hash(__u32 hash)
> -{
> -#ifdef CONFIG_RPS
> -       struct rps_sock_flow_table *sock_flow_table;
> -
> -       rcu_read_lock();
> -       sock_flow_table = rcu_dereference(rps_sock_flow_table);
> -       rps_reset_sock_flow(sock_flow_table, hash);
> -       rcu_read_unlock();
> -#endif
> -}
> -
>  static inline void sock_rps_record_flow(const struct sock *sk)
>  {
>  #ifdef CONFIG_RPS
> @@ -876,28 +864,18 @@ static inline void sock_rps_record_flow(const struct sock *sk)
>  #endif
>  }
>
> -static inline void sock_rps_reset_flow(const struct sock *sk)
> -{
> -#ifdef CONFIG_RPS
> -       sock_rps_reset_flow_hash(sk->sk_rxhash);
> -#endif
> -}
> -
>  static inline void sock_rps_save_rxhash(struct sock *sk,
>                                         const struct sk_buff *skb)
>  {
>  #ifdef CONFIG_RPS
> -       if (unlikely(sk->sk_rxhash != skb->hash)) {
> -               sock_rps_reset_flow(sk);
> +       if (unlikely(sk->sk_rxhash != skb->hash))
>                 sk->sk_rxhash = skb->hash;
> -       }
>  #endif
>  }
>
>  static inline void sock_rps_reset_rxhash(struct sock *sk)
>  {
>  #ifdef CONFIG_RPS
> -       sock_rps_reset_flow(sk);
>         sk->sk_rxhash = 0;
>  #endif
>  }
> diff --git a/net/core/dev.c b/net/core/dev.c
> index a3a96ffc67f4..8be38675e1a8 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -3030,6 +3030,8 @@ static inline void ____napi_schedule(struct softnet_data *sd,
>  /* One global table that all flow-based protocols share. */
>  struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
>  EXPORT_SYMBOL(rps_sock_flow_table);
> +u32 rps_cpu_mask __read_mostly;
> +EXPORT_SYMBOL(rps_cpu_mask);
>
>  struct static_key rps_needed __read_mostly;
>
> @@ -3086,16 +3088,17 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
>  static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
>                        struct rps_dev_flow **rflowp)
>  {
> -       struct netdev_rx_queue *rxqueue;
> -       struct rps_map *map;
> +       const struct rps_sock_flow_table *sock_flow_table;
> +       struct netdev_rx_queue *rxqueue = dev->_rx;
>         struct rps_dev_flow_table *flow_table;
> -       struct rps_sock_flow_table *sock_flow_table;
> +       struct rps_map *map;
>         int cpu = -1;
> -       u16 tcpu;
> +       u32 tcpu;
>         u32 hash;
>
>         if (skb_rx_queue_recorded(skb)) {
>                 u16 index = skb_get_rx_queue(skb);
> +
>                 if (unlikely(index >= dev->real_num_rx_queues)) {
>                         WARN_ONCE(dev->real_num_rx_queues > 1,
>                                   "%s received packet on queue %u, but number "
> @@ -3103,39 +3106,40 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
>                                   dev->name, index, dev->real_num_rx_queues);
>                         goto done;
>                 }
> -               rxqueue = dev->_rx + index;
> -       } else
> -               rxqueue = dev->_rx;
> +               rxqueue += index;
> +       }
>
> +       /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
> +
> +       flow_table = rcu_dereference(rxqueue->rps_flow_table);
>         map = rcu_dereference(rxqueue->rps_map);
> -       if (map) {
> -               if (map->len == 1 &&
> -                   !rcu_access_pointer(rxqueue->rps_flow_table)) {
> -                       tcpu = map->cpus[0];
> -                       if (cpu_online(tcpu))
> -                               cpu = tcpu;
> -                       goto done;
> -               }
> -       } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
> +       if (!flow_table && !map)
>                 goto done;
> -       }
>
>         skb_reset_network_header(skb);
>         hash = skb_get_hash(skb);
>         if (!hash)
>                 goto done;
>
> -       flow_table = rcu_dereference(rxqueue->rps_flow_table);
>         sock_flow_table = rcu_dereference(rps_sock_flow_table);
>         if (flow_table && sock_flow_table) {
> -               u16 next_cpu;
>                 struct rps_dev_flow *rflow;
> +               u32 next_cpu;
> +               u32 ident;
> +
> +               /* First check into global flow table if there is a match */
> +               ident = sock_flow_table->ents[hash & sock_flow_table->mask];
> +               if ((ident ^ hash) & ~rps_cpu_mask)
> +                       goto try_rps;
>
> +               next_cpu = ident & rps_cpu_mask;
> +
> +               /* OK, now we know there is a match,
> +                * we can look at the local (per receive queue) flow table
> +                */
>                 rflow = &flow_table->flows[hash & flow_table->mask];
>                 tcpu = rflow->cpu;
>
> -               next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
> -
>                 /*
>                  * If the desired CPU (where last recvmsg was done) is
>                  * different from current CPU (one in the rx-queue flow
> @@ -3162,6 +3166,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
>                 }
>         }
>
> +try_rps:
> +
>         if (map) {
>                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
>                 if (cpu_online(tcpu)) {
> diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
> index fde21d19e61b..7a31be5e361f 100644
> --- a/net/core/sysctl_net_core.c
> +++ b/net/core/sysctl_net_core.c
> @@ -65,7 +65,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
>                                         mutex_unlock(&sock_flow_mutex);
>                                         return -ENOMEM;
>                                 }
> -
> +                               rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1;
>                                 sock_table->mask = size - 1;
>                         } else
>                                 sock_table = orig_sock_table;
> diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
> index a44773c8346c..d2e49baaff63 100644
> --- a/net/ipv4/af_inet.c
> +++ b/net/ipv4/af_inet.c
> @@ -395,8 +395,6 @@ int inet_release(struct socket *sock)
>         if (sk) {
>                 long timeout;
>
> -               sock_rps_reset_flow(sk);
> -
>                 /* Applications forget to leave groups before exiting */
>                 ip_mc_drop_socket(sk);
>
>
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html