lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Thu, 8 Oct 2015 20:40:11 -0700
From:	Tom Herbert <tom@...bertland.com>
To:	Eric Dumazet <edumazet@...gle.com>
Cc:	"David S . Miller" <davem@...emloft.net>,
	netdev <netdev@...r.kernel.org>,
	Eric Dumazet <eric.dumazet@...il.com>
Subject: Re: [PATCH v3 net-next 1/4] net: SO_INCOMING_CPU setsockopt() support

On Thu, Oct 8, 2015 at 7:33 PM, Eric Dumazet <edumazet@...gle.com> wrote:
> SO_INCOMING_CPU as added in commit 2c8c56e15df3 was a getsockopt() command
> to fetch incoming cpu handling a particular TCP flow after accept()
>
> This commits adds setsockopt() support and extends SO_REUSEPORT selection
> logic : If a TCP listener or UDP socket has this option set, a packet is
> delivered to this socket only if CPU handling the packet matches the specified
> one.
>
> This allows to build very efficient TCP servers, using one listener per
> RX queue, as the associated TCP listener should only accept flows handled
> in softirq by the same cpu.
> This provides optimal NUMA behavior and keep cpu caches hot.
>
> Note that __inet_lookup_listener() still has to iterate over the list of
> all listeners. Following patch puts sk_refcnt in a different cache line
> to let this iteration hit only shared and read mostly cache lines.
>
> Signed-off-by: Eric Dumazet <edumazet@...gle.com>
> ---
>  include/net/sock.h          | 10 ++++------
>  net/core/sock.c             |  5 +++++
>  net/ipv4/inet_hashtables.c  |  2 ++
>  net/ipv4/udp.c              |  6 +++++-
>  net/ipv6/inet6_hashtables.c |  2 ++
>  net/ipv6/udp.c              | 11 +++++++----
>  6 files changed, 25 insertions(+), 11 deletions(-)
>
> diff --git a/include/net/sock.h b/include/net/sock.h
> index dfe2eb8e1132..08abffe32236 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -150,6 +150,7 @@ typedef __u64 __bitwise __addrpair;
>   *     @skc_node: main hash linkage for various protocol lookup tables
>   *     @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
>   *     @skc_tx_queue_mapping: tx queue number for this connection
> + *     @skc_incoming_cpu: record/match cpu processing incoming packets
>   *     @skc_refcnt: reference count
>   *
>   *     This is the minimal network layer representation of sockets, the header
> @@ -212,6 +213,8 @@ struct sock_common {
>                 struct hlist_nulls_node skc_nulls_node;
>         };
>         int                     skc_tx_queue_mapping;
> +       int                     skc_incoming_cpu;
> +
>         atomic_t                skc_refcnt;
>         /* private: */
>         int                     skc_dontcopy_end[0];
> @@ -274,7 +277,6 @@ struct cg_proto;
>    *    @sk_rcvtimeo: %SO_RCVTIMEO setting
>    *    @sk_sndtimeo: %SO_SNDTIMEO setting
>    *    @sk_rxhash: flow hash received from netif layer
> -  *    @sk_incoming_cpu: record cpu processing incoming packets
>    *    @sk_txhash: computed flow hash for use on transmit
>    *    @sk_filter: socket filtering instructions
>    *    @sk_timer: sock cleanup timer
> @@ -331,6 +333,7 @@ struct sock {
>  #define sk_v6_daddr            __sk_common.skc_v6_daddr
>  #define sk_v6_rcv_saddr        __sk_common.skc_v6_rcv_saddr
>  #define sk_cookie              __sk_common.skc_cookie
> +#define sk_incoming_cpu                __sk_common.skc_incoming_cpu
>
>         socket_lock_t           sk_lock;
>         struct sk_buff_head     sk_receive_queue;
> @@ -353,11 +356,6 @@ struct sock {
>  #ifdef CONFIG_RPS
>         __u32                   sk_rxhash;
>  #endif
> -       u16                     sk_incoming_cpu;
> -       /* 16bit hole
> -        * Warned : sk_incoming_cpu can be set from softirq,
> -        * Do not use this hole without fully understanding possible issues.
> -        */
>
>         __u32                   sk_txhash;
>  #ifdef CONFIG_NET_RX_BUSY_POLL
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 7dd1263e4c24..1071f9380250 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -988,6 +988,10 @@ set_rcvbuf:
>                                          sk->sk_max_pacing_rate);
>                 break;
>
> +       case SO_INCOMING_CPU:
> +               sk->sk_incoming_cpu = val;
> +               break;
> +
>         default:
>                 ret = -ENOPROTOOPT;
>                 break;
> @@ -2353,6 +2357,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
>
>         sk->sk_max_pacing_rate = ~0U;
>         sk->sk_pacing_rate = ~0U;
> +       sk->sk_incoming_cpu = -1;
>         /*
>          * Before updating sk_refcnt, we must commit prior changes to memory
>          * (Documentation/RCU/rculist_nulls.txt for details)
> diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
> index bed8886a4b6c..08643a3616af 100644
> --- a/net/ipv4/inet_hashtables.c
> +++ b/net/ipv4/inet_hashtables.c
> @@ -185,6 +185,8 @@ static inline int compute_score(struct sock *sk, struct net *net,
>                                 return -1;
>                         score += 4;
>                 }
> +               if (sk->sk_incoming_cpu == raw_smp_processor_id())
> +                       score++;
>         }
>         return score;
>  }
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index e1fc129099ea..24ec14f9825c 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -375,7 +375,8 @@ static inline int compute_score(struct sock *sk, struct net *net,
>                         return -1;
>                 score += 4;
>         }
> -
> +       if (sk->sk_incoming_cpu == raw_smp_processor_id())
> +               score++;
>         return score;
>  }
>
> @@ -419,6 +420,9 @@ static inline int compute_score2(struct sock *sk, struct net *net,
>                 score += 4;
>         }
>
> +       if (sk->sk_incoming_cpu == raw_smp_processor_id())
> +               score++;
> +
>         return score;
>  }
>
> diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
> index 6ac8dad0138a..21ace5a2bf7c 100644
> --- a/net/ipv6/inet6_hashtables.c
> +++ b/net/ipv6/inet6_hashtables.c
> @@ -114,6 +114,8 @@ static inline int compute_score(struct sock *sk, struct net *net,
>                                 return -1;
>                         score++;
>                 }
> +               if (sk->sk_incoming_cpu == raw_smp_processor_id())
> +                       score++;
>         }
>         return score;
>  }
> diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
> index 0aba654f5b91..01bcb49619ee 100644
> --- a/net/ipv6/udp.c
> +++ b/net/ipv6/udp.c
> @@ -182,10 +182,12 @@ static inline int compute_score(struct sock *sk, struct net *net,
>                 score++;
>         }
>
> +       if (sk->sk_incoming_cpu == raw_smp_processor_id())
> +               score++;
> +
>         return score;
>  }
>
> -#define SCORE2_MAX (1 + 1 + 1)
>  static inline int compute_score2(struct sock *sk, struct net *net,
>                                  const struct in6_addr *saddr, __be16 sport,
>                                  const struct in6_addr *daddr,
> @@ -223,6 +225,9 @@ static inline int compute_score2(struct sock *sk, struct net *net,
>                 score++;
>         }
>
> +       if (sk->sk_incoming_cpu == raw_smp_processor_id())
> +               score++;
> +
>         return score;
>  }
>
> @@ -251,8 +256,7 @@ begin:
>                                 hash = udp6_ehashfn(net, daddr, hnum,
>                                                     saddr, sport);
>                                 matches = 1;
> -                       } else if (score == SCORE2_MAX)
> -                               goto exact_match;
> +                       }

Do we care about losing this optimization? It's not done in IPv4 but I
can imagine that there is some arguments that address comparisons in
IPv6 are more expensive hence this might make sense...

>                 } else if (score == badness && reuseport) {
>                         matches++;
>                         if (reciprocal_scale(hash, matches) == 0)
> @@ -269,7 +273,6 @@ begin:
>                 goto begin;
>
>         if (result) {
> -exact_match:
>                 if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
>                         result = NULL;
>                 else if (unlikely(compute_score2(result, net, saddr, sport,
> --
> 2.6.0.rc2.230.g3dd15c0
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@...r.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ